In [11]:
import os
import tensorflow as tf
import numpy as np
import pandas as pd
%matplotlib inline

In [12]:
DATA_DIR='/home/jupyter/data'

### dataset from np array

In [4]:
x = np.random.normal(size=(10,2))
x

array([[-0.2673395 ,  0.8397985 ],
       [ 0.4215114 ,  1.59650183],
       [ 0.32095545, -0.87147807],
       [ 1.68237833, -1.64231524],
       [-0.60458029,  0.23117716],
       [-0.11939795,  0.98115812],
       [-2.58634282, -0.06939115],
       [-0.40390687,  1.44146753],
       [-0.569293  , -1.22308503],
       [ 0.63920614,  0.04912573]])

In [5]:
inp = tf.data.Dataset.from_tensor_slices(x)

In [6]:
b = inp.batch(2).make_one_shot_iterator().get_next()

In [7]:
with tf.Session() as sess:
    _b = sess.run(b)
    _c = sess.run(b)
_b, _c

(array([[-0.2673395 ,  0.8397985 ],
        [ 0.4215114 ,  1.59650183]]), array([[ 0.32095545, -0.87147807],
        [ 1.68237833, -1.64231524]]))

### Dataset from a generator

In [8]:
sequence = np.random.normal(size=(10,2))
def generator():
    for x in sequence:
        yield x

In [9]:
inp = tf.data.Dataset.from_generator(generator, output_types=tf.float64, output_shapes=(tf.TensorShape([2])))
b = inp.batch(2).make_one_shot_iterator().get_next()
with tf.Session() as sess:
    _b = sess.run(b)
    _c = sess.run(b)
_b, _c    

(array([[-1.65911584, -0.319281  ],
        [-0.38373948, -0.34553233]]), array([[-1.55824123,  0.70972351],
        [ 0.33166813,  0.16585409]]))

### Dataset from CSV file
A subset of the famous flight data dataset in Big Query: All flight that departed from Atlanta on any June day between 2006 to 2017

37MB in 403'358 records. That's easy to handle in memory for exploration

In [14]:
atlanta_june = pd.DataFrame.from_csv(os.path.join(DATA_DIR, "atl_june.csv"))

  """Entry point for launching an IPython kernel.


In [15]:
atlanta_june[0:10]

Unnamed: 0_level_0,FL_YEAR,FL_MONTH,FL_DOM,FL_DOW,UNIQUE_CARRIER,FL_NUM,ORIGIN_AIRPORT_SEQ_ID,DEST_AIRPORT_SEQ_ID,ORIGIN,DEST,...,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,DISTANCE
FL_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-06-01,2016,6,1,4,EV,5602,1039705,1295104,ATL,LFT,...,47,1803,1818,5,1705,78,0,,0,503
2016-06-01,2016,6,1,4,DL,2614,1039705,1295104,ATL,LFT,...,20,2028,2039,5,2056,-12,0,,0,503
2016-06-01,2016,6,1,4,EV,5071,1039705,1295104,ATL,LFT,...,21,1429,1445,5,1344,66,0,,0,503
2016-06-01,2016,6,1,4,EV,5597,1039705,1295104,ATL,LFT,...,16,1019,1041,7,1051,-3,0,,0,503
2016-06-02,2016,6,2,5,EV,5071,1039705,1295104,ATL,LFT,...,23,1322,1338,25,1344,19,0,,0,503
2016-06-02,2016,6,2,5,EV,5597,1039705,1295104,ATL,LFT,...,29,1254,1312,6,1051,147,0,,0,503
2016-06-02,2016,6,2,5,DL,2614,1039705,1295104,ATL,LFT,...,24,2029,2039,7,2056,-10,0,,0,503
2016-06-02,2016,6,2,5,EV,5602,1039705,1295104,ATL,LFT,...,25,1646,1654,5,1705,-6,0,,0,503
2016-06-03,2016,6,3,6,EV,5071,1039705,1295104,ATL,LFT,...,18,1344,1406,5,1344,27,0,,0,503
2016-06-03,2016,6,3,6,DL,2614,1039705,1295104,ATL,LFT,...,21,2027,2044,7,2056,-5,0,,0,503


In [16]:
atlanta_june.describe()

Unnamed: 0,FL_YEAR,FL_MONTH,FL_DOM,FL_DOW,FL_NUM,ORIGIN_AIRPORT_SEQ_ID,DEST_AIRPORT_SEQ_ID,CRS_DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,DISTANCE
count,403358.0,403358.0,403358.0,403358.0,403358.0,403358.0,403358.0,403358.0,403358.0,403358.0,403358.0,403358.0,403358.0,403358.0,403358.0,403358.0,0.0,403358.0,403358.0
mean,2011.395998,6.0,15.575033,3.958536,2324.65421,1039704.0,1278804.0,1460.046948,15.029393,20.590726,1487.350684,1514.163475,6.049626,1552.945956,11.898465,0.0,,0.0,643.922352
std,3.438737,0.0,8.637871,1.968028,1826.668615,0.9995802,151929.0,463.110599,39.32574,11.310037,498.089658,538.377473,4.550384,499.354124,42.376085,0.0,,0.0,471.52315
min,2006.0,6.0,1.0,1.0,1.0,1039703.0,1013501.0,500.0,-54.0,0.0,1.0,1.0,0.0,0.0,-61.0,0.0,,0.0,79.0
25%,2008.0,6.0,8.0,2.0,873.0,1039703.0,1129802.0,1035.0,-3.0,14.0,1051.0,1128.0,4.0,1150.0,-10.0,0.0,,0.0,356.0
50%,2011.0,6.0,16.0,4.0,1711.0,1039703.0,1294502.0,1453.0,0.0,18.0,1510.0,1535.0,5.0,1557.0,-1.0,0.0,,0.0,547.0
75%,2014.0,6.0,23.0,6.0,4332.0,1039705.0,1410001.0,1856.0,15.0,24.0,1925.0,1948.0,7.0,1958.0,16.0,0.0,,0.0,743.0
max,2017.0,6.0,30.0,7.0,7425.0,1039705.0,1591902.0,2350.0,1427.0,226.0,2400.0,2400.0,230.0,2359.0,1444.0,0.0,,0.0,4502.0


#### Creating a tf.TextLineDataset

In [17]:
atlanta_june.columns

Index(['FL_YEAR', 'FL_MONTH', 'FL_DOM', 'FL_DOW', 'UNIQUE_CARRIER', 'FL_NUM',
       'ORIGIN_AIRPORT_SEQ_ID', 'DEST_AIRPORT_SEQ_ID', 'ORIGIN', 'DEST',
       'CRS_DEP_TIME', 'DEP_DELAY', 'TAXI_OUT', 'WHEELS_OFF', 'WHEELS_ON',
       'TAXI_IN', 'CRS_ARR_TIME', 'ARR_DELAY', 'CANCELLED',
       'CANCELLATION_CODE', 'DIVERTED', 'DISTANCE'],
      dtype='object')

In [18]:
COLUMNS = ['FL_DATE', 'FL_YEAR', 'FL_MONTH', 'FL_DOM', 'FL_DOW', 'UNIQUE_CARRIER', 'FL_NUM',
       'ORIGIN_AIRPORT_SEQ_ID', 'DEST_AIRPORT_SEQ_ID', 'ORIGIN', 'DEST',
       'CRS_DEP_TIME', 'DEP_DELAY', 'TAXI_OUT', 'WHEELS_OFF', 'WHEELS_ON',
       'TAXI_IN', 'CRS_ARR_TIME', 'ARR_DELAY', 'CANCELLED',
       'CANCELLATION_CODE', 'DIVERTED', 'DISTANCE']
DEFAULTS = [["-"], [], [], [], [], ["-"], ["-"], ["-"], ["-"], ["-"], ["-"], [], [], [], [], [], [], [], [], [], ['NONE'], [], []]

We're not using the easier `tf.contrib.data.make_csv_dataset` API here because it apparently assumes the data is clean already. With the TextLineDataset, we're able to process each row as we see fit and we can set defaults, types and handle erroneous rows. 

In [19]:
SELECT=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22]
inp = tf.data.TextLineDataset("gs://ingres/atl_june.csv")
def decode_csv(row):
    cols = tf.decode_csv(row, select_cols=SELECT, record_defaults=DEFAULTS)
    features = dict(zip([COLUMNS[i] for i in SELECT], cols))
    return features
inp = inp.skip(1).map(decode_csv).batch(2)

In [20]:
b = inp.make_one_shot_iterator().get_next()

In [21]:
with tf.Session() as sess:
    _b = sess.run(b)
    _c = sess.run(b)
_b, _c    

({'ARR_DELAY': array([ 78., -12.], dtype=float32),
  'CANCELLATION_CODE': array([b'NONE', b'NONE'], dtype=object),
  'CANCELLED': array([0., 0.], dtype=float32),
  'CRS_ARR_TIME': array([1705., 2056.], dtype=float32),
  'CRS_DEP_TIME': array([1621., 2010.], dtype=float32),
  'DEP_DELAY': array([55., -2.], dtype=float32),
  'DEST': array([b'LFT', b'LFT'], dtype=object),
  'DEST_AIRPORT_SEQ_ID': array([b'1295104', b'1295104'], dtype=object),
  'DISTANCE': array([503., 503.], dtype=float32),
  'DIVERTED': array([0., 0.], dtype=float32),
  'FL_DATE': array([b'2016-06-01', b'2016-06-01'], dtype=object),
  'FL_DOM': array([1., 1.], dtype=float32),
  'FL_DOW': array([4., 4.], dtype=float32),
  'FL_MONTH': array([6., 6.], dtype=float32),
  'FL_NUM': array([b'5602', b'2614'], dtype=object),
  'FL_YEAR': array([2016., 2016.], dtype=float32),
  'ORIGIN': array([b'ATL', b'ATL'], dtype=object),
  'ORIGIN_AIRPORT_SEQ_ID': array([b'1039705', b'1039705'], dtype=object),
  'TAXI_IN': array([5., 5.], dt