In [3]:
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import

In [4]:
import tensorflow as tf

In [5]:
PACKAGE="./train"
from tools import make_src_dumper
write_py = make_src_dumper(PACKAGE)

---
# ```tf.data``` input functions

#### These input functions read from any number of files containing pre-processed data
In our parlour, this data is at the **training stage**. It's been fetched from a BigQuery table and pre-processed. It's not what'll come in at prediction time!

That pre-processing function has been stored in a metadata directory that is available to us to treat data at prediction time (**signature stage**) exactly the same way that our training data has been treated.



```make_XXX_input_fn()``` returns an input_function. This function will be passed to the estimator, such that the estimator can call it in its own session/graph context to create a particularly useful input tensor. That input tensor will return the next batch of input records whenever it is evaluated. 

In [6]:
def make_input_fns():
    from train.make_csv_input_fn import make_csv_input_fn
    from train.make_tfr_input_fn import make_tfr_input_fn
    
    return {
        'csv': make_csv_input_fn,
        'tfr': make_tfr_input_fn
    }
write_py(make_input_fns)

'make_input_fns written to ./train/make_input_fns.py.'

----
#### Read from CSV
Not so good for production, but may come handy for exploration.

In [7]:
def make_csv_input_fn(filename_pattern, batch_size, options): 
# batch_size, shuffle_buffer_size=None, distribute=False

    import tensorflow as tf
    from train.model_config import ORDERED_TRAINING_DEFAULTS
    from train.model_config import ORDERED_TRAINING_COLUMNS
    from train.model_config import LABEL_COLUMN
    
    
    def _input_fn():
        filenames = tf.gfile.Glob(filename_pattern)
        dataset = tf.data.TextLineDataset(filenames)

        def decode_csv(row):
            cols = tf.decode_csv(row, record_defaults=ORDERED_TRAINING_DEFAULTS)
            features = dict(zip(ORDERED_TRAINING_COLUMNS, cols))
            return features

        def pop_target(features):
            target = features.pop(LABEL_COLUMN)
            return features, target
        
        if options['shuffle_buffer_size'] is not None:
            dataset = dataset.shuffle(buffer_size=options['shuffle_buffer_size'])
                
        dataset = (dataset.repeat()
                   .map(decode_csv)
                   .map(pop_target)
                   .batch(batch_size))
        
        if options['distribute']:
            return dataset 
        else:
            return dataset.make_one_shot_iterator().get_next()
    
    return _input_fn

write_py(make_csv_input_fn)

'make_csv_input_fn written to ./train/make_csv_input_fn.py.'

#### Verify the input_function's behaviour

In [8]:
DATASET="samples"

In [10]:
make_input_fn = make_input_fns()['csv']
with tf.Session() as sess:
    train_input_fn = make_input_fn(
        "gs://going-tfx/{}/eval_data/*".format(DATASET),
        batch_size = 2,
        options={'shuffle_buffer_size': None,
                 'distribute': False
                })
    input = train_input_fn()
    res = [sess.run(input) for i in range(1)]
res

AttributeError: 'module' object has no attribute 'compress'

---
#### Read from TFRecords File

In [18]:
def make_tfr_input_fn(filename_pattern, batch_size, options):
    
    import tensorflow as tf
    from train.model_config import LABEL_COLUMN
    from train.model_config import TRAINING_METADATA

    feature_spec = TRAINING_METADATA.schema.as_feature_spec()

    def _input_fn():
        dataset = tf.data.experimental.make_batched_features_dataset(
            file_pattern=filename_pattern,
            batch_size=batch_size,
            features=feature_spec,
            shuffle_buffer_size=options['shuffle_buffer_size'],
            prefetch_buffer_size=options['prefetch_buffer_size'],
            reader_num_threads=options['reader_num_threads'],
            parser_num_threads=options['parser_num_threads'],
            sloppy_ordering=options['sloppy_ordering'],
            label_key=LABEL_COLUMN)

        if options['distribute']:
            return dataset 
        else:
            return dataset.make_one_shot_iterator().get_next()
    return _input_fn

write_py(make_tfr_input_fn)

'make_tfr_input_fn written to ./train/make_tfr_input_fn.py.'

#### Verify behaviour

In [19]:
make_input_fn = make_input_fns()['tfr']
with tf.Session() as sess:
    train_input_fn = make_input_fn(
        "gs://going-tfx/{}/train_data/atl_june_tfr-00000-of-*".format(DATASET), 
        batch_size=2,                 
        options={'shuffle_buffer_size': 10000,
                'prefetch_buffer_size': 10000,
                'reader_num_threads': 16,
                'parser_num_threads': 16,
                'sloppy_ordering': True,
                'distribute': False})
    input = train_input_fn()
    res = [sess.run(input) for i in range(1)]
res

[({'AIRLINE': array([0, 1]),
   'ARR': array([58, 94]),
   'ARR_LAT': array([30.19, 28.1 ], dtype=float32),
   'ARR_LON': array([-97.67, -80.64], dtype=float32),
   'DEP_DELAY': array([0.07692308, 0.07692308], dtype=float32),
   'DEP_DOW': array([5, 2]),
   'DEP_HOD': array([22, 16]),
   'DEP_LAT': array([33.63, 33.63], dtype=float32),
   'DEP_LON': array([-84.42, -84.42], dtype=float32),
   'DIFF_LAT': array([0.2873246 , 0.23924546], dtype=float32),
   'DIFF_LON': array([0.6470146, 0.8298969], dtype=float32),
   'DISTANCE': array([0.1659419 , 0.08237782], dtype=float32),
   'MEAN_TEMP_ARR': array([0.68592054, 0.59205776], dtype=float32),
   'MEAN_TEMP_DEP': array([0.605042  , 0.75210077], dtype=float32),
   'MEAN_VIS_ARR': array([0.45161292, 0.3817204 ], dtype=float32),
   'MEAN_VIS_DEP': array([0.875     , 0.81250006], dtype=float32),
   'WND_SPD_ARR': array([0.00860086, 0.00450045], dtype=float32),
   'WND_SPD_DEP': array([0.2868217 , 0.33333334], dtype=float32)},
  array([  9., -15