In [1]:
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import

In [2]:
import os
import pandas as pd
import tensorflow as tf
import google.datalab.bigquery as dlbq

In [3]:
print(tf.__version__)
if tf.test.is_built_with_cuda():
    print ("Built with cuda")
if tf.test.is_gpu_available():
    print("GPU available.")

1.12.0


In [4]:
tf.test.is_built_with_cuda()

False

In [5]:
tf.logging.set_verbosity(tf.logging.INFO)

---
##### From Jupyter notebook to python package
From exploration to production.

This little tool dumps a given function to a file with the same name in a certain package

In [6]:
PACKAGE="./train"
from tools import make_src_dumper
write_py = make_src_dumper(PACKAGE)

---

# Training and Evaluation Data
Training and evaluation data should be provided in files already.

If not, please go back an run ```Processing_ATL_JUNE.ipynb```

#### Fetch a sample file for examination

In [7]:
DATASET='samples'
a_training_file = !gsutil ls gs://going-tfx/$DATASET/train_data/atl_june_csv-00000-of-*
a_training_file = a_training_file[0]
TEMP_DIR='/tmp/atl_june/{}'.format(DATASET)
!rm -rf $TEMP_DIR
!mkdir -p $TEMP_DIR
!gsutil cp $a_training_file $TEMP_DIR
a_training_file = !ls $TEMP_DIR
a_training_file = os.path.join(TEMP_DIR,a_training_file[0])
res=!wc -l $a_training_file
res=res[0].split(" ")
print()
print("{} records in {}".format(res[0], res[1]))

Copying gs://going-tfx/samples/train_data/atl_june_csv-00000-of-00024...
/ [1 files][141.4 KiB/141.4 KiB]                                                
Operation completed over 1 objects/141.4 KiB.                                    

1000 records in /tmp/atl_june/samples/atl_june_csv-00000-of-00024


#### Have a look into the first training data file

This data is at the **training data** stage. It's got all and only the columns we want. Is has been normalized and integerized. We'll use the ```tf.feature_column``` API to further prepare categorical features.

In [8]:
from train.model_config import ORDERED_TRAINING_COLUMNS
probe = pd.read_csv(a_training_file, names=ORDERED_TRAINING_COLUMNS)
probe.sample(frac=1.0)[:2]

Unnamed: 0,AIRLINE,ARR,ARR_DELAY,ARR_LAT,ARR_LON,DEP_DELAY,DEP_DOW,DEP_HOD,DEP_LAT,DEP_LON,DIFF_LAT,DIFF_LON,DISTANCE,MEAN_TEMP_ARR,MEAN_TEMP_DEP,MEAN_VIS_ARR,MEAN_VIS_DEP,WND_SPD_ARR,WND_SPD_DEP
570,9,46,3.0,34.89,-82.21,0.078493,6,17,33.63,-84.42,0.395445,0.813037,0.016807,0.550541,0.760504,0.489247,1.0,0.0043,0.263566
614,0,68,-3.0,27.39,-82.55,0.076923,6,21,33.63,-84.42,0.222912,0.809386,0.08291,0.635379,0.760504,0.489247,1.0,0.007701,0.263566


In [9]:
probe.describe()

Unnamed: 0,AIRLINE,ARR,ARR_DELAY,ARR_LAT,ARR_LON,DEP_DELAY,DEP_DOW,DEP_HOD,DEP_LAT,DEP_LON,DIFF_LAT,DIFF_LON,DISTANCE,MEAN_TEMP_ARR,MEAN_TEMP_DEP,MEAN_VIS_ARR,MEAN_VIS_DEP,WND_SPD_ARR,WND_SPD_DEP
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,1.367,44.106,17.27,35.45627,-86.81613,0.107582,3.696,14.435,33.63,-84.42,0.408472,0.763572,0.123891,0.522581,0.556386,0.435118,0.700406,0.00608,0.374411
std,2.387889,36.831227,35.743373,5.25087,11.891074,0.052509,1.726431,4.623327,0.0,0.0,0.120793,0.127696,0.104027,0.133286,0.234825,0.077546,0.281966,0.002626,0.109097
min,0.0,0.0,-29.0,18.33,-157.92,0.058085,1.0,6.0,33.63,-84.42,0.014493,0.0,0.0,0.15343,0.109244,0.112903,0.0,0.0002,0.263566
25%,0.0,12.75,-5.0,31.32,-90.5,0.076923,2.0,10.0,33.63,-84.42,0.31332,0.724012,0.05906,0.431408,0.42437,0.408602,0.59375,0.0042,0.263566
50%,1.0,35.0,6.0,35.43,-83.31,0.083987,4.0,14.0,33.63,-84.42,0.407867,0.801224,0.103682,0.546931,0.592437,0.47043,0.6875,0.005701,0.364341
75%,2.0,68.0,28.25,39.87,-80.04,0.117739,6.0,18.0,33.63,-84.42,0.510007,0.83634,0.147357,0.615523,0.760504,0.489247,1.0,0.007401,0.449612
max,18.0,169.0,267.0,61.17,-64.97,0.503925,6.0,23.0,33.63,-84.42,1.0,0.998174,1.0,0.951263,0.928571,0.806452,1.0,0.018602,0.612403


---
# Feature engineering for categorical columns

Categorical columns need to be treated once more to derive at numerical input suitable for model training. That involves bucketizing, the use of dictionaries, feature crossing and embedding

#### Find ranges to bucketize latitude and longitude 
We can easily understand the range of values with the help of a bq query and ```pandas.describe()```

In [10]:
query="""
select 
    distinct arrival_airport as airport, arrival_lat as lat, arrival_lon as lon 
from 
    `bigquery-samples.airline_ontime_data.flights`
"""
locations = dlbq.Query(query).execute().result().to_dataframe()
locations.describe()

Unnamed: 0,lat,lon
count,344.0,344.0
mean,38.49157,-98.531599
std,8.547964,21.746974
min,13.48,-176.64
25%,33.45,-111.675
50%,38.715,-93.3
75%,42.9075,-82.4975
max,71.28,-64.8


In [11]:
lat_boundaries = range(10,80,5)
lat_boundaries

[10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75]

In [12]:
lon_boundaries = range(-100, -55, 5)
lon_boundaries

[-100, -95, -90, -85, -80, -75, -70, -65, -60]

We're going to use those boundaries in the function below

#### Using tf feature_column api for bucketizing, crossing and embedding

In [13]:
def create_feature_columns():
    """
        returns: a dict of features columns for wide and deep input
    """
    
    from tensorflow.feature_column import indicator_column as ind
    from tensorflow.feature_column import numeric_column as num
    from tensorflow.feature_column import bucketized_column as buck
    from tensorflow.feature_column import crossed_column as cross
    from tensorflow.feature_column import embedding_column as emb
    from tensorflow.feature_column import categorical_column_with_identity as cid
    
    ################################################################
    #  Numerical columns for the pre-processed features
    ################################################################
    feature_columns = [
        num(col) for col in [
            'DEP_DELAY',  
            'MEAN_TEMP_DEP','MEAN_VIS_DEP','WND_SPD_DEP',
            'MEAN_TEMP_ARR','MEAN_VIS_ARR','WND_SPD_ARR',
            'DIFF_LAT','DIFF_LON','DISTANCE']]
    
    ################################################################
    #  categorical from ints, bucket counts from examination of the 
    #  full dataset
    ################################################################
    airline = ind(cid('AIRLINE', num_buckets=30))
    arrival = ind(cid('ARR', num_buckets=400))
    
    ################################################################
    #  Crossed and embedded
    ################################################################
    lat_boundaries = range(10,80,5)
    lon_boundaries = range(-100, -55, 5)
    cross_size = len(lat_boundaries) * len(lon_boundaries)

    arr_geo_emb = emb(cross([
        buck(num('ARR_LAT'), lat_boundaries), 
        buck(num('ARR_LON'), lon_boundaries)], cross_size), 10)

    dep_geo_emb = emb(cross([
        buck(num("DEP_LAT"), lat_boundaries), 
        buck(num("DEP_LON"), lon_boundaries)], cross_size), 10)

    dep_how_emb = emb(cross([
        cid("DEP_HOD", num_buckets=24), 
        cid("DEP_DOW", num_buckets=8)], 7*24), 10)

    ################################################################
    #  all together
    ################################################################
    return {
        'deep': feature_columns + [dep_how_emb, arr_geo_emb, dep_geo_emb],
        'wide': [airline, arrival]}
    
write_py(create_feature_columns)

'create_feature_columns written to ./train/create_feature_columns.py.'

In [14]:
create_feature_columns()

{'deep': [_NumericColumn(key='DEP_DELAY', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
  _NumericColumn(key='MEAN_TEMP_DEP', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
  _NumericColumn(key='MEAN_VIS_DEP', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
  _NumericColumn(key='WND_SPD_DEP', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
  _NumericColumn(key='MEAN_TEMP_ARR', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
  _NumericColumn(key='MEAN_VIS_ARR', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
  _NumericColumn(key='WND_SPD_ARR', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
  _NumericColumn(key='DIFF_LAT', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
  _NumericColumn(key='DIFF_LON', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
  _NumericColumn(key='DISTANCE'

These feature columns encode a construction plan. The ```tf.feature_column.input_column()``` helper will construct a sub-graph from this plan and feed the root (the *result*) of the graph into the model. You see the pattern: All parts of the tensor graph are created within the session/graph context of the ```Estimator``` API. Never outside of it.

# The available hypotheses
Please see ```Hypotheses_Workbench.ipynb``` for more insight into the various hypotheses functions

In [15]:
from train.make_hypotheses import make_hypotheses
all_hypotheses = make_hypotheses()
import inspect
print(inspect.getsource(all_hypotheses['linear']))

def hypothesis_linear(features, feature_columns, options):
    
    import tensorflow as tf
    from train.train_tools import weight_summary

    with tf.name_scope('Linear'):
    
        all_feature_columns = feature_columns['wide'] + feature_columns['deep']

        input_layer = tf.feature_column.input_layer( 
            features, feature_columns=all_feature_columns)

        out = tf.layers.dense(input_layer, 1, activation=None)
        weight_summary(out)
    
    return out



# The model function
The model function is responsible for providing different variants of the actual model suitable for training, evaluation and prediction

In [16]:
def make_model_fn(feature_columns, options, hypothesis):
    
    import tensorflow as tf
    from train.make_hypothesis import make_hypothesis
    
    def _model_fn(features, labels, mode):

        out = hypothesis(features, feature_columns, options)

        if mode == tf.estimator.ModeKeys.PREDICT:
            return tf.estimator.EstimatorSpec(mode, predictions=out)


        labels = tf.expand_dims(labels, -1)
        loss = tf.losses.mean_squared_error(labels, out)
        mean_error=tf.metrics.mean(tf.abs(labels-out))

        if mode == tf.estimator.ModeKeys.EVAL:    
            return tf.estimator.EstimatorSpec(
                mode=mode,
                loss = loss,
                eval_metric_ops={'mean_error': mean_error}
            )

        else:
            optimizer = tf.train.GradientDescentOptimizer(options['learning_rate'])
            train_op = optimizer.minimize(loss, global_step=tf.train.get_or_create_global_step())

            grads = optimizer.compute_gradients(loss)
            for g in grads:
                name = "%s-grad" % g[1].name
                name = name.replace(":", "_")
                tf.summary.histogram(name, g[0])
            
            return tf.estimator.EstimatorSpec(  
                mode,
                loss = loss,
                train_op = train_op)
        
    return _model_fn
write_py(make_model_fn)

'make_model_fn written to ./train/make_model_fn.py.'

---
# Available input functions
Please examine **```Input_Functions.ipynb```** for more information about the available input functions.

For now, have a look at the tfrecord input function we're going to use.

In [17]:
from train.make_input_fns import make_input_fns
tfr_input_fn = make_input_fns()['tfr']
import inspect
print(inspect.getsource(tfr_input_fn))

def make_tfr_input_fn(filename_pattern, batch_size, options):
    
    import tensorflow as tf
    from train.model_config import LABEL_COLUMN
    from train.model_config import TRAINING_METADATA

    feature_spec = TRAINING_METADATA.schema.as_feature_spec()

    def _input_fn():
        dataset = tf.data.experimental.make_batched_features_dataset(
            file_pattern=filename_pattern,
            batch_size=batch_size,
            features=feature_spec,
            shuffle_buffer_size=options['shuffle_buffer_size'],
            prefetch_buffer_size=options['prefetch_buffer_size'],
            reader_num_threads=options['reader_num_threads'],
            parser_num_threads=options['parser_num_threads'],
            sloppy_ordering=options['sloppy_ordering'],
            label_key=LABEL_COLUMN)

        if options['distribute']:
            return dataset 
        else:
            return dataset.make_one_shot_iterator().get_next()
    return _input_fn



---
# Training and Evaluation

In [18]:
def train_and_evaluate(options):

    import tensorflow as tf
    from tensorflow.estimator import RunConfig
    from tensorflow.contrib.distribute import MirroredStrategy
    import mlflow
    
    from train.make_model_fn import make_model_fn
    from train.make_tft_serving_input_fn import make_tft_serving_input_fn
    from train.create_feature_columns import create_feature_columns
    from train.make_tfr_input_fn import make_tfr_input_fn
    from train.make_hypotheses import make_hypotheses
    from train.make_input_fns import make_input_fns

    
    with mlflow.start_run():

        log_params = [
            'base_dir',
            'file_format',
            'train_batch_size',
            'max_train_steps',
            'reader_num_threads',
            'parser_num_threads',
            'prefetch_buffer_size'    
        ]
        
        for key in log_params:
            mlflow.log_param(key, options[key])

        ##################################################################
        #   Train and Eval Input Functions
        ##################################################################
        make_input_fn=make_input_fns()[options['file_format']]

        train_input_fn = make_input_fn(options['train_data_pattern'], 
                                       options['train_batch_size'],
                                       options)    

        eval_input_fn = make_input_fn(options['eval_data_pattern'], 
                                      options['eval_batch_size'],
                                      options)


        ##################################################################
        #   Create the hypothesis and the model_fn
        ##################################################################
        hypothesis = make_hypotheses()[options['hypothesis']]    
        feature_columns = create_feature_columns()
        model_fn = make_model_fn(feature_columns, options, hypothesis )


        ##################################################################
        #    Train and Eval Spec
        ##################################################################
        serving_input_fn = make_tft_serving_input_fn(options['metadata_dir'])
        exporter = tf.estimator.LatestExporter('exporter', serving_input_fn)

        train_spec = tf.estimator.TrainSpec(
            input_fn=train_input_fn, 
            max_steps=options['max_train_steps'])

        eval_spec = tf.estimator.EvalSpec(
            input_fn=eval_input_fn, exporters=exporter,
            steps = options['eval_steps'],
            throttle_secs=options['throttle_secs'],
            start_delay_secs=0)


        ##################################################################
        #   Create and configure the estimator
        ##################################################################
        strategy = MirroredStrategy() if options['distribute'] else None
        config = RunConfig(model_dir=options['model_dir'],
                           save_summary_steps=options['save_summary_steps'],
                           train_distribute=strategy, 
                           save_checkpoints_steps=options['save_checkpoints_steps'],
                           log_step_count_steps=options['log_step_count_steps'])

        estimator = tf.estimator.Estimator(
                config=config,
                model_fn=model_fn)


        ##################################################################
        #   Finally, train and evaluate the model
        ##################################################################
        final_eval = tf.estimator.train_and_evaluate(
            estimator, 
            train_spec=train_spec, 
            eval_spec=eval_spec)
        
        mlflow.log_metric('loss', final_eval[0]['loss'])
        mlflow.log_metric('mean_error', final_eval[0]['mean_error'])

        return final_eval
        
    
write_py(train_and_evaluate)

'train_and_evaluate written to ./train/train_and_evaluate.py.'

#### Run from within the notebook kernel

In [19]:
from train.train_tools import join_paths

args={}

# file locations
args['base_dir']='gs://going-tfx/{}'.format(DATASET)
args['metadata_dir']='metadata'
args['model_dir']='model'
args['train_data_pattern']='train_data/atl_june_tfr*'
args['eval_data_pattern']='eval_data/atl_june_tfr*'
args['file_format']='tfr'

# train and eval parameters
args['train_batch_size']=256
args['eval_batch_size']=1024
args['max_train_steps']=2000
args['eval_steps']=10

# Execution parameters
args['reader_num_threads']=16
args['parser_num_threads']=16
args['prefetch_buffer_size']=10000
args['shuffle_buffer_size']=10000
args['save_checkpoints_steps']=2000
args['log_step_count_steps']=200
args['throttle_secs']=30
args['distribute']=False
args['sloppy_ordering']=True
args['save_summary_steps']=100

# Model parameters
args['learning_rate']=1e-3
args['hypothesis']='linear'

args = join_paths(args)

model_dir = args['model_dir']
print("using directory {} to store the model. ")
print("Remove the directory if you want to start from scratch".format(model_dir))
_ = !gsutil -m rm -rf $model_dir

print("=====================================================================================================")
from tools import create_runpy
create_runpy("run_task.sh", args)
!cat ./run_task.sh
print("=====================================================================================================")
print()

# if you want to keep your jupyter notebook clean, rather use run_task.sh from a terminal 
res = train_and_evaluate(args)

using directory {} to store the model. 
Remove the directory if you want to start from scratch
export PYTHONPATH=${PYTHONPATH}:${PWD}
python -m train.task \
  --eval_steps="10"  \
  --parser_num_threads="16"  \
  --eval_data_pattern="gs://going-tfx/samples/eval_data/atl_june_tfr*"  \
  --train_batch_size="256"  \
  --shuffle_buffer_size="10000"  \
  --eval_batch_size="1024"  \
  --sloppy_ordering="True"  \
  --reader_num_threads="16"  \
  --file_format="tfr"  \
  --log_step_count_steps="200"  \
  --model_dir="gs://going-tfx/samples/model"  \
  --throttle_secs="30"  \
  --learning_rate="0.001"  \
  --hypothesis="linear"  \
  --save_summary_steps="100"  \
  --max_train_steps="2000"  \
  --prefetch_buffer_size="10000"  \
  --metadata_dir="gs://going-tfx/samples/metadata"  \
  --train_data_pattern="gs://going-tfx/samples/train_data/atl_june_tfr*"  \
  --save_checkpoints_steps="2000"  \
  --base_dir="gs://going-tfx/samples"  \

INFO:tensorflow:Using config: {'_save_checkpoints_secs': None, 

Created the bash runner script named **```run_task.sh```**. 

Execute

``` 
bash run_task.sh

``` 

to perform the training from a shell.

In [20]:
type(res)

tuple

In [21]:
res

({'global_step': 2000, 'loss': 1702.7211, 'mean_error': 25.239887},
 ['gs://going-tfx/samples/model/export/exporter/1543010833'])

In [22]:
res[0]['loss']

1702.7211

In [23]:
res

({'global_step': 2000, 'loss': 1702.7211, 'mean_error': 25.239887},
 ['gs://going-tfx/samples/model/export/exporter/1543010833'])