In [1]:
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import

In [2]:
PACKAGE="./prep"
from tools import make_src_dumper
write_py = make_src_dumper(PACKAGE)

In [1]:
%load_ext autoreload
%autoreload 2

---
# Preprocessing data for ML with Tensorflow

#### Signature data in Bigquery
We collected the raw data that we use from various sources into a single denormalized table holding the data in so-called signature format. That table's schema is meant to reflect the structure of the data/requests that we expect to be served at prediction time. 

In [3]:
%load_ext google.cloud.bigquery

In [4]:
%%bigquery sample
select * FROM `going-tfx.examples.ATL_JUNE_SIGNATURE` limit 3

Unnamed: 0,DATE,YEAR,MONTH,DAY,DEP_DOW,AIRLINE_NAME,AIRLINE,DEP_T,DEP,DEP_LAT,...,WND_SPD_DEP,ARR_T,ARR_DELAY,ARR,ARR_LAT,ARR_LON,ARR_W,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
0,2002-06-01,2002,6,1,7,US Airways Inc.: US (Merged with America West ...,US,610,ATL,33.63,...,6.9,712,-12.0,CLT,35.21,-80.94,CHARLOTTE/DOUGLAS INTERNATION,78.3,9.5,2.7
1,2002-06-01,2002,6,1,7,Delta Air Lines Inc.: DL,DL,620,ATL,33.63,...,6.9,740,9.0,MCO,28.42,-81.3,ORLANDO INTERNATIONAL AIRPORT,77.4,9.6,5.7
2,2002-06-01,2002,6,1,7,Delta Air Lines Inc.: DL,DL,620,ATL,33.63,...,6.9,738,55.0,TPA,27.97,-82.53,TAMPA INTERNATIONAL AIRPORT,79.1,9.9,5.5


In [5]:
from train.model_config import SIGNATURE_COLUMNS
print(SIGNATURE_COLUMNS)

['DEP_DOW', 'DEP_T', 'DEP_LAT', 'DEP_LON', 'DEP_DELAY', 'MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 'ARR_LAT', 'ARR_LON', 'ARR_DELAY', 'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR', 'ARR', 'AIRLINE']


In [6]:
def sample_queries(columns, fractions, rate=0.1):

    def sample_query(columns, total, lower, upper):
        col_string=", ".join(columns)
        return """
        SELECT
            {0}
        FROM 
            `going-tfx.examples.ATL_JUNE_SIGNATURE` 
        where
            MOD(ABS(FARM_FINGERPRINT(
                CONCAT(DATE,AIRLINE,ARR)
            )) + DEP_T, {1}) >= {2} 
        and
            MOD(ABS(FARM_FINGERPRINT(
                CONCAT( DATE, AIRLINE, ARR)
            )) + DEP_T, {1}) < {3} 
        """.format(col_string, total, lower, upper)
    
    start = 0
    total = int(sum(fractions) / rate)
    res = []
    for f in fractions:
        f_ = int(f) 
        q = sample_query(columns, total, start, start+f_)
        start = start + f_
        res.append(q)
    return dict(zip(['train', 'eval', 'test'], res))
write_py(sample_queries)

'sample_queries written to ./prep/sample_queries.py.'

In [8]:
import google.datalab.bigquery as dlbq
queries = sample_queries(SIGNATURE_COLUMNS, [90,5,5], .01)
df = dlbq.Query(queries['train']).execute().result().to_dataframe()
print('Only {} examples. Showing first three:'.format(len(df)))
df[:3]

Only 2661 examples. Showing first three:


Unnamed: 0,DEP_DOW,DEP_T,DEP_LAT,DEP_LON,DEP_DELAY,MEAN_TEMP_DEP,MEAN_VIS_DEP,WND_SPD_DEP,ARR_LAT,ARR_LON,ARR_DELAY,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR,ARR,AIRLINE
0,1,1509,33.63,-84.42,39.0,76.0,7.5,11.1,40.65,-75.44,24.0,78.5,9.8,5.5,ABE,EV
1,3,2205,33.63,-84.42,204.0,85.3,10.0,5.1,40.65,-75.44,185.0,72.3,10.0,4.0,ABE,FL
2,1,1050,33.63,-84.42,9.0,75.7,9.3,6.5,35.04,-106.6,11.0,74.1,10.0,12.4,ABQ,DL


---
#### The pre-processing function

In [9]:
def pre_process(row):
    import tensorflow_transform as tft
    from tools import tf_haversine

    def add_engineered(row):
        dep_lat = row['DEP_LAT']
        dep_lon = row['DEP_LON']
        arr_lat = row['ARR_LAT']
        arr_lon = row['ARR_LON']

        row['DEP_HOD'] = row['DEP_T'] // 100
        row.pop('DEP_T')  # no longer needed

        row['DIFF_LAT'] = arr_lat - dep_lat
        row['DIFF_LON'] = arr_lon - dep_lon
        row['DISTANCE'] = tf_haversine(arr_lat, arr_lon, dep_lat, dep_lon)
        return row

    def scale_floats(row):
        for c in ['MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR', 'DEP_DELAY',
                 'DIFF_LAT', 'DIFF_LON', 'DISTANCE']:
            row[c] = tft.scale_to_0_1(row[c])
        return row

    def categorical_from_strings(row):
        row['AIRLINE'] = tft.string_to_int(row['AIRLINE'])
        row['ARR'] = tft.string_to_int(row['ARR'])
        return row
    
    row = row.copy()
    row = add_engineered(row)
    row = scale_floats(row)
    row = categorical_from_strings(row)
    return row
write_py(pre_process)

'pre_process written to ./prep/pre_process.py.'

In [10]:
from train.model_config import TRAINING_COLUMNS
print(TRAINING_COLUMNS)

['DEP_DOW', 'DEP_HOD', 'AIRLINE', 'ARR', 'DEP_LAT', 'DEP_LON', 'MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 'DEP_DELAY', 'ARR_LAT', 'ARR_LON', 'ARR_DELAY', 'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR', 'DIFF_LAT', 'DIFF_LON', 'DISTANCE']


---
#### The full pipeline

In [11]:
def exec_pipeline_prod (options, train_dir, eval_dir, test_dir, 
                        metadata_dir, tmp_dir, 
                        fractions, sample_rate, prefix, 
                        encode='tfrecord', 
                        runner='DirectRunner'):
    
    import os
    import tensorflow_transform as tft
    import tensorflow_transform.beam.impl as beam_impl
    import apache_beam as beam
    from tensorflow_transform.tf_metadata import dataset_metadata
    from tensorflow_transform.tf_metadata import dataset_schema
    from tensorflow_transform.beam.tft_beam_io import transform_fn_io
    
    from train.model_config import (SIGNATURE_COLUMNS, TRAINING_COLUMNS,
        TRAINING_METADATA, SIGNATURE_METADATA, ORDERED_TRAINING_COLUMNS)
    from prep.pre_process import pre_process
    from prep.sample_queries import sample_queries

    with beam.Pipeline(runner, options=options) as p:
        with beam_impl.Context(temp_dir=tmp_dir):

            def write_to_files(data, prefix, phase):
                tfr_encoder = tft.coders.ExampleProtoCoder(t_metadata.schema)            
                if encode in ['tfrecord', 'both', None]:
                    _ = (data
                        | ('EncodeTFRecord_' + phase) >> beam.Map(tfr_encoder.encode)
                        | ('WriteTFRecord_' + phase) >> beam.io.WriteToTFRecord(prefix+'_tfr'))

                if encode in ['csv', 'both', None]:
                    csv_encoder = tft.coders.CsvCoder(ORDERED_TRAINING_COLUMNS, TRAINING_METADATA.schema)    
                    _ = (data 
                        | ('EncodeCSV_train' + phase) >> beam.Map(csv_encoder.encode)
                        | ('WriteText_train' + phase) >> beam.io.WriteToText(file_path_prefix=prefix+'_csv'))
        
            # Process training data and obtain transform_fn
            #
            queries = sample_queries(SIGNATURE_COLUMNS, fractions, sample_rate)

            signature_data = (p | "ReadFromBigQuery_train"  
                              >> beam.io.Read(beam.io.BigQuerySource(
                                  query=queries['train'], use_standard_sql=True)))
            signature_dataset = (signature_data, SIGNATURE_METADATA)
            
            tds, transform_fn = (signature_dataset | "AnalyzeAndTransform" 
                        >> beam_impl.AnalyzeAndTransformDataset(pre_process))
            t_data, t_metadata = tds

            train_prefix = os.path.join(train_dir, prefix)
            write_to_files(t_data, train_prefix, 'train')
            
            #  Process evaluation data with the obtained transform_fn
            #
            signature_data = (p | "ReadFromBigQuery_eval"  
                              >> beam.io.Read(beam.io.BigQuerySource(
                                  query=queries['eval'], use_standard_sql=True))) 
            signature_dataset = (signature_data, SIGNATURE_METADATA)

            t_dataset = ((signature_dataset, transform_fn) 
                         | "TransformEval" >> beam_impl.TransformDataset())
            t_data, t_metadata = t_dataset

            eval_prefix = os.path.join(eval_dir, prefix)
            write_to_files(t_data, eval_prefix, 'eval')

            #  Also process test data with the obtained transform_fn
            #
            signature_data = (p | "ReadFromBigQuery_test"  
                              >> beam.io.Read(beam.io.BigQuerySource(
                                  query=queries['test'], use_standard_sql=True)))
            signature_dataset = (signature_data, SIGNATURE_METADATA)

            t_dataset = ((signature_dataset, transform_fn) 
                         | "TransformTest" >> beam_impl.TransformDataset())
            t_data, t_metadata = t_dataset           

            test_prefix = os.path.join(test_dir, prefix)
            write_to_files(t_data, test_prefix, 'text')
            
            # save transforma function to disk for use at serving time
            #
            transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(metadata_dir)

write_py(exec_pipeline_prod)

'exec_pipeline_prod written to ./prep/exec_pipeline_prod.py.'

In [12]:
def run_job(args):
    
    import datetime
    import apache_beam as beam
    from prep.exec_pipeline_prod import exec_pipeline_prod
    
    job_name = 'tft-tutorial' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    
    
    options = {
        'staging_location': args['stage_dir'],
        'temp_location': args['tmp_dir'],
        'job_name': job_name,
        'project': args['project'],
        'max_num_workers': int(args['max_workers']),
        'teardown_policy': 'TEARDOWN_ALWAYS',
        'no_save_main_session': True,
        'requirements_file': 'dataflow_requirements.txt'
    }    
    opts = beam.pipeline.PipelineOptions(flags=[], **options)

    fractions = [int(n) for n in args['fractions'].split(",")]

    exec_pipeline_prod (opts, args['train_dir'], args['eval_dir'],args['test_dir'],
                        args['metadata_dir'], args['tmp_dir'],
                        fractions, float(args['sample_rate']), args['prefix'],
                        encode=args['encode'], runner=args['runner'])
write_py(run_job)

'run_job written to ./prep/run_job.py.'

In [13]:
def cleanup(subproject):
    import os
    basedir = os.path.join('gs://going-tfx/', subproject)

    for d in ['train_data/*', 'eval_data/*', 'test_data/*', 'tmp/*', 'model/*', 'metadata/*']:
        target = os.path.join(basedir, d)
        !echo gsutil -m rm -rf $target
        _ = !gsutil -m rm -rf $target

In [14]:
DATASET='samples'

In [16]:
cleanup(DATASET)

gsutil -m rm -rf gs://going-tfx/samples/train_data/*
gsutil -m rm -rf gs://going-tfx/samples/eval_data/*
gsutil -m rm -rf gs://going-tfx/samples/test_data/*
gsutil -m rm -rf gs://going-tfx/samples/tmp/*
gsutil -m rm -rf gs://going-tfx/samples/model/*
gsutil -m rm -rf gs://going-tfx/samples/metadata/*


In [17]:
from prep.prep_tools import join_paths

args={}
args['base_dir'] = "gs://going-tfx/{}".format(DATASET)
args['train_dir'] = 'train_data'
args['eval_dir'] = 'eval_data'
args['test_dir'] = 'test_data'
args['metadata_dir'] = 'metadata'
args['stage_dir'] = 'staging'
args['tmp_dir'] = 'tmp'
args['project'] = 'going-tfx'
args['prefix'] = 'atl_june'
args['fractions'] = '80,10,10'
args['sample_rate'] = 0.1
args['max_workers'] = 10
args['runner'] = 'DirectRunner'
args['encode'] = 'both'


run_job(join_paths(args))

INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:SavedModel written to: gs://going-tfx/samples/tmp/tftransform_tmp/71470af6999e4d9ab97a29f4cb6d3c60/saved_model.pb


INFO:tensorflow:SavedModel written to: gs://going-tfx/samples/tmp/tftransform_tmp/71470af6999e4d9ab97a29f4cb6d3c60/saved_model.pb


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:SavedModel written to: gs://going-tfx/samples/tmp/tftransform_tmp/2a6f8963ab0c46dc864262dea93710b1/saved_model.pb


INFO:tensorflow:SavedModel written to: gs://going-tfx/samples/tmp/tftransform_tmp/2a6f8963ab0c46dc864262dea93710b1/saved_model.pb


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets written to: gs://going-tfx/samples/tmp/tftransform_tmp/f410d61a5bd9452588f77c6f4aceb110/assets


INFO:tensorflow:Assets written to: gs://going-tfx/samples/tmp/tftransform_tmp/f410d61a5bd9452588f77c6f4aceb110/assets


INFO:tensorflow:SavedModel written to: gs://going-tfx/samples/tmp/tftransform_tmp/f410d61a5bd9452588f77c6f4aceb110/saved_model.pb


INFO:tensorflow:SavedModel written to: gs://going-tfx/samples/tmp/tftransform_tmp/f410d61a5bd9452588f77c6f4aceb110/saved_model.pb


value: "\n\013\n\tConst_8:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\013\n\tConst_8:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\014\n\nConst_13:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



value: "\n\014\n\nConst_13:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


value: "\n\013\n\tConst_8:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\013\n\tConst_8:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\014\n\nConst_13:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



value: "\n\014\n\nConst_13:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


value: "\n\013\n\tConst_8:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\013\n\tConst_8:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\014\n\nConst_13:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



value: "\n\014\n\nConst_13:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


value: "\n\013\n\tConst_8:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\013\n\tConst_8:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\014\n\nConst_13:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



value: "\n\014\n\nConst_13:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [22]:
def dataframe_from_gs(gsglob):
    import os
    import pandas as pd
    from train.model_config import ORDERED_TRAINING_COLUMNS

    a_training_file = !gsutil ls $gsglob
    a_training_file = a_training_file[0]
    TEMP_DIR='/tmp/atl_june/{}'.format(DATASET)
    !mkdir -p $TEMP_DIR
    _ = !gsutil cp $a_training_file $TEMP_DIR
    a_training_file = !ls $TEMP_DIR
    a_training_file = os.path.join(TEMP_DIR,a_training_file[0])
    res=!wc -l $a_training_file
    res=res[0].split(" ")
    print()
    print("{} records in {}".format(res[0], res[1]))
    return pd.read_csv(a_training_file, names=ORDERED_TRAINING_COLUMNS)

In [23]:
dataframe_from_gs('gs://going-tfx/$DATASET/train_data/atl_june_csv-00000-of-*')
probe.sample(frac=1.0)[:2]


1000 records in /tmp/atl_june/samples/atl_june_csv-00000-of-00024


Unnamed: 0,AIRLINE,ARR,ARR_DELAY,ARR_LAT,ARR_LON,DEP_DELAY,DEP_DOW,DEP_HOD,DEP_LAT,DEP_LON,DIFF_LAT,DIFF_LON,DISTANCE,MEAN_TEMP_ARR,MEAN_TEMP_DEP,MEAN_VIS_ARR,MEAN_VIS_DEP,WND_SPD_ARR,WND_SPD_DEP
296,0,3,-5.0,40.69,-74.16,0.094192,4,12,33.63,-84.42,0.52887,0.899484,0.150884,0.490975,0.592437,0.489247,0.734375,0.010801,0.263566
859,0,29,37.0,42.94,-87.89,0.135008,3,20,33.63,-84.42,0.58063,0.75204,0.133809,0.49639,0.277311,0.489247,0.609375,0.007901,0.27907


---
Make below cell a code cell and execute it within the notebook or better execute this code on a terminal. It's going to take up to 20 minutes, if sample_rate is 1.0

```
%%bash
export PYTHONPATH=${PYTHONPATH}:${PWD}
python -m prep.task \
    --project=going-tfx \
    --base_dir=gs://going-tfx/$DATASET/ \
    --sample_rate=1.0 \
    --prefix=atl_june \
    --encode=tfrecord \
    --runner=DataflowRunner
```