In [1]:
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import

In [2]:
PACKAGE="./prep"
from tools import make_src_dumper
write_py = make_src_dumper(PACKAGE)

---
#### Signature data in Bigquery
We collected the raw data that we use from various sources into a single denormalized table holding the data in so-called signature format. That table's schema is meant to reflect the structure of the data/requests that we expect to be served at prediction time. 

In [3]:
%load_ext google.cloud.bigquery

In [4]:
%%bigquery sample
select * FROM `going-tfx.examples.ATL_JUNE_SIGNATURE` limit 3

Unnamed: 0,DATE,YEAR,MONTH,DAY,DEP_DOW,AIRLINE,DEP_T,DEP,DEP_LAT,DEP_LON,...,WND_SPD_DEP,ARR_T,ARR_DELAY,ARR,ARR_LAT,ARR_LON,ARR_W,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
0,2005-06-01,2005,6,1,4,Delta Air Lines Inc.: DL,1927,ATL,33.63,-84.42,...,14.4,2113,26.0,RNO,39.49,-119.76,RENO WBO,67.2,10.0,9.8
1,2005-06-02,2005,6,2,5,Delta Air Lines Inc.: DL,1927,ATL,33.63,-84.42,...,9.7,2113,40.0,RNO,39.49,-119.76,RENO WBO,62.1,10.0,7.2
2,2005-06-03,2005,6,3,6,Delta Air Lines Inc.: DL,1927,ATL,33.63,-84.42,...,7.4,2113,13.0,RNO,39.49,-119.76,RENO WBO,58.9,10.0,4.3


In [5]:
from train.model_config import SIGNATURE_COLUMNS
print(SIGNATURE_COLUMNS)

['DEP_DOW', 'DEP_T', 'DEP_LAT', 'DEP_LON', 'DEP_DELAY', 'MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 'ARR_LAT', 'ARR_LON', 'ARR_DELAY', 'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR']


In [38]:
def sample_queries(columns, fractions, rate=0.1):

    def sample_query(columns, total, lower, upper):
        col_string=", ".join(columns)
        return """
        SELECT
            {0}
        FROM 
            `going-tfx.examples.ATL_JUNE_SIGNATURE` 
        where
            MOD(ABS(FARM_FINGERPRINT(
                CONCAT(DATE,AIRLINE,ARR)
            )) + DEP_T, {1}) >= {2} 
        and
            MOD(ABS(FARM_FINGERPRINT(
                CONCAT( DATE, AIRLINE, ARR)
            )) + DEP_T, {1}) < {3} 
        """.format(col_string, total, lower, upper)
    
    start = 0
    total = int(sum(fractions) / rate)
    res = []
    for f in fractions:
        f_ = int(f) 
        q = sample_query(columns, total, start, start+f_)
        start = start + f_
        res.append(q)
    return dict(zip(['train', 'eval', 'test'], res))
write_py(sample_queries)

'sample_queries written to ./prep/sample_queries.py.'

---
#### The pre-processing function

In [6]:
def pre_process(row):
    import tensorflow_transform as tft
    from tools import tf_haversine

    def add_engineered(row):
        dep_lat = row['DEP_LAT']
        dep_lon = row['DEP_LON']
        arr_lat = row['ARR_LAT']
        arr_lon = row['ARR_LON']

        row['DEP_HOD'] = row['DEP_T'] // 100
        row.pop('DEP_T')  # no longer needed

        row['DIFF_LAT'] = arr_lat - dep_lat
        row['DIFF_LON'] = arr_lon - dep_lon
        row['DISTANCE'] = tf_haversine(arr_lat, arr_lon, dep_lat, dep_lon)
        return row

    def scale_floats(row):
        for c in ['MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR', 'DEP_DELAY',
                 'DIFF_LAT', 'DIFF_LON', 'DISTANCE']:
            row[c] = tft.scale_to_0_1(row[c])
        return row
    
    row = row.copy()
    row = add_engineered(row)
    row = scale_floats(row)
    return row
write_py(pre_process)

'pre_process written to ./prep/pre_process.py.'

In [7]:
from train.model_config import TRAINING_COLUMNS
print(TRAINING_COLUMNS)

['DEP_DOW', 'DEP_HOD', 'DEP_LAT', 'DEP_LON', 'MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 'DEP_DELAY', 'ARR_LAT', 'ARR_LON', 'ARR_DELAY', 'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR', 'DIFF_LAT', 'DIFF_LON', 'DISTANCE']


---
#### The full pipeline

In [57]:
def exec_pipeline_prod (options, train_dir, eval_dir, test_dir, 
                        metadata_dir, tmp_dir,
                        fractions, sample_rate, prefix,
                        runner='DirectRunner'):
    
    import os
    import tensorflow_transform as tft
    import tensorflow_transform.beam.impl as beam_impl
    import apache_beam as beam
    from tensorflow_transform.tf_metadata import dataset_metadata
    from tensorflow_transform.tf_metadata import dataset_schema
    from tensorflow_transform.beam.tft_beam_io import transform_fn_io
    
    from train.model_config import (SIGNATURE_COLUMNS, TRAINING_COLUMNS,
        SIGNATURE_METADATA)
    from prep.pre_process import pre_process
    from prep.sample_queries import sample_queries

    
    with beam.Pipeline(runner, options=options) as p:
        with beam_impl.Context(temp_dir=tmp_dir):
            
            # Process training data and obtain transform_fn
            #
            queries = sample_queries(SIGNATURE_COLUMNS, fractions, sample_rate)

            signature_data = (p | "ReadFromBigQuery_train"  
                              >> beam.io.Read(beam.io.BigQuerySource(
                                  query=queries['train'], use_standard_sql=True)))
            signature_dataset = (signature_data, SIGNATURE_METADATA)
            
            tds, transform_fn = (signature_dataset | "AnalyzeAndTransform" 
                        >> beam_impl.AnalyzeAndTransformDataset(pre_process))
            t_data, t_metadata = tds

            train_prefix = os.path.join(train_dir, prefix)
            encoder = tft.coders.ExampleProtoCoder(t_metadata.schema)

            _ = (t_data
                 | 'EncodeTFRecord_train' >> beam.Map(encoder.encode)
                 | 'WriteTFRecord_train' >> beam.io.WriteToTFRecord(train_prefix))
        
        
            #  Process evaluation data with the obtained transform_fn
            #
            signature_data = (p | "ReadFromBigQuery_eval"  
                              >> beam.io.Read(beam.io.BigQuerySource(
                                  query=queries['eval'], use_standard_sql=True))) 
            signature_dataset = (signature_data, SIGNATURE_METADATA)

            t_dataset = ((signature_dataset, transform_fn) 
                         | "TransformEval" >> beam_impl.TransformDataset())
            t_data, _ = t_dataset
            eval_prefix = os.path.join(eval_dir, prefix)
            _ = (t_data
                 | 'EncodeTFRecord_eval' >> beam.Map(encoder.encode)
                 | 'WriteTFRecord_eval' >> beam.io.WriteToTFRecord(eval_prefix))
        
            
            #  Also process test data with the obtained transform_fn
            #
            signature_data = (p | "ReadFromBigQuery_test"  
                              >> beam.io.Read(beam.io.BigQuerySource(
                                  query=queries['test'], use_standard_sql=True)))
            signature_dataset = (signature_data, SIGNATURE_METADATA)

            t_dataset = ((signature_dataset, transform_fn) 
                         | "TransformTest" >> beam_impl.TransformDataset())
            t_data, _ = t_dataset
            test_prefix = os.path.join(test_dir, prefix)
            _ = (t_data
                 | 'EncodeTFRecord_test' >> beam.Map(encoder.encode)
                 | 'WriteTFRecord_test' >> beam.io.WriteToTFRecord(test_prefix))
        
            
            # save transforma function to disk for use at serving time
            #
            transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(metadata_dir)

write_py(exec_pipeline_prod)

'exec_pipeline_prod written to ./prep/exec_pipeline_prod.py.'

In [66]:
def run_job(args):
    
    import datetime
    import apache_beam as beam
    from prep.exec_pipeline_prod import exec_pipeline_prod
    
    job_name = 'tft-tutorial' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    
    
    options = {
        'staging_location': args['stage_dir'],
        'temp_location': args['tmp_dir'],
        'job_name': job_name,
        'project': args['project'],
        'max_num_workers': int(args['max_workers']),
        'teardown_policy': 'TEARDOWN_ALWAYS',
        'no_save_main_session': True,
        'requirements_file': 'dataflow_requirements.txt'
    }    
    opts = beam.pipeline.PipelineOptions(flags=[], **options)

    fractions = [int(n) for n in args['fractions'].split(",")]

    exec_pipeline_prod (opts, args['train_dir'], args['eval_dir'],args['test_dir'],
                        args['metadata_dir'], args['tmp_dir'],
                        fractions, float(args['sample_rate']), args['prefix'],
                        runner=args['runner'])
write_py(run_job)

'run_job written to ./prep/run_job.py.'

In [69]:
def cleanup(subproject):
    import os
    basedir = os.path.join('gs://going-tfx/', subproject)

    for d in ['train_data/*', 'eval_data/*', 'test_data/*', 'tmp/*', 'model/*', 'metadata/*']:
        target = os.path.join(basedir, d)
        !echo gsutil -m rm -rf $target
        _ = !gsutil -m rm -rf $target

In [16]:
cleanup('sandbox')

gsutil -m rm -rf gs://going-tfx/sandbox/train_data/*
CommandException: 1 files/objects could not be removed.
gsutil -m rm -rf gs://going-tfx/sandbox/eval_data/*
CommandException: 1 files/objects could not be removed.
gsutil -m rm -rf gs://going-tfx/sandbox/test_data/*
CommandException: 1 files/objects could not be removed.
gsutil -m rm -rf gs://going-tfx/sandbox/tmp/*
CommandException: 1 files/objects could not be removed.
gsutil -m rm -rf gs://going-tfx/sandbox/model/*
CommandException: 1 files/objects could not be removed.
gsutil -m rm -rf gs://going-tfx/sandbox/metadata/*
CommandException: 1 files/objects could not be removed.


In [59]:
from prep.prep_tools import join_paths

args={}
args['base_dir'] = "gs://going-tfx/sandbox"
args['train_dir'] = 'train_data'
args['eval_dir'] = 'eval_data'
args['test_dir'] = 'test_data'
args['metadata_dir'] = 'metadata'
args['stage_dir'] = 'staging'
args['tmp_dir'] = 'tmp'
args['project'] = 'going-tfx'
args['prefix'] = 'atl_june'
args['fractions'] = '90,5,5'
args['sample_rate'] = 0.1
args['max_workers'] = 24
args['runner'] = 'DirectRunner'


run_job(join_paths(args))

INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: gs://going-tfx/sandbox/tmp/tftransform_tmp/04dc720d643541f5a1370289a1d8c6b9/saved_model.pb
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: gs://going-tfx/sandbox/tmp/tftransform_tmp/749ec2301cfe4692a80f7e54a7a0c6b7/saved_model.pb


  pipeline.replace_all(_get_transform_overrides(pipeline.options))


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:SavedModel written to: gs://going-tfx/sandbox/tmp/tftransform_tmp/e3b2308c4b65417c9693d3802da79af7/saved_model.pb


INFO:tensorflow:SavedModel written to: gs://going-tfx/sandbox/tmp/tftransform_tmp/e3b2308c4b65417c9693d3802da79af7/saved_model.pb


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [70]:
cleanup('sandbox')

gsutil -m rm -rf gs://going-tfx/sandbox/train_data/*
gsutil -m rm -rf gs://going-tfx/sandbox/eval_data/*
gsutil -m rm -rf gs://going-tfx/sandbox/test_data/*
gsutil -m rm -rf gs://going-tfx/sandbox/tmp/*
gsutil -m rm -rf gs://going-tfx/sandbox/model/*
gsutil -m rm -rf gs://going-tfx/sandbox/metadata/*


In [71]:
%%bash
export PYTHONPATH=${PYTHONPATH}:${PWD}
python -m prep.task \
    --project=going-tfx \
    --base_dir=gs://going-tfx/sandbox/ \
    --sample_rate=0.1 \
    --prefix=atl_june

  from .qhull import *
  from .lbfgsb import _minimize_lbfgsb
2018-11-17 11:04:52.976471: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA
  pipeline.replace_all(_get_transform_overrides(pipeline.options))
