In [1]:
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
PACKAGE="./prep"
from tools import make_src_dumper
write_py = make_src_dumper(PACKAGE)

---
# Preprocessing data for ML with tensorflow_transform

### 1.  Signature data in Bigquery
We collected the raw data that we use from various sources into a single denormalized table holding the data in so-called signature format. That table's schema is meant to reflect the structure of the data/requests that we expect to be served at prediction time. 

In [4]:
%load_ext google.cloud.bigquery

In [29]:
%%bigquery num_records
select count(*) as num_records FROM `going-tfx.examples.ATL_JUNE_SIGNATURE`

Unnamed: 0,num_records
0,298329


In [30]:
%%bigquery sample
select * FROM `going-tfx.examples.ATL_JUNE_SIGNATURE` limit 3

Unnamed: 0,DATE,YEAR,MONTH,DAY,DEP_DOW,AIRLINE_NAME,AIRLINE,DEP_T,DEP,DEP_LAT,...,WND_SPD_DEP,ARR_T,ARR_DELAY,ARR,ARR_LAT,ARR_LON,ARR_W,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
0,2002-06-01,2002,6,1,7,US Airways Inc.: US (Merged with America West ...,US,610,ATL,33.63,...,6.9,712,-12.0,CLT,35.21,-80.94,CHARLOTTE DOUGLAS MUNICIPAL A,78.3,9.5,2.7
1,2002-06-01,2002,6,1,7,Delta Air Lines Inc.: DL,DL,620,ATL,33.63,...,6.9,740,9.0,MCO,28.42,-81.3,ORLANDO INTERNATIONAL AIRPORT,77.4,9.6,5.7
2,2002-06-01,2002,6,1,7,Delta Air Lines Inc.: DL,DL,620,ATL,33.63,...,6.9,738,55.0,TPA,27.97,-82.53,TAMPA INTERNATIONAL AIRPORT,79.1,9.9,5.5


#### Querying reproducible random subsets

In [31]:
def sample_queries(columns, fractions=[80,10,10], rate=0.1):
    """Reproducible random sets of flights from Atlanta in any month of June.

    Requires Bigquery table ATL_JUNE_SIGNATURE to be present. This function is meant
    to encapsulate the aspect of reproducible random sets from the rest of the code.
    
    Creates three statements (strings), each of which will produce a non-intersecting
    random subset of the data in ATL_JUNE_SIGNATURE table. Will return a map with keys
    'train', 'eval', and 'test' to hold those statements. 
    
    Args:
        columns: A string holding a comma-separated list of column names to fetch.
        fractions: an array of exactly three doubles defining the precise splits.
        rate: an additional parameter determining the overall sample rate. 0.1 means 10% is
        being split into the three given fractions.
        
    Returns:
        a dict like {'train': stmt1, 'eval': stmt2, 'test': stmt3}, where stmt1, stmt2, stmt3
        are three strings that represent valid statements against bigquery.
    """
    
    def sample_query(columns, total, lower, upper):
        col_string=", ".join(columns)
        return """
        SELECT
            {0}
        FROM 
            `going-tfx.examples.ATL_JUNE_SIGNATURE` 
        where
            MOD(ABS(FARM_FINGERPRINT(
                CONCAT(DATE,AIRLINE,ARR)
            )) + DEP_T, {1}) >= {2} 
        and
            MOD(ABS(FARM_FINGERPRINT(
                CONCAT( DATE, AIRLINE, ARR)
            )) + DEP_T, {1}) < {3} 
        """.format(col_string, total, lower, upper)
    
    start = 0
    total = int(sum(fractions) / rate)
    res = []
    for f in fractions:
        f_ = int(f) 
        q = sample_query(columns, total, start, start+f_)
        start = start + f_
        res.append(q)
    return dict(zip(['train', 'eval', 'test'], res))
write_py(sample_queries)

'sample_queries written to ./prep/sample_queries.py.'

#### Verifying ```sample_queries()```

Let's verify the function's behaviour: from the where clause of the 'train' query you can tell that it reproducibly chooses those records that come with a hash value modulo between $0$ and $90$ out of $10000$ different values. That's $90\%$ of $1\%$, just as we specified in the call to ```sample_queries```. 

All the code specific to this signature structure is kept in a single file called ```model_config.py``` located in the train package 

In [32]:
from train.model_config import SIGNATURE_COLUMNS
queries = sample_queries(SIGNATURE_COLUMNS, [90,5,5], .01)
print(queries['train'])


        SELECT
            DEP_DOW, DEP_T, DEP_LAT, DEP_LON, DEP_DELAY, MEAN_TEMP_DEP, MEAN_VIS_DEP, WND_SPD_DEP, ARR_LAT, ARR_LON, ARR_DELAY, MEAN_TEMP_ARR, MEAN_VIS_ARR, WND_SPD_ARR, ARR, AIRLINE
        FROM 
            `going-tfx.examples.ATL_JUNE_SIGNATURE` 
        where
            MOD(ABS(FARM_FINGERPRINT(
                CONCAT(DATE,AIRLINE,ARR)
            )) + DEP_T, 10000) >= 0 
        and
            MOD(ABS(FARM_FINGERPRINT(
                CONCAT( DATE, AIRLINE, ARR)
            )) + DEP_T, 10000) < 90 
        


In [33]:
num_records = num_records['num_records'].iloc[0]

In [34]:
import google.datalab.bigquery as dlbq
df = dlbq.Query(queries['train']).execute().result().to_dataframe()
print('Only {} = {}% of all examples. Showing first three:'.format(len(df), len(df)/num_records*100))
df[:3]

Only 2661 = 0.891968263226% of all examples. Showing first three:


Unnamed: 0,DEP_DOW,DEP_T,DEP_LAT,DEP_LON,DEP_DELAY,MEAN_TEMP_DEP,MEAN_VIS_DEP,WND_SPD_DEP,ARR_LAT,ARR_LON,ARR_DELAY,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR,ARR,AIRLINE
0,7,1125,33.63,-84.42,13.0,78.1,9.6,6.9,36.19,-95.88,11.0,80.3,9.9,7.8,TUL,DL
1,7,1540,33.63,-84.42,5.0,78.1,9.6,6.9,40.63,-73.77,16.0,73.8,9.2,11.4,JFK,DL
2,7,1630,33.63,-84.42,31.0,78.1,9.6,6.9,30.53,-91.15,20.0,77.2,8.5,2.2,BTR,DL


---
### 2. The Pre-processing Function

In [35]:
def pre_process(row):
    import tensorflow_transform as tft
    from tools import tf_haversine

    def add_engineered(row):
        dep_lat = row['DEP_LAT']
        dep_lon = row['DEP_LON']
        arr_lat = row['ARR_LAT']
        arr_lon = row['ARR_LON']

        row['DEP_HOD'] = row['DEP_T'] // 100
        row.pop('DEP_T')  # no longer needed

        row['DIFF_LAT'] = arr_lat - dep_lat
        row['DIFF_LON'] = arr_lon - dep_lon
        row['DISTANCE'] = tf_haversine(arr_lat, arr_lon, dep_lat, dep_lon)
        return row

    def scale_floats(row):
        for c in ['MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR', 'DEP_DELAY',
                 'DIFF_LAT', 'DIFF_LON', 'DISTANCE']:
            row[c] = tft.scale_to_0_1(row[c])
        return row

    def categorical_from_strings(row):
        row['AIRLINE'] = tft.string_to_int(row['AIRLINE'])
        row['ARR'] = tft.string_to_int(row['ARR'])
        return row
    
    row = row.copy()
    row = add_engineered(row)
    row = scale_floats(row)
    row = categorical_from_strings(row)
    return row
write_py(pre_process)

'pre_process written to ./prep/pre_process.py.'

Like before, let's now verify the pre-processing function. We're aware that putting the basic transformations into a wrapper function does not make it easily testable. But please consider that the all basic transformations are somewhat trivial and those with the ```tft``` functions inside are by their nature not trivial to test anyway. Btw: Another interesting question is: Does this wrapping impact performance? Honestly, I don't know for sure - that may become subject to a different tutorial. 

Now here's how we test our code: We create a super-small pipeline with five records and verify the output of the analyse-and-transform lifecycle.

#### Verifying ```pre_process()```

In [37]:
first5=df.head().to_dict(orient='records')
print(first5)

[{u'WND_SPD_DEP': 6.9, u'DEP_DELAY': 13.0, u'ARR_LAT': 36.19, u'WND_SPD_ARR': 7.8, u'MEAN_VIS_DEP': 9.6, u'DEP_T': 1125, u'MEAN_TEMP_ARR': 80.3, u'DEP_LON': -84.42, u'DEP_DOW': 7, u'MEAN_VIS_ARR': 9.9, u'ARR': 'TUL', u'AIRLINE': 'DL', u'MEAN_TEMP_DEP': 78.1, u'ARR_DELAY': 11.0, u'DEP_LAT': 33.63, u'ARR_LON': -95.88}, {u'WND_SPD_DEP': 6.9, u'DEP_DELAY': 5.0, u'ARR_LAT': 40.63, u'WND_SPD_ARR': 11.4, u'MEAN_VIS_DEP': 9.6, u'DEP_T': 1540, u'MEAN_TEMP_ARR': 73.8, u'DEP_LON': -84.42, u'DEP_DOW': 7, u'MEAN_VIS_ARR': 9.2, u'ARR': 'JFK', u'AIRLINE': 'DL', u'MEAN_TEMP_DEP': 78.1, u'ARR_DELAY': 16.0, u'DEP_LAT': 33.63, u'ARR_LON': -73.77}, {u'WND_SPD_DEP': 6.9, u'DEP_DELAY': 31.0, u'ARR_LAT': 30.53, u'WND_SPD_ARR': 2.2, u'MEAN_VIS_DEP': 9.6, u'DEP_T': 1630, u'MEAN_TEMP_ARR': 77.2, u'DEP_LON': -84.42, u'DEP_DOW': 7, u'MEAN_VIS_ARR': 8.5, u'ARR': 'BTR', u'AIRLINE': 'DL', u'MEAN_TEMP_DEP': 78.1, u'ARR_DELAY': 20.0, u'DEP_LAT': 33.63, u'ARR_LON': -91.15}, {u'WND_SPD_DEP': 7.4, u'DEP_DELAY': -1.0, u'A

In [38]:
from train.model_config import (SIGNATURE_COLUMNS, TRAINING_COLUMNS,
    TRAINING_METADATA, SIGNATURE_METADATA, ORDERED_TRAINING_COLUMNS)
import tensorflow_transform.beam.impl as beam_impl
import apache_beam as beam

signature_dataset = (first5, SIGNATURE_METADATA)

with beam_impl.Context(temp_dir='/tmp'):
    tds, transform_fn = (signature_dataset | beam_impl.AnalyzeAndTransformDataset(pre_process))
    t_data, t_metadata = tds

Instructions for updating:
Use `tft.compute_and_apply_vocabulary()` instead.
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: /tmp/tftransform_tmp/9afd2251e7d34c6d8957c26a3ff4c46d/saved_model.pb
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: /tmp/tftransform_tmp/43644c9114fd47c6b2f16be54969495d/saved_model.pb
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets written to: /tmp/tftransform_tmp/23df1c4933e046dc8f5ecc6375d51970/assets


INFO:tensorflow:Assets written to: /tmp/tftransform_tmp/23df1c4933e046dc8f5ecc6375d51970/assets


INFO:tensorflow:SavedModel written to: /tmp/tftransform_tmp/23df1c4933e046dc8f5ecc6375d51970/saved_model.pb


INFO:tensorflow:SavedModel written to: /tmp/tftransform_tmp/23df1c4933e046dc8f5ecc6375d51970/saved_model.pb


value: "\n\t\n\007Const:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\t\n\007Const:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\014\n\nConst_13:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



value: "\n\014\n\nConst_13:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


value: "\n\t\n\007Const:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\t\n\007Const:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\014\n\nConst_13:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



value: "\n\014\n\nConst_13:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


---
Please confirm for yourself that some of the features now scale from $0$ to $1$, and the differences in longitude and latitude ```DIFF_LAT``` and ```DIFF_LON``` have been added to the feature set.

In [39]:
print(t_data[0])

{u'ARR_LAT': 36.19, u'DISTANCE': 0.7537452, u'WND_SPD_DEP': 0.0, u'DEP_DELAY': 0.4857143, u'DEP_HOD': 11, u'WND_SPD_ARR': 0.6086957, u'MEAN_VIS_DEP': 0.0, u'DIFF_LAT': 0.64928895, u'MEAN_TEMP_ARR': 0.73863673, u'DEP_LON': -84.42, u'DEP_DOW': 7, u'MEAN_VIS_ARR': 1.0, u'ARR': 0, u'AIRLINE': 0, u'MEAN_TEMP_DEP': 0.0, u'ARR_DELAY': 11.0, u'DEP_LAT': 33.63, u'ARR_LON': -95.88, u'DIFF_LON': 0.0}


Verify that the 5 different longitudinal differences are scaled between $0$ and $1$

In [40]:
print(sorted([r['DIFF_LON'] for r in t_data]))

[0.0, 0.05291715, 0.21393016, 0.6037991, 1.0]


---
### 3. The Full Pipeline

In [41]:
def exec_pipeline_prod (options, train_dir, eval_dir, test_dir, 
                        metadata_dir, tmp_dir, 
                        fractions, sample_rate, prefix, 
                        encode='tfrecord', 
                        runner='DirectRunner'):
    
    import os
    import tensorflow_transform as tft
    import tensorflow_transform.beam.impl as beam_impl
    import apache_beam as beam
    from tensorflow_transform.tf_metadata import dataset_metadata
    from tensorflow_transform.tf_metadata import dataset_schema
    from tensorflow_transform.beam.tft_beam_io import transform_fn_io
    
    from train.model_config import (SIGNATURE_COLUMNS, TRAINING_COLUMNS,
        TRAINING_METADATA, SIGNATURE_METADATA, ORDERED_TRAINING_COLUMNS)
    from prep.pre_process import pre_process
    from prep.sample_queries import sample_queries

    with beam.Pipeline(runner, options=options) as p:
        with beam_impl.Context(temp_dir=tmp_dir):

            def write_to_files(data, prefix, phase):
                tfr_encoder = tft.coders.ExampleProtoCoder(t_metadata.schema)            
                if encode in ['tfrecord', 'both', None]:
                    _ = (data
                        | ('EncodeTFRecord_' + phase) >> beam.Map(tfr_encoder.encode)
                        | ('WriteTFRecord_' + phase) >> beam.io.WriteToTFRecord(prefix+'_tfr'))

                if encode in ['csv', 'both', None]:
                    csv_encoder = tft.coders.CsvCoder(ORDERED_TRAINING_COLUMNS, TRAINING_METADATA.schema)    
                    _ = (data 
                        | ('EncodeCSV_train' + phase) >> beam.Map(csv_encoder.encode)
                        | ('WriteText_train' + phase) >> beam.io.WriteToText(file_path_prefix=prefix+'_csv'))
        
            # Process training data and obtain transform_fn
            #
            queries = sample_queries(SIGNATURE_COLUMNS, fractions, sample_rate)

            signature_data = (p | "ReadFromBigQuery_train"  
                              >> beam.io.Read(beam.io.BigQuerySource(
                                  query=queries['train'], use_standard_sql=True)))
            signature_dataset = (signature_data, SIGNATURE_METADATA)
            
            tds, transform_fn = (signature_dataset | "AnalyzeAndTransform" 
                        >> beam_impl.AnalyzeAndTransformDataset(pre_process))
            t_data, t_metadata = tds

            train_prefix = os.path.join(train_dir, prefix)
            write_to_files(t_data, train_prefix, 'train')
            
            #  Process evaluation data with the obtained transform_fn
            #
            signature_data = (p | "ReadFromBigQuery_eval"  
                              >> beam.io.Read(beam.io.BigQuerySource(
                                  query=queries['eval'], use_standard_sql=True))) 
            signature_dataset = (signature_data, SIGNATURE_METADATA)

            t_dataset = ((signature_dataset, transform_fn) 
                         | "TransformEval" >> beam_impl.TransformDataset())
            t_data, t_metadata = t_dataset

            eval_prefix = os.path.join(eval_dir, prefix)
            write_to_files(t_data, eval_prefix, 'eval')

            #  Also process test data with the obtained transform_fn
            #
            signature_data = (p | "ReadFromBigQuery_test"  
                              >> beam.io.Read(beam.io.BigQuerySource(
                                  query=queries['test'], use_standard_sql=True)))
            signature_dataset = (signature_data, SIGNATURE_METADATA)

            t_dataset = ((signature_dataset, transform_fn) 
                         | "TransformTest" >> beam_impl.TransformDataset())
            t_data, t_metadata = t_dataset           

            test_prefix = os.path.join(test_dir, prefix)
            write_to_files(t_data, test_prefix, 'text')
            
            # save transforma function to disk for use at serving time
            #
            transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(metadata_dir)

write_py(exec_pipeline_prod)

'exec_pipeline_prod written to ./prep/exec_pipeline_prod.py.'

In [42]:
def run_job(args):
    
    import datetime
    import apache_beam as beam
    from prep.exec_pipeline_prod import exec_pipeline_prod
    
    job_name = 'tft-tutorial' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    
    
    options = {
        'staging_location': args['stage_dir'],
        'temp_location': args['tmp_dir'],
        'job_name': job_name,
        'project': args['project'],
        'max_num_workers': int(args['max_workers']),
        'teardown_policy': 'TEARDOWN_ALWAYS',
        'no_save_main_session': True,
        'requirements_file': 'dataflow_requirements.txt'
    }    
    opts = beam.pipeline.PipelineOptions(flags=[], **options)

    fractions = [int(n) for n in args['fractions'].split(",")]

    exec_pipeline_prod (opts, args['train_dir'], args['eval_dir'],args['test_dir'],
                        args['metadata_dir'], args['tmp_dir'],
                        fractions, float(args['sample_rate']), args['prefix'],
                        encode=args['encode'], runner=args['runner'])
write_py(run_job)

'run_job written to ./prep/run_job.py.'

---
#### Verifying the production pipeline
This little helper function cleans all sub-directories in a particular sub-project directory

In [47]:
def cleanup(subproject):
    import os
    basedir = os.path.join('gs://going-tfx/', subproject)

    for d in ['train_data/*', 'eval_data/*', 'test_data/*', 'tmp/*', 'model/*', 'metadata/*']:
        target = os.path.join(basedir, d)
        !echo gsutil -m rm -rf $target
        _ = !gsutil -m rm -rf $target

In [53]:
WORK_ROOT='experimental'

In [54]:
cleanup(WORK_ROOT)

gsutil -m rm -rf gs://going-tfx/experimental/train_data/*
gsutil -m rm -rf gs://going-tfx/experimental/eval_data/*
gsutil -m rm -rf gs://going-tfx/experimental/test_data/*
gsutil -m rm -rf gs://going-tfx/experimental/tmp/*
gsutil -m rm -rf gs://going-tfx/experimental/model/*
gsutil -m rm -rf gs://going-tfx/experimental/metadata/*


Run a local job on a small subset to verify everything is working fine

In [55]:
import tensorflow as tf
from prep.prep_tools import join_paths

args={}
args['base_dir'] = "gs://going-tfx/{}".format(WORK_ROOT)
args['train_dir'] = 'train_data'
args['eval_dir'] = 'eval_data'
args['test_dir'] = 'test_data'
args['metadata_dir'] = 'metadata'
args['stage_dir'] = 'staging'
args['tmp_dir'] = 'tmp'
args['project'] = 'going-tfx'
args['prefix'] = 'atl_june'
args['fractions'] = '80,10,10'
args['sample_rate'] = 0.01
args['max_workers'] = 1
args['runner'] = 'DirectRunner'
args['encode'] = 'both'

tf.logging.set_verbosity(tf.logging.WARN)
run_job(join_paths(args))



value: "\n\013\n\tConst_8:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\013\n\tConst_8:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\014\n\nConst_13:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



value: "\n\014\n\nConst_13:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



value: "\n\013\n\tConst_8:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\013\n\tConst_8:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\014\n\nConst_13:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



value: "\n\014\n\nConst_13:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



value: "\n\013\n\tConst_8:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\013\n\tConst_8:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\014\n\nConst_13:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



value: "\n\014\n\nConst_13:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



value: "\n\013\n\tConst_8:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\013\n\tConst_8:0\022-vocab_compute_and_apply_vocabulary_vocabulary"



value: "\n\014\n\nConst_13:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



value: "\n\014\n\nConst_13:0\022/vocab_compute_and_apply_vocabulary_1_vocabulary"



---
A quick sanity check: Fetch one file from gs storage and have a sneak preview

In [56]:
def dataframe_from_gs(gsglob):
    import os
    import pandas as pd
    from train.model_config import ORDERED_TRAINING_COLUMNS

    a_training_file = !gsutil ls $gsglob
    a_training_file = a_training_file[0]
    TEMP_DIR='/tmp/atl_june/{}'.format(WORK_ROOT)
    !mkdir -p $TEMP_DIR
    !gsutil cp $a_training_file $TEMP_DIR
    a_training_file = !ls $TEMP_DIR
    a_training_file = os.path.join(TEMP_DIR,a_training_file[0])
    res=!wc -l $a_training_file
    res=res[0].split(" ")
    print()
    print("{} records in {}".format(res[0], res[1]))
    return pd.read_csv(a_training_file, names=ORDERED_TRAINING_COLUMNS)

In [57]:
probe = dataframe_from_gs('gs://going-tfx/%s/train_data/atl_june_csv-00000-of-*' % WORK_ROOT)
probe.sample(frac=1.0)[:2]

Copying gs://going-tfx/experimental/train_data/atl_june_csv-00000-of-00003...
/ [1 files][142.6 KiB/142.6 KiB]                                                
Operation completed over 1 objects/142.6 KiB.                                    

1000 records in /tmp/atl_june/experimental/atl_june_csv-00000-of-00003


Unnamed: 0,AIRLINE,ARR,ARR_DELAY,ARR_LAT,ARR_LON,DEP_DELAY,DEP_DOW,DEP_HOD,DEP_LAT,DEP_LON,DIFF_LAT,DIFF_LON,DISTANCE,MEAN_TEMP_ARR,MEAN_TEMP_DEP,MEAN_VIS_ARR,MEAN_VIS_DEP,WND_SPD_ARR,WND_SPD_DEP
230,1,56,53.0,35.43,-82.54,0.103896,3,7,33.63,-84.42,0.39916,0.810974,0.019242,0.380682,0.739496,0.211679,1.0,0.001901,0.286822
196,1,96,51.0,31.53,-84.19,0.135065,6,9,33.63,-84.42,0.308123,0.793222,0.015087,0.700758,0.970588,0.532847,0.625,0.002601,0.248062


---
### 4. Run Your Code in Production

In [65]:
%%writefile preprocess.py
[ -z "$WORK_ROOT" ] && echo set WORK_ROOT to a particular value of your choice and find your results in gs://going-tfx/WORK_ROOT && exit -1
export PYTHONPATH=${PYTHONPATH}:${PWD}
python -m prep.task \
    --project=going-tfx \
    --base_dir=gs://going-tfx/$WORK_ROOT/ \
    --sample_rate=1.0 \
    --prefix=atl_june \
    --encode=tfrecord \
    --runner=DataflowRunner

Overwriting preprocess.py


---
Have a look at the files that now consitute the production-ready package

In [75]:
%%bash
ls -l ./prep | grep -v pyc
echo
echo preprocess.py:
cat preprocess.py

total 48
-rw-r--r-- 1 wgiersche wgiersche 4067 Dec 27 20:42 exec_pipeline_prod.py
-rw-r--r-- 1 wgiersche wgiersche    0 Nov 24 07:51 __init__.py
-rw-r--r-- 1 wgiersche wgiersche 1114 Dec 27 20:26 pre_process.py
-rw-r--r-- 1 wgiersche wgiersche  526 Nov 24 12:33 prep_tools.py
-rw-r--r-- 1 wgiersche wgiersche 1034 Dec 27 20:47 run_job.py
-rw-r--r-- 1 wgiersche wgiersche 1871 Dec 27 20:25 sample_queries.py
-rw-r--r-- 1 wgiersche wgiersche 3233 Dec 27 18:57 task.py

preprocess.py:
[ -z "$WORK_ROOT" ] && echo set WORK_ROOT to a particular value of your choice and find your results in gs://going-tfx/WORK_ROOT && exit -1
export PYTHONPATH=${PYTHONPATH}:${PWD}
python -m prep.task \
    --project=going-tfx \
    --base_dir=gs://going-tfx/$WORK_ROOT/ \
    --sample_rate=1.0 \
    --prefix=atl_june \
    --encode=tfrecord \
    --runner=DataflowRunner

---
Now we're ready to run the entire code as a script in production. All the functions that we developed in this notebook, have already been written into python files that make a complete package ```prep``` that we can send to Google's ```DataFlowRunner``` in the cloud.
Set WORK_ROOT to your choice and execute 

```bash preprocess.py``` 

on a terminal. It's going to take up to 20 minutes, if sample_rate is 1.0. Observe the progress in Google Cloud Dataflow and after the job is finished verify that there are indeed directories containing training, evaluation and test datasets. We'll use those files in [the training notebook](./05_Training.ipynb) 