In [5]:
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import

In [6]:
import datetime
import os
import tempfile
import math

import pandas as pd
import apache_beam as beam

import tensorflow as tf
import tensorflow_transform as tft
import tensorflow_transform.beam.impl as beam_impl
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.beam.tft_beam_io import transform_fn_io

import google.datalab.bigquery as dlbq

from tools import haversine

  from ._conv import register_converters as _register_converters
  from .. import h5g, h5i, h5o, h5r, h5t, h5l, h5p
  from .qhull import *
  from .murmurhash import murmurhash3_32
  from .lbfgsb import _minimize_lbfgsb
  from ._logistic_sigmoid import _log_logistic_sigmoid
  from .expected_mutual_info_fast import expected_mutual_information
  from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan


In [7]:
%load_ext google.cloud.bigquery

In [8]:
PROJECT='going-tfx'
BUCKET='going-tfx'
DATASET='examples'
TMPDIR='/tmp'

---
# Raw data in Bigquery
We collected the raw data that we use from various sources into a single denormalized table

In [9]:
%%bigquery aj_sample
select * FROM `going-tfx.examples.ATL_JUNE_RAW` limit 3

Unnamed: 0,DATE,YEAR,MONTH,DAY,DEP_DOW,AIRLINE,DEP_T,DEP,DEP_LAT,DEP_LON,...,WND_SPD_DEP,ARR_T,ARR_DELAY,ARR,ARR_LAT,ARR_LON,ARR_W,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
0,2002-06-01,2002,6,1,7,US Airways Inc.: US (Merged with America West ...,610,ATL,33.63,-84.42,...,6.9,712,-12.0,CLT,35.21,-80.94,CHARLOTTE/DOUGLAS INTERNATION,78.3,9.5,2.7
1,2002-06-01,2002,6,1,7,Delta Air Lines Inc.: DL,620,ATL,33.63,-84.42,...,6.9,749,-1.0,ORF,36.89,-76.2,NORFOLK REGIONAL ARPT,80.9,9.7,9.4
2,2002-06-01,2002,6,1,7,Delta Air Lines Inc.: DL,620,ATL,33.63,-84.42,...,6.9,740,9.0,MCO,28.42,-81.3,ORLANDO INTERNATIONAL AIRPORT,77.4,9.6,5.7


In [10]:
ALL_COLUMNS = [str(key) for key in aj_sample.keys()]
print(ALL_COLUMNS)

['DATE', 'YEAR', 'MONTH', 'DAY', 'DEP_DOW', 'AIRLINE', 'DEP_T', 'DEP', 'DEP_LAT', 'DEP_LON', 'DEP_DELAY', 'DEP_W', 'MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 'ARR_T', 'ARR_DELAY', 'ARR', 'ARR_LAT', 'ARR_LON', 'ARR_W', 'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR']


In [11]:
OUTPUT_COLUMNS = ALL_COLUMNS

---
# Repeatable random subsets 

### Here's how we create repeatable random subsets for training, evaluation, and test

In [12]:
def sample_query(total, lower, upper):
    return """
        SELECT
            *
        FROM `going-tfx.examples.ATL_JUNE_RAW` 

        where
          MOD(ABS(FARM_FINGERPRINT(
            CONCAT(DATE,AIRLINE,ARR)
          )) + DEP_T, {0}) >= {1} 
        and
          MOD(ABS(FARM_FINGERPRINT(
            CONCAT( DATE, AIRLINE, ARR)
           )) + DEP_T, {0}) < {2} 
    """.format(total, lower, upper)

In [13]:
def sample_queries(fractions, rate=0.1):
    start = 0
    total = int(sum(fractions) / rate)
    res = []
    for f in fractions:
        f_ = int(f) 
        q = sample_query(total, start, start+f_)
        start = start + f_
        res.append(q)
    return dict(zip(['train', 'eval', 'test'], res))

In [14]:
queries = sample_queries([80,10,10], .1)

In [15]:
print(queries['eval'])


        SELECT
            *
        FROM `going-tfx.examples.ATL_JUNE_RAW` 

        where
          MOD(ABS(FARM_FINGERPRINT(
            CONCAT(DATE,AIRLINE,ARR)
          )) + DEP_T, 1000) >= 80 
        and
          MOD(ABS(FARM_FINGERPRINT(
            CONCAT( DATE, AIRLINE, ARR)
           )) + DEP_T, 1000) < 90 
    


#### A super-small random subset 

In [16]:
tiny_query = sample_query(10000, 0, 1)
sample = dlbq.Query(tiny_query).execute().result().to_dataframe()
print('Only {} examples. Showing first three:'.format(len(sample)))
sample[:3]

Only 40 examples. Showing first three:


Unnamed: 0,DATE,YEAR,MONTH,DAY,DEP_DOW,AIRLINE,DEP_T,DEP,DEP_LAT,DEP_LON,...,WND_SPD_DEP,ARR_T,ARR_DELAY,ARR,ARR_LAT,ARR_LON,ARR_W,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
0,2004-06-07,2004,6,7,2,Delta Air Lines Inc.: DL,842,ATL,33.63,-84.42,...,4.4,949,16.0,SAV,32.12,-81.2,SAVANNAH/HILTON HEAD INTL AIR,79.4,9.8,5.7
1,2006-06-27,2006,6,27,3,Delta Air Lines Inc.: DL,2130,ATL,33.63,-84.42,...,6.3,2354,45.0,PVD,41.72,-71.42,PROVIDENCE T F GREEN ARPT,74.7,10.0,10.1
2,2005-06-22,2005,6,22,4,America West Airlines Inc.: HP (Merged with US...,805,ATL,33.63,-84.42,...,6.1,916,-25.0,PHX,33.43,-112.01,PHOENIX SKY HARBOR INTL AIRPO,102.4,10.0,8.5


---
# Reading from Bigquery into a beam pipeline

In [17]:
STRING_COLUMNS = ['DATE', 'AIRLINE', 'DEP', 'DEP_W', 'ARR', 'ARR_W']
INT_COLUMNS = ['YEAR', 'MONTH', 'DAY', 'DEP_DOW', 'DEP_T', 'ARR_T']
FLOAT_COLUMNS = ['DEP_LAT', 'DEP_LON', 'DEP_DELAY', 'MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 'ARR_DELAY', 'ARR_LAT', 'ARR_LON', 'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR']

In [18]:
raw_data_schema = {}

for t, cols in [(tf.string, STRING_COLUMNS), (tf.float32, FLOAT_COLUMNS), (tf.int64, INT_COLUMNS)]:
    raw_data_schema.update({
        col : dataset_schema.ColumnSchema(t, [], dataset_schema.FixedColumnRepresentation())
                   for col in cols})
raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema))

In [19]:
in_test_mode = True
OUTPUT_DIR="./out"
job_name = 'tft_tutorial' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    

options = {
    'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
    'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
    'job_name': job_name,
    'project': PROJECT,
    'max_num_workers': 24,
    'teardown_policy': 'TEARDOWN_ALWAYS',
    'no_save_main_session': True,
    'requirements_file': 'requirements.txt'
}
opts = beam.pipeline.PipelineOptions(flags=[], **options)
if in_test_mode:
    RUNNER = 'DirectRunner'
else:
    RUNNER = 'DataflowRunner'

---
# Developing the pre-processing function

In [20]:
tf.logging.set_verbosity(tf.logging.ERROR)

---
### This simple pipeline reads, transforms and emits the result into a csv file

In [21]:
def exec_pipeline(query, preprocessing_fn, output_columns, out_name = 'atl_june_transformed', write_header=False):
    
    header=",".join(output_columns) if write_header else None
    
    out_prefix = os.path.join(TMPDIR, out_name)
    with beam.Pipeline(RUNNER, options=opts) as p:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            
            
            #   Read from Big Query
            #
            raw_data = p | "ReadFromBigQuery"  >> beam.io.Read(beam.io.BigQuerySource(query=query, use_standard_sql=True)) 
            raw_dataset = (raw_data, raw_data_metadata)

            
            #   Analyze and transform by calling a single function that has all the tft.transforms in it
            #
            t_dataset, t_fn = (raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            t_data, t_metadata = t_dataset
            
            
            # Encode back to CSV file(s)
            #
            csv_encode = tft.coders.CsvCoder(output_columns, t_metadata.schema).encode    
            res = (t_data 
                   | beam.Map(csv_encode)
                   | beam.io.WriteToText(file_path_prefix=out_prefix, header=header))

            
    # Return a pandas dataframe containing the result
    #
    resfile = !ls $TMPDIR | grep $out_name
    resfile = resfile[0]
    resfile = os.path.join(TMPDIR, resfile)
    return pd.read_csv(resfile)

---
#### Step 1: Do nothing

In [22]:
def do_nothing(inputs):
    return inputs

In [23]:
res = exec_pipeline(tiny_query, do_nothing, ALL_COLUMNS, write_header=True)
res[:3]

  pipeline.replace_all(_get_transform_overrides(pipeline.options))


Unnamed: 0,DATE,YEAR,MONTH,DAY,DEP_DOW,AIRLINE,DEP_T,DEP,DEP_LAT,DEP_LON,...,WND_SPD_DEP,ARR_T,ARR_DELAY,ARR,ARR_LAT,ARR_LON,ARR_W,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
0,2009-06-14,2009,6,14,1,Delta Air Lines Inc.: DL,1405,ATL,33.63,-84.42,...,6.5,1529,-1.0,TPA,27.97,-82.53,TAMPA INTERNATIONAL AIRPORT,82.5,10.0,4.3
1,2004-06-27,2004,6,27,1,ExpressJet Airlines Inc.: EV,1025,ATL,33.63,-84.42,...,5.0,1047,-9.0,PNS,30.47,-87.18,PENSACOLA REGIONAL AP,79.7,10.0,4.8
2,2002-06-23,2002,6,23,1,Delta Air Lines Inc.: DL,1030,ATL,33.63,-84.42,...,9.8,1147,7.0,RDU,35.87,-78.78,RALEIGH-DURHAM INTERNATIONAL,78.6,9.0,4.6


---
#### Step 2: Select only the useful columns

In [24]:
SELECTED_COLUMNS=[
    'YEAR', 'MONTH', 'DEP_DOW', 'AIRLINE', 
    'DEP_T', 'DEP', 'DEP_LAT', 'DEP_LON', 'DEP_DELAY', 
    'MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 
    'ARR_DELAY', 
    'ARR', 'ARR_LAT', 'ARR_LON', 
    'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR']

In [25]:
def make_select_cols(select_columns):
    def _select_cols(row):
        return {key: row[key] for key in select_columns}
    return _select_cols

one_row = sample.to_dict(orient='records')[0]
print(make_select_cols(SELECTED_COLUMNS)(one_row))

{'WND_SPD_DEP': 4.4, 'WND_SPD_ARR': 5.7, 'MEAN_TEMP_ARR': 79.4, 'DEP_DELAY': -2.0, 'DEP': 'ATL', 'ARR_LON': -81.2, 'MEAN_VIS_DEP': 6.0, 'DEP_T': 842, 'ARR': 'SAV', 'DEP_LON': -84.42, 'DEP_DOW': 2, 'MEAN_VIS_ARR': 9.8, 'ARR_LAT': 32.12, 'AIRLINE': 'Delta Air Lines Inc.: DL', 'YEAR': 2004, 'ARR_DELAY': 16.0, 'DEP_LAT': 33.63, 'MONTH': 6, 'MEAN_TEMP_DEP': 75.2}


In [26]:
res = exec_pipeline(tiny_query, make_select_cols(SELECTED_COLUMNS), SELECTED_COLUMNS, write_header=True)
res[:3]



Unnamed: 0,YEAR,MONTH,DEP_DOW,AIRLINE,DEP_T,DEP,DEP_LAT,DEP_LON,DEP_DELAY,MEAN_TEMP_DEP,MEAN_VIS_DEP,WND_SPD_DEP,ARR_DELAY,ARR,ARR_LAT,ARR_LON,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
0,2002,6,1,Delta Air Lines Inc.: DL,1030,ATL,33.63,-84.42,-1.0,74.5,6.9,9.8,7.0,RDU,35.87,-78.78,78.6,9.0,4.6
1,2002,6,2,Delta Air Lines Inc.: DL,1425,ATL,33.63,-84.42,-1.0,74.6,10.0,6.1,-12.0,CAE,33.93,-81.11,75.7,9.9,2.9
2,2002,6,2,Delta Air Lines Inc.: DL,1150,ATL,33.63,-84.42,19.0,73.1,9.9,6.2,37.0,DFW,32.89,-97.03,75.8,9.9,3.3


---
#### Adding combined features

In [27]:
def tf_haversine(lat1, lon1, lat2, lon2):
    
    def radians(a):
        return a * math.pi / 180.0

    radius = 6371.0
    dlat = radians (lat2 - lat1) 
    dlon = radians (lon2 - lon1)
    a = (tf.sin(dlat / 2.0) * tf.sin(dlat/2.0) +
         tf.cos(radians(lat1)) * tf.cos(radians(lat2)) *
         tf.sin(dlon / 2.0) * tf.sin(dlon / 2.0))
    c = 2.0 * tf.atan2(tf.sqrt(a), tf.sqrt(1.0 - a))
    return radius * c

In [28]:
def add_combined(row):
    dep_lat = row['DEP_LAT']
    dep_lon = row['DEP_LON']
    arr_lat = row['ARR_LAT']
    arr_lon = row['ARR_LON']
    row['DIFF_LAT'] = arr_lat - dep_lat
    row['DIFF_LON'] = arr_lon - dep_lon
    row['DISTANCE'] = tf_haversine(arr_lat, arr_lon, dep_lat, dep_lon)
    return row

In [29]:
select_columns = SELECTED_COLUMNS + ['DIFF_LAT', 'DIFF_LON', 'DISTANCE']

def pre_processor(row):
    row = make_select_cols(SELECTED_COLUMNS)(row)
    return add_combined(row)

res = exec_pipeline(tiny_query, pre_processor, select_columns, write_header=True)
res[:3]



Unnamed: 0,YEAR,MONTH,DEP_DOW,AIRLINE,DEP_T,DEP,DEP_LAT,DEP_LON,DEP_DELAY,MEAN_TEMP_DEP,...,ARR_DELAY,ARR,ARR_LAT,ARR_LON,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR,DIFF_LAT,DIFF_LON,DISTANCE
0,2004,6,5,Comair Inc.: OH,931,ATL,33.63,-84.42,0.0,73.4,...,-16.0,CAK,40.91,-81.44,67.8,9.6,8.8,7.279999,2.979996,851.21136
1,2009,6,1,Delta Air Lines Inc.: DL,1405,ATL,33.63,-84.42,2.0,75.7,...,-1.0,TPA,27.97,-82.53,82.5,10.0,4.3,-5.660002,1.889999,654.69806
2,2002,6,1,Delta Air Lines Inc.: DL,1030,ATL,33.63,-84.42,-1.0,74.5,...,7.0,RDU,35.87,-78.78,78.6,9.0,4.6,2.239998,5.639999,572.19543


---
#### Step 4: Scaling floats

In [30]:
def scale_floats(row):
    for c in ['MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR', 'DEP_DELAY',
             'DIFF_LAT', 'DIFF_LON', 'DISTANCE']:
        row[c] = tft.scale_to_0_1(row[c])
    return row

In [31]:
select_columns = SELECTED_COLUMNS + ['DIFF_LAT', 'DIFF_LON', 'DISTANCE']

def pre_processor(row):
    row = row.copy()
    row = make_select_cols(SELECTED_COLUMNS)(row)
    row = add_combined(row)
    row = scale_floats(row)
    return row

In [32]:
res = exec_pipeline(tiny_query, pre_processor, select_columns, write_header=True)
res[:3]



Unnamed: 0,YEAR,MONTH,DEP_DOW,AIRLINE,DEP_T,DEP,DEP_LAT,DEP_LON,DEP_DELAY,MEAN_TEMP_DEP,...,ARR_DELAY,ARR,ARR_LAT,ARR_LON,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR,DIFF_LAT,DIFF_LON,DISTANCE
0,2004,6,2,Delta Air Lines Inc.: DL,842,ATL,33.63,-84.42,0.013453,0.33125,...,16.0,SAV,32.12,-81.2,0.507495,0.96,0.222973,0.354622,0.773389,0.044136
1,2006,6,3,Delta Air Lines Inc.: DL,2130,ATL,33.63,-84.42,0.246637,0.2125,...,45.0,PVD,41.72,-71.42,0.406852,1.0,0.52027,0.892437,0.976715,0.424967
2,2005,6,4,America West Airlines Inc.: HP (Merged with US...,805,ATL,33.63,-84.42,0.013453,0.23125,...,-25.0,PHX,33.43,-112.01,1.0,1.0,0.412162,0.428011,0.132848,0.802009


---
# Create the big files
Creating the files for training and evaluation

In [33]:
!cat dataflow_requirements.txt

tensorflow-transform

In [34]:
def exec_pipeline_prod (preprocessing_fn, output_columns, fractions, sample_rate, runner='DirectRunner', write_header=False):
    
    header=",".join(output_columns) if write_header else None
    
    tmpdir='gs://{}/tmp'.format(BUCKET)
    proddir='gs://{}/prod'.format(BUCKET)

    job_name = 'tft-tutorial' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    

    options = {
        'staging_location': tmpdir,
        'temp_location': tmpdir,
        'job_name': job_name,
        'project': PROJECT,
        'max_num_workers': 24,
        'teardown_policy': 'TEARDOWN_ALWAYS',
        'no_save_main_session': True,
        'requirements_file': 'dataflow_requirements.txt'
    }
    opts = beam.pipeline.PipelineOptions(flags=[], **options)
    
    with beam.Pipeline(runner, options=opts) as p:
        with beam_impl.Context(temp_dir=tmpdir):
            
            queries = sample_queries(fractions, sample_rate)

            #   Read training data from Big Query
            #
            raw_data = p | "ReadFromBigQuery_train"  >> beam.io.Read(beam.io.BigQuerySource(query=queries['train'], use_standard_sql=True)) 
            raw_dataset = (raw_data, raw_data_metadata)

            
            #   Analyze and transform by calling a single function that has all the tft.transforms in it
            #
            t_dataset, transform_fn = (raw_dataset 
                                       | "AnalyzeAndTransform" >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            t_data, t_metadata = t_dataset
            
            
            # Encode back to CSV file(s)
            #
            train_prefix = os.path.join(proddir, 'atl_june_train')
            csv_encode = tft.coders.CsvCoder(output_columns, t_metadata.schema).encode    
            res = (t_data 
                   | "EncodeTraining" >> beam.Map(csv_encode)
                   | "WriteTraining" >> beam.io.WriteToText(file_path_prefix=train_prefix, header=header))


            #   Read eval data from Big Query
            #
            raw_data = p | "ReadFromBigQuery_eval"  >> beam.io.Read(beam.io.BigQuerySource(query=queries['eval'], use_standard_sql=True)) 
            raw_dataset = (raw_data, raw_data_metadata)

            
            #   Transform the eval dataset with the transform function derived above
            #
            t_dataset = ((raw_dataset, transform_fn) 
                         | "TransformEval" >> beam_impl.TransformDataset())
            t_data, _ = t_dataset
            
            
            # Encode back to CSV file(s)
            #
            eval_prefix = os.path.join(proddir, 'atl_june_eval')
            csv_encode = tft.coders.CsvCoder(output_columns, t_metadata.schema).encode    
            res = (t_data 
                   | "EncodeEval" >> beam.Map(csv_encode)
                   | "WriteEval" >> beam.io.WriteToText(file_path_prefix=eval_prefix, header=header))

            
            # save transformation function to disk for use at serving time
            #
            transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(
                os.path.join(proddir, 'metadata'))
            

#### From an excellent blog on the subject:
[The blog](https://cloud.google.com/blog/products/ai-machine-learning/pre-processing-tensorflow-pipelines-tftransform-google-cloud)

[Github](https://github.com/Fematich/tftransform-demo)

In [None]:
tf_transform_output = tft.TFTransformOutput(working_dir)
serving_input_fn = _make_serving_input_fn(tf_transform_output)
exported_model_dir = os.path.join(working_dir, EXPORTED_MODEL_DIR)
estimator.export_savedmodel(exported_model_dir, serving_input_fn)

This should be re-usable code

In [None]:
def _make_serving_input_fn(tf_transform_output):
  raw_feature_spec = RAW_DATA_METADATA.schema.as_feature_spec()
  raw_feature_spec.pop(LABEL_KEY)

  def serving_input_fn():
    raw_input_fn = input_fn_utils.build_parsing_serving_input_fn(
        raw_feature_spec)
    raw_features, _, default_inputs = raw_input_fn()
    transformed_features = tf_transform_output.transform_raw_features(
        raw_features)
    return input_fn_utils.InputFnOps(transformed_features, None, default_inputs)

  return serving_input_fn

#### Cleaning up before we start

In [1]:
!rm -f /tmp/*_june*
!ls /tmp

In [2]:
!gsutil ls gs://going-tfx/prod

CommandException: One or more URLs matched no objects.


In [37]:
!gsutil -m rm -rf gs://going-tfx/prod
_ = !gsutil -m rm -rf gs://going-tfx/tmp/

Removing gs://going-tfx/prod/#1541699087434906...
Removing gs://going-tfx/prod/atl_june_train-00003-of-00025#1541699111140688...
Removing gs://going-tfx/prod/atl_june_eval-00000-of-00003#1541699109975135...
Removing gs://going-tfx/prod/atl_june_eval-00001-of-00003#1541699109990940...
Removing gs://going-tfx/prod/atl_june_train-00006-of-00025#1541699111165386...
Removing gs://going-tfx/prod/atl_june_train-00000-of-00025#1541699111140030...
Removing gs://going-tfx/prod/atl_june_train-00007-of-00025#1541699111196358...
Removing gs://going-tfx/prod/atl_june_train-00002-of-00025#1541699111177539...
Removing gs://going-tfx/prod/atl_june_eval-00002-of-00003#1541699110058062...
Removing gs://going-tfx/prod/atl_june_train-00008-of-00025#1541699111236032...
Removing gs://going-tfx/prod/atl_june_train-00001-of-00025#1541699111208878...
Removing gs://going-tfx/prod/atl_june_train-00004-of-00025#1541699111158326...
Removing gs://going-tfx/prod/atl_june_train-00012-of-00025#1541699111221294...
Remov

#### This takes about two minutes. Be patient.

In [35]:
exec_pipeline_prod (preprocessing_fn=pre_processor, output_columns=select_columns, fractions=[80, 10, 10], sample_rate=0.1)



---
#### This executes in dataflow and takes some 12 - 20 minutes
But if you watch the graph in the dataflow console, you see that the job lives until the VM is shutdown. The files may be available a bit earlier.

In [38]:
!gsutil -m rm -rf gs://going-tfx/prod
_ = !gsutil -m rm -rf gs://going-tfx/tmp/

CommandException: 1 files/objects could not be removed.


In [39]:
exec_pipeline_prod (preprocessing_fn=pre_processor, output_columns=select_columns, fractions=[90, 5, 5], sample_rate=1.0, runner = 'DataflowRunner')

In [40]:
!gsutil ls gs://going-tfx/prod/*

gs://going-tfx/prod/
gs://going-tfx/prod/atl_june_eval-00000-of-00001
gs://going-tfx/prod/atl_june_train-00000-of-00005
gs://going-tfx/prod/atl_june_train-00001-of-00005
gs://going-tfx/prod/atl_june_train-00002-of-00005
gs://going-tfx/prod/atl_june_train-00003-of-00005
gs://going-tfx/prod/atl_june_train-00004-of-00005

gs://going-tfx/prod/metadata/:
gs://going-tfx/prod/metadata/
gs://going-tfx/prod/metadata/transform_fn/
gs://going-tfx/prod/metadata/transformed_metadata/


---
#### Copy the files into local directory 

In [41]:
mkdir /tmp/prod

In [42]:
!gsutil -m cp gs://going-tfx/prod/atl_june_eval* /tmp/prod/
_ = !gsutil -m cp gs://going-tfx/prod/atl_june_train* /tmp/prod/

Copying gs://going-tfx/prod/atl_june_eval-00000-of-00001...
/ [1/1 files][  2.7 MiB/  2.7 MiB] 100% Done                                    
Operation completed over 1 objects/2.7 MiB.                                      


---
#### Have a look at the result

In [36]:
ALL_COLUMNS = [
    'YEAR', 'MONTH', 'DEP_DOW', 'AIRLINE', 
    'DEP_T', 'DEP', 'DEP_LAT', 'DEP_LON', 'DEP_DELAY',
    'MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 
    'ARR_DELAY', 'ARR', 'ARR_LAT', 'ARR_LON', 
    'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR', 'DIFF_LAT', 'DIFF_LON', 'DISTANCE']

In [38]:
a_training_file = !ls /tmp/prod/atl_june_train-00001-of-*
a_training_file = a_training_file[0]
!wc -l $a_training_file

1000 /tmp/prod/atl_june_train-00001-of-00025


In [41]:
t1 = pd.read_csv(a_training_file, names=select_columns)

In [42]:
t1.describe()

Unnamed: 0,YEAR,MONTH,DEP_DOW,DEP_T,DEP_LAT,DEP_LON,DEP_DELAY,MEAN_TEMP_DEP,MEAN_VIS_DEP,WND_SPD_DEP,ARR_DELAY,ARR_LAT,ARR_LON,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR,DIFF_LAT,DIFF_LON,DISTANCE
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,2006.87,6.0,3.945,1443.381,33.63,-84.42,0.158073,0.633105,0.849453,0.352209,12.648,37.68447,-81.86891,0.486703,0.423871,0.005624,0.45973,0.8167,0.063288
std,2.940024,0.0,1.973796,470.859704,0.0,0.0,0.081216,0.186673,0.182565,0.141484,36.774889,2.941718,1.352395,0.118834,0.073301,0.002173,0.067672,0.014523,0.03032
min,2002.0,6.0,1.0,530.0,33.63,-84.42,0.099274,0.0,0.0,0.0,-29.0,32.89,-84.66,0.070652,0.129032,0.0013,0.349436,0.786727,0.033353
25%,2004.0,6.0,2.0,1015.0,33.63,-84.42,0.121065,0.491596,0.765625,0.255814,-7.0,35.21,-82.89,0.405797,0.38172,0.004,0.402807,0.805735,0.033353
50%,2007.0,6.0,4.0,1446.5,33.63,-84.42,0.128329,0.630252,0.921875,0.348837,1.0,39.04,-81.85,0.505435,0.451613,0.005201,0.490913,0.816903,0.066784
75%,2010.0,6.0,6.0,1805.0,33.63,-84.42,0.154964,0.769958,0.984375,0.434109,17.0,39.99,-80.94,0.574275,0.483871,0.007001,0.512768,0.826675,0.083416
max,2011.0,6.0,7.0,2340.0,33.63,-84.42,0.813559,1.0,1.0,1.0,273.0,41.4,-80.04,0.760869,0.489247,0.014001,0.545204,0.83634,0.107747
