In [12]:
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import

In [13]:
import datetime
import os
import tempfile
import math
import shutil

import apache_beam as beam

import tensorflow as tf
import tensorflow_transform as tft
import tensorflow_transform.beam.impl as beam_impl
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.beam.tft_beam_io import transform_fn_io

import pandas as pd

import google.datalab.bigquery as dlbq

from tools import tf_haversine

In [14]:
%load_ext google.cloud.bigquery

The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery


In [15]:
PROJECT='going-tfx'
BUCKET='going-tfx'

from configuration_2 import PROJECT
from configuration_2 import BUCKET
from configuration_2 import DATASET

from configuration_2 import SIGNATURE_METADATA
from configuration_2 import SIGNATURE_SCHEMA
from configuration_2 import SIGNATURE_COLUMNS
from configuration_2 import TRAINING_COLUMNS
from configuration_2 import ORDERED_TRAINING_COLUMNS

from configuration_2 import directories

---
We will distinguish different stages, such as 'sample' or 'full', to avoid mixing up things. That's what we're demonstrating here:
As an example:

- The first dictionary groups the directories containing *sample* stage data on Google cloud storage
- The second dictionary groups the directories containing *full* stage data on some local directory

directories('gs', 'sample'), directories('local', 'full')

---
# Signature data in Bigquery
We collected the raw data that we use from various sources into a single denormalized table holding the data in so-called signature format. That table's schema is meant to reflect the structure of the data/requests that we expect to be served at prediction time. 

In [16]:
%%bigquery aj_sample
select * FROM `going-tfx.examples.ATL_JUNE_SIGNATURE` limit 3

Unnamed: 0,DATE,YEAR,MONTH,DAY,DEP_DOW,AIRLINE,DEP_T,DEP,DEP_LAT,DEP_LON,...,WND_SPD_DEP,ARR_T,ARR_DELAY,ARR,ARR_LAT,ARR_LON,ARR_W,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
0,2005-06-01,2005,6,1,4,Delta Air Lines Inc.: DL,1927,ATL,33.63,-84.42,...,14.4,2113,26.0,RNO,39.49,-119.76,RENO WBO,67.2,10.0,9.8
1,2005-06-02,2005,6,2,5,Delta Air Lines Inc.: DL,1927,ATL,33.63,-84.42,...,9.7,2113,40.0,RNO,39.49,-119.76,RENO WBO,62.1,10.0,7.2
2,2005-06-03,2005,6,3,6,Delta Air Lines Inc.: DL,1927,ATL,33.63,-84.42,...,7.4,2113,13.0,RNO,39.49,-119.76,RENO WBO,58.9,10.0,4.3


---
**Signature** columns are those columns that we expect to be provided at prediction time. We have exactly those columns made available in ```ATL_JUNE_SIGNATURE``` table in Bigquery

In [17]:
from train.model_config import SIGNATURE_COLUMNS
print(SIGNATURE_COLUMNS)

['DEP_DOW', 'DEP_T', 'DEP_LAT', 'DEP_LON', 'DEP_DELAY', 'MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 'ARR_LAT', 'ARR_LON', 'ARR_DELAY', 'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR']


---
These are the feature-engineered columns that we'll create and save in ```TFRecord``` files for training. You see: Some columns will be dropped (e.g. ```DEP_T```) and others added (e.g. ```DISTANCE```).

In [18]:
from train.model_config import TRAINING_COLUMNS
print(TRAINING_COLUMNS)

['DEP_DOW', 'DEP_HOD', 'DEP_LAT', 'DEP_LON', 'MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 'DEP_DELAY', 'ARR_LAT', 'ARR_LON', 'ARR_DELAY', 'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR', 'DIFF_LAT', 'DIFF_LON', 'DISTANCE']


---
# Repeatable random subsets 

In [19]:
def sample_query(columns, total, lower, upper):
    col_string=", ".join(columns)
    return """
    SELECT
        {0}
    FROM 
        `going-tfx.examples.ATL_JUNE_SIGNATURE` 
    where
        MOD(ABS(FARM_FINGERPRINT(
            CONCAT(DATE,AIRLINE,ARR)
        )) + DEP_T, {1}) >= {2} 
    and
        MOD(ABS(FARM_FINGERPRINT(
            CONCAT( DATE, AIRLINE, ARR)
        )) + DEP_T, {1}) < {3} 
    """.format(col_string, total, lower, upper)

In [20]:
def sample_queries(columns, fractions, rate=0.1):
    start = 0
    total = int(sum(fractions) / rate)
    res = []
    for f in fractions:
        f_ = int(f) 
        q = sample_query(columns, total, start, start+f_)
        start = start + f_
        res.append(q)
    return dict(zip(['train', 'eval', 'test'], res))

In [21]:
queries = sample_queries(SIGNATURE_COLUMNS, [80,10,10], .1)

In [22]:
print(queries['eval'])


    SELECT
        DEP_DOW, DEP_T, DEP_LAT, DEP_LON, DEP_DELAY, MEAN_TEMP_DEP, MEAN_VIS_DEP, WND_SPD_DEP, ARR_LAT, ARR_LON, ARR_DELAY, MEAN_TEMP_ARR, MEAN_VIS_ARR, WND_SPD_ARR
    FROM 
        `going-tfx.examples.ATL_JUNE_SIGNATURE` 
    where
        MOD(ABS(FARM_FINGERPRINT(
            CONCAT(DATE,AIRLINE,ARR)
        )) + DEP_T, 1000) >= 80 
    and
        MOD(ABS(FARM_FINGERPRINT(
            CONCAT( DATE, AIRLINE, ARR)
        )) + DEP_T, 1000) < 90 
    


#### A super-small random subset 

In [23]:
tiny_query = sample_query(SIGNATURE_COLUMNS, 10000, 0, 1)
sample = dlbq.Query(tiny_query).execute().result().to_dataframe()
print('Only {} examples. Showing first three:'.format(len(sample)))
sample[:3]

Only 40 examples. Showing first three:


Unnamed: 0,DEP_DOW,DEP_T,DEP_LAT,DEP_LON,DEP_DELAY,MEAN_TEMP_DEP,MEAN_VIS_DEP,WND_SPD_DEP,ARR_LAT,ARR_LON,ARR_DELAY,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
0,1,1025,33.63,-84.42,-3.0,74.0,7.6,5.0,30.47,-87.18,-9.0,79.7,10.0,4.8
1,1,1608,33.63,-84.42,16.0,76.7,9.5,7.7,35.81,-83.99,14.0,74.2,9.9,9.8
2,1,1405,33.63,-84.42,2.0,75.7,9.3,6.5,27.97,-82.53,-1.0,82.5,10.0,4.3


---
# Reading from Bigquery into a beam pipeline

In [24]:
OUTPUT_DIR="./out"
job_name = 'tft_tutorial' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    

options = {
    'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
    'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
    'job_name': job_name,
    'project': PROJECT,
    'max_num_workers': 24,
    'teardown_policy': 'TEARDOWN_ALWAYS',
    'no_save_main_session': True,
    'requirements_file': 'requirements.txt'
}
opts = beam.pipeline.PipelineOptions(flags=[], **options)
tf.logging.set_verbosity(tf.logging.ERROR)

---
This simple pipeline reads, transforms and emits the result into a csv file. It returns a pair consisting of a pandas dataframe containing the data, and the transformed schema

In [25]:
LOCAL_TMPDIR='/tmp/'

In [26]:
def exec_pipeline(query, preprocessing_fn, output_columns, out_name = 'atl_june_transformed', write_header=False, runner='DirectRunner'):
    
    header=",".join(output_columns) if write_header else None
    
    out_prefix = os.path.join(LOCAL_TMPDIR, out_name)
    with beam.Pipeline(runner, options=opts) as p:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            
            
            #   Read from Big Query
            #
            sig_data = p | "ReadFromBigQuery"  >> beam.io.Read(beam.io.BigQuerySource(query=query, use_standard_sql=True)) 
            sig_dataset = (sig_data, SIGNATURE_METADATA)

            
            #   Analyze and transform by calling a single function that has all the tft.transforms in it
            #
            t_dataset, t_fn = (sig_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            t_data, t_metadata = t_dataset
            
            
            # Encode back to CSV file(s)
            #
            csv_encode = tft.coders.CsvCoder(output_columns, t_metadata.schema).encode    
            res = (t_data 
                   | beam.Map(csv_encode)
                   | beam.io.WriteToText(file_path_prefix=out_prefix, header=header))

            
    # Return a pandas dataframe containing the result
    #
    resfile = !ls $LOCAL_TMPDIR | grep $out_name
    resfile = resfile[0]
    resfile = os.path.join(LOCAL_TMPDIR, resfile)
    return pd.read_csv(resfile), t_metadata.schema

print("Output will be written to {}".format(LOCAL_TMPDIR))

Output will be written to /tmp/


---
# Developing the pre-processing function

#### Step 1: Do nothing
Here, we simply verify that the pipeline setup is correct.

In [14]:
def do_nothing(inputs):
    return inputs

In [15]:
res, _ = exec_pipeline(tiny_query, do_nothing, SIGNATURE_COLUMNS, write_header=True, runner='DirectRunner')
res[:3]

NameError: name 'tiny_query' is not defined

---
#### Adding engineered features
We use the well-known haversine function (defined in ```tools.py```) to calculate the distance between two lat/lon coordinate pairs

In [20]:
!grep -A 13 tf_haversine tools.py

def tf_haversine(lat1, lon1, lat2, lon2):
    
    def radians(a):
        return a * math.pi / 180.0

    radius = 6371.0
    dlat = radians (lat2 - lat1) 
    dlon = radians (lon2 - lon1)
    a = (tf.sin(dlat / 2.0) * tf.sin(dlat/2.0) +
         tf.cos(radians(lat1)) * tf.cos(radians(lat2)) *
         tf.sin(dlon / 2.0) * tf.sin(dlon / 2.0))
    c = 2.0 * tf.atan2(tf.sqrt(a), tf.sqrt(1.0 - a))
    return radius * c


In [21]:
def add_engineered(row):
    dep_lat = row['DEP_LAT']
    dep_lon = row['DEP_LON']
    arr_lat = row['ARR_LAT']
    arr_lon = row['ARR_LON']

    row['DEP_HOD'] = row['DEP_T'] // 100
    row.pop('DEP_T')  # no longer needed

    row['DIFF_LAT'] = arr_lat - dep_lat
    row['DIFF_LON'] = arr_lon - dep_lon
    row['DISTANCE'] = tf_haversine(arr_lat, arr_lon, dep_lat, dep_lon)
    return row

In [22]:
def pre_processor(row):
    return add_engineered(row)

res, _ = exec_pipeline(tiny_query, pre_processor, TRAINING_COLUMNS, write_header=True)
res[:3]



Unnamed: 0,DEP_DOW,DEP_HOD,DEP_LAT,DEP_LON,MEAN_TEMP_DEP,MEAN_VIS_DEP,WND_SPD_DEP,DEP_DELAY,ARR_LAT,ARR_LON,ARR_DELAY,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR,DIFF_LAT,DIFF_LON,DISTANCE
0,1,14,33.63,-84.42,75.7,9.3,6.5,2.0,27.97,-82.53,-1.0,82.5,10.0,4.3,-5.660002,1.889999,654.69806
1,1,10,33.63,-84.42,74.0,7.6,5.0,-3.0,30.47,-87.18,-9.0,79.7,10.0,4.8,-3.160002,-2.760002,437.13605
2,1,10,33.63,-84.42,74.5,6.9,9.8,-1.0,35.87,-78.78,7.0,78.6,9.0,4.6,2.239998,5.639999,572.19543


---
#### Step 4: Scaling floats

In [23]:
def scale_floats(row):
    for c in ['MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR', 'DEP_DELAY',
             'DIFF_LAT', 'DIFF_LON', 'DISTANCE']:
        row[c] = tft.scale_to_0_1(row[c])
    return row

In [24]:
def pre_processor(row):
    row = row.copy()
    row = add_engineered(row)
    row = scale_floats(row)
    return row

In [25]:
res, _ = exec_pipeline(tiny_query, pre_processor, TRAINING_COLUMNS, write_header=True)
res[:3]



Unnamed: 0,DEP_DOW,DEP_HOD,DEP_LAT,DEP_LON,MEAN_TEMP_DEP,MEAN_VIS_DEP,WND_SPD_DEP,DEP_DELAY,ARR_LAT,ARR_LON,ARR_DELAY,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR,DIFF_LAT,DIFF_LON,DISTANCE
0,1,10,33.63,-84.42,0.2875,0.225,0.75,0.017937,35.87,-78.78,7.0,0.490364,0.8,0.148649,0.564706,0.823701,0.122422
1,1,16,33.63,-84.42,0.425,0.875,0.4875,0.09417,35.81,-83.99,14.0,0.396146,0.98,0.5,0.561345,0.715385,0.01018
2,1,14,33.63,-84.42,0.3625,0.825,0.3375,0.03139,27.97,-82.53,-1.0,0.573876,1.0,0.128378,0.122129,0.745738,0.150773


---
# Create the big files

We tell dataflow what packages our pipeline requires

In [26]:
!cat dataflow_requirements.txt

tensorflow-transform

In [27]:
def exec_pipeline_prod (environment, stage, 
                        preprocessing_fn, output_columns, 
                        fractions, sample_rate, prefix,
                        runner='DirectRunner', write_header=False):
    
    header=",".join(output_columns) if write_header else None
    
    dirs = directories(environment, stage)    
    tmpdir = dirs['tmp']
    datadir = dirs['data']
    metadir = dirs['metadata']

    job_name = 'tft-tutorial' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    

    options = {
        'staging_location': tmpdir,
        'temp_location': tmpdir,
        'job_name': job_name,
        'project': PROJECT,
        'max_num_workers': 24,
        'teardown_policy': 'TEARDOWN_ALWAYS',
        'no_save_main_session': True,
        'requirements_file': 'dataflow_requirements.txt'
    }    
    opts = beam.pipeline.PipelineOptions(flags=[], **options)
    
    with beam.Pipeline(runner, options=opts) as p:
        with beam_impl.Context(temp_dir=tmpdir):
            
            queries = sample_queries(SIGNATURE_COLUMNS, fractions, sample_rate)

            #   Read training data from Big Query
            #
            signature_data = p | "ReadFromBigQuery_train"  >> beam.io.Read(beam.io.BigQuerySource(query=queries['train'], use_standard_sql=True)) 
            signature_dataset = (signature_data, SIGNATURE_METADATA)

            
            #   Analyze and transform by calling a single function that has all the tft.transforms in it
            #
            t_dataset, transform_fn = (signature_dataset 
                                       | "AnalyzeAndTransform" >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            t_data, t_metadata = t_dataset
            
            
            # Encode back to CSV file(s)
            #
            train_prefix = os.path.join(datadir, prefix + "_train")

            csv_encode = tft.coders.CsvCoder(output_columns, t_metadata.schema).encode    
            res = (t_data 
                   | "EncodeTraining" >> beam.Map(csv_encode)
                   | "WriteTraining" >> beam.io.WriteToText(file_path_prefix=train_prefix, header=header))


            #   Read eval data from Big Query
            #
            signature_data = p | "ReadFromBigQuery_eval"  >> beam.io.Read(beam.io.BigQuerySource(query=queries['eval'], use_standard_sql=True)) 
            signature_dataset = (signature_data, SIGNATURE_METADATA)

            
            #   Transform the eval dataset with the transform function derived above
            #
            t_dataset = ((signature_dataset, transform_fn) 
                         | "TransformEval" >> beam_impl.TransformDataset())
            t_data, _ = t_dataset
            
            
            # Encode back to CSV file(s)
            #
            eval_prefix = os.path.join(datadir, prefix + "_eval")
            csv_encode = tft.coders.CsvCoder(output_columns, t_metadata.schema).encode    
            res = (t_data 
                   | "EncodeEval" >> beam.Map(csv_encode)
                   | "WriteEval" >> beam.io.WriteToText(file_path_prefix=eval_prefix, header=header))

            
            # save transformation function to disk for use at serving time
            #
            transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(metadir)
            

---
## First stage: Run locally on a smaller sample

#### Cleaning up before we start
We distinguish a sample scenario that we still want to run with DirectRunner, and then a full scenario. That's why we use two different file locations even in the local directory

In [28]:
def cleanup(stage):
    local_dirs = directories('local', stage)
    gs_dirs = directories('gs', stage)

    for path in local_dirs.values():
        shutil.rmtree(path, ignore_errors=True)
        os.mkdir(path)
        res = !ls $path
        print("{} contains {} files.".format(path, len(res)))
        
    for path in gs_dirs.values():
        _ = !gsutil -m rm -rf $path
        res = !gsutil ls $path
        print("{}: {}".format(path, res[0]))

In [29]:
cleanup('sample')

/tmp/atl_june/sample/tmp contains 0 files.
/tmp/atl_june/sample/data contains 0 files.
/tmp/atl_june/sample/metadata contains 0 files.
gs://going-tfx/sample/tmp: CommandException: One or more URLs matched no objects.
gs://going-tfx/sample/data: CommandException: One or more URLs matched no objects.
gs://going-tfx/sample/metadata: CommandException: One or more URLs matched no objects.


#### Run the pipeline!
This takes about two minutes. Be patient.

In [30]:
exec_pipeline_prod (
    'gs', 'sample',
    preprocessing_fn=pre_processor, 
    output_columns=ORDERED_TRAINING_COLUMNS, 
    fractions=[80, 10, 10], 
    sample_rate=0.1,
    prefix='atl_june',
    runner = 'DirectRunner')



#### Examine and retrieve the results.

In [31]:
gsdirs = directories('gs', 'sample')
localdirs = directories('local', 'sample')
gsdirs, localdirs

({'data': 'gs://going-tfx/sample/data',
  'metadata': 'gs://going-tfx/sample/metadata',
  'tmp': 'gs://going-tfx/sample/tmp'},
 {'data': '/tmp/atl_june/sample/data',
  'metadata': '/tmp/atl_june/sample/metadata',
  'tmp': '/tmp/atl_june/sample/tmp'})

In [32]:
gsdatadir = gsdirs['data']
res = !gsutil ls $gsdatadir
print("Displaying first 6 of {}.".format(gsdatadir))
res[:6]

Displaying first 6 of gs://going-tfx/sample/data.


['gs://going-tfx/sample/data/atl_june_eval-00000-of-00003',
 'gs://going-tfx/sample/data/atl_june_eval-00001-of-00003',
 'gs://going-tfx/sample/data/atl_june_eval-00002-of-00003',
 'gs://going-tfx/sample/data/atl_june_train-00000-of-00025',
 'gs://going-tfx/sample/data/atl_june_train-00001-of-00025',
 'gs://going-tfx/sample/data/atl_june_train-00002-of-00025']

In [33]:
localdatadir = localdirs['data']
!mkdir -p $localdatadir
_ = !gsutil -m cp $gsdatadir/*eval* $localdatadir
_ = !gsutil -m cp $gsdatadir/*train* $localdatadir

In [34]:
res = !ls $localdatadir
print("{} contains {}.".format(localdatadir, res if len(res) > 0 else "no files"))

/tmp/atl_june/sample/data contains ['atl_june_eval-00000-of-00003', 'atl_june_eval-00001-of-00003', 'atl_june_eval-00002-of-00003', 'atl_june_train-00000-of-00025', 'atl_june_train-00001-of-00025', 'atl_june_train-00002-of-00025', 'atl_june_train-00003-of-00025', 'atl_june_train-00004-of-00025', 'atl_june_train-00005-of-00025', 'atl_june_train-00006-of-00025', 'atl_june_train-00007-of-00025', 'atl_june_train-00008-of-00025', 'atl_june_train-00009-of-00025', 'atl_june_train-00010-of-00025', 'atl_june_train-00011-of-00025', 'atl_june_train-00012-of-00025', 'atl_june_train-00013-of-00025', 'atl_june_train-00014-of-00025', 'atl_june_train-00015-of-00025', 'atl_june_train-00016-of-00025', 'atl_june_train-00017-of-00025', 'atl_june_train-00018-of-00025', 'atl_june_train-00019-of-00025', 'atl_june_train-00020-of-00025', 'atl_june_train-00021-of-00025', 'atl_june_train-00022-of-00025', 'atl_june_train-00023-of-00025', 'atl_june_train-00024-of-00025'].


---
#### Have a look into the first training file

In [35]:
a_training_file = !ls $localdatadir/atl_june_train-00000-of-*
a_training_file = a_training_file[0]
!wc -l $a_training_file

1000 /tmp/atl_june/sample/data/atl_june_train-00000-of-00025


In [36]:
!head -4 $a_training_file

-3.0,42.21,-83.35,0.10895884,5,12,33.63,-84.42,0.5638371,0.8007947,0.11696716,0.28985506,0.59663844,0.4784946,1.0,0.010901089,0.5658915
28.0,26.07,-80.15,0.19612591,5,20,33.63,-84.42,0.19254656,0.83515894,0.113756314,0.6249999,0.59663844,0.4784946,1.0,0.0046004597,0.5658915
-11.0,26.53,-81.75,0.11622277,7,13,33.63,-84.42,0.2031286,0.81797683,0.09888748,0.57971007,0.60924363,0.44623655,0.515625,0.0062006195,0.60465115
-14.0,39.29,-94.71,0.125908,7,12,33.63,-84.42,0.49666438,0.67880154,0.13878365,0.6340579,0.60924363,0.29032257,0.515625,0.009200919,0.60465115


In [37]:
t1 = pd.read_csv(a_training_file, names=ORDERED_TRAINING_COLUMNS)
t1[:3]

Unnamed: 0,ARR_DELAY,ARR_LAT,ARR_LON,DEP_DELAY,DEP_DOW,DEP_HOD,DEP_LAT,DEP_LON,DIFF_LAT,DIFF_LON,DISTANCE,MEAN_TEMP_ARR,MEAN_TEMP_DEP,MEAN_VIS_ARR,MEAN_VIS_DEP,WND_SPD_ARR,WND_SPD_DEP
0,-3.0,42.21,-83.35,0.108959,5,12,33.63,-84.42,0.563837,0.800795,0.116967,0.289855,0.596638,0.478495,1.0,0.010901,0.565891
1,28.0,26.07,-80.15,0.196126,5,20,33.63,-84.42,0.192547,0.835159,0.113756,0.625,0.596638,0.478495,1.0,0.0046,0.565891
2,-11.0,26.53,-81.75,0.116223,7,13,33.63,-84.42,0.203129,0.817977,0.098887,0.57971,0.609244,0.446237,0.515625,0.006201,0.604651


---
## Second stage: Full dataset

In [52]:
gsdirs = directories('gs', 'full')
localdirs = directories('local', 'full')
gsdirs, localdirs

({'data': 'gs://going-tfx/full/data',
  'metadata': 'gs://going-tfx/full/metadata',
  'tmp': 'gs://going-tfx/full/tmp'},
 {'data': '/tmp/atl_june/full/data',
  'metadata': '/tmp/atl_june/full/metadata',
  'tmp': '/tmp/atl_june/full/tmp'})

In [53]:
cleanup('full')

/tmp/atl_june/full/tmp contains 0 files.
/tmp/atl_june/full/data contains 0 files.
/tmp/atl_june/full/metadata contains 0 files.
gs://going-tfx/full/tmp: CommandException: One or more URLs matched no objects.
gs://going-tfx/full/data: CommandException: One or more URLs matched no objects.
gs://going-tfx/full/metadata: CommandException: One or more URLs matched no objects.


---
#### This executes in dataflow and takes some 12 - 20 minutes
But if you watch the graph in the dataflow console, you see that the job lives until the VM is shutdown. The files may be available a bit earlier.

In [54]:
exec_pipeline_prod (
    'gs', 'full',
    preprocessing_fn=pre_processor, 
    output_columns=ORDERED_TRAINING_COLUMNS, 
    fractions=[90, 5, 5], 
    sample_rate=1.0, 
    prefix='atl_june',
    runner = 'DataflowRunner')

#### Examine and retrieve the results.

In [44]:
gsdatadir = gsdirs['data']
res = !gsutil ls $gsdatadir
print("Displaying first 6 of {}.".format(gsdatadir))
res[:6]

Displaying first 6 of gs://going-tfx/full/data.


['gs://going-tfx/full/data/atl_june_eval-00000-of-00001',
 'gs://going-tfx/full/data/atl_june_train-00000-of-00005',
 'gs://going-tfx/full/data/atl_june_train-00001-of-00005',
 'gs://going-tfx/full/data/atl_june_train-00002-of-00005',
 'gs://going-tfx/full/data/atl_june_train-00003-of-00005',
 'gs://going-tfx/full/data/atl_june_train-00004-of-00005']

In [45]:
localdatadir = localdirs['data']
!mkdir -p $localdatadir

In [46]:
_ = !gsutil -m cp $gsdatadir/*eval* $localdatadir
_ = !gsutil -m cp $gsdatadir/*train* $localdatadir

In [47]:
res = !ls $localdatadir
print("{} contains {}.".format(localdatadir, res if len(res) > 0 else "no files"))

/tmp/atl_june/full/data contains ['atl_june_eval-00000-of-00001', 'atl_june_train-00000-of-00005', 'atl_june_train-00001-of-00005', 'atl_june_train-00002-of-00005', 'atl_june_train-00003-of-00005', 'atl_june_train-00004-of-00005'].


---
#### Have a look into the first training file

In [48]:
a_training_file = !ls $localdatadir/atl_june_train-00000-of-*
a_training_file = a_training_file[0]
!wc -l $a_training_file

26137 /tmp/atl_june/full/data/atl_june_train-00000-of-00005


In [49]:
!head -1 $a_training_file

1,9,33.63,-84.42,0.6932772,0.875,0.21705428,0.052527256,34.99,-78.88,-1.0,0.60439557,0.0075052534,0.0056005595,0.3977456,0.84879726,0.056765903


In [51]:
t1 = pd.read_csv(a_training_file, names=ORDERED_TRAINING_COLUMNS)
t1[:3]

Unnamed: 0,DEP_DOW,DEP_HOD,DEP_LAT,DEP_LON,MEAN_TEMP_DEP,MEAN_VIS_DEP,WND_SPD_DEP,DEP_DELAY,ARR_LAT,ARR_LON,ARR_DELAY,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR,DIFF_LAT,DIFF_LON,DISTANCE
0,1,9,33.63,-84.42,0.693277,0.875,0.217054,0.052527,34.99,-78.88,-1.0,0.604396,0.007505,0.005601,0.397746,0.848797,0.056766
1,1,20,33.63,-84.42,0.693277,0.875,0.217054,0.051536,31.53,-84.19,8.0,0.604396,0.007405,0.0025,0.31815,0.791774,0.015087
2,1,11,33.63,-84.42,0.693277,0.875,0.217054,0.047572,35.21,-80.94,0.0,0.604396,0.008006,0.0049,0.402807,0.826675,0.033353
