In [1]:
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import

In [2]:
import datetime
import os
import tempfile
import math
import shutil

import pandas as pd
import apache_beam as beam

import tensorflow as tf
import tensorflow_transform as tft
import tensorflow_transform.beam.impl as beam_impl
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.beam.tft_beam_io import transform_fn_io

import google.datalab.bigquery as dlbq

from tools import tf_haversine

  from ._conv import register_converters as _register_converters
  from .. import h5g, h5i, h5o, h5r, h5t, h5l, h5p
  from .qhull import *
  from .murmurhash import murmurhash3_32
  from .lbfgsb import _minimize_lbfgsb
  from ._logistic_sigmoid import _log_logistic_sigmoid
  from .expected_mutual_info_fast import expected_mutual_information
  from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan


In [3]:
%load_ext google.cloud.bigquery

In [81]:
PROJECT='going-tfx'
BUCKET='going-tfx'
DATASET='examples'

---
We will distinguish different stages, such as 'sample' or 'full', to avoid mixing up things. That's what we need the ```stage``` parameter for:

In [85]:
def directories(env, stage): 
    if env == 'gs':
        form = "gs://{}/{}/{}".format(BUCKET, '{}', {})
    elif env == 'local':
        form = "/tmp/atl_june/{}/{}"
    else: 
        raise Exception("Environment {} not supported".format(env))
        
    def _dir(usage):
        return form.format(stage, usage)
    return {usage: _dir(usage) for usage in ['tmp', 'data', 'metadata']}

---
As an example:

- The first dictionary groups the directories containing *sample* stage data on Google cloud storage
- The second dictionary groups the directories containing *full* stage data on some local directory

In [86]:
directories('gs', 'sample'), directories('local', 'full')

({'data': 'gs://going-tfx/sample/data',
  'metadata': 'gs://going-tfx/sample/metadata',
  'tmp': 'gs://going-tfx/sample/tmp'},
 {'data': '/tmp/atl_june/full/data',
  'metadata': '/tmp/atl_june/full/metadata',
  'tmp': '/tmp/atl_june/full/tmp'})

---
# Signature data in Bigquery
We collected the raw data that we use from various sources into a single denormalized table holding the data in so-called signature format. That table's schema is meant to reflect the structure of the data/requests that we expect to be served at prediction time. 

In [7]:
%%bigquery aj_sample
select * FROM `going-tfx.examples.ATL_JUNE_SIGNATURE` limit 3

Unnamed: 0,DATE,YEAR,MONTH,DAY,DEP_DOW,AIRLINE,DEP_T,DEP,DEP_LAT,DEP_LON,...,WND_SPD_DEP,ARR_T,ARR_DELAY,ARR,ARR_LAT,ARR_LON,ARR_W,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
0,2002-06-01,2002,6,1,7,US Airways Inc.: US (Merged with America West ...,610,ATL,33.63,-84.42,...,6.9,712,-12.0,CLT,35.21,-80.94,CHARLOTTE/DOUGLAS INTERNATION,78.3,9.5,2.7
1,2002-06-01,2002,6,1,7,Delta Air Lines Inc.: DL,620,ATL,33.63,-84.42,...,6.9,749,-1.0,ORF,36.89,-76.2,NORFOLK REGIONAL ARPT,80.9,9.7,9.4
2,2002-06-01,2002,6,1,7,Delta Air Lines Inc.: DL,620,ATL,33.63,-84.42,...,6.9,740,9.0,MCO,28.42,-81.3,ORLANDO INTERNATIONAL AIRPORT,77.4,9.6,5.7


In [8]:
from data_schemas import SIGNATURE_METADATA
from data_schemas import SIGNATURE_SCHEMA
from data_schemas import SIGNATURE_COLUMNS
from data_schemas import TRAINING_COLUMNS

---
These are the columns that we expect to be provided at prediction time. We have exactly those columns made available in the signature data in Bigquery

In [9]:
print(SIGNATURE_COLUMNS)

['DATE', 'AIRLINE', 'DEP', 'DEP_W', 'ARR', 'ARR_W', 'YEAR', 'MONTH', 'DAY', 'DEP_DOW', 'DEP_T', 'ARR_T', 'DEP_LAT', 'DEP_LON', 'DEP_DELAY', 'MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 'ARR_DELAY', 'ARR_LAT', 'ARR_LON', 'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR']


---
These are the feature-engineered columns that we intend to use for training. You see: Some columns are dropped and others added, like e.g. ```DISTANCE```

In [10]:
print(TRAINING_COLUMNS)

['YEAR', 'MONTH', 'DEP_DOW', 'AIRLINE', 'DEP', 'DEP_LAT', 'DEP_LON', 'MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 'DEP_T', 'DEP_DELAY', 'ARR', 'ARR_LAT', 'ARR_LON', 'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR', 'ARR_T', 'ARR_DELAY', 'DIFF_LAT', 'DIFF_LON', 'DISTANCE']


---
# Repeatable random subsets 

In [11]:
def sample_query(total, lower, upper):
    return """
        SELECT
            *
        FROM `going-tfx.examples.ATL_JUNE_SIGNATURE` 

        where
          MOD(ABS(FARM_FINGERPRINT(
            CONCAT(DATE,AIRLINE,ARR)
          )) + DEP_T, {0}) >= {1} 
        and
          MOD(ABS(FARM_FINGERPRINT(
            CONCAT( DATE, AIRLINE, ARR)
           )) + DEP_T, {0}) < {2} 
    """.format(total, lower, upper)

In [12]:
def sample_queries(fractions, rate=0.1):
    start = 0
    total = int(sum(fractions) / rate)
    res = []
    for f in fractions:
        f_ = int(f) 
        q = sample_query(total, start, start+f_)
        start = start + f_
        res.append(q)
    return dict(zip(['train', 'eval', 'test'], res))

In [13]:
queries = sample_queries([80,10,10], .1)

In [14]:
print(queries['eval'])


        SELECT
            *
        FROM `going-tfx.examples.ATL_JUNE_SIGNATURE` 

        where
          MOD(ABS(FARM_FINGERPRINT(
            CONCAT(DATE,AIRLINE,ARR)
          )) + DEP_T, 1000) >= 80 
        and
          MOD(ABS(FARM_FINGERPRINT(
            CONCAT( DATE, AIRLINE, ARR)
           )) + DEP_T, 1000) < 90 
    


#### A super-small random subset 

In [15]:
tiny_query = sample_query(10000, 0, 1)
sample = dlbq.Query(tiny_query).execute().result().to_dataframe()
print('Only {} examples. Showing first three:'.format(len(sample)))
sample[:3]

Only 40 examples. Showing first three:


Unnamed: 0,DATE,YEAR,MONTH,DAY,DEP_DOW,AIRLINE,DEP_T,DEP,DEP_LAT,DEP_LON,...,WND_SPD_DEP,ARR_T,ARR_DELAY,ARR,ARR_LAT,ARR_LON,ARR_W,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
0,2004-06-07,2004,6,7,2,Delta Air Lines Inc.: DL,842,ATL,33.63,-84.42,...,4.4,949,16.0,SAV,32.12,-81.2,SAVANNAH/HILTON HEAD INTL AIR,79.4,9.8,5.7
1,2006-06-27,2006,6,27,3,Delta Air Lines Inc.: DL,2130,ATL,33.63,-84.42,...,6.3,2354,45.0,PVD,41.72,-71.42,PROVIDENCE T F GREEN ARPT,74.7,10.0,10.1
2,2005-06-22,2005,6,22,4,America West Airlines Inc.: HP (Merged with US...,805,ATL,33.63,-84.42,...,6.1,916,-25.0,PHX,33.43,-112.01,PHOENIX SKY HARBOR INTL AIRPO,102.4,10.0,8.5


---
# Reading from Bigquery into a beam pipeline

In [16]:
OUTPUT_DIR="./out"
job_name = 'tft_tutorial' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    

options = {
    'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
    'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
    'job_name': job_name,
    'project': PROJECT,
    'max_num_workers': 24,
    'teardown_policy': 'TEARDOWN_ALWAYS',
    'no_save_main_session': True,
    'requirements_file': 'requirements.txt'
}
opts = beam.pipeline.PipelineOptions(flags=[], **options)
tf.logging.set_verbosity(tf.logging.ERROR)

---
This simple pipeline reads, transforms and emits the result into a csv file. It returns a pair consisting of a pandas dataframe containing the data, and the transformed schema

In [27]:
def exec_pipeline(query, preprocessing_fn, output_columns, out_name = 'atl_june_transformed', write_header=False, runner='DirectRunner'):
    
    header=",".join(output_columns) if write_header else None
    
    out_prefix = os.path.join(LOCAL_TMPDIR, out_name)
    with beam.Pipeline(runner, options=opts) as p:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            
            
            #   Read from Big Query
            #
            sig_data = p | "ReadFromBigQuery"  >> beam.io.Read(beam.io.BigQuerySource(query=query, use_standard_sql=True)) 
            sig_dataset = (sig_data, SIGNATURE_METADATA)

            
            #   Analyze and transform by calling a single function that has all the tft.transforms in it
            #
            t_dataset, t_fn = (sig_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            t_data, t_metadata = t_dataset
            
            
            # Encode back to CSV file(s)
            #
            csv_encode = tft.coders.CsvCoder(output_columns, t_metadata.schema).encode    
            res = (t_data 
                   | beam.Map(csv_encode)
                   | beam.io.WriteToText(file_path_prefix=out_prefix, header=header))

            
    # Return a pandas dataframe containing the result
    #
    resfile = !ls $LOCAL_TMPDIR | grep $out_name
    resfile = resfile[0]
    resfile = os.path.join(LOCAL_TMPDIR, resfile)
    return pd.read_csv(resfile), t_metadata.schema

print("Output will be written to {}".format(LOCAL_TMPDIR))

Output will be written to /tmp/atl_june


---
# Developing the pre-processing function

#### Step 1: Do nothing
Here, we simply verify that the pipeline setup is correct.

In [28]:
def do_nothing(inputs):
    return inputs

In [29]:
res, _ = exec_pipeline(tiny_query, do_nothing, SIGNATURE_COLUMNS, write_header=True, runner='DirectRunner')
res[:3]



Unnamed: 0,DATE,AIRLINE,DEP,DEP_W,ARR,ARR_W,YEAR,MONTH,DAY,DEP_DOW,...,DEP_DELAY,MEAN_TEMP_DEP,MEAN_VIS_DEP,WND_SPD_DEP,ARR_DELAY,ARR_LAT,ARR_LON,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
0,2002-06-10,Delta Air Lines Inc.: DL,ATL,HARTSFIELD-JACKSON ATLANTA IN,CAE,COLUMBIA METRO ARPT,2002,6,10,2,...,-1.0,74.6,10.0,6.1,-12.0,33.93,-81.11,75.7,9.9,2.9
1,2002-06-23,Delta Air Lines Inc.: DL,ATL,HARTSFIELD-JACKSON ATLANTA IN,RDU,RALEIGH-DURHAM INTERNATIONAL,2002,6,23,1,...,-1.0,74.5,6.9,9.8,7.0,35.87,-78.78,78.6,9.0,4.6
2,2002-06-17,Delta Air Lines Inc.: DL,ATL,HARTSFIELD-JACKSON ATLANTA IN,DFW,DALLAS-FORT WORTH INTL AP,2002,6,17,2,...,19.0,73.1,9.9,6.2,37.0,32.89,-97.03,75.8,9.9,3.3


---
#### Step 2: Select only the useful columns
These columns could, of course, already been excluded in the signature. Just demonstrating that the signature may indeed contain more data than is actually required for training.

In [31]:
IGNORE = ['ARR_W', 'DEP_W', 'DATE', 'DAY']
SELECTED_COLUMNS = [key for key in SIGNATURE_COLUMNS if key not in IGNORE]
print(SELECTED_COLUMNS)

['AIRLINE', 'DEP', 'ARR', 'YEAR', 'MONTH', 'DEP_DOW', 'DEP_T', 'ARR_T', 'DEP_LAT', 'DEP_LON', 'DEP_DELAY', 'MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 'ARR_DELAY', 'ARR_LAT', 'ARR_LON', 'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR']


In [32]:
def make_ignore_cols(to_ignore):
    def _ignore_cols(row):
        return {key: row[key] for key in SIGNATURE_COLUMNS if key not in to_ignore }
    return _ignore_cols

one_row = sample.to_dict(orient='records')[0]
print(make_ignore_cols(IGNORE)(one_row))

{'ARR': 'SAV', 'WND_SPD_ARR': 5.7, 'ARR_T': 949, 'DEP_DELAY': -2.0, 'DEP': 'ATL', 'ARR_LON': -81.2, 'MEAN_VIS_DEP': 6.0, 'DEP_T': 842, 'WND_SPD_DEP': 4.4, 'DEP_LON': -84.42, 'MONTH': 6, 'MEAN_VIS_ARR': 9.8, 'ARR_LAT': 32.12, 'AIRLINE': 'Delta Air Lines Inc.: DL', 'YEAR': 2004, 'ARR_DELAY': 16.0, 'DEP_LAT': 33.63, 'DEP_DOW': 2, 'MEAN_TEMP_DEP': 75.2, 'MEAN_TEMP_ARR': 79.4}


In [33]:
res, _ = exec_pipeline(tiny_query, make_ignore_cols(IGNORE), SELECTED_COLUMNS, write_header=True)
res[:3]



Unnamed: 0,AIRLINE,DEP,ARR,YEAR,MONTH,DEP_DOW,DEP_T,ARR_T,DEP_LAT,DEP_LON,DEP_DELAY,MEAN_TEMP_DEP,MEAN_VIS_DEP,WND_SPD_DEP,ARR_DELAY,ARR_LAT,ARR_LON,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
0,Delta Air Lines Inc.: DL,ATL,SAV,2004,6,2,842,949,33.63,-84.42,-2.0,75.2,6.0,4.4,16.0,32.12,-81.2,79.4,9.8,5.7
1,Delta Air Lines Inc.: DL,ATL,PVD,2006,6,3,2130,2354,33.63,-84.42,50.0,73.3,6.5,6.3,45.0,41.72,-71.42,74.7,10.0,10.1
2,America West Airlines Inc.: HP (Merged with US...,ATL,PHX,2005,6,4,805,916,33.63,-84.42,-2.0,73.6,8.0,6.1,-25.0,33.43,-112.01,102.4,10.0,8.5


---
#### Adding engineered features
We use the well-known haversine function (defined in ```tools.py```) to calculate the distance between two lat/lon coordinate pairs

In [35]:
!grep -A 13 tf_haversine tools.py

def tf_haversine(lat1, lon1, lat2, lon2):
    
    def radians(a):
        return a * math.pi / 180.0

    radius = 6371.0
    dlat = radians (lat2 - lat1) 
    dlon = radians (lon2 - lon1)
    a = (tf.sin(dlat / 2.0) * tf.sin(dlat/2.0) +
         tf.cos(radians(lat1)) * tf.cos(radians(lat2)) *
         tf.sin(dlon / 2.0) * tf.sin(dlon / 2.0))
    c = 2.0 * tf.atan2(tf.sqrt(a), tf.sqrt(1.0 - a))
    return radius * c


In [38]:
def add_engineered(row):
    dep_lat = row['DEP_LAT']
    dep_lon = row['DEP_LON']
    arr_lat = row['ARR_LAT']
    arr_lon = row['ARR_LON']
    row['DIFF_LAT'] = arr_lat - dep_lat
    row['DIFF_LON'] = arr_lon - dep_lon
    row['DISTANCE'] = tf_haversine(arr_lat, arr_lon, dep_lat, dep_lon)
    return row

In [39]:
def pre_processor(row):
    row = make_ignore_cols(IGNORE)(row)
    return add_engineered(row)

res, _ = exec_pipeline(tiny_query, pre_processor, TRAINING_COLUMNS, write_header=True)
res[:3]



Unnamed: 0,YEAR,MONTH,DEP_DOW,AIRLINE,DEP,DEP_LAT,DEP_LON,MEAN_TEMP_DEP,MEAN_VIS_DEP,WND_SPD_DEP,...,ARR_LAT,ARR_LON,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR,ARR_T,ARR_DELAY,DIFF_LAT,DIFF_LON,DISTANCE
0,2004,6,2,Delta Air Lines Inc.: DL,ATL,33.63,-84.42,75.2,6.0,4.4,...,32.12,-81.2,79.4,9.8,5.7,949,16.0,-1.510002,3.220001,344.38202
1,2006,6,3,Delta Air Lines Inc.: DL,ATL,33.63,-84.42,73.3,6.5,6.3,...,41.72,-71.42,74.7,10.0,10.1,2354,45.0,8.09,13.0,1452.6074
2,2005,6,4,America West Airlines Inc.: HP (Merged with US...,ATL,33.63,-84.42,73.6,8.0,6.1,...,33.43,-112.01,102.4,10.0,8.5,916,-25.0,-0.200001,-27.590004,2549.8042


---
#### Step 4: Scaling floats

In [40]:
def scale_floats(row):
    for c in ['MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR', 'DEP_DELAY',
             'DIFF_LAT', 'DIFF_LON', 'DISTANCE']:
        row[c] = tft.scale_to_0_1(row[c])
    return row

In [42]:
def pre_processor(row):
    row = row.copy()
    row = make_ignore_cols(IGNORE)(row)
    row = add_engineered(row)
    row = scale_floats(row)
    return row

In [41]:
res, _ = exec_pipeline(tiny_query, pre_processor, TRAINING_COLUMNS, write_header=True)
res[:3]



Unnamed: 0,YEAR,MONTH,DEP_DOW,AIRLINE,DEP,DEP_LAT,DEP_LON,MEAN_TEMP_DEP,MEAN_VIS_DEP,WND_SPD_DEP,...,ARR_LAT,ARR_LON,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR,ARR_T,ARR_DELAY,DIFF_LAT,DIFF_LON,DISTANCE
0,2004,6,2,Delta Air Lines Inc.: DL,ATL,33.63,-84.42,0.33125,0.0,0.075,...,32.12,-81.2,0.507495,0.96,0.222973,949,16.0,0.354622,0.773389,0.044136
1,2006,6,3,Delta Air Lines Inc.: DL,ATL,33.63,-84.42,0.2125,0.125,0.3125,...,41.72,-71.42,0.406852,1.0,0.52027,2354,45.0,0.892437,0.976715,0.424967
2,2005,6,4,America West Airlines Inc.: HP (Merged with US...,ATL,33.63,-84.42,0.23125,0.5,0.2875,...,33.43,-112.01,1.0,1.0,0.412162,916,-25.0,0.428011,0.132848,0.802009


---
# Create the big files

We tell dataflow what packages our pipeline requires

In [43]:
!cat dataflow_requirements.txt

tensorflow-transform

In [106]:
def exec_pipeline_prod (environment, stage, 
                        preprocessing_fn, output_columns, 
                        fractions, sample_rate, prefix,
                        runner='DirectRunner', write_header=False):
    
    header=",".join(output_columns) if write_header else None
    
    dirs = directories(environment, stage)    
    tmpdir = dirs['tmp']
    datadir = dirs['data']
    metadir = dirs['metadata']

    job_name = 'tft-tutorial' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    

    options = {
        'staging_location': tmpdir,
        'temp_location': tmpdir,
        'job_name': job_name,
        'project': PROJECT,
        'max_num_workers': 24,
        'teardown_policy': 'TEARDOWN_ALWAYS',
        'no_save_main_session': True,
        'requirements_file': 'dataflow_requirements.txt'
    }    
    opts = beam.pipeline.PipelineOptions(flags=[], **options)
    
    with beam.Pipeline(runner, options=opts) as p:
        with beam_impl.Context(temp_dir=tmpdir):
            
            queries = sample_queries(fractions, sample_rate)

            #   Read training data from Big Query
            #
            signature_data = p | "ReadFromBigQuery_train"  >> beam.io.Read(beam.io.BigQuerySource(query=queries['train'], use_standard_sql=True)) 
            signature_dataset = (signature_data, SIGNATURE_METADATA)

            
            #   Analyze and transform by calling a single function that has all the tft.transforms in it
            #
            t_dataset, transform_fn = (signature_dataset 
                                       | "AnalyzeAndTransform" >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            t_data, t_metadata = t_dataset
            
            
            # Encode back to CSV file(s)
            #
            train_prefix = os.path.join(datadir, prefix + "_train")

            csv_encode = tft.coders.CsvCoder(output_columns, t_metadata.schema).encode    
            res = (t_data 
                   | "EncodeTraining" >> beam.Map(csv_encode)
                   | "WriteTraining" >> beam.io.WriteToText(file_path_prefix=train_prefix, header=header))


            #   Read eval data from Big Query
            #
            signature_data = p | "ReadFromBigQuery_eval"  >> beam.io.Read(beam.io.BigQuerySource(query=queries['eval'], use_standard_sql=True)) 
            signature_dataset = (signature_data, SIGNATURE_METADATA)

            
            #   Transform the eval dataset with the transform function derived above
            #
            t_dataset = ((signature_dataset, transform_fn) 
                         | "TransformEval" >> beam_impl.TransformDataset())
            t_data, _ = t_dataset
            
            
            # Encode back to CSV file(s)
            #
            eval_prefix = os.path.join(datadir, prefix + "_eval")
            csv_encode = tft.coders.CsvCoder(output_columns, t_metadata.schema).encode    
            res = (t_data 
                   | "EncodeEval" >> beam.Map(csv_encode)
                   | "WriteEval" >> beam.io.WriteToText(file_path_prefix=eval_prefix, header=header))

            
            # save transformation function to disk for use at serving time
            #
            transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(metadir)
            

---
## First stage: Run locally on a smaller sample

#### Cleaning up before we start
We distinguish a sample scenario that we still want to run with DirectRunner, and then a full scenario. That's why we use two different file locations even in the local directory

In [118]:
gsdirs = directories('gs', 'sample')
localdirs = directories('local', 'sample')
gsdirs, localdirs

({'data': 'gs://going-tfx/sample/data',
  'metadata': 'gs://going-tfx/sample/metadata',
  'tmp': 'gs://going-tfx/sample/tmp'},
 {'data': '/tmp/atl_june/sample/data',
  'metadata': '/tmp/atl_june/sample/metadata',
  'tmp': '/tmp/atl_june/sample/tmp'})

In [108]:
def cleanup(stage):
    local_dirs = directories('local', stage)
    gs_dirs = directories('gs', stage)

    for path in local_dirs.values():
        shutil.rmtree(path, ignore_errors=True)
        os.mkdir(path)
        res = !ls $path
        print("{} contains {} files.".format(path, len(res)))
        
    for path in gs_dirs.values():
        _ = !gsutil -m rm -rf $path
        res = !gsutil ls $path
        print("{}: {}".format(path, res[0]))

In [109]:
cleanup('sample')

/tmp/atl_june/sample/tmp contains 0 files.
/tmp/atl_june/sample/data contains 0 files.
/tmp/atl_june/sample/metadata contains 0 files.
gs://going-tfx/sample/tmp: CommandException: One or more URLs matched no objects.
gs://going-tfx/sample/data: CommandException: One or more URLs matched no objects.
gs://going-tfx/sample/metadata: CommandException: One or more URLs matched no objects.


#### Run the pipeline!
This takes about two minutes. Be patient.

In [110]:
exec_pipeline_prod (
    'gs', 'sample',
    preprocessing_fn=pre_processor, 
    output_columns=TRAINING_COLUMNS, 
    fractions=[80, 10, 10], 
    sample_rate=0.1,
    prefix='atl_june',
    runner = 'DirectRunner')



#### Examine and retrieve the results.

In [127]:
gsdatadir = gsdirs['data']
res = !gsutil ls $gsdatadir
print("Displaying first 6 of {}.".format(gsdatadir))
res[:6]

Displaying first 6 of gs://going-tfx/sample/data.


['gs://going-tfx/sample/data/atl_june_eval-00000-of-00003',
 'gs://going-tfx/sample/data/atl_june_eval-00001-of-00003',
 'gs://going-tfx/sample/data/atl_june_eval-00002-of-00003',
 'gs://going-tfx/sample/data/atl_june_train-00000-of-00025',
 'gs://going-tfx/sample/data/atl_june_train-00001-of-00025',
 'gs://going-tfx/sample/data/atl_june_train-00002-of-00025']

In [128]:
localdatadir = localdirs['data']
_ = !gsutil -m cp $gsdatadir/*eval* $localdatadir
_ = !gsutil -m cp $gsdatadir/*train* $localdatadir

In [129]:
res = !ls $localdatadir
print("{} contains {}.".format(local_sample_dir, res if len(res) > 0 else "no files"))

/tmp/atl_june/data/sample contains ['atl_june_eval-00000-of-00003', 'atl_june_eval-00001-of-00003', 'atl_june_eval-00002-of-00003', 'atl_june_train-00000-of-00025', 'atl_june_train-00001-of-00025', 'atl_june_train-00002-of-00025', 'atl_june_train-00003-of-00025', 'atl_june_train-00004-of-00025', 'atl_june_train-00005-of-00025', 'atl_june_train-00006-of-00025', 'atl_june_train-00007-of-00025', 'atl_june_train-00008-of-00025', 'atl_june_train-00009-of-00025', 'atl_june_train-00010-of-00025', 'atl_june_train-00011-of-00025', 'atl_june_train-00012-of-00025', 'atl_june_train-00013-of-00025', 'atl_june_train-00014-of-00025', 'atl_june_train-00015-of-00025', 'atl_june_train-00016-of-00025', 'atl_june_train-00017-of-00025', 'atl_june_train-00018-of-00025', 'atl_june_train-00019-of-00025', 'atl_june_train-00020-of-00025', 'atl_june_train-00021-of-00025', 'atl_june_train-00022-of-00025', 'atl_june_train-00023-of-00025', 'atl_june_train-00024-of-00025'].


---
#### Have a look into the first training file

In [142]:
a_training_file = !ls $localdatadir/atl_june_train-00000-of-*
a_training_file = a_training_file[0]
!wc -l $a_training_file

47545 /tmp/atl_june/full/data/atl_june_train-00000-of-00005


In [126]:
!head -4 $a_training_file

2007,6,6,AirTran Airways Corporation: FL,ATL,33.63,-84.42,0.394958,0.98437494,0.33333334,1448,0.11622277,ROC,43.11,-77.67,0.3949275,0.4892473,0.0033003297,1652,-13.0,0.5845411,0.86179125,0.1518046
2007,6,4,ExpressJet Airlines Inc.: EV,ATL,33.63,-84.42,0.59243685,0.875,0.5426357,955,0.29539952,ROC,43.11,-77.67,0.35688406,0.46774197,0.010701069,1213,44.0,0.5845411,0.86179125,0.1518046
2007,6,5,AirTran Airways Corporation: FL,ATL,33.63,-84.42,0.7268908,1.0,0.45736435,2120,0.15496369,ROC,43.11,-77.67,0.35869563,0.4892473,0.008900889,2324,2.0,0.5845411,0.86179125,0.1518046
2007,6,6,Delta Air Lines Inc.: DL,ATL,33.63,-84.42,0.760504,1.0,0.2635659,2058,0.12348669,ROC,43.11,-77.67,0.30615938,0.4892473,0.0120012,2306,-3.0,0.5845411,0.86179125,0.1518046


In [125]:
t1 = pd.read_csv(a_training_file, names=TRAINING_COLUMNS)
t1[:3]

Unnamed: 0,YEAR,MONTH,DEP_DOW,AIRLINE,DEP,DEP_LAT,DEP_LON,MEAN_TEMP_DEP,MEAN_VIS_DEP,WND_SPD_DEP,...,ARR_LAT,ARR_LON,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR,ARR_T,ARR_DELAY,DIFF_LAT,DIFF_LON,DISTANCE
0,2007,6,6,AirTran Airways Corporation: FL,ATL,33.63,-84.42,0.394958,0.984375,0.333333,...,43.11,-77.67,0.394927,0.489247,0.0033,1652,-13.0,0.584541,0.861791,0.151805
1,2007,6,4,ExpressJet Airlines Inc.: EV,ATL,33.63,-84.42,0.592437,0.875,0.542636,...,43.11,-77.67,0.356884,0.467742,0.010701,1213,44.0,0.584541,0.861791,0.151805
2,2007,6,5,AirTran Airways Corporation: FL,ATL,33.63,-84.42,0.726891,1.0,0.457364,...,43.11,-77.67,0.358696,0.489247,0.008901,2324,2.0,0.584541,0.861791,0.151805


---
## Second stage: Full dataset

In [148]:
gsdirs = directories('gs', 'full')
localdirs = directories('local', 'full')
gsdirs, localdirs

({'data': 'gs://going-tfx/full/data',
  'metadata': 'gs://going-tfx/full/metadata',
  'tmp': 'gs://going-tfx/full/tmp'},
 {'data': '/tmp/atl_june/full/data',
  'metadata': '/tmp/atl_june/full/metadata',
  'tmp': '/tmp/atl_june/full/tmp'})

In [133]:
cleanup('full')

/tmp/atl_june/full/tmp contains 0 files.
/tmp/atl_june/full/data contains 0 files.
/tmp/atl_june/full/metadata contains 0 files.
gs://going-tfx/full/tmp: CommandException: One or more URLs matched no objects.
gs://going-tfx/full/data: CommandException: One or more URLs matched no objects.
gs://going-tfx/full/metadata: CommandException: One or more URLs matched no objects.


---
#### This executes in dataflow and takes some 12 - 20 minutes
But if you watch the graph in the dataflow console, you see that the job lives until the VM is shutdown. The files may be available a bit earlier.

In [134]:
exec_pipeline_prod (
    'gs', 'full',
    preprocessing_fn=pre_processor, 
    output_columns=TRAINING_COLUMNS, 
    fractions=[90, 5, 5], 
    sample_rate=1.0, 
    prefix='atl_june',
    runner = 'DataflowRunner')

  standard_options = transform_node.inputs[0].pipeline.options.view_as(


#### Examine and retrieve the results.

In [149]:
gsdatadir = gsdirs['data']
res = !gsutil ls $gsdatadir
print("Displaying first 6 of {}.".format(gsdatadir))
res[:6]

Displaying first 6 of gs://going-tfx/full/data.


['gs://going-tfx/full/data/atl_june_eval-00000-of-00001',
 'gs://going-tfx/full/data/atl_june_train-00000-of-00005',
 'gs://going-tfx/full/data/atl_june_train-00001-of-00005',
 'gs://going-tfx/full/data/atl_june_train-00002-of-00005',
 'gs://going-tfx/full/data/atl_june_train-00003-of-00005',
 'gs://going-tfx/full/data/atl_june_train-00004-of-00005']

In [150]:
localdatadir = localdirs['data']

In [139]:
_ = !gsutil -m cp $gsdatadir/*eval* $localdatadir
_ = !gsutil -m cp $gsdatadir/*train* $localdatadir

In [151]:
res = !ls $localdatadir
print("{} contains {}.".format(localdatadir, res if len(res) > 0 else "no files"))

/tmp/atl_june/full/data contains ['atl_june_eval-00000-of-00001', 'atl_june_train-00000-of-00005', 'atl_june_train-00001-of-00005', 'atl_june_train-00002-of-00005', 'atl_june_train-00003-of-00005', 'atl_june_train-00004-of-00005'].


---
#### Have a look into the first training file

In [152]:
a_training_file = !ls $localdatadir/atl_june_train-00000-of-*
a_training_file = a_training_file[0]
!wc -l $a_training_file

47545 /tmp/atl_june/full/data/atl_june_train-00000-of-00005


In [153]:
!head -1 $a_training_file

2010,6,2,Delta Air Lines Inc.: DL,ATL,33.63,-84.42,0.840336,1.0,0.41085273,1506,0.06144698,RIC,37.5,-77.31,0.78649926,0.009406583,0.010000999,1649,18.0,0.45548654,0.86565727,0.090910725


In [154]:
t1 = pd.read_csv(a_training_file, names=TRAINING_COLUMNS)

In [158]:
t1 = pd.read_csv(a_training_file, names=TRAINING_COLUMNS)
t1[:3]

Unnamed: 0,YEAR,MONTH,DEP_DOW,AIRLINE,DEP,DEP_LAT,DEP_LON,MEAN_TEMP_DEP,MEAN_VIS_DEP,WND_SPD_DEP,...,ARR_LAT,ARR_LON,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR,ARR_T,ARR_DELAY,DIFF_LAT,DIFF_LON,DISTANCE
0,2010,6,2,Delta Air Lines Inc.: DL,ATL,33.63,-84.42,0.840336,1.0,0.410853,...,37.5,-77.31,0.786499,0.009407,0.010001,1649,18.0,0.455487,0.865657,0.090911
1,2010,6,2,AirTran Airways Corporation: FL,ATL,33.63,-84.42,0.840336,1.0,0.410853,...,37.5,-77.31,0.786499,0.009407,0.010001,1013,-2.0,0.455487,0.865657,0.090911
2,2010,6,2,Delta Air Lines Inc.: DL,ATL,33.63,-84.42,0.840336,1.0,0.410853,...,37.5,-77.31,0.786499,0.009407,0.010001,1825,75.0,0.455487,0.865657,0.090911
