In [1]:
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import

In [2]:
import datetime
import os
import tempfile

import pandas as pd
import apache_beam as beam

import tensorflow as tf
import tensorflow_transform as tft
import tensorflow_transform.beam.impl as beam_impl
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.beam.tft_beam_io import transform_fn_io


import google.datalab.bigquery as dlbq

  from ._conv import register_converters as _register_converters
  from .. import h5g, h5i, h5o, h5r, h5t, h5l, h5p
  from .qhull import *
  from .murmurhash import murmurhash3_32
  from .lbfgsb import _minimize_lbfgsb
  from ._logistic_sigmoid import _log_logistic_sigmoid
  from .expected_mutual_info_fast import expected_mutual_information
  from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan


In [3]:
%load_ext google.cloud.bigquery

In [4]:
PROJECT='going-tfx'
BUCKET='going-tfx'
DATASET='examples'
TMPDIR='/tmp'

---
# Raw data in Bigquery
We collected the raw data that we use from various sources into a single denormalized table

In [5]:
%%bigquery aj_sample
select * FROM `going-tfx.examples.ATL_JUNE_RAW` limit 3

Unnamed: 0,DATE,YEAR,MONTH,DAY,DEP_DOW,AIRLINE,DEP_T,DEP,DEP_LAT,DEP_LON,...,WND_SPD_DEP,ARR_T,ARR_DELAY,ARR,ARR_LAT,ARR_LON,ARR_W,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
0,2002-06-01,2002,6,1,7,US Airways Inc.: US (Merged with America West ...,610,ATL,33.63,-84.42,...,6.9,712,-12.0,CLT,35.21,-80.94,CHARLOTTE/DOUGLAS INTERNATION,78.3,9.5,2.7
1,2002-06-01,2002,6,1,7,Delta Air Lines Inc.: DL,620,ATL,33.63,-84.42,...,6.9,738,55.0,TPA,27.97,-82.53,TAMPA INTERNATIONAL AIRPORT,79.1,9.9,5.5
2,2002-06-01,2002,6,1,7,Delta Air Lines Inc.: DL,620,ATL,33.63,-84.42,...,6.9,740,9.0,MCO,28.42,-81.3,ORLANDO INTERNATIONAL AIRPORT,77.4,9.6,5.7


In [6]:
ALL_COLUMNS = [str(key) for key in aj_sample.keys()]
print(ALL_COLUMNS)

['DATE', 'YEAR', 'MONTH', 'DAY', 'DEP_DOW', 'AIRLINE', 'DEP_T', 'DEP', 'DEP_LAT', 'DEP_LON', 'DEP_W', 'MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 'ARR_T', 'ARR_DELAY', 'ARR', 'ARR_LAT', 'ARR_LON', 'ARR_W', 'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR']


In [7]:
OUTPUT_COLUMNS = ALL_COLUMNS

---
# Repeatable random subsets 

### Here's how we create repeatable random subsets for training, evaluation, and test

In [8]:
def sample_query(total, lower, upper):
    return """
        SELECT
            *
        FROM `going-tfx.examples.ATL_JUNE_RAW` 

        where
          MOD(ABS(FARM_FINGERPRINT(
            CONCAT(DATE,AIRLINE,ARR)
          )) + DEP_T, {0}) >= {1} 
        and
          MOD(ABS(FARM_FINGERPRINT(
            CONCAT( DATE, AIRLINE, ARR)
           )) + DEP_T, {0}) < {2} 
    """.format(total, lower, upper)

In [9]:
def sample_queries(fractions, rate=0.1):
    start = 0
    total = int(sum(fractions) / rate)
    res = []
    for f in fractions:
        f_ = int(f) 
        q = sample_query(total, start, start+f_)
        start = start + f_
        res.append(q)
    return dict(zip(['train', 'eval', 'test'], res))

In [10]:
queries = sample_queries([80,10,10], .1)

In [11]:
print(queries['eval'])


        SELECT
            *
        FROM `going-tfx.examples.ATL_JUNE_RAW` 

        where
          MOD(ABS(FARM_FINGERPRINT(
            CONCAT(DATE,AIRLINE,ARR)
          )) + DEP_T, 1000) >= 80 
        and
          MOD(ABS(FARM_FINGERPRINT(
            CONCAT( DATE, AIRLINE, ARR)
           )) + DEP_T, 1000) < 90 
    


#### A super-small random subset 

In [12]:
tiny_query = sample_query(10000, 0, 1)
sample = dlbq.Query(tiny_query).execute().result().to_dataframe()
print('Only {} examples. Showing first three:'.format(len(sample)))
sample[:3]

Only 40 examples. Showing first three:


Unnamed: 0,DATE,YEAR,MONTH,DAY,DEP_DOW,AIRLINE,DEP_T,DEP,DEP_LAT,DEP_LON,...,WND_SPD_DEP,ARR_T,ARR_DELAY,ARR,ARR_LAT,ARR_LON,ARR_W,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
0,2004-06-07,2004,6,7,2,Delta Air Lines Inc.: DL,842,ATL,33.63,-84.42,...,4.4,949,16.0,SAV,32.12,-81.2,SAVANNAH/HILTON HEAD INTL AIR,79.4,9.8,5.7
1,2006-06-27,2006,6,27,3,Delta Air Lines Inc.: DL,2130,ATL,33.63,-84.42,...,6.3,2354,45.0,PVD,41.72,-71.42,PROVIDENCE T F GREEN ARPT,74.7,10.0,10.1
2,2005-06-22,2005,6,22,4,America West Airlines Inc.: HP (Merged with US...,805,ATL,33.63,-84.42,...,6.1,916,-25.0,PHX,33.43,-112.01,PHOENIX SKY HARBOR INTL AIRPO,102.4,10.0,8.5


---
# Reading from Bigquery into a beam pipeline

In [13]:
STRING_COLUMNS = ['DATE', 'AIRLINE', 'DEP', 'DEP_W', 'ARR', 'ARR_W']
INT_COLUMNS = ['YEAR', 'MONTH', 'DAY', 'DEP_DOW', 'DEP_T', 'ARR_T']
FLOAT_COLUMNS = ['DEP_LAT', 'DEP_LON', 'MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 'ARR_DELAY', 'ARR_LAT', 'ARR_LON', 'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR']

In [14]:
raw_data_schema = {}

for t, cols in [(tf.string, STRING_COLUMNS), (tf.float32, FLOAT_COLUMNS), (tf.int64, INT_COLUMNS)]:
    raw_data_schema.update({
        col : dataset_schema.ColumnSchema(t, [], dataset_schema.FixedColumnRepresentation())
                   for col in cols})
raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema))

In [15]:
in_test_mode = True
OUTPUT_DIR="./out"
job_name = 'tft_tutorial' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    

options = {
    'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
    'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
    'job_name': job_name,
    'project': PROJECT,
    'max_num_workers': 24,
    'teardown_policy': 'TEARDOWN_ALWAYS',
    'no_save_main_session': True,
    'requirements_file': 'requirements.txt'
}
opts = beam.pipeline.PipelineOptions(flags=[], **options)
if in_test_mode:
    RUNNER = 'DirectRunner'
else:
    RUNNER = 'DataflowRunner'

---
# Developing the pre-processing function

In [16]:
tf.logging.set_verbosity(tf.logging.ERROR)

---
### This simple pipeline reads, transforms and emits the result into a csv file

In [24]:
wh = False
header = "Header" if wh else None
print(header)

None


In [26]:
def exec_pipeline(query, preprocessing_fn, output_columns, out_name = 'atl_june_transformed', write_header=False):
    
    header=",".join(output_columns) if write_header else None
    
    out_prefix = os.path.join(TMPDIR, out_name)
    with beam.Pipeline(RUNNER, options=opts) as p:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            
            
            #   Read from Big Query
            #
            raw_data = p | "ReadFromBigQuery"  >> beam.io.Read(beam.io.BigQuerySource(query=query, use_standard_sql=True)) 
            raw_dataset = (raw_data, raw_data_metadata)

            
            #   Analyze and transform by calling a single function that has all the tft.transforms in it
            #
            t_dataset, t_fn = (raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            t_data, t_metadata = t_dataset
            
            
            # Encode back to CSV file(s)
            #
            csv_encode = tft.coders.CsvCoder(output_columns, t_metadata.schema).encode    
            res = (t_data 
                   | beam.Map(csv_encode)
                   | beam.io.WriteToText(file_path_prefix=out_prefix, header=header))

            
    # Return a pandas dataframe containing the result
    #
    resfile = !ls $TMPDIR | grep $out_name
    resfile = resfile[0]
    resfile = os.path.join(TMPDIR, resfile)
    return pd.read_csv(resfile)

---
#### Step 1: Do nothing

In [27]:
def do_nothing(inputs):
    return inputs

In [29]:
res = exec_pipeline(tiny_query, do_nothing, ALL_COLUMNS, write_header=True)
res[:3]



Unnamed: 0,DATE,YEAR,MONTH,DAY,DEP_DOW,AIRLINE,DEP_T,DEP,DEP_LAT,DEP_LON,...,WND_SPD_DEP,ARR_T,ARR_DELAY,ARR,ARR_LAT,ARR_LON,ARR_W,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
0,2004-06-07,2004,6,7,2,Delta Air Lines Inc.: DL,842,ATL,33.63,-84.42,...,4.4,949,16.0,SAV,32.12,-81.2,SAVANNAH/HILTON HEAD INTL AIR,79.4,9.8,5.7
1,2006-06-27,2006,6,27,3,Delta Air Lines Inc.: DL,2130,ATL,33.63,-84.42,...,6.3,2354,45.0,PVD,41.72,-71.42,PROVIDENCE T F GREEN ARPT,74.7,10.0,10.1
2,2005-06-22,2005,6,22,4,America West Airlines Inc.: HP (Merged with US...,805,ATL,33.63,-84.42,...,6.1,916,-25.0,PHX,33.43,-112.01,PHOENIX SKY HARBOR INTL AIRPO,102.4,10.0,8.5


---
#### Step 2: Select only the useful columns

In [30]:
SELECTED_COLUMNS=[
    'YEAR', 'MONTH', 'DEP_DOW', 'AIRLINE', 
    'DEP_T', 'DEP', 'DEP_LAT', 'DEP_LON', 
    'MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 
    'ARR_DELAY', 
    'ARR', 'ARR_LAT', 'ARR_LON', 
    'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR']

In [31]:
def select_cols(row):
    return {key: row[key] for key in SELECTED_COLUMNS}

one_row = sample.to_dict(orient='records')[0]
print(select_cols(one_row))

{'WND_SPD_DEP': 4.4, 'WND_SPD_ARR': 5.7, 'MEAN_TEMP_ARR': 79.4, 'DEP': 'ATL', 'ARR_LON': -81.2, 'MEAN_VIS_DEP': 6.0, 'DEP_T': 842, 'ARR': 'SAV', 'DEP_LON': -84.42, 'DEP_DOW': 2, 'MEAN_VIS_ARR': 9.8, 'ARR_LAT': 32.12, 'AIRLINE': 'Delta Air Lines Inc.: DL', 'YEAR': 2004, 'ARR_DELAY': 16.0, 'DEP_LAT': 33.63, 'MONTH': 6, 'MEAN_TEMP_DEP': 75.2}


In [32]:
res = exec_pipeline(tiny_query, select_cols, SELECTED_COLUMNS, write_header=True)
res[:3]



Unnamed: 0,YEAR,MONTH,DEP_DOW,AIRLINE,DEP_T,DEP,DEP_LAT,DEP_LON,MEAN_TEMP_DEP,MEAN_VIS_DEP,WND_SPD_DEP,ARR_DELAY,ARR,ARR_LAT,ARR_LON,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
0,2002,6,1,Delta Air Lines Inc.: DL,1030,ATL,33.63,-84.42,74.5,6.9,9.8,7.0,RDU,35.87,-78.78,78.6,9.0,4.6
1,2002,6,2,Delta Air Lines Inc.: DL,1425,ATL,33.63,-84.42,74.6,10.0,6.1,-12.0,CAE,33.93,-81.11,75.7,9.9,2.9
2,2002,6,2,Delta Air Lines Inc.: DL,1150,ATL,33.63,-84.42,73.1,9.9,6.2,37.0,DFW,32.89,-97.03,75.8,9.9,3.3


---
#### Step 3: Scaling floats

In [33]:
def scale_floats(row):
    for c in ['MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR']:
        row[c] = tft.scale_to_0_1(row[c])
    return row

In [34]:
def pre_processor(row):
    row = row.copy()
    row = select_cols(row)
    row = scale_floats(row)
    return row

In [36]:
res = exec_pipeline(tiny_query, pre_processor, SELECTED_COLUMNS, write_header=True)
res[:3]



Unnamed: 0,YEAR,MONTH,DEP_DOW,AIRLINE,DEP_T,DEP,DEP_LAT,DEP_LON,MEAN_TEMP_DEP,MEAN_VIS_DEP,WND_SPD_DEP,ARR_DELAY,ARR,ARR_LAT,ARR_LON,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
0,2004,6,2,Delta Air Lines Inc.: DL,842,ATL,33.63,-84.42,0.33125,0.0,0.075,16.0,SAV,32.12,-81.2,0.507495,0.96,0.222973
1,2006,6,3,Delta Air Lines Inc.: DL,2130,ATL,33.63,-84.42,0.2125,0.125,0.3125,45.0,PVD,41.72,-71.42,0.406852,1.0,0.52027
2,2005,6,4,America West Airlines Inc.: HP (Merged with US...,805,ATL,33.63,-84.42,0.23125,0.5,0.2875,-25.0,PHX,33.43,-112.01,1.0,1.0,0.412162


---
# Create the big files
Creating the files for training and evaluation

In [37]:
!cat dataflow_requirements.txt

tensorflow-transform

In [43]:
def exec_pipeline_prod (preprocessing_fn, output_columns, fractions, sample_rate, runner='DirectRunner', write_header=False):
    
    header=",".join(output_columns) if write_header else None
    
    tmpdir='gs://{}/tmp'.format(BUCKET)
    proddir='gs://{}/prod'.format(BUCKET)

    job_name = 'tft-tutorial' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    

    options = {
        'staging_location': tmpdir,
        'temp_location': tmpdir,
        'job_name': job_name,
        'project': PROJECT,
        'max_num_workers': 24,
        'teardown_policy': 'TEARDOWN_ALWAYS',
        'no_save_main_session': True,
        'requirements_file': 'dataflow_requirements.txt'
    }
    opts = beam.pipeline.PipelineOptions(flags=[], **options)
    
    with beam.Pipeline(runner, options=opts) as p:
        with beam_impl.Context(temp_dir=tmpdir):
            
            queries = sample_queries(fractions, sample_rate)

            #   Read training data from Big Query
            #
            raw_data = p | "ReadFromBigQuery_train"  >> beam.io.Read(beam.io.BigQuerySource(query=queries['train'], use_standard_sql=True)) 
            raw_dataset = (raw_data, raw_data_metadata)

            
            #   Analyze and transform by calling a single function that has all the tft.transforms in it
            #
            t_dataset, transform_fn = (raw_dataset 
                                       | "AnalyzeAndTransform" >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            t_data, t_metadata = t_dataset
            
            
            # Encode back to CSV file(s)
            #
            train_prefix = os.path.join(proddir, 'atl_june_train')
            csv_encode = tft.coders.CsvCoder(output_columns, t_metadata.schema).encode    
            res = (t_data 
                   | "EncodeTraining" >> beam.Map(csv_encode)
                   | "WriteTraining" >> beam.io.WriteToText(file_path_prefix=train_prefix, header=header))


            #   Read eval data from Big Query
            #
            raw_data = p | "ReadFromBigQuery_eval"  >> beam.io.Read(beam.io.BigQuerySource(query=queries['eval'], use_standard_sql=True)) 
            raw_dataset = (raw_data, raw_data_metadata)

            
            #   Transform the eval dataset with the transform function derived above
            #
            t_dataset = ((raw_dataset, transform_fn) 
                         | "TransformEval" >> beam_impl.TransformDataset())
            t_data, _ = t_dataset
            
            
            # Encode back to CSV file(s)
            #
            eval_prefix = os.path.join(proddir, 'atl_june_eval')
            csv_encode = tft.coders.CsvCoder(output_columns, t_metadata.schema).encode    
            res = (t_data 
                   | "EncodeEval" >> beam.Map(csv_encode)
                   | "WriteEval" >> beam.io.WriteToText(file_path_prefix=eval_prefix, header=header))

            
            # save transformation function to disk for use at serving time
            #
            transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(
                os.path.join(proddir, 'metadata'))
            

In [44]:
!rm -f /tmp/*_june*
!ls /tmp

dataflow-requirements-cache  tmpb1cKdO	tmpFZwfeZ  tmpMXQtcR  tmpuTssa9
prod			     tmpCG3lqI	tmpHwbyq3  tmpNTwuRG  tmpxDPw8B
tmp4TS4t5		     tmpCQJByy	tmpkpwwCT  tmp_sbnfQ


In [45]:
!gsutil ls gs://going-tfx/prod

CommandException: One or more URLs matched no objects.


In [46]:
!gsutil -m rm -rf gs://going-tfx/prod
_ = !gsutil -m rm -rf gs://going-tfx/tmp/

CommandException: 1 files/objects could not be removed.


#### This takes about two minutes. Be patient.

In [30]:
exec_pipeline_prod (preprocessing_fn=pre_processor, output_columns=SELECTED_COLUMNS, fractions=[80, 10, 10], sample_rate=0.1)



---
#### This executes in dataflow and takes some 12 - 20 minutes
But if you watch the graph in the dataflow console, you see that the job lives until the VM is shutdown. The files may be available a bit earlier.

In [47]:
exec_pipeline_prod (preprocessing_fn=pre_processor, output_columns=SELECTED_COLUMNS, fractions=[80, 10, 10], sample_rate=1.0, runner = 'DataflowRunner')

In [48]:
!gsutil ls gs://going-tfx/prod/*

gs://going-tfx/prod/
gs://going-tfx/prod/atl_june_eval-00000-of-00002
gs://going-tfx/prod/atl_june_eval-00001-of-00002
gs://going-tfx/prod/atl_june_train-00000-of-00005
gs://going-tfx/prod/atl_june_train-00001-of-00005
gs://going-tfx/prod/atl_june_train-00002-of-00005
gs://going-tfx/prod/atl_june_train-00003-of-00005
gs://going-tfx/prod/atl_june_train-00004-of-00005

gs://going-tfx/prod/metadata/:
gs://going-tfx/prod/metadata/
gs://going-tfx/prod/metadata/transform_fn/
gs://going-tfx/prod/metadata/transformed_metadata/


---
#### Copy the files into local directory 

In [49]:
mkdir /tmp/prod

mkdir: cannot create directory ‘/tmp/prod’: File exists


In [50]:
!gsutil -m cp gs://going-tfx/prod/atl_june_eval* /tmp/prod/
_ = !gsutil -m cp gs://going-tfx/prod/atl_june_train* /tmp/prod/

Copying gs://going-tfx/prod/atl_june_eval-00001-of-00002...
Copying gs://going-tfx/prod/atl_june_eval-00000-of-00002...
/ [2/2 files][  4.1 MiB/  4.1 MiB] 100% Done                                    
Operation completed over 2 objects/4.1 MiB.                                      


---
#### Have a look at the result

In [54]:
ALL_COLUMNS = [
    'YEAR', 'MONTH', 'DEP_DOW', 'AIRLINE', 
    'DEP_T', 'DEP', 'DEP_LAT', 'DEP_LON', 
    'MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 
    'ARR_DELAY', 'ARR', 'ARR_LAT', 'ARR_LON', 
    'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR']

In [51]:
a_training_file = !ls /tmp/prod/atl_june_train-00001-of-*
a_training_file = a_training_file[0]
!wc -l $a_training_file

29801 /tmp/prod/atl_june_train-00001-of-00005


In [55]:
t1 = pd.read_csv(a_training_file, names=ALL_COLUMNS)

In [56]:
t1.describe()

Unnamed: 0,YEAR,MONTH,DEP_DOW,DEP_T,DEP_LAT,DEP_LON,MEAN_TEMP_DEP,MEAN_VIS_DEP,WND_SPD_DEP,ARR_DELAY,ARR_LAT,ARR_LON,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
count,29801.0,29801.0,29801.0,29801.0,29801.0,29801.0,29801.0,29801.0,29801.0,29801.0,29801.0,29801.0,29801.0,29801.0,29801.0
mean,2007.119962,6.0,3.95574,1466.346767,33.63,-84.42,0.636172,0.855615,0.355867,12.971276,36.015935,-95.366662,0.547756,0.008777,0.006683
std,2.656701,0.0,1.974099,459.790567,0.0,0.0,0.18359,0.179274,0.140529,36.297191,5.352652,16.99609,0.138877,0.00583,0.002888
min,2002.0,6.0,1.0,655.0,33.63,-84.42,0.0,0.0,0.0,-50.0,26.53,-122.37,0.141287,0.002702,0.0003
25%,2005.0,6.0,2.0,1028.0,33.63,-84.42,0.504202,0.765625,0.255814,-7.0,32.12,-111.97,0.433281,0.008506,0.0048
50%,2007.0,6.0,4.0,1455.0,33.63,-84.42,0.634454,0.921875,0.348837,3.0,37.5,-90.37,0.585557,0.009206,0.006101
75%,2009.0,6.0,6.0,1854.0,33.63,-84.42,0.773109,0.984375,0.434109,18.0,38.74,-81.2,0.660911,0.009407,0.008001
max,2011.0,6.0,7.0,2331.0,33.63,-84.42,1.0,1.0,1.0,593.0,47.45,-77.31,0.822606,1.0,0.024102
