In [1]:
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import

In [3]:
import datetime
import os
import tempfile

import pandas as pd
import apache_beam as beam

import tensorflow as tf
import tensorflow_transform as tft
import tensorflow_transform.beam.impl as beam_impl
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.beam.tft_beam_io import transform_fn_io


import google.datalab.bigquery as dlbq

In [4]:
%load_ext google.cloud.bigquery

In [5]:
PROJECT='going-tfx'
BUCKET='going-tfx'
DATASET='examples'
TMPDIR='/tmp'

---
# Raw data in Bigquery
We collected the raw data that we use from various sources into a single denormalized table

In [6]:
%%bigquery aj_sample
select * FROM `going-tfx.examples.ATL_JUNE_RAW` limit 3

Unnamed: 0,DATE,YEAR,MONTH,DAY,DEP_DOW,AIRLINE,DEP_T,DEP,DEP_LAT,DEP_LON,...,WND_SPD_DEP,ARR_T,ARR_DELAY,ARR,ARR_LAT,ARR_LON,ARR_W,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
0,2002-06-01,2002,6,1,7,US Airways Inc.: US (Merged with America West ...,610,ATL,33.63,-84.42,...,6.9,712,-12.0,CLT,35.21,-80.94,CHARLOTTE/DOUGLAS INTERNATION,78.3,9.5,2.7
1,2002-06-01,2002,6,1,7,Delta Air Lines Inc.: DL,620,ATL,33.63,-84.42,...,6.9,738,55.0,TPA,27.97,-82.53,TAMPA INTERNATIONAL AIRPORT,79.1,9.9,5.5
2,2002-06-01,2002,6,1,7,Delta Air Lines Inc.: DL,620,ATL,33.63,-84.42,...,6.9,740,9.0,MCO,28.42,-81.3,ORLANDO INTERNATIONAL AIRPORT,77.4,9.6,5.7


In [7]:
ALL_COLUMNS = [str(key) for key in aj_sample.keys()]
print(ALL_COLUMNS)

['DATE', 'YEAR', 'MONTH', 'DAY', 'DEP_DOW', 'AIRLINE', 'DEP_T', 'DEP', 'DEP_LAT', 'DEP_LON', 'DEP_W', 'MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 'ARR_T', 'ARR_DELAY', 'ARR', 'ARR_LAT', 'ARR_LON', 'ARR_W', 'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR']


In [8]:
OUTPUT_COLUMNS = ALL_COLUMNS

---
# Repeatable random subsets 

### Here's how we create repeatable random subsets for training, evaluation, and test

In [9]:
def sample_query(total, lower, upper):
    return """
        SELECT
            *
        FROM `going-tfx.examples.ATL_JUNE_RAW` 

        where
          MOD(ABS(FARM_FINGERPRINT(
            CONCAT(DATE,AIRLINE,ARR)
          )) + DEP_T, {0}) >= {1} 
        and
          MOD(ABS(FARM_FINGERPRINT(
            CONCAT( DATE, AIRLINE, ARR)
           )) + DEP_T, {0}) < {2} 
    """.format(total, lower, upper)

In [10]:
def sample_queries(fractions, rate=0.1):
    start = 0
    total = int(sum(fractions) / rate)
    res = []
    for f in fractions:
        f_ = int(f) 
        q = sample_query(total, start, start+f_)
        start = start + f_
        res.append(q)
    return dict(zip(['train', 'eval', 'test'], res))

In [11]:
queries = sample_queries([80,10,10], .1)

In [12]:
print(queries['eval'])


        SELECT
            *
        FROM `going-tfx.examples.ATL_JUNE_RAW` 

        where
          MOD(ABS(FARM_FINGERPRINT(
            CONCAT(DATE,AIRLINE,ARR)
          )) + DEP_T, 1000) >= 80 
        and
          MOD(ABS(FARM_FINGERPRINT(
            CONCAT( DATE, AIRLINE, ARR)
           )) + DEP_T, 1000) < 90 
    


#### A super-small random subset 

In [13]:
tiny_query = sample_query(10000, 0, 1)
sample = dlbq.Query(tiny_query).execute().result().to_dataframe()
print('Only {} examples. Showing first three:'.format(len(sample)))
sample[:3]

Only 40 examples. Showing first three:


Unnamed: 0,DATE,YEAR,MONTH,DAY,DEP_DOW,AIRLINE,DEP_T,DEP,DEP_LAT,DEP_LON,...,WND_SPD_DEP,ARR_T,ARR_DELAY,ARR,ARR_LAT,ARR_LON,ARR_W,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
0,2004-06-07,2004,6,7,2,Delta Air Lines Inc.: DL,842,ATL,33.63,-84.42,...,4.4,949,16.0,SAV,32.12,-81.2,SAVANNAH/HILTON HEAD INTL AIR,79.4,9.8,5.7
1,2006-06-27,2006,6,27,3,Delta Air Lines Inc.: DL,2130,ATL,33.63,-84.42,...,6.3,2354,45.0,PVD,41.72,-71.42,PROVIDENCE T F GREEN ARPT,74.7,10.0,10.1
2,2005-06-22,2005,6,22,4,America West Airlines Inc.: HP (Merged with US...,805,ATL,33.63,-84.42,...,6.1,916,-25.0,PHX,33.43,-112.01,PHOENIX SKY HARBOR INTL AIRPO,102.4,10.0,8.5


---
# Reading from Bigquery into a beam pipeline

In [14]:
STRING_COLUMNS = ['DATE', 'AIRLINE', 'DEP', 'DEP_W', 'ARR', 'ARR_W']
INT_COLUMNS = ['YEAR', 'MONTH', 'DAY', 'DEP_DOW', 'DEP_T', 'ARR_T']
FLOAT_COLUMNS = ['DEP_LAT', 'DEP_LON', 'MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 'ARR_DELAY', 'ARR_LAT', 'ARR_LON', 'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR']

In [15]:
raw_data_schema = {}

for t, cols in [(tf.string, STRING_COLUMNS), (tf.float32, FLOAT_COLUMNS), (tf.int64, INT_COLUMNS)]:
    raw_data_schema.update({
        col : dataset_schema.ColumnSchema(t, [], dataset_schema.FixedColumnRepresentation())
                   for col in cols})
raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema))

In [16]:
in_test_mode = True
OUTPUT_DIR="./out"
job_name = 'tft_tutorial' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    

options = {
    'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
    'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
    'job_name': job_name,
    'project': PROJECT,
    'max_num_workers': 24,
    'teardown_policy': 'TEARDOWN_ALWAYS',
    'no_save_main_session': True,
    'requirements_file': 'requirements.txt'
}
opts = beam.pipeline.PipelineOptions(flags=[], **options)
if in_test_mode:
    RUNNER = 'DirectRunner'
else:
    RUNNER = 'DataflowRunner'

---
# Developing the pre-processing function

In [17]:
tf.logging.set_verbosity(tf.logging.ERROR)

---
### This simple pipeline reads, transforms and emits the result into a csv file

In [18]:
def exec_pipeline(query, preprocessing_fn, output_columns, out_name = 'atl_june_transformed', ):
    out_prefix = os.path.join(TMPDIR, out_name)
    with beam.Pipeline(RUNNER, options=opts) as p:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            
            
            #   Read from Big Query
            #
            raw_data = p | "ReadFromBigQuery"  >> beam.io.Read(beam.io.BigQuerySource(query=query, use_standard_sql=True)) 
            raw_dataset = (raw_data, raw_data_metadata)

            
            #   Analyze and transform by calling a single function that has all the tft.transforms in it
            #
            t_dataset, t_fn = (raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            t_data, t_metadata = t_dataset
            
            
            # Encode back to CSV file(s)
            #
            csv_encode = tft.coders.CsvCoder(output_columns, t_metadata.schema).encode    
            res = (t_data 
                   | beam.Map(csv_encode)
                   | beam.io.WriteToText(file_path_prefix=out_prefix, header=",".join(output_columns)))

            
    # Return a pandas dataframe containing the result
    #
    resfile = !ls $TMPDIR | grep $out_name
    resfile = resfile[0]
    resfile = os.path.join(TMPDIR, resfile)
    return pd.read_csv(resfile)

---
#### Step 1: Do nothing

In [19]:
def do_nothing(inputs):
    return inputs

In [20]:
res = exec_pipeline(tiny_query, do_nothing, ALL_COLUMNS)
res[:3]

  pipeline.replace_all(_get_transform_overrides(pipeline.options))


Unnamed: 0,DATE,YEAR,MONTH,DAY,DEP_DOW,AIRLINE,DEP_T,DEP,DEP_LAT,DEP_LON,...,WND_SPD_DEP,ARR_T,ARR_DELAY,ARR,ARR_LAT,ARR_LON,ARR_W,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
0,2002-06-23,2002,6,23,1,Delta Air Lines Inc.: DL,1030,ATL,33.63,-84.42,...,9.8,1147,7.0,RDU,35.87,-78.78,RALEIGH-DURHAM INTERNATIONAL,78.6,9.0,4.6
1,2002-06-17,2002,6,17,2,Delta Air Lines Inc.: DL,1150,ATL,33.63,-84.42,...,6.2,1251,37.0,DFW,32.89,-97.03,DALLAS-FORT WORTH INTL AP,75.8,9.9,3.3
2,2002-06-10,2002,6,10,2,Delta Air Lines Inc.: DL,1425,ATL,33.63,-84.42,...,6.1,1522,-12.0,CAE,33.93,-81.11,COLUMBIA METRO ARPT,75.7,9.9,2.9


---
#### Step 2: Select only the useful columns

In [21]:
SELECTED_COLUMNS=[
    'YEAR', 'MONTH', 'DEP_DOW', 'AIRLINE', 
    'DEP_T', 'DEP', 'DEP_LAT', 'DEP_LON', 
    'MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 
    'ARR_DELAY', 
    'ARR', 'ARR_LAT', 'ARR_LON', 
    'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR']

In [22]:
def select_cols(row):
    return {key: row[key] for key in SELECTED_COLUMNS}

one_row = sample.to_dict(orient='records')[0]
print(select_cols(one_row))

{'WND_SPD_DEP': 4.4, 'WND_SPD_ARR': 5.7, 'MEAN_TEMP_ARR': 79.4, 'DEP': 'ATL', 'ARR_LON': -81.2, 'MEAN_VIS_DEP': 6.0, 'DEP_T': 842, 'ARR': 'SAV', 'DEP_LON': -84.42, 'DEP_DOW': 2, 'MEAN_VIS_ARR': 9.8, 'ARR_LAT': 32.12, 'AIRLINE': 'Delta Air Lines Inc.: DL', 'YEAR': 2004, 'ARR_DELAY': 16.0, 'DEP_LAT': 33.63, 'MONTH': 6, 'MEAN_TEMP_DEP': 75.2}


In [23]:
res = exec_pipeline(tiny_query, select_cols, SELECTED_COLUMNS)
res[:3]



Unnamed: 0,YEAR,MONTH,DEP_DOW,AIRLINE,DEP_T,DEP,DEP_LAT,DEP_LON,MEAN_TEMP_DEP,MEAN_VIS_DEP,WND_SPD_DEP,ARR_DELAY,ARR,ARR_LAT,ARR_LON,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
0,2004,6,2,Delta Air Lines Inc.: DL,842,ATL,33.63,-84.42,75.2,6.0,4.4,16.0,SAV,32.12,-81.2,79.4,9.8,5.7
1,2006,6,3,Delta Air Lines Inc.: DL,2130,ATL,33.63,-84.42,73.3,6.5,6.3,45.0,PVD,41.72,-71.42,74.7,10.0,10.1
2,2005,6,4,America West Airlines Inc.: HP (Merged with US...,805,ATL,33.63,-84.42,73.6,8.0,6.1,-25.0,PHX,33.43,-112.01,102.4,10.0,8.5


---
#### Step 3: Scaling floats

In [24]:
def scale_floats(row):
    for c in ['MEAN_TEMP_DEP', 'MEAN_VIS_DEP', 'WND_SPD_DEP', 'MEAN_TEMP_ARR', 'MEAN_VIS_ARR', 'WND_SPD_ARR']:
        row[c] = tft.scale_to_0_1(row[c])
    return row

In [25]:
def pre_processor(row):
    row = row.copy()
    row = select_cols(row)
    row = scale_floats(row)
    return row

In [26]:
res = exec_pipeline(tiny_query, pre_processor, SELECTED_COLUMNS)
res[:3]



Unnamed: 0,YEAR,MONTH,DEP_DOW,AIRLINE,DEP_T,DEP,DEP_LAT,DEP_LON,MEAN_TEMP_DEP,MEAN_VIS_DEP,WND_SPD_DEP,ARR_DELAY,ARR,ARR_LAT,ARR_LON,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
0,2004,6,2,Delta Air Lines Inc.: DL,842,ATL,33.63,-84.42,0.33125,0.0,0.075,16.0,SAV,32.12,-81.2,0.507495,0.96,0.222973
1,2006,6,3,Delta Air Lines Inc.: DL,2130,ATL,33.63,-84.42,0.2125,0.125,0.3125,45.0,PVD,41.72,-71.42,0.406852,1.0,0.52027
2,2005,6,4,America West Airlines Inc.: HP (Merged with US...,805,ATL,33.63,-84.42,0.23125,0.5,0.2875,-25.0,PHX,33.43,-112.01,1.0,1.0,0.412162


---
# Create the big files
Creating the files for training and evaluation

In [27]:
!cat dataflow_requirements.txt

tensorflow-transform

In [145]:
def exec_pipeline_prod (preprocessing_fn, output_columns, fractions, sample_rate, runner='DirectRunner'):
    tmpdir='gs://{}/tmp'.format(BUCKET)
    proddir='gs://{}/prod'.format(BUCKET)

    job_name = 'tft-tutorial' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    

    options = {
        'staging_location': tmpdir,
        'temp_location': tmpdir,
        'job_name': job_name,
        'project': PROJECT,
        'max_num_workers': 24,
        'teardown_policy': 'TEARDOWN_ALWAYS',
        'no_save_main_session': True,
        'requirements_file': 'dataflow_requirements.txt'
    }
    opts = beam.pipeline.PipelineOptions(flags=[], **options)
    
    with beam.Pipeline(runner, options=opts) as p:
        with beam_impl.Context(temp_dir=tmpdir):
            
            queries = sample_queries(fractions, sample_rate)

            #   Read training data from Big Query
            #
            raw_data = p | "ReadFromBigQuery_train"  >> beam.io.Read(beam.io.BigQuerySource(query=queries['train'], use_standard_sql=True)) 
            raw_dataset = (raw_data, raw_data_metadata)

            
            #   Analyze and transform by calling a single function that has all the tft.transforms in it
            #
            t_dataset, transform_fn = (raw_dataset 
                                       | "AnalyzeAndTransform" >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            t_data, t_metadata = t_dataset
            
            
            # Encode back to CSV file(s)
            #
            train_prefix = os.path.join(proddir, 'atl_june_train')
            csv_encode = tft.coders.CsvCoder(output_columns, t_metadata.schema).encode    
            res = (t_data 
                   | "EncodeTraining" >> beam.Map(csv_encode)
                   | "WriteTraining" >> beam.io.WriteToText(file_path_prefix=train_prefix, header=",".join(output_columns)))


            #   Read eval data from Big Query
            #
            raw_data = p | "ReadFromBigQuery_eval"  >> beam.io.Read(beam.io.BigQuerySource(query=queries['eval'], use_standard_sql=True)) 
            raw_dataset = (raw_data, raw_data_metadata)

            
            #   Transform the eval dataset with the transform function derived above
            #
            t_dataset = ((raw_dataset, transform_fn) 
                         | "TransformEval" >> beam_impl.TransformDataset())
            t_data, _ = t_dataset
            
            
            # Encode back to CSV file(s)
            #
            eval_prefix = os.path.join(proddir, 'atl_june_eval')
            csv_encode = tft.coders.CsvCoder(output_columns, t_metadata.schema).encode    
            res = (t_data 
                   | "EncodeEval" >> beam.Map(csv_encode)
                   | "WriteEval" >> beam.io.WriteToText(file_path_prefix=eval_prefix, header=",".join(output_columns)))

            
            # save transformation function to disk for use at serving time
            #
            transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(
                os.path.join(proddir, 'metadata'))
            

In [28]:
!rm -f /tmp/*_june*
!ls /tmp

dataflow-requirements-cache  tmp8YBL_g	tmpHS0B1a  tmpou6hBU  tmpXwZx4b
prod			     tmpbBKz2b	tmpJLQG8B  tmpqfRWXi  tmpybU8er
tmp6kJdJS		     tmpegmjWh	tmpKnDIjq  tmpU_VGxW
tmp7V_xsh		     tmpGYVFuS	tmpkVEzLC  tmpXrdlV4


In [29]:
!gsutil -m rm -rf gs://going-tfx/prod
_ = !gsutil -m rm -rf gs://going-tfx/tmp/

Removing gs://going-tfx/prod/atl_june_eval-00001-of-00002#1541345707982957...
Removing gs://going-tfx/prod/#1541345504293050...
Removing gs://going-tfx/prod/atl_june_train-00000-of-00005#1541345563766542...
Removing gs://going-tfx/prod/atl_june_eval-00000-of-00002#1541345707979136...
Removing gs://going-tfx/prod/atl_june_train-00001-of-00005#1541345563729976...
Removing gs://going-tfx/prod/metadata/transform_fn/#1541345504838703...
Removing gs://going-tfx/prod/metadata/transform_fn/saved_model.pb#1541345505657304...
Removing gs://going-tfx/prod/metadata/#1541345504547882...
Removing gs://going-tfx/prod/metadata/transform_fn/variables/#1541345506576601...
Removing gs://going-tfx/prod/metadata/transformed_metadata/#1541345507788904...
Removing gs://going-tfx/prod/atl_june_train-00003-of-00005#1541345563747342...
Removing gs://going-tfx/prod/atl_june_train-00002-of-00005#1541345563749478...
Removing gs://going-tfx/prod/atl_june_train-00004-of-00005#1541345563745013...
Removing gs://going-

#### This takes about two minutes. Be patient.

In [110]:
exec_pipeline_prod (preprocessing_fn=pre_processor, output_columns=SELECTED_COLUMNS, fractions=[80, 10, 10], sample_rate=0.1)



---
#### This executes in dataflow and takes some 12 - 20 minutes
But if you watch the graph in the dataflow console, you see that the job lives until the VM is shutdown. The files may be available a bit earlier.

In [151]:
exec_pipeline_prod (preprocessing_fn=pre_processor, output_columns=SELECTED_COLUMNS, fractions=[80, 10, 10], sample_rate=1.0, runner = 'DataflowRunner')

In [154]:
!gsutil ls gs://going-tfx/prod/*

gs://going-tfx/prod/
gs://going-tfx/prod/atl_june_eval-00000-of-00002
gs://going-tfx/prod/atl_june_eval-00001-of-00002
gs://going-tfx/prod/atl_june_train-00000-of-00005
gs://going-tfx/prod/atl_june_train-00001-of-00005
gs://going-tfx/prod/atl_june_train-00002-of-00005
gs://going-tfx/prod/atl_june_train-00003-of-00005
gs://going-tfx/prod/atl_june_train-00004-of-00005

gs://going-tfx/prod/metadata/:
gs://going-tfx/prod/metadata/
gs://going-tfx/prod/metadata/transform_fn/
gs://going-tfx/prod/metadata/transformed_metadata/


In [161]:
mkdir /tmp/prod

mkdir: cannot create directory ‘/tmp/prod’: File exists


In [162]:
!gsutil -m cp gs://going-tfx/prod/atl_june_eval* /tmp/prod/
_ = !gsutil -m cp gs://going-tfx/prod/atl_june_train* /tmp/prod/

Copying gs://going-tfx/prod/atl_june_eval-00000-of-00002...
Copying gs://going-tfx/prod/atl_june_eval-00001-of-00002...
/ [2/2 files][  4.1 MiB/  4.1 MiB] 100% Done                                    
Operation completed over 2 objects/4.1 MiB.                                      


In [30]:
t1 = pd.read_csv("/tmp/prod/atl_june_train-00001-of-00005")

In [31]:
t1.describe()

Unnamed: 0,YEAR,MONTH,DEP_DOW,DEP_T,DEP_LAT,DEP_LON,MEAN_TEMP_DEP,MEAN_VIS_DEP,WND_SPD_DEP,ARR_DELAY,ARR_LAT,ARR_LON,MEAN_TEMP_ARR,MEAN_VIS_ARR,WND_SPD_ARR
count,80222.0,80222.0,80222.0,80222.0,80222.0,80222.0,80222.0,80222.0,80222.0,80222.0,80222.0,80222.0,80222.0,80222.0,80222.0
mean,2006.82799,6.0,3.964112,1461.206016,33.63,-84.42,0.623959,0.847795,0.357151,15.579579,35.203398,-86.94595,0.594617,0.008532,0.006635
std,2.705807,0.0,1.968189,468.048983,0.0,1.421094e-14,0.184572,0.182979,0.142093,39.827289,5.255716,12.798527,0.126182,0.001261,0.014254
min,2002.0,6.0,1.0,545.0,33.63,-84.42,0.0,0.0,0.0,-46.0,18.0,-156.43,0.150706,0.0,0.0002
25%,2005.0,6.0,2.0,1037.0,33.63,-84.42,0.491596,0.765625,0.255814,-6.0,30.47,-90.25,0.513344,0.008206,0.0047
50%,2007.0,6.0,4.0,1446.0,33.63,-84.42,0.62605,0.921875,0.348837,3.0,35.04,-81.68,0.616954,0.009106,0.006101
75%,2009.0,6.0,6.0,1859.0,33.63,-84.42,0.764706,0.984375,0.434109,21.0,40.49,-78.78,0.676609,0.009407,0.007801
max,2011.0,6.0,7.0,2342.0,33.63,-84.42,1.0,1.0,1.0,924.0,45.58,-66.56,1.0,0.012108,1.0
