In [1]:
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import

In [25]:
import os
import tempfile
import datetime

import pandas as pd
import numpy as np
import apache_beam as beam

import tensorflow as tf
import tensorflow_transform as tft
import tensorflow_transform.beam as tft_beam
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema
import tensorflow_transform.beam.impl as beam_impl

from utilities.transforms import MapAndFilterErrors
tf.__version__

'1.11.0'

In [3]:
PWD=os.getcwd()
BUCKET='going-tfx'
PROJECT='going-tfx'
EXTRA_PACKAGE='./utilities'
DATA_DIR="gs://{}/flight_data".format(BUCKET)
OUTPUT_DIR="gs://{}/output".format(BUCKET)
TMP_DIR="gs://{}/tmp".format(BUCKET)

In [4]:
ALL_COLUMNS = ['FL_DATE', 'FL_YEAR', 'FL_MONTH', 'FL_DOM', 'FL_DOW', 'UNIQUE_CARRIER', 'FL_NUM',
       'ORIGIN_AIRPORT_SEQ_ID', 'DEST_AIRPORT_SEQ_ID', 'ORIGIN', 'DEST',
       'CRS_DEP_TIME', 'DEP_DELAY', 'TAXI_OUT', 'WHEELS_OFF', 'WHEELS_ON',
       'TAXI_IN', 'CRS_ARR_TIME', 'ARR_DELAY', 'CANCELLED',
       'CANCELLATION_CODE', 'DIVERTED', 'DISTANCE']
DEFAULTS = [["-"], [], [], [], [], ["-"], ["-"], ["-"], ["-"], ["-"], ["-"], [], [], [], [], [], [], [], [], [], ['NONE'], [], []]

ORDERED_COLUMNS = ['FL_YEAR', 'FL_MONTH', 'FL_DOW', 'UNIQUE_CARRIER', 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'DEP_DELAY', 'CRS_ARR_TIME', 'ARR_DELAY']

SELECT = list(np.sort([ALL_COLUMNS.index(c) for c in ORDERED_COLUMNS]))

---
Get an impression of what's in the file

In [7]:
atlanta_june = tf.data.TextLineDataset(os.path.join(DATA_DIR, 'atl_june.csv'))
def decode_csv(row):
    cols = tf.decode_csv(row, select_cols=SELECT, record_defaults=[DEFAULTS[i] for i in SELECT])
    features = dict(zip([ALL_COLUMNS[i] for i in SELECT], cols))
    return features

inp = atlanta_june.skip(1).map(decode_csv).batch(10)

In [8]:
input_raw = inp.make_one_shot_iterator().get_next()
with tf.Session() as sess:
    b = sess.run(input_raw)
pd.DataFrame(b)

Unnamed: 0,ARR_DELAY,CRS_ARR_TIME,CRS_DEP_TIME,DEP_DELAY,DEST,FL_DOW,FL_MONTH,FL_YEAR,ORIGIN,UNIQUE_CARRIER
0,78.0,1705.0,1621.0,55.0,LFT,4.0,6.0,2016.0,ATL,EV
1,-12.0,2056.0,2010.0,-2.0,LFT,4.0,6.0,2016.0,ATL,DL
2,66.0,1344.0,1305.0,63.0,LFT,4.0,6.0,2016.0,ATL,EV
3,-3.0,1051.0,1006.0,-3.0,LFT,4.0,6.0,2016.0,ATL,EV
4,19.0,1344.0,1305.0,-6.0,LFT,5.0,6.0,2016.0,ATL,EV
5,147.0,1051.0,1006.0,139.0,LFT,5.0,6.0,2016.0,ATL,EV
6,-10.0,2056.0,2010.0,-5.0,LFT,5.0,6.0,2016.0,ATL,DL
7,-6.0,1705.0,1621.0,0.0,LFT,5.0,6.0,2016.0,ATL,EV
8,27.0,1344.0,1305.0,21.0,LFT,6.0,6.0,2016.0,ATL,EV
9,-5.0,2056.0,2010.0,-4.0,LFT,6.0,6.0,2016.0,ATL,DL


---
### Metadata and Schema

In [5]:
raw_data_schema = {
    colname : dataset_schema.ColumnSchema(
        tf.string, [], dataset_schema.FixedColumnRepresentation())
               for colname in ['ORIGIN','UNIQUE_CARRIER','DEST']
}
raw_data_schema.update({
    colname : dataset_schema.ColumnSchema(
        tf.float32, [], dataset_schema.FixedColumnRepresentation())
               for colname in ['DEP_DELAY','ARR_DELAY']
})
raw_data_schema.update({
    colname : dataset_schema.ColumnSchema(
        tf.int64, [], dataset_schema.FixedColumnRepresentation())
               for colname in ['FL_YEAR','FL_MONTH','FL_DOW','CRS_DEP_TIME','CRS_ARR_TIME']
})
raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema))

Another way of creating meta data is as follows

```
raw_data_metadata = dataset_metadata.DatasetMetadata(
    dataset_schema.from_feature_spec({
        'ORIGIN': tf.FixedLenFeature([], tf.string),
        'FL_YEAR': tf.FixedLenFeature([], tf.int64),
        'FL_MONTH': tf.FixedLenFeature([], tf.int64),
        'FL_DOW': tf.FixedLenFeature([], tf.int64),
        'UNIQUE_CARRIER': tf.FixedLenFeature([], tf.string),
        'DEST': tf.FixedLenFeature([], tf.string),
        'CRS_DEP_TIME': tf.FixedLenFeature([], tf.int64),
        'CRS_ARR_TIME': tf.FixedLenFeature([], tf.int64),
        'DEP_DELAY': tf.FixedLenFeature([], tf.float32),
        'ARR_DELAY': tf.FixedLenFeature([], tf.float32)
    }))
```

---
### The Preprocessing Pipeline 
Here, we scale ```DEP_DELAY``` to the range $[0,1]$ and select only the interesting columns

In [40]:
def preprocessing_fn(inputs):
    outputs = inputs.copy()
    outputs = {k: outputs[k] for k in ORDERED_COLUMNS}             # Projection: Select useful columns 

    outputs['DEP_DELAY'] = tft.scale_to_0_1(outputs['DEP_DELAY'])  # Scale 

    return outputs

The CSV encoder and decoder

In [41]:
training_file = os.path.join(DATA_DIR, 'atl_june.csv') # 403'358 records 
csv_decode = tft.coders.CsvCoder(ALL_COLUMNS, raw_data_metadata.schema).decode
csv_encode = tft.coders.CsvCoder(ORDERED_COLUMNS, raw_data_metadata.schema).encode

In [52]:
# Uncomment this to use a smaller test file
training_file = 'atl_june_11.csv' # 10 records only

A quick sanity check on for the error filter with a very small subset

In [43]:
with open('atl_june_11.csv') as f:
    content = f.readlines()

In [44]:
(content 
 | MapAndFilterErrors(csv_decode) 
 | beam.Map(csv_encode))

['2016,6,4,EV,ATL,LFT,1621,55.0,1705,78.0',
 '2016,6,4,DL,ATL,LFT,2010,-2.0,2056,-12.0',
 '2016,6,4,EV,ATL,LFT,1305,63.0,1344,66.0',
 '2016,6,4,EV,ATL,LFT,1006,-3.0,1051,-3.0',
 '2016,6,5,EV,ATL,LFT,1305,-6.0,1344,19.0',
 '2016,6,5,EV,ATL,LFT,1006,139.0,1051,147.0',
 '2016,6,5,DL,ATL,LFT,2010,-5.0,2056,-10.0',
 '2016,6,5,EV,ATL,LFT,1621,0.0,1705,-6.0',
 '2016,6,6,EV,ATL,LFT,1305,21.0,1344,27.0',
 '2016,6,6,DL,ATL,LFT,2010,-4.0,2056,-5.0']

Cleaning up any previously processed records...

In [45]:
!gsutil rm -f $OUTPUT_DIR/atl_june_transformed*

Removing gs://going-tfx/output/atl_june_transformed-00000-of-00001...
/ [1 objects]                                                                   
Operation completed over 1 objects.                                              


---
### The Pipeline
Runtime options:

In [46]:
test_mode = True

job_name = 'tft-tutorial-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    
tmp_dir = os.path.join(TMP_DIR, job_name)
options = {
    'staging_location': os.path.join(tmp_dir, 'staging'),
    'temp_location': tmp_dir,
    'job_name': job_name,
    'project': PROJECT,
    'max_num_workers': 24,
    'teardown_policy': 'TEARDOWN_ALWAYS',
    'requirements_file': 'requirements.txt',
    'setup_file': os.path.join(PWD, 'setup.py')
}
opts = beam.pipeline.PipelineOptions(flags=[], **options)
if test_mode:
    RUNNER = 'DirectRunner'
else:
    RUNNER = 'DataflowRunner'

print(RUNNER)
print(opts.get_all_options())

DirectRunner
{'machine_type': None, 'runner': None, 'labels': None, 'save_main_session': False, 'streaming': False, 'experiments': None, 'requirements_cache': None, 'harness_docker_image': None, 'max_num_workers': 24, 'template_location': None, 'requirements_file': 'requirements.txt', 'sdk_location': 'default', 'network': None, 'dry_run': False, 'profile_location': None, 'service_account_email': None, 'profile_cpu': False, 'profile_memory': False, 'direct_runner_use_stacked_bundle': True, 'type_check_strictness': 'DEFAULT_TO_ANY', 'job_name': 'tft-tutorial-181026-212541', 'use_public_ips': None, 'num_workers': None, 'hdfs_host': None, 'disk_size_gb': None, 'runtime_type_check': False, 'on_success_matcher': None, 'temp_location': 'gs://going-tfx/tmp/tft-tutorial-181026-212541', 'setup_file': '/home/jupyter/workspace/home-in-time/setup.py', 'disk_type': None, 'dataflow_endpoint': 'https://dataflow.googleapis.com', 'worker_harness_container_image': None, 'hdfs_port': None, 'autoscaling_al

---
### Construct and run the pipeline
Be aware that this can take quite a while. You can watch your pipeline run on the dataflow console

In [53]:
tf.logging.set_verbosity(tf.logging.WARN)

In [54]:
with beam.Pipeline(RUNNER, options=opts) as pipeline:
    with beam_impl.Context(temp_dir=tmp_dir):
        
        # Decode the raw data from CSV file and filter outliers
        raw_data = (
            pipeline 
            | 'ReadData' >> beam.io.ReadFromText(training_file, skip_header_lines=1)
            | 'Decode' >> MapAndFilterErrors(csv_decode)
            | 'Filter_outliers' >> beam.Filter(lambda r: r['DEP_DELAY'] < 120.0)
        )
        
        # Analyse and transform - handle meta_data
        raw_dataset = (raw_data, raw_data_metadata)
        t_dataset, t_fn = (raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
        t_data, t_metadata = t_dataset
 
        # Encode back to CSV file(s)
        res = (t_data 
               | beam.Map(csv_encode)
               | beam.io.WriteToText(file_path_prefix=os.path.join(OUTPUT_DIR, "atl_june_transformed")))
            
result = pipeline.run()



---
If you want to wait for the result, this would be the way to do it

In [48]:
# result.wait_until_finish()

---
The resulting CSV files

In [49]:
!gsutil ls -l $OUTPUT_DIR/atl_june_transformed*

       403  2018-10-26T21:26:00Z  gs://going-tfx/output/atl_june_transformed-00000-of-00001
TOTAL: 1 objects, 403 bytes (403 B)


In [50]:
!gsutil cat $OUTPUT_DIR/atl_june_transformed-00000-of-00001 | wc -l

9


In [51]:
!gsutil cat $OUTPUT_DIR/atl_june_transformed-00000-of-00001 | head -10

2016,6,4,EV,ATL,LFT,1621,0.884058,1705,78.0
2016,6,4,DL,ATL,LFT,2010,0.057971016,2056,-12.0
2016,6,4,EV,ATL,LFT,1305,1.0,1344,66.0
2016,6,4,EV,ATL,LFT,1006,0.04347826,1051,-3.0
2016,6,5,EV,ATL,LFT,1305,0.0,1344,19.0
2016,6,5,DL,ATL,LFT,2010,0.014492754,2056,-10.0
2016,6,5,EV,ATL,LFT,1621,0.08695652,1705,-6.0
2016,6,6,EV,ATL,LFT,1305,0.39130434,1344,27.0
2016,6,6,DL,ATL,LFT,2010,0.028985508,2056,-5.0


<PCollection[WriteToText/Write/WriteImpl/FinalizeWrite.None] at 0x7ffa14205690>