In [1]:
from __future__ import print_function

import datetime
import os

import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import apache_beam as beam

from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema
import tensorflow_transform as tft
import tensorflow_transform.beam.impl as beam_impl
import google.datalab.bigquery as bq

  from .murmurhash import murmurhash3_32
  from .lbfgsb import _minimize_lbfgsb
  from ._logistic_sigmoid import _log_logistic_sigmoid
  from .expected_mutual_info_fast import expected_mutual_information
  from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan


In [2]:
# change these to try this notebook out
BUCKET = 'going-tfx'
PROJECT = 'going-tfx'
REGION = 'us-east-1'
BQ_DATASET = 'examples'
os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION

In [3]:
!pwd

/home/jupyter/workspace/tutorials/kovalevskyi


In [4]:
def sample_query(project, dataset):
    return """SELECT
      ORIGIN,
      FL_YEAR,
      FL_MONTH,
      FL_DOW,
      UNIQUE_CARRIER,
      DEST,
      CRS_ARR_TIME,
      DEP_DELAY,
      ARR_DELAY
    FROM `{}.{}.ATL_JUNE` 
    where
      MOD(ABS(FARM_FINGERPRINT(
        CONCAT(
          STRING(TIMESTAMP(FL_DATE)),
          UNIQUE_CARRIER,
          DEST
        )
      )) + CRS_ARR_TIME, 10000) >= {} and 
      MOD(ABS(FARM_FINGERPRINT(
        CONCAT(
          STRING(TIMESTAMP(FL_DATE)),
          UNIQUE_CARRIER,
          DEST
        )
      )) + CRS_ARR_TIME, 10000) < {} 
    """.format(project, dataset, '{}', '{}')

In [5]:
def create_queries(training_percentage, eval_percentage):
    """
        returns 3 queries that return distinct samples of the ATL_JUNE table.
        Use these for your convenience to define train,eval,test splits
    """
    cut1 = int(100 * training_percentage)
    cut2 = cut1 + int(100 * eval_percentage)
    query = sample_query(PROJECT, BQ_DATASET)
    q1 = query.format(0, cut1)
    q2 = query.format(cut1+1, cut2)
    q3 = query.format(cut2+1, 9999)
    return q1, q2, q3

---
The below query retrieves only 1/10'000 of the 400k entries, that's about 40-50

In [6]:
query_0001, e, s = create_queries(0.01,0.02)
print(query_0001)

SELECT
      ORIGIN,
      FL_YEAR,
      FL_MONTH,
      FL_DOW,
      UNIQUE_CARRIER,
      DEST,
      CRS_ARR_TIME,
      DEP_DELAY,
      ARR_DELAY
    FROM `going-tfx.examples.ATL_JUNE` 
    where
      MOD(ABS(FARM_FINGERPRINT(
        CONCAT(
          STRING(TIMESTAMP(FL_DATE)),
          UNIQUE_CARRIER,
          DEST
        )
      )) + CRS_ARR_TIME, 10000) >= 0 and 
      MOD(ABS(FARM_FINGERPRINT(
        CONCAT(
          STRING(TIMESTAMP(FL_DATE)),
          UNIQUE_CARRIER,
          DEST
        )
      )) + CRS_ARR_TIME, 10000) < 1 
    


In [7]:
# An alternative way of getting some data
# sample = pd.read_csv(os.path.join(DATA_DIR, "atl_june_46.csv"));

In [8]:
sample = bq.Query(query_0001).execute().result().to_dataframe()
sample

Unnamed: 0,ORIGIN,FL_YEAR,FL_MONTH,FL_DOW,UNIQUE_CARRIER,DEST,CRS_ARR_TIME,DEP_DELAY,ARR_DELAY
0,ATL,2009,6,7,AA,MIA,1610,0,24
1,ATL,2006,6,1,DL,ABQ,2307,20,8
2,ATL,2007,6,3,DL,SFO,1153,8,-7
3,ATL,2008,6,7,DL,ABQ,1255,3,12
4,ATL,2008,6,2,DL,LAX,1240,7,12
5,ATL,2008,6,5,DL,DEN,1206,13,30
6,ATL,2010,6,2,DL,ONT,2040,52,41
7,ATL,2010,6,3,DL,PHL,1206,-1,3
8,ATL,2011,6,6,DL,PWM,2311,-3,1
9,ATL,2013,6,6,DL,BWI,1819,-6,-23


---
Metadata and schema

In [48]:
raw_data_metadata = dataset_metadata.DatasetMetadata(
    dataset_schema.from_feature_spec({
        'ORIGIN': tf.FixedLenFeature([], tf.string),
        'FL_YEAR': tf.FixedLenFeature([], tf.int64),
        'FL_MONTH': tf.FixedLenFeature([], tf.int64),
        'FL_DOW': tf.FixedLenFeature([], tf.int64),
        'UNIQUE_CARRIER': tf.FixedLenFeature([], tf.string),
        'DEST': tf.FixedLenFeature([], tf.string),
        'CRS_ARR_TIME': tf.FixedLenFeature([], tf.int64),
        'DEP_DELAY': tf.FixedLenFeature([], tf.float32),
        'ARR_DELAY': tf.FixedLenFeature([], tf.float32)
    }))

In [49]:
raw_data_schema = {
    colname : dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation())
               for colname in ['ORIGIN','UNIQUE_CARRIER','DEST']
}
raw_data_schema.update({
    colname : dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation())
               for colname in ['DEP_DELAY','ARR_DELAY']
})
raw_data_schema.update({
    colname : dataset_schema.ColumnSchema(tf.int64, [], dataset_schema.FixedColumnRepresentation())
               for colname in ['FL_YEAR','FL_MONTH','FL_DOW','CRS_ARR_TIME']
})
raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema))

In [50]:
records = sample.to_dict(orient='records')
records[0:2]

[{u'ARR_DELAY': 24,
  u'CRS_ARR_TIME': 1610,
  u'DEP_DELAY': 0,
  u'DEST': 'MIA',
  u'FL_DOW': 7,
  u'FL_MONTH': 6,
  u'FL_YEAR': 2009,
  u'ORIGIN': 'ATL',
  u'UNIQUE_CARRIER': 'AA'},
 {u'ARR_DELAY': 8,
  u'CRS_ARR_TIME': 2307,
  u'DEP_DELAY': 20,
  u'DEST': 'ABQ',
  u'FL_DOW': 1,
  u'FL_MONTH': 6,
  u'FL_YEAR': 2006,
  u'ORIGIN': 'ATL',
  u'UNIQUE_CARRIER': 'DL'}]

---
The pre-processing function scales the arrival delay and lets all other columns unchanged. Pay particular attention to the name of the returned ARR_DELAY tensor

In [55]:
def preprocessing_fn(inputs):
    # print(inputs)
    arr_delay=tft.scale_to_0_1(inputs['ARR_DELAY'])
    res = {'ARR_DELAY': arr_delay}
    for col in ['ORIGIN', 'FL_YEAR', 'FL_MONTH', 'FL_DOW', 'UNIQUE_CARRIER', 'DEST', 'CRS_ARR_TIME', 'DEP_DELAY']:
        res[col] = tf.identity(inputs[col])
    return res

In [56]:
preprocessing_fn(records[0])

{'ARR_DELAY': <tf.Tensor 'scale_by_min_max_2/add:0' shape=() dtype=float32>,
 'CRS_ARR_TIME': <tf.Tensor 'Identity_22:0' shape=() dtype=int32>,
 'DEP_DELAY': <tf.Tensor 'Identity_23:0' shape=() dtype=int32>,
 'DEST': <tf.Tensor 'Identity_21:0' shape=() dtype=string>,
 'FL_DOW': <tf.Tensor 'Identity_19:0' shape=() dtype=int32>,
 'FL_MONTH': <tf.Tensor 'Identity_18:0' shape=() dtype=int32>,
 'FL_YEAR': <tf.Tensor 'Identity_17:0' shape=() dtype=int32>,
 'ORIGIN': <tf.Tensor 'Identity_16:0' shape=() dtype=string>,
 'UNIQUE_CARRIER': <tf.Tensor 'Identity_20:0' shape=() dtype=string>}

In [57]:
ORDERED_COLS=['ORIGIN', 'FL_YEAR', 'FL_MONTH', 'FL_DOW', 'UNIQUE_CARRIER', 'DEST', 'CRS_ARR_TIME', 'DEP_DELAY', 'ARR_DELAY']

In [58]:
in_test_mode = True
OUTPUT_DIR="./out"
job_name = 'tft_tutorial' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    

options = {
    'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
    'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
    'job_name': job_name,
    'project': PROJECT,
    'max_num_workers': 24,
    'teardown_policy': 'TEARDOWN_ALWAYS',
    'no_save_main_session': True,
    'requirements_file': 'requirements.txt'
}
opts = beam.pipeline.PipelineOptions(flags=[], **options)
if in_test_mode:
    RUNNER = 'DirectRunner'
else:
    RUNNER = 'DataflowRunner'

In [59]:
import tempfile
with beam.Pipeline(RUNNER, options=opts) as p:
    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        raw_data = p | "ReadCSV"  >> beam.io.Read(beam.io.BigQuerySource(query=query_0001, use_standard_sql=True)) 

        _ = (raw_data, raw_data_metadata)    |\
        'Decode' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)



INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:SavedModel written to: /tmp/tmpxFqKiS/tftransform_tmp/afd394cd93f841afb83acb5e6827faaa/saved_model.pb


INFO:tensorflow:SavedModel written to: /tmp/tmpxFqKiS/tftransform_tmp/afd394cd93f841afb83acb5e6827faaa/saved_model.pb


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:SavedModel written to: /tmp/tmpxFqKiS/tftransform_tmp/93a66cf053b14f999ba5cf382940ddb4/saved_model.pb


INFO:tensorflow:SavedModel written to: /tmp/tmpxFqKiS/tftransform_tmp/93a66cf053b14f999ba5cf382940ddb4/saved_model.pb
  pipeline.replace_all(_get_transform_overrides(pipeline.options))


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:SavedModel written to: /tmp/tmpxFqKiS/tftransform_tmp/6edb9696bd6b4f4cb11e4b1a36e71316/saved_model.pb


INFO:tensorflow:SavedModel written to: /tmp/tmpxFqKiS/tftransform_tmp/6edb9696bd6b4f4cb11e4b1a36e71316/saved_model.pb


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [60]:
raw_data

<PCollection[ReadCSV.None] at 0x7f12b93f4b10>

import tempfile
with beam.Pipeline() as p:
    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        converter = tft.coders.CsvCoder(ORDERED_COLS, raw_data_metadata.schema)
        raw_data = p | "ReadCSV"  >> beam.io.ReadFromText('atl_june_46.csv') 

        _ = (raw_data, raw_data_metadata)    |\
        'Decode' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)



In [30]:
res

'DONE'

In [None]:
transformed_dataseet, transform_fn = 