In [2]:
from __future__ import print_function

import datetime
import os

import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import apache_beam as beam

from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema
import tensorflow_transform as tft
import tensorflow_transform.beam.impl as beam_impl
import google.datalab.bigquery as bq

In [3]:
# change these to try this notebook out
BUCKET = 'ingres'
PROJECT = 'ticino-2018'
REGION = 'europe-west1'
import os
os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION

In [16]:
!pwd

/home/jupyter


In [4]:
sample_query = """SELECT
  ORIGIN,
  FL_YEAR,
  FL_MONTH,
  FL_DOW,
  UNIQUE_CARRIER,
  DEST,
  CRS_ARR_TIME,
  DEP_DELAY,
  ARR_DELAY
FROM `ticino-2018.flightdata.ATL_JUNE` 
where
  MOD(ABS(FARM_FINGERPRINT(
    CONCAT(
      STRING(TIMESTAMP(FL_DATE)),
      UNIQUE_CARRIER,
      DEST
    )
  )) + CRS_ARR_TIME, 10000) >= {0} and 
  MOD(ABS(FARM_FINGERPRINT(
    CONCAT(
      STRING(TIMESTAMP(FL_DATE)),
      UNIQUE_CARRIER,
      DEST
    )
  )) + CRS_ARR_TIME, 10000) < {1} 
"""

In [5]:
def create_queries(training_percentage, eval_percentage):
    """
        returns 3 queries that return distinct samples of the ATL_JUNE table.
        Use these for your convenience to define train,eval,test splits
    """
    cut1 = int(100 * training_percentage)
    cut2 = cut1 + int(100 * eval_percentage)
    q1 = sample_query.format(0, cut1)
    q2 = sample_query.format(cut1+1, cut2)
    q3 = sample_query.format(cut2+1, 9999)
    return q1, q2, q3

In [6]:
query_0001, e, s = create_queries(0.01,0.02)
print(query_0001)

SELECT
  ORIGIN,
  FL_YEAR,
  FL_MONTH,
  FL_DOW,
  UNIQUE_CARRIER,
  DEST,
  CRS_ARR_TIME,
  DEP_DELAY,
  ARR_DELAY
FROM `ticino-2018.flightdata.ATL_JUNE` 
where
  MOD(ABS(FARM_FINGERPRINT(
    CONCAT(
      STRING(TIMESTAMP(FL_DATE)),
      UNIQUE_CARRIER,
      DEST
    )
  )) + CRS_ARR_TIME, 10000) >= 0 and 
  MOD(ABS(FARM_FINGERPRINT(
    CONCAT(
      STRING(TIMESTAMP(FL_DATE)),
      UNIQUE_CARRIER,
      DEST
    )
  )) + CRS_ARR_TIME, 10000) < 1 



In [7]:
bq.Query(query_0001).execute().result().to_dataframe()

Unnamed: 0,ORIGIN,FL_YEAR,FL_MONTH,FL_DOW,UNIQUE_CARRIER,DEST,CRS_ARR_TIME,DEP_DELAY,ARR_DELAY
0,ATL,2009,6,7,AA,MIA,1610,0.0,24.0
1,ATL,2006,6,1,DL,ABQ,2307,20.0,8.0
2,ATL,2007,6,3,DL,SFO,1153,8.0,-7.0
3,ATL,2008,6,7,DL,ABQ,1255,3.0,12.0
4,ATL,2008,6,2,DL,LAX,1240,7.0,12.0
5,ATL,2008,6,5,DL,DEN,1206,13.0,30.0
6,ATL,2010,6,2,DL,ONT,2040,52.0,41.0
7,ATL,2010,6,3,DL,PHL,1206,-1.0,3.0
8,ATL,2011,6,6,DL,PWM,2311,-3.0,1.0
9,ATL,2013,6,6,DL,BWI,1819,-6.0,-23.0


In [8]:
sample = pd.read_csv("atl_june_46.csv");

In [32]:
raw_data_metadata = dataset_metadata.DatasetMetadata(
    dataset_schema.from_feature_spec({
        'ORIGIN': tf.FixedLenFeature([], tf.string),
        'FL_YEAR': tf.FixedLenFeature([], tf.int64),
        'FL_MONTH': tf.FixedLenFeature([], tf.int64),
        'FL_DOW': tf.FixedLenFeature([], tf.int64),
        'UNIQUE_CARRIER': tf.FixedLenFeature([], tf.string),
        'DEST': tf.FixedLenFeature([], tf.string),
        'CRS_ARR_TIME': tf.FixedLenFeature([], tf.int64),
        'DEP_DELAY': tf.FixedLenFeature([], tf.float32),
        'ARR_DELAY': tf.FixedLenFeature([], tf.float32)
    }))

In [9]:
raw_data_schema = {
colname : dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation())
               for colname in 'ORIGIN,UNIQUE_CARRIER,DEST'.split(',')
}
raw_data_schema.update({
  colname : dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation())
               for colname in 'DEP_DELAY,ARR_DELAY'.split(',')
})
raw_data_schema.update({
  colname : dataset_schema.ColumnSchema(tf.int64, [], dataset_schema.FixedColumnRepresentation())
               for colname in 'FL_YEAR,FL_MONTH,FL_DOW,CRS_ARR_TIME'.split(',')
})
raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema))

In [10]:
records = sample.to_dict(orient='records')
records[0:2]

[{'ARR_DELAY': -5.0,
  'CRS_ARR_TIME': 947,
  'DEP_DELAY': 3.0,
  'DEST': 'CHS',
  'FL_DOW': 1,
  'FL_MONTH': 6,
  'FL_YEAR': 2006,
  'ORIGIN': 'ATL',
  'UNIQUE_CARRIER': 'DL'},
 {'ARR_DELAY': 3.0,
  'CRS_ARR_TIME': 1217,
  'DEP_DELAY': -1.0,
  'DEST': 'PHX',
  'FL_DOW': 1,
  'FL_MONTH': 6,
  'FL_YEAR': 2007,
  'ORIGIN': 'ATL',
  'UNIQUE_CARRIER': 'DL'}]

In [11]:
def preprocessing_fn(inputs):
    print(inputs)
    arr_delay=tft.scale_to_0_1(inputs['ARR_DELAY'])
    res = {'ARR_DELAY': arr_delay}
    for col in ['ORIGIN', 'FL_YEAR', 'FL_MONTH', 'FL_DOW', 'UNIQUE_CARRIER', 'DEST', 'CRS_ARR_TIME', 'DEP_DELAY']:
        res[col] = tf.identity(inputs[col])
    return res

In [12]:
preprocessing_fn(records[0])

{'ORIGIN': 'ATL', 'FL_MONTH': 6, 'DEP_DELAY': 3.0, 'DEST': 'CHS', 'FL_YEAR': 2006, 'CRS_ARR_TIME': 947, 'UNIQUE_CARRIER': 'DL', 'ARR_DELAY': -5.0, 'FL_DOW': 1}


{'ARR_DELAY': <tf.Tensor 'scale_by_min_max/add:0' shape=() dtype=float32>,
 'CRS_ARR_TIME': <tf.Tensor 'Identity_6:0' shape=() dtype=int32>,
 'DEP_DELAY': <tf.Tensor 'Identity_7:0' shape=() dtype=float32>,
 'DEST': <tf.Tensor 'Identity_5:0' shape=() dtype=string>,
 'FL_DOW': <tf.Tensor 'Identity_3:0' shape=() dtype=int32>,
 'FL_MONTH': <tf.Tensor 'Identity_2:0' shape=() dtype=int32>,
 'FL_YEAR': <tf.Tensor 'Identity_1:0' shape=() dtype=int32>,
 'ORIGIN': <tf.Tensor 'Identity:0' shape=() dtype=string>,
 'UNIQUE_CARRIER': <tf.Tensor 'Identity_4:0' shape=() dtype=string>}

In [13]:
ORDERED_COLS=['ORIGIN', 'FL_YEAR', 'FL_MONTH', 'FL_DOW', 'UNIQUE_CARRIER', 'DEST', 'CRS_ARR_TIME', 'DEP_DELAY', 'ARR_DELAY']

In [14]:
in_test_mode = True
OUTPUT_DIR="./out"
job_name = 'tft_tutorial' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    
PROJECT='ticino-2018'

options = {
    'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
    'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
    'job_name': job_name,
    'project': PROJECT,
    'max_num_workers': 24,
    'teardown_policy': 'TEARDOWN_ALWAYS',
    'no_save_main_session': True,
    'requirements_file': 'requirements.txt'
}
opts = beam.pipeline.PipelineOptions(flags=[], **options)
if in_test_mode:
    RUNNER = 'DirectRunner'
else:
    RUNNER = 'DataflowRunner'

In [15]:
import tempfile
with beam.Pipeline(RUNNER, options=opts) as p:
    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        raw_data = p | "ReadCSV"  >> beam.io.Read(beam.io.BigQuerySource(query=query_0001, use_standard_sql=True)) 

        _ = (raw_data, raw_data_metadata)    |\
        'Decode' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)



{'ORIGIN': <tf.Tensor 'inputs/ORIGIN_copy:0' shape=(?,) dtype=string>, 'FL_MONTH': <tf.Tensor 'inputs/FL_MONTH_copy:0' shape=(?,) dtype=int64>, 'DEP_DELAY': <tf.Tensor 'inputs/DEP_DELAY_copy:0' shape=(?,) dtype=float32>, 'DEST': <tf.Tensor 'inputs/DEST_copy:0' shape=(?,) dtype=string>, 'FL_YEAR': <tf.Tensor 'inputs/FL_YEAR_copy:0' shape=(?,) dtype=int64>, 'CRS_ARR_TIME': <tf.Tensor 'inputs/CRS_ARR_TIME_copy:0' shape=(?,) dtype=int64>, 'UNIQUE_CARRIER': <tf.Tensor 'inputs/UNIQUE_CARRIER_copy:0' shape=(?,) dtype=string>, 'ARR_DELAY': <tf.Tensor 'inputs/ARR_DELAY_copy:0' shape=(?,) dtype=float32>, 'FL_DOW': <tf.Tensor 'inputs/FL_DOW_copy:0' shape=(?,) dtype=int64>}
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: /tmp/tmpIiYwZz/tftransform_tmp/bbe8219b408e4744a47b504e7c5de252/saved_model.pb
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: /tmp/tmpIiYwZz/tftrans

  from compiler import parse, ast, pycodegen
  pipeline.replace_all(_get_transform_overrides(pipeline.options))
ERROR:root:Exception at bundle <apache_beam.runners.direct.bundle_factory._Bundle object at 0x7f86e8927950>, due to an exception.
 Traceback (most recent call last):
  File "/home/jupyter/.local/lib/python2.7/site-packages/apache_beam/runners/direct/executor.py", line 343, in call
    finish_state)
  File "/home/jupyter/.local/lib/python2.7/site-packages/apache_beam/runners/direct/executor.py", line 383, in attempt_call
    result = evaluator.finish_bundle()
  File "/home/jupyter/.local/lib/python2.7/site-packages/apache_beam/runners/direct/transform_evaluator.py", line 309, in finish_bundle
    bundles = _read_values_to_bundles(reader)
  File "/home/jupyter/.local/lib/python2.7/site-packages/apache_beam/runners/direct/transform_evaluator.py", line 296, in _read_values_to_bundles
    read_result = [GlobalWindows.windowed_value(e) for e in reader]
  File "/home/jupyter/.local/

HttpBadRequestError: HttpError accessing <https://www.googleapis.com/bigquery/v2/projects/ticino-2018/queries/79574857421f45abb9c9f1d7c7736ac4?alt=json&maxResults=10000>: response: <{'status': '400', 'content-length': '354', 'x-xss-protection': '1; mode=block', 'x-content-type-options': 'nosniff', 'transfer-encoding': 'chunked', 'vary': 'Origin, X-Origin, Referer', 'server': 'ESF', '-content-encoding': 'gzip', 'cache-control': 'private', 'date': 'Sun, 07 Oct 2018 11:04:24 GMT', 'x-frame-options': 'SAMEORIGIN', 'content-type': 'application/json; charset=UTF-8'}>, content <{
  "error": {
    "code": 400,
    "message": "Cannot read and write in different locations: source: EU, destination: US",
    "errors": [
      {
        "message": "Cannot read and write in different locations: source: EU, destination: US",
        "domain": "global",
        "reason": "invalid"
      }
    ],
    "status": "INVALID_ARGUMENT"
  }
}
>

In [36]:
import tempfile
with beam.Pipeline() as p:
    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        converter = tft.coders.CsvCoder(ORDERED_COLS, raw_data_metadata.schema)
        raw_data = p | "ReadCSV"  >> beam.io.ReadFromText('atl_june_46.csv') 

        _ = (raw_data, raw_data_metadata)    |\
        'Decode' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)



{'ORIGIN': <tf.Tensor 'inputs/ORIGIN_copy:0' shape=(?,) dtype=string>, 'FL_MONTH': <tf.Tensor 'inputs/FL_MONTH_copy:0' shape=(?,) dtype=int64>, 'DEP_DELAY': <tf.Tensor 'inputs/DEP_DELAY_copy:0' shape=(?,) dtype=float32>, 'DEST': <tf.Tensor 'inputs/DEST_copy:0' shape=(?,) dtype=string>, 'FL_YEAR': <tf.Tensor 'inputs/FL_YEAR_copy:0' shape=(?,) dtype=int64>, 'CRS_ARR_TIME': <tf.Tensor 'inputs/CRS_ARR_TIME_copy:0' shape=(?,) dtype=int64>, 'UNIQUE_CARRIER': <tf.Tensor 'inputs/UNIQUE_CARRIER_copy:0' shape=(?,) dtype=string>, 'ARR_DELAY': <tf.Tensor 'inputs/ARR_DELAY_copy:0' shape=(?,) dtype=float32>, 'FL_DOW': <tf.Tensor 'inputs/FL_DOW_copy:0' shape=(?,) dtype=int64>}
INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:SavedModel written to: /tmp/tmpJtWJXx/tftransform_tmp/59075e3c83b845f5ae62b3bf22a76e43/saved_model.pb


INFO:tensorflow:SavedModel written to: /tmp/tmpJtWJXx/tftransform_tmp/59075e3c83b845f5ae62b3bf22a76e43/saved_model.pb


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:No assets to write.


INFO:tensorflow:No assets to write.


INFO:tensorflow:SavedModel written to: /tmp/tmpJtWJXx/tftransform_tmp/8b9b6648e9734f5a99e73822cc2b2413/saved_model.pb


INFO:tensorflow:SavedModel written to: /tmp/tmpJtWJXx/tftransform_tmp/8b9b6648e9734f5a99e73822cc2b2413/saved_model.pb


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


TypeError: string indices must be integers [while running 'Decode/AnalyzeDataset/RunPhase[0]/ComputeAnalyzerInputs']

In [30]:
res

'DONE'

In [None]:
transformed_dataseet, transform_fn = 