# Building Neural Networks with TensorFlow Estimator API

## Imports

In [1]:
import tensorflow as tf
import pandas as pd

## Load Data

In [2]:
%%bigquery flights_df --project tensorflow-ml-course --verbose

SELECT 

  -- Departure delay
  departure_delay,
    
  -- Distance
  distance,

  -- Airlines
  airline,
    
  -- Airports 
  departure_airport,
  arrival_airport, 

  -- Date information
  CAST(EXTRACT(DAYOFWEEK FROM departure_date) AS STRING) as departure_weekday,
  CAST(EXTRACT(MONTH FROM departure_date) AS STRING) as departure_month,

  -- Target column
  CASE WHEN (arrival_delay >= 15) THEN 1 ELSE 0 END as delayed
  
  FROM ( 
    
    -- Inner Query
    SELECT
      
      departure_delay,
      ROUND(ST_DISTANCE(ST_GEOGPOINT(departure_lon, departure_lat), ST_GEOGPOINT(arrival_lon, arrival_lat))/1000) as distance,
      airline,
      arrival_airport,
      departure_airport,
      PARSE_DATE("%Y-%m-%d", date) AS departure_date,
      
      arrival_delay
      
      
    FROM
      `bigquery-samples.airline_ontime_data.flights`
    WHERE date >= '2009-01-01' 
    AND date <= '2009-01-31'
    AND departure_delay > 0
    
  )


Executing query with job ID: 0728281a-f017-4fe8-bc8c-4d69b962fb41
Query executing: 1.16s
Query complete after 1.48s


In [3]:
%%bigquery high_traffic_airports --project tensorflow-ml-course --verbose

SELECT * FROM
 
 (SELECT departure_airport as airport_code,
  COUNT(*) as flights
  
  FROM
    `bigquery-samples.airline_ontime_data.flights`    
  
  WHERE date >= '2009-01-01' 
    AND date <= '2009-12-31'
    
  GROUP BY departure_airport
  ORDER BY airport_code)

WHERE flights > 10000

Executing query with job ID: 3d6caff1-a3b7-4c0c-bb52-eb6c4af5edb2
Query executing: 0.86s
Query complete after 1.17s


In [4]:
%%bigquery airline_codes --project tensorflow-ml-course --verbose

SELECT DISTINCT(airline)
  
FROM
    `bigquery-samples.airline_ontime_data.flights`
    
WHERE date >= '2009-01-01' 
    AND date <= '2009-12-31'
    
ORDER BY airline


Executing query with job ID: 6f2d5976-22cd-439e-9e29-dba45406ee0f
Query executing: 0.77s
Query complete after 1.05s


In [5]:
flights_df.shape

(188843, 8)

In [6]:
flights_df.sample(n = 5)

Unnamed: 0,departure_delay,distance,airline,departure_airport,arrival_airport,departure_weekday,departure_month,delayed
13106,34.0,1537.0,AS,SEA,LAX,6,1,1
107040,11.0,376.0,NW,DTW,ORD,2,1,0
125472,2.0,1552.0,CO,FLL,IAH,5,1,0
12684,4.0,176.0,MQ,SAN,LAX,3,1,0
150694,13.0,572.0,DL,ATL,RDU,7,1,0


## Data Preprocessing

### Training-Testing-Split

In [7]:
train_df = flights_df.sample(frac=0.8,random_state=123)
test_df = flights_df.drop(train_df.index)

In [8]:
print(len(train_df), 'train examples')
print(len(test_df), 'test examples')

151074 train examples
37769 test examples


#### Check Label distribution

In [9]:
print(round(flights_df.delayed.mean(),3)*100, '% delay in total dataset')
print(round(train_df.delayed.mean(),3)*100, '% delay in total dataset')
print(round(test_df.delayed.mean(),3)*100, '% delay in total dataset')

47.599999999999994 % delay in total dataset
47.599999999999994 % delay in total dataset
47.4 % delay in total dataset


### Create input pipeline using tf.data

#### Build a tf.data.Dataset 

Create a Batch Dataset from a Pandas Dataframe

In [10]:
train_y = train_df.pop('delayed')
test_y = test_df.pop('delayed')

In [11]:
batch_size = 256

In [12]:
def input_fn(features, labels, shuffle=True, batch_size=batch_size):
    """An input function for training or evaluating"""
    # Convert the inputs to a Dataset.
    features = features.copy()
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # Shuffle and repeat if you are in training mode.
    if shuffle:
        dataset = dataset.shuffle(1000).repeat()
    
    dataset = dataset.batch(batch_size)

    return dataset

#### Build Features using tf.feature_column

Define bins: This time we are not binning the numeric variables, because we want the network to learn the non-linear effects.

In [13]:
airports_voc = high_traffic_airports['airport_code']
airlines_voc = airline_codes['airline']
weekdays_voc = ['1', '2', '3', '4', '5', '6', '7']
months_voc = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']

Build feature columns

In [14]:
feature_columns = []

# numeric columns
distance = tf.feature_column.numeric_column("distance")
feature_columns.append(distance)

departure_delay = tf.feature_column.numeric_column("departure_delay")
feature_columns.append(departure_delay)

# categorical columns
arrival_airports = tf.feature_column.categorical_column_with_vocabulary_list('arrival_airport', airports_voc)
arrival_airports_dummy = tf.feature_column.indicator_column(arrival_airports)
feature_columns.append(arrival_airports_dummy)

departure_airports = tf.feature_column.categorical_column_with_vocabulary_list('departure_airport', airports_voc)
departure_airports_dummy = tf.feature_column.indicator_column(departure_airports)
feature_columns.append(departure_airports_dummy)

airlines = tf.feature_column.categorical_column_with_vocabulary_list('airline', airlines_voc)
airlines_dummy = tf.feature_column.indicator_column(airlines)
feature_columns.append(airlines_dummy)

weekdays = tf.feature_column.categorical_column_with_vocabulary_list('departure_weekday', weekdays_voc)
weekdays_dummy = tf.feature_column.indicator_column(weekdays)
feature_columns.append(weekdays_dummy)

#months = tf.feature_column.categorical_column_with_vocabulary_list('departure_month', months_voc)
#months_dummy = tf.feature_column.indicator_column(months)
#feature_columns.append(months_dummy)

In [15]:
example_batch = next(iter(input_fn(train_df, train_y, shuffle=False)))[0]

feature_layer_demo = tf.keras.layers.DenseFeatures(feature_columns)
feature_layer_demo(example_batch).numpy()[:5][0]

W1022 16:06:09.947477 140681593272064 deprecation.py:323] From /usr/local/lib/python3.5/dist-packages/tensorflow/python/feature_column/feature_column_v2.py:2655: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W1022 16:06:09.951304 140681593272064 deprecation.py:323] From /usr/local/lib/python3.5/dist-packages/tensorflow/python/feature_column/feature_column_v2.py:4215: IndicatorColumn._variable_shape (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
W1022 16:06:09.952152 140681593272064 deprecation.py:323] From /usr/local/lib/python3.5/dist-packages/tensorflow/python/feature_column/feature_column_v2.py:4270: VocabularyListCateg

array([0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e

## Defining the model using TF.Estimator

**Advantages of Estimator**

Estimator and Keras are both high level APIs for model abstraction in TensorFlow. TF.estimator provides some capabilities that are currently not available for tf.keras:

- Parameter server based training
- Full TFX integration.

**Important:** Applications with Estimators require to separate the data input pipeline from the actual model.

**Pre-made Estimators**

- tf.estimator.DNNClassifier (Deep models with multi-class classification) 
- tf.estimator.LinearClassifier (Classifiers based on linear models)
- tf.estimator.LinearRegressor (Linear regression problems)
- tf.estimator.DNNLinearCombinedClassifier (for wide & deep models)
- ...


In [19]:
# Build an ANN with 2 hidden layers with 10 hidden nodes each.
classifier = tf.estimator.DNNClassifier(
    feature_columns=feature_columns,
    # Two hidden layers of 10 nodes each.
    hidden_units=[10, 10],
    # The model must choose between 2 classes.
    n_classes=2,
    optimizer='Adam',
    model_dir='logs/DNN/')

# By default loss is calculated using softmax cross entropy for the DNNClassifier class.

In [17]:
# Train the Model.
classifier.train(
    input_fn=lambda: input_fn(train_df, train_y, shuffle=True),
    steps=50000)

W1022 16:06:09.992830 140681593272064 deprecation.py:323] From /usr/local/lib/python3.5/dist-packages/tensorflow/python/training/training_util.py:236: Variable.initialized_value (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
W1022 16:06:12.809975 140681593272064 deprecation.py:323] From /usr/local/lib/python3.5/dist-packages/tensorflow_estimator/python/estimator/head/base_head.py:574: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.


<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x7ff28e0f9eb8>

### Test results

In [18]:
eval_result = classifier.evaluate(
    input_fn=lambda: input_fn(test_df, test_y, shuffle=False))

print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))

W1022 16:15:53.694638 140681593272064 deprecation.py:323] From /usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py:1276: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with this prefix.



Test set accuracy: 0.837

