# Monitoring Training Progress with TensorBoard

## Project Setup

In [18]:
import numpy as np
import pandas as pd
import tensorflow as tf
import time

In [19]:
tf.__version__

'2.0.0-beta1'

## Staging Data

In [20]:
%%bigquery flights_df --project tensorflow-ml-course --verbose

SELECT 

  -- Departure delay
  departure_delay,
    
  -- Distance
  distance,

  -- Airlines
  airline,
    
  -- Airports 
  departure_airport,
  arrival_airport, 

  -- Date information
  CAST(EXTRACT(DAYOFWEEK FROM departure_date) AS STRING) as departure_weekday,
  CAST(EXTRACT(MONTH FROM departure_date) AS STRING) as departure_month,

  -- Target column
  CASE WHEN (arrival_delay >= 15) THEN 1 ELSE 0 END as delayed
  
  FROM ( 
    
    -- Inner Query
    SELECT
      ROUND(ST_DISTANCE(ST_GEOGPOINT(departure_lon, departure_lat), ST_GEOGPOINT(arrival_lon, arrival_lat))/1000) as distance,
      departure_delay,
      arrival_delay,
      PARSE_DATE("%Y-%m-%d", date) AS departure_date,
      airline,
      departure_airport,
      arrival_airport
      
    FROM
      `bigquery-samples.airline_ontime_data.flights`
    WHERE date >= '2009-01-01' 
    AND date <= '2009-12-31'
    AND departure_delay > 0
    AND RAND() < 0.3
  )


Executing query with job ID: a0c57ba3-ab4f-4e38-a5a6-96dfd331649c
Query executing: 3.94s
Query complete after 4.72s


## Data Preprocessing

### Training-Testing-Split

In [21]:
train_df = flights_df.sample(frac=0.8,random_state=123)
test_df = flights_df.drop(train_df.index)

In [22]:
print(len(train_df), 'train examples')
print(len(test_df), 'test examples')

552575 train examples
138144 test examples


#### Check Label distribution

In [23]:
print(round(flights_df.delayed.mean(),3)*100, '% delay in total dataset')
print(round(train_df.delayed.mean(),3)*100, '% delay in total dataset')
print(round(test_df.delayed.mean(),3)*100, '% delay in total dataset')

45.1 % delay in total dataset
45.1 % delay in total dataset
45.1 % delay in total dataset


### Create input pipeline using tf.data

#### Build a tf.data.Dataset 

Create a Batch Dataset from a Pandas Dataframe

In [24]:
def dataframe_to_dataset(dataframe, labels = 'delayed', shuffle=True, batch_size=32):
    # Creates a tf.data dataset from a Pandas Dataframe
    dataframe = dataframe.copy()
    labels = dataframe.pop(labels)
    dataset = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(dataframe))
    dataset = dataset.batch(batch_size)
    return dataset

In [25]:
batch_size = 64

In [26]:
tf.keras.backend.set_floatx('float64')
train_ds = dataframe_to_dataset(train_df, batch_size=batch_size)
test_ds = dataframe_to_dataset(test_df, shuffle=False, batch_size=batch_size)

In [27]:
train_ds

<BatchDataset shapes: ({arrival_airport: (None,), departure_delay: (None,), departure_month: (None,), departure_weekday: (None,), distance: (None,), departure_airport: (None,), airline: (None,)}, (None,)), types: ({arrival_airport: tf.string, departure_delay: tf.float64, departure_month: tf.string, departure_weekday: tf.string, distance: tf.float64, departure_airport: tf.string, airline: tf.string}, tf.int32)>

The dataset returns a dictionary of column names (from the dataframe) that map to column values from rows in the dataframe.

#### Build Features using tf.feature_column

#### Setting Bins for numeric and vocabularies for categorical variables

In [28]:
departure_delay_bins = [2, 3, 6, 9, 13, 19, 28, 44, 76]
distance_bins = [600, 1200]

#### Build the input pipeline

In [29]:
feature_columns = []

# bucketized columns
distance = tf.feature_column.numeric_column("distance")
distance_buckets = tf.feature_column.bucketized_column(distance, boundaries = distance_bins)
feature_columns.append(distance_buckets)

departure_delay = tf.feature_column.numeric_column("departure_delay")
departure_delay_buckets = tf.feature_column.bucketized_column(departure_delay, boundaries = departure_delay_bins)
feature_columns.append(departure_delay_buckets)

## Defining our model

### Define the feature layer

In [30]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

### Build the model

#### Non-distributed model

#### Parameters for TensorBoard

In [31]:
import datetime, os
log_dir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

In [32]:
log_dir

'logs/20190904-210346'

In [33]:
model_normal = tf.keras.models.Sequential([
    
    feature_layer,
    tf.keras.layers.Dense(1, activation='sigmoid', kernel_regularizer=tf.keras.regularizers.l2(0.0001))
    
    ])

model_normal.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy']
             )

#### Normal Training

In [34]:
start_time = time.time()
history = model_normal.fit(train_ds,
                    validation_data = test_ds,
                    epochs = 10,
                    callbacks = [tf.keras.callbacks.TensorBoard(log_dir, histogram_freq=1, update_freq='batch')])
print("Normal training took: {}".format(time.time() - start_time))



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Normal training took: 1346.3080098628998
