# Just Run All! 

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

2.0.0


In [3]:
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

# Import data
Note: please put train.csv and test.csv into the same directory as this iPython file

Suggestion: 

Perhaps can use scikit learn to port over our data pre-processing into python? 

In [4]:
train_dataset = pd.read_csv('train.csv')
test_dataset = pd.read_csv('test.csv')

In [5]:
train_dataset.tail()

Unnamed: 0,T+8,T+0,T-1_T-4_DIFF_MEAN,T-9_T-12_DIFF_MEAN,T-1_T-4_DIFF_DIFF_MEAN,T-1_T-4_MEAN,T-5_T-8_MEAN,T-9_T-12_MEAN,POPULATION,RAIN_2_MAX,...,TEMP_2_-8_-12_MIN,TEMP_2_-8_-12_MEAN,TEMP_3_-4_-7_MIN,TEMP_3_-4_-7_MEAN,TEMP_3_-8_-12_MIN,TEMP_3_-8_-12_MEAN,TEMP_4_-4_-7_MIN,TEMP_4_-4_-7_MEAN,TEMP_4_-8_-12_MIN,TEMP_4_-8_-12_MEAN
320,-0.168397,-0.141869,0.006151,0.009227,-0.004614,-0.161476,-0.133506,-0.140427,0.312796,0.306902,...,-0.072614,0.080083,0.141365,0.226737,-0.180771,-0.02735,0.143312,0.228186,-0.17963,-0.027677
321,-0.155709,-0.124567,0.004998,0.009612,0.008651,-0.153691,-0.148789,-0.129469,0.312996,0.306902,...,-0.116183,0.052282,-0.028383,0.1843,-0.180771,-0.006103,-0.025794,0.185909,-0.17963,-0.00589
322,-0.141869,-0.10496,0.012303,0.002691,0.00519,-0.145617,-0.155709,-0.126586,0.313939,0.306902,...,-0.184647,0.029876,-0.141916,0.075416,-0.180771,0.055756,-0.141782,0.077532,-0.17963,0.056997
323,-0.158016,-0.122261,0.016532,-0.007305,0.00346,-0.131488,-0.158593,-0.129758,0.315006,0.420707,...,-0.184647,0.019087,-0.141916,0.114309,-0.180771,0.095254,-0.141782,0.115891,-0.17963,0.096737
324,-0.155709,-0.138408,0.006536,-0.010381,-0.017301,-0.123414,-0.161476,-0.133506,0.316072,0.420707,...,-0.184647,-0.079668,-0.141916,0.058524,-0.033744,0.174641,-0.141782,0.059241,-0.026822,0.177184


In [6]:
# split features from labels 
# label = T=8
train_labels = train_dataset.pop('T+8')
test_labels = test_dataset.pop('T+8')

In [7]:
#def norm(x):
#  return (x - train_stats['mean']) / train_stats['std']
#normed_train_data = norm(train_dataset)
#normed_test_data = norm(test_dataset)

# note that this data is cleaned by MOPIKO, so it's alr normalized
normed_train_data = train_dataset
normed_test_data = test_dataset

In [8]:
train_dataset.tail()

Unnamed: 0,T+0,T-1_T-4_DIFF_MEAN,T-9_T-12_DIFF_MEAN,T-1_T-4_DIFF_DIFF_MEAN,T-1_T-4_MEAN,T-5_T-8_MEAN,T-9_T-12_MEAN,POPULATION,RAIN_2_MAX,RAIN_2_MEAN,...,TEMP_2_-8_-12_MIN,TEMP_2_-8_-12_MEAN,TEMP_3_-4_-7_MIN,TEMP_3_-4_-7_MEAN,TEMP_3_-8_-12_MIN,TEMP_3_-8_-12_MEAN,TEMP_4_-4_-7_MIN,TEMP_4_-4_-7_MEAN,TEMP_4_-8_-12_MIN,TEMP_4_-8_-12_MEAN
320,-0.141869,0.006151,0.009227,-0.004614,-0.161476,-0.133506,-0.140427,0.312796,0.306902,0.046323,...,-0.072614,0.080083,0.141365,0.226737,-0.180771,-0.02735,0.143312,0.228186,-0.17963,-0.027677
321,-0.124567,0.004998,0.009612,0.008651,-0.153691,-0.148789,-0.129469,0.312996,0.306902,0.049712,...,-0.116183,0.052282,-0.028383,0.1843,-0.180771,-0.006103,-0.025794,0.185909,-0.17963,-0.00589
322,-0.10496,0.012303,0.002691,0.00519,-0.145617,-0.155709,-0.126586,0.313939,0.306902,0.127465,...,-0.184647,0.029876,-0.141916,0.075416,-0.180771,0.055756,-0.141782,0.077532,-0.17963,0.056997
323,-0.122261,0.016532,-0.007305,0.00346,-0.131488,-0.158593,-0.129758,0.315006,0.420707,0.155916,...,-0.184647,0.019087,-0.141916,0.114309,-0.180771,0.095254,-0.141782,0.115891,-0.17963,0.096737
324,-0.138408,0.006536,-0.010381,-0.017301,-0.123414,-0.161476,-0.133506,0.316072,0.420707,0.139071,...,-0.184647,-0.079668,-0.141916,0.058524,-0.033744,0.174641,-0.141782,0.059241,-0.026822,0.177184


In [9]:
train_labels

0     -0.129181
1     -0.115340
2     -0.124567
3     -0.130334
4     -0.140715
         ...   
320   -0.168397
321   -0.155709
322   -0.141869
323   -0.158016
324   -0.155709
Name: T+8, Length: 325, dtype: float64

# Building the model


## Loss
This defines the Loss function. 
Keras only accepts yTrue (T+8) and yPredicted (predicted T+8 by model) as arguments. 
We cannot implement the MSE(yPredicted + Persistence, yTrue) as the Loss like autocaffe. 

In [10]:
# custom RMSE? 
def customLoss(yTrue,yPred):
    return keras.backend.sqrt(keras.backend.mean(keras.backend.square(yPred - yTrue))) 


## Model Network Definition
This section is the model building. 
This is a very naive and simple model. 
Suggestions: 
- perhaps we can build an LSTM network from keras docs? I'm not sure how to do it 

In [37]:
def build_model():
  model = keras.Sequential([
    layers.Dense(64, activation='sigmoid', input_shape=[len(train_dataset.keys())]),
    layers.Dense(48, activation='sigmoid'),
    layers.Dense(24, activation='relu'),
    layers.Dense(1, activation='linear')
  ])

  # optimizer = keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.99, beta_2=0.999, epsilon=0, amsgrad=False)
  
    
  model.compile(loss=customLoss,
                optimizer='Adam',
                metrics=['mae', 'mse'])
  return model

Uncomment below to get early stopping, although I just observe the test loss graph and adjust the EPOCH accordingly

In [38]:
model = build_model()

# The patience parameter is the amount of epochs to check for improvement
# early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=50)

# early_history = model.fit(normed_train_data, train_labels, 
#                    epochs=EPOCHS, validation_split = 0.2, verbose=0, 
#                    callbacks=[early_stop, tfdocs.modeling.EpochDots()])

## Early Model Check 
This segment checks if model can be trained on a sequence of 10 instances - to see if the model is functional without running the full training. 

In [39]:
# testing out the model

example_batch = normed_train_data[:10]
example_result = model.predict(example_batch)
    
example_result    



array([[0.31717172],
       [0.314717  ],
       [0.31479815],
       [0.31945243],
       [0.3193472 ],
       [0.31732133],
       [0.3135005 ],
       [0.33237946],
       [0.32971108],
       [0.33227757]], dtype=float32)

# Actual Training

In [None]:
EPOCHS = 500 # this sets how many repeats to train sequentially 

history = model.fit(normed_train_data, train_labels,
  epochs=EPOCHS, validation_split = 0.15, verbose=0,
  # callbacks=[early_stop, tfdocs.modeling.EpochDots()]) # this section includes the early stop 
    callbacks=[tfdocs.modeling.EpochDots()])



Epoch: 0, loss:0.2262,  mae:0.1809,  mse:0.0530,  val_loss:0.1944,  val_mae:0.1938,  val_mse:0.0378,  
....................................................................................................
Epoch: 100, loss:0.1316,  mae:0.0983,  mse:0.0177,  val_loss:0.0670,  val_mae:0.0551,  val_mse:0.0045,  
....................................................................................................
Epoch: 200, loss:0.1011,  mae:0.0751,  mse:0.0105,  val_loss:0.0487,  val_mae:0.0388,  val_mse:0.0024,  
....................................................................................................
Epoch: 300, loss:0.0882,  mae:0.0656,  mse:0.0079,  val_loss:0.0478,  val_mae:0.0411,  val_mse:0.0024,  
.........................................................

In [None]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

In [None]:
plotter = tfdocs.plots.HistoryPlotter(smoothing_std=2)

# Test Loss Graph

In [None]:
plotter.plot({'Basic': history}, metric = "mse")
plt.ylim([0, 0.04])
plt.ylabel('MSE [T+8^2]')

# Results

In [None]:
loss, mae, mse = model.evaluate(normed_test_data, test_labels, verbose=2)

print("Testing set MSE: {:5.10f} T+8".format(mse))

# Making Predictions

## scatter plot to test predictions

In [None]:
test_predictions = model.predict(normed_test_data).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [T+8]')
plt.ylabel('Predictions [T+8]')
lims = [-0.2, 0.8]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)


## Actual (blue) vs Model (orange) on training data 

In [None]:
plt.plot(train_labels)
plt.plot(model.predict(normed_train_data).flatten())

## Actual (blue) vs Model (orange) on test data

In [None]:
plt.plot(test_labels)
plt.plot(test_predictions)

In [None]:
error = test_predictions - test_labels
plt.hist(error, bins = 25)
plt.xlabel("Prediction Error [T=8]")
_ = plt.ylabel("Count")

## Lagged Correlation plot

In [None]:
plt.acorr(test_predictions)