# Imports

In [1]:
from collections import defaultdict

import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import tensorflow_probability as tfp
import keras_tuner as kt

mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False

# Function definitions

In [2]:
#new

def split_ts_data(data, val_start, test_start):
    year_min = min(data['Year'])
    year_max = max(data['Year'])
    year_range = year_max-year_min
    
    assert (val_start >= year_min) & (test_start >= year_min) & (val_start <= year_max) & (test_start <= year_max), "Parameter out of bounds"
    assert (val_start > year_min) & (test_start > year_min), "Training set is empty."
    assert val_start < test_start, "Validation set is empty."
    assert year_range > 0, "Data contains less than 2 years."
    
    
    train_data = data[(data['Year']<val_start) & (data['Year']<test_start)]
    val_data = data[(data['Year']>=val_start) & (data['Year']<test_start)]
    test_data = data[data['Year']>=test_start]
    
    return train_data, val_data, test_data

In [3]:
def make_dataset(df, input_width, label_width, shift):
    def create_window(tensor):
        #input -> length of time series used for training
        #shift -> how far off prediction is from last input
        #label -> points to predict
        total_window_size = input_width + shift
        label_start = total_window_size - label_width

        input_bounds = slice(0, input_width)
        label_bounds = slice(label_start, None)

        inputs = tensor[:,input_bounds,:]
        labels = tensor[:,label_bounds,:]

        inputs.set_shape([None, input_width, None])
        labels.set_shape([None, label_width, None])

        return inputs, labels
    
    total_window_size = input_width + shift
    
    arr = np.array(df, dtype=np.float32)
    ds = tf.keras.utils.timeseries_dataset_from_array(
      data=arr,
      targets=None,
      sequence_length=total_window_size,
      sequence_stride=1,
      shuffle=False,
      batch_size=32,)
    
    ds = ds.map(create_window)
    
    return ds

In [4]:
def compile_and_fit(model, epochs, input_optimizer='adam', input_loss='mse'):
    model.compile(optimizer=input_optimizer, loss=input_loss)
    history = model.fit(x=train_ds, epochs=epochs, validation_data=val_ds)
    
    return history

In [5]:
def col_dict(np_df):
    return_dict = {col:index for index, col in enumerate(np_df.columns)}
    
    return return_dict

In [6]:
def plot(df, ds, input_width, label_width, shift, model=None, plot_col='10101 m0.4', max_subplots=3):
    #ensure that df and ds match e.g. train_df must be accompanied by train_ds
    col_indices = col_dict(df)
    
    total_window_size = label_width + shift
    input_slice = slice(0,input_width)
    input_indices = np.arange(total_window_size)[input_slice]
    label_start = total_window_size - label_width
    labels_slice = slice(label_start, None)
    label_indices = np.arange(total_window_size)[labels_slice]
    
    inputs = next(iter(ds))[0]
    labels = next(iter(ds))[1]
    plt.figure(figsize=(12, 8))
    plot_col_index = col_indices[plot_col] 
    max_n = min(max_subplots, len(inputs))
    
    for n in range(max_n):
        plt.subplot(max_n, 1, n+1)
        plt.ylabel(plot_col)
        plt.plot(input_indices, inputs[n, :, plot_col_index],
             label='Inputs', marker='.', zorder=-10)
        
        plt.scatter(label_indices, labels[n, :, plot_col_index],
                edgecolors='k', label='Labels', c='#2ca02c', s=64)
        
        if model is not None:
          predictions = model(inputs)
          plt.scatter(label_indices, predictions[n, :, plot_col_index],
                  marker='X', edgecolors='k', label='Predictions',
                  c='#ff7f0e', s=64)
            
        if n == 0:
          plt.legend()
        
    plt.xlabel('Year')

# Main code

## Edit parameters here, but do not rename variables

## Read, preprocess data

In [7]:
raw_data = pd.read_csv('newSA3.csv')



#Parameters
validation_start = 2002
test_start = 2006
#



train_df, val_df, test_df = split_ts_data(raw_data, validation_start, test_start)

train_df = train_df[train_df.columns.difference(["Unnamed: 0","Year"])]
val_df = val_df[val_df.columns.difference(["Unnamed: 0","Year"])]
test_df = test_df[test_df.columns.difference(["Unnamed: 0","Year"])]

In [8]:
## Create datasets

In [9]:
#Parameters
input_width = 2 #data used in prediction
label_width = 1 #points to predict
shift = 1 #how many years away is the last point to predict
#



train_ds = make_dataset(train_df, input_width, label_width, shift)
val_ds = make_dataset(val_df, input_width, label_width, shift)
test_ds = make_dataset(test_df, input_width, label_width, shift)

num_cols = next(iter(train_ds))[0].shape[2]

## Create and fit model

In [10]:
class SACohortModel(kt.HyperModel):
    def build(self,hp):
        #### Hyperparameters
        # add hyperparameters as needed when adding layers
        
        ##layer hyperparameters
        hp_lstm1_units = hp.Choice('units',[10,30,50])
        hp_lstm1_act = hp.Choice('activation', ["relu"])

        ##model hyperparameters -> adjust tf.keras.models type and model.add layers
        model = tf.keras.models.Sequential()
        model.add(tf.keras.layers.LSTM(units = hp_lstm1_units, 
                                       activation=hp_lstm1_act, 
                                       return_sequences=False))
        
        model.add(tf.keras.layers.Dense(label_width * num_cols))
        model.add(tf.keras.layers.Reshape([label_width,num_cols]))
        
        ##compilation hyperparameters
        hp_epochs = hp.Choice("epochs",[10,20,30])
        hp_input_optimizer = hp.Choice('input_optimizer',["adam", "adadelta"])
        loss_fun = "mse"
        
        ####
        
        #Do not edit
        model.compile(loss = loss_fun)
        
        return model
        #Do not edit

In [11]:
#Parameter
num_epochs = 10
#

train_inputs = next(iter(train_ds))[0]
train_labels = next(iter(train_ds))[1]

val_inputs = next(iter(val_ds))[0]
val_labels = next(iter(val_ds))[1]

test_inputs = next(iter(test_ds))[0]

tuner = kt.RandomSearch(
    SACohortModel(),
    objective='val_loss',
    max_trials=5)

tuner.search(train_inputs, train_labels, epochs = num_epochs, validation_data = (train_inputs, train_labels))

INFO:tensorflow:Reloading Oracle from existing project .\untitled_project\oracle.json
INFO:tensorflow:Reloading Tuner from .\untitled_project\tuner0.json
INFO:tensorflow:Oracle triggered exit


In [12]:
for model in tuner.get_best_hyperparameters():
    print(model.values)

{'units': 30, 'activation': 'sigmoid', 'epochs': 20, 'input_optimizer': 'adam'}


## Create model with above parameters

In [13]:
model_optimizer = 'adam'
loss_fun = 'mse'

full_model = tf.keras.models.Sequential()
full_model.add(tf.keras.layers.LSTM(units = 30, 
                                       activation="relu", 
                                       return_sequences=False))
        
full_model.add(tf.keras.layers.Dense(label_width * num_cols))
full_model.add(tf.keras.layers.Reshape([label_width,num_cols]))
compile_and_fit(full_model, epochs=20, input_optimizer=model_optimizer, input_loss=loss_fun)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1e93a0cdc70>

## Store full model prediction

In [14]:
#1991-2001

full_train_inputs = next(iter(train_ds))[0] #pairs from 1991-2000
full_train_labels = next(iter(train_ds))[1] #1993-2001
full_train_predictions = full_model(train_inputs) #1993-2001

In [16]:
#2002-2005

full_val_inputs = next(iter(val_ds))[0] #pairs 2002,2003 and 2003,2004
full_val_labels = next(iter(val_ds))[1] #2004 and 2005
full_val_predictions = full_model(val_inputs) #2004 and 2005

In [17]:
full_test_inputs = next(iter(test_ds))[0] #pairs from 2006-2010
full_test_labels = next(iter(test_ds))[1] #2008-2011
full_test_predictions = full_model(test_inputs) #2008-2011

In [None]:
train_df

In [None]:
full_train_inputs #pairs from 1991-2000

In [None]:
#2000-2001 input for predicting 2002
input_2002 = tf.stack([full_train_labels[7,0,:], full_train_labels[8,0,:]],0)
#2001-2002 input for predicting 2003
input_2003 = tf.stack([full_train_labels[8,0,:], full_val_inputs[0,0,:]],0)
#2000-2001 and 2001-2002 inputs as tensor
input_2002_2003 = tf.stack([input_2002,input_2003],0)

input_2002_2003

In [None]:
val_df

In [None]:
full_val_inputs #pairs from 2002-2004

In [None]:
#2004-2005 input for predicting 2006
input_2006 = tf.stack([full_val_labels[0,0,:],full_val_labels[1,0,:]],0)
#2005-2006 input for predicting 2007
input_2007 = tf.stack([full_val_labels[1,0,:], full_test_inputs[0,0,:]],0)
#2004-2005 and 2005-2006 inputs as tensor
input_2006_2007 = tf.stack([input_2006,input_2007],0)

input_2006_2007

In [None]:
full_test_inputs #pairs from 2006-2010

In [None]:
test_df

In [86]:
#all-in-one input

all_input = tf.concat([full_train_inputs,input_2002_2003,full_val_inputs,input_2006_2007,full_test_inputs],0)

In [87]:
#predictions for years 1993-2011

full_model(all_input) #1993-2011

<tf.Tensor: shape=(19, 1, 11700), dtype=float32, numpy=
array([[[2509.1895  , 3198.7053  ,  567.51215 , ...,  453.19766 ,
          268.1739  ,  111.13661 ]],

       [[2547.102   , 3238.1907  ,  571.01245 , ...,  460.8364  ,
          271.96707 ,  115.925186]],

       [[2582.126   , 3276.721   ,  575.42413 , ...,  467.70932 ,
          275.53125 ,  119.62692 ]],

       ...,

       [[3182.2969  , 4005.0186  ,  690.4095  , ...,  587.13934 ,
          331.17096 ,  168.4201  ]],

       [[3246.9346  , 4090.7864  ,  707.01306 , ...,  599.6581  ,
          337.07806 ,  171.4721  ]],

       [[3307.771   , 4165.9683  ,  719.47217 , ...,  612.18604 ,
          342.23685 ,  176.59187 ]]], dtype=float32)>