# Time series classification - cross validation

# Load Python packages
Import the Python packages that we will need.

In [None]:
from pathlib import Path
import time

import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
import tensorflow as tf
import tensorflow.keras as keras
import matplotlib.pyplot as plt
import seaborn as sns

from tensorflow.keras.layers import Input, Dense, Activation, Dropout
from tensorflow.keras.models import Model

# General settings
sns.set_style('whitegrid')

# User settings

In [None]:
load_from_web = True

# Load the development dataset

In [None]:
if load_from_web:
    url = 'https://raw.githubusercontent.com/Withington/deepscent/master/data/SonyAIBORobotSurface1_IoC/SonyAIBORobotSurface1_IoC_DEV.txt'
    robot_df = pd.read_csv(url, sep='\t', header=None)
    print('Loaded from', url)
    robot_data = robot_df.values
else:
    data_dir = '../../data'
    data_name = 'SonyAIBORobotSurface1_IoC'
    data_filename = data_dir+'/'+data_name+'/'+data_name+'_DEV.txt'
    robot_data = np.loadtxt(Path(data_filename))
    print('Loaded from', data_filename)

y_dev = robot_data[:,0]
x_dev = robot_data[:,1:]
print('The shape of x_dev is', x_dev.shape)
print('The shape of y_dev is', y_dev.shape)

# Change from classes 1 and 2 to classes 0 and 1
y_dev = (y_dev - y_dev.min())/(y_dev.max()-y_dev.min())

# MLP 2
This time we will create a function that builds our model.


TODO - remove dropout at this stage and make introduction of dropout a separate exercise.

In [None]:
def build_model():
    x = Input(shape=(x_dev.shape[1:]), name='InputLayer')
    ### CHANGE PARAMETERS HERE ###
    y = Dense(16, activation='relu', name='Layer010Dense')(x) 
    y = Dense(16, activation='relu', name='Layer020Dense')(y)
    y = Dense(16, activation='relu', name='Layer030Dense')(y)
    ### END OF CHANGE PARAMETERS ###
    out = Dense(1, activation='sigmoid', name='OutputLayer')(y)

    # Build model and compile the model
    model = Model(x, out)
    optimizer = keras.optimizers.Adam()
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model
    
model = build_model()

## Train the model

Train the model once to get a feel for how many epochs are needed.

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x_dev, y_dev, test_size=100, random_state=21, stratify=y_dev)

model = build_model()

batch_size = 5
epochs = 50
start = time.time()
hist = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, 
                 validation_data=(x_test, y_test), verbose=1)
end = time.time()
log = pd.DataFrame(hist.history) 
print('Training complete in', round(end-start), 'seconds')

In [None]:
log[['loss', 'val_loss']].plot()

In [None]:
log[['acc', 'val_acc']].plot()
result = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Validation accuracy is', result[1])

# Repeated k-fold cross validation

In [None]:
### CHANGE PARAMETERS HERE ###
k = 3 
m = 5 
batch_size = 10
epochs = 30
### END OF CHANGE PARAMETERS ###

kfold = RepeatedStratifiedKFold(n_splits=k, n_repeats=m, random_state=76)
count = 0
val_acc = list()
start = time.time()
for train, test in kfold.split(x_dev, y_dev):
    x_train, y_train, x_test, y_test = x_dev[train], y_dev[train], x_dev[test], y_dev[test]
    # Normalise the data
    x_train_mean = x_train.mean()
    x_train_std = x_train.std()
    x_train = (x_train - x_train_mean)/(x_train_std) 
    x_test = (x_test - x_train_mean)/(x_train_std)
    # Build and train a model
    model = build_model()
    fold_start = time.time()
    hist = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test), verbose=1)
    fold_end = time.time()
    log = pd.DataFrame(hist.history) 
    print('Training of iteration', count, 'complete in', round(fold_end-fold_start), 'seconds')
    val_acc.append(log.iloc[-1]['val_acc'])
    count = count + 1

end = time.time()
val_acc = pd.DataFrame(val_acc, columns=['val_acc'])

In [None]:
print(val_acc)
print(m, 'repeats of', k, '-fold cross validation completed in', round(end-start), 'seconds')

## Plot the k-fold cross validation results

In [None]:
ax = sns.boxplot(data=val_acc)
ax = sns.swarmplot(data=val_acc, color='black')
print('Validation accuracy mean and sample standard deviation', val_acc['val_acc'].mean(), val_acc['val_acc'].std())

# Speed - GPU
Using a GPU can speed up calculations. However, it can take longer to transfer the data to the GPU.

You are more likely to see a speed-up if batch size is large. As you increase batch size, check that valuation accuracy does not deteriorate.

To use a GPU in colab select Edit - Notebook settings and then set Hardware accelerator to GPU

In [None]:
# Check to see if you are using a GPU.
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    print('GPU device not found')
print('Found a GPU at: {}'.format(device_name))

# Generalisation - dropout
Try adding dropout layers to your model. An example of such a model is given below.

In [None]:
def build_model_with_dropout():
    x = Input(shape=(x_dev.shape[1:]), name='InputLayer')
    ### CHANGE PARAMETERS HERE ###
    y = Dropout(0.1,name='InputLayerDropout')(x)
    y = Dense(16, activation='relu', name='Layer010Dense')(y) 
    y = Dropout(0.2,name='Layer010Dropout')(y)
    y = Dense(16, activation='relu', name='Layer020Dense')(y)
    y = Dropout(0.2,name='Layer020Dropout')(y)
    y = Dense(16, activation='relu', name='Layer030Dense')(y)
    y = Dropout(0.3,name='Layer030Dropout')(y)
    ### END OF CHANGE PARAMETERS ###
    out = Dense(1, activation='sigmoid', name='OutputLayer')(y)

    # Build model and compile the model
    model = Model(x, out)
    optimizer = keras.optimizers.Adam()
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model