In [1]:
%matplotlib inline  
import numpy as np
import pandas as pd
from collections import Counter
from datetime import datetime
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.utils import np_utils
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
from keras import metrics
from sklearn.preprocessing import OneHotEncoder
from keras.utils import to_categorical
# from imblearn.keras import BalancedBatchGenerator
# from imblearn.under_sampling import NearMiss
from sklearn.metrics import confusion_matrix
#from sklearn.metrics import  multilabel_confusion_matrix

from numpy.random import seed
from tensorflow import set_random_seed
set_random_seed(2)
seed(1)

Using TensorFlow backend.


In [2]:
train_data = pd.read_csv("data/train_data.csv")
test_data = pd.read_csv("data/test_data.csv")
train_labels = pd.read_csv("data/train_labels.csv",header=None).values.reshape(-1)
test_labels = pd.read_csv("data/test_labels.csv",header=None).values.reshape(-1)

In [3]:
# helper fucntion to convert labels into OHE vectors
def convert_to_cat(labels):
    enc = LabelEncoder()
    enc_y = enc.fit_transform(labels)
    categorical_y = np_utils.to_categorical(enc_y)
    print(categorical_y.shape,categorical_y[:2])
    return categorical_y

In [4]:
# Separate categorical fields from numeric fields

cat_cols = [k for k in list(train_data.columns) if train_data[k].dtype != "float64"]
num_cols = list(set(list(train_data.columns))-set(cat_cols))

# Construct disjoint datasets for cateorical and numeric types
categorical_data = train_data[cat_cols].copy()
numeric_data = train_data[num_cols].copy()
print(categorical_data.shape,numeric_data.shape)

categorical_test_data = test_data[cat_cols].copy()
numeric_test_data = test_data[num_cols].copy()
print(categorical_test_data.shape,numeric_test_data.shape)


# Have a copy of the merged, complete dataset. We will use both these datasets in further steps 
train_data = train_data.values
test_data = test_data.values

    

(65809, 132) (65809, 23)
(43201, 132) (43201, 23)


In [5]:
# If need be, feed the reduced dimensional data from PCA to the LSTM
# Just another experiment that was tried, tested and failed
from sklearn.decomposition import PCA
total_X = np.concatenate([train_data,test_data])

print("Original Shape ",total_X.shape)
pca = PCA(n_components=2,svd_solver="full")
total_X = pca.fit_transform(total_X)
print("Reduced ",total_X.shape)

train_x = total_X[:train_data.shape[0]]
test_x = total_X[train_data.shape[0]:]
print(train_x.shape,test_x.shape)

Original Shape  (109010, 155)
Reduced  (109010, 2)
(65809, 2) (43201, 2)


In [6]:
# Remove a specific numeric columns from our training pipeline.
# Here we use the disjoint datasets we created two cells ago.

drop_col = ["MODSTS.552051"] #,"VI552051.748"]

for col in drop_col:
    numeric_data.drop([col],inplace=True,axis=1)
    numeric_test_data.drop([col],inplace=True,axis=1)
    print(numeric_data.shape,categorical_data.shape)
    train_data = pd.concat([numeric_data,categorical_data],axis=1).values
    print(train_data.shape)
    
    print(numeric_test_data.shape,categorical_test_data.shape)
    test_data = pd.concat([numeric_test_data,categorical_test_data],axis=1).values
    print(test_data.shape)


(65809, 22) (65809, 132)
(65809, 154)
(43201, 22) (43201, 132)
(43201, 154)


In [7]:
from keras.optimizers import SGD, Adadelta, Adam
from keras.losses import mean_squared_error,mean_absolute_error, logcosh, categorical_crossentropy, sparse_categorical_crossentropy
from keras.initializers import RandomNormal, RandomUniform, glorot_normal
from keras.callbacks import TerminateOnNaN, LearningRateScheduler, EarlyStopping


# Parameters for Early stopping
cb1 = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3, verbose=0, mode='auto', restore_best_weights=False)
cb2 = TerminateOnNaN()
cb = [cb1,cb2]

# Setting Hyperparameters -  Though custom packages like AutoKeras perform search space construction,
# they dont specifically address time-series data requirements. We proceed with our own setting

more_optim = []
adam_beta = [[0.9,0.999],[0.8,0.888],[0.7,0.777]]
lr = [0.1,0.01,0.001]
for l in lr:
    more_optim.append(SGD(lr=l))
    more_optim.append(Adadelta(lr=l))
    for b in adam_beta:
        more_optim.append(Adam(lr=l,beta_1=b[0],beta_2=b[1]))

# A conveinient overwrite option if the above setting seems highly resource intensive
optim = ["sgd","adadelta","adam"]
loss = ["categorical_crossentropy"] #,"sparse_categorical_crossentropy"]
init = ['glorot_uniform','random_uniform','random_normal']

In [82]:

# Constructing the Vanilla LSTM here

n_steps = 6
n_features = train_data.shape[1]
model = Sequential()

# Embeddings are not useful in this setting
#model.add(Embedding(124,32,input_length=155))

#model.add(LSTM(units=124,activation='relu',return_sequences=True))
#model.add(LSTM(units=64, activation='relu',return_sequences=True))
model.add(LSTM(units=16,activation='relu',input_shape=(n_steps,n_features)))
model.add(Dense(3, activation='softmax'))



In [8]:
# Crucial to training this function convers sequential data 
# to "batch data with time_steps" to feed in to LSTM 

def process_batch(data,lbls,time_steps=3):
    dt = []
    labels = []
    i = time_steps
    while i < len(data):
        dt.append(data[i-time_steps:i])
        labels.append(lbls[i])
        i +=1  
    return np.asarray(dt),np.asarray(labels)


In [84]:
# Hardcoding here with the best parameters found so far

batch_size = 32
n_loss = "categorical_crossentropy"
opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
if n_loss == "categorical_crossentropy":
    categorical_y = convert_to_cat(train_labels)
    train_in,train_lbl = process_batch(train_data,categorical_y,n_steps)
else:
    train_in,train_lbl = process_batch(train_data,train_labels,n_steps)

print(train_in.shape,train_lbl.shape)
n_samples = len(train_in)
d_sample_weight = class_weight.compute_sample_weight('balanced',train_lbl)
print(d_sample_weight)


(65809, 3) [[1. 0. 0.]
 [1. 0. 0.]]
(65803, 6, 154) (65803, 3)
[0.12628954 0.12628954 0.12628954 ... 0.12628954 0.12628954 0.12628954]


In [85]:

# Let us run the model on the best parameters that I tuned.
model.compile(loss=n_loss, optimizer=opt,metrics=["accuracy","categorical_accuracy"])
history = model.fit(train_in,train_lbl,epochs=5,validation_split=0.2,sample_weight=d_sample_weight,batch_size=batch_size,callbacks=cb)


# If you wish to try out the hyperparameter setting we set with "adam","sgd" etc, uncomment this section 
# and run it. Spolier Alert : Highly resource intensive

# for opt in more_optim:
#     print(" OPT ",t)
#     for ls in loss:
#         print(opt,ls,"------------------------------")
#         model.compile(loss=ls, optimizer=opt,metrics=["accuracy","categorical_accuracy"])
#         history = model.fit(train_in,train_lbl,epochs=20,validation_split=0.2,sample_weight=d_sample_weight,batch_size=batch_size,callbacks=cb)


Train on 52642 samples, validate on 13161 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5


In [23]:
from keras.models import load_model

model.save('complete_model.h5') 
del model  

categorical_y = convert_to_cat(test_labels)
test_in,test_lbl = process_batch(test_data,categorical_y,time_steps)

results = model.evaluate(test_in, test_lbl)
predictions = model.predict(test_in)
print("Accuracy: %.2f%%" % (results[1]*100))


(43201, 3) [[1. 0. 0.]
 [1. 0. 0.]]
Accuracy: 99.34%


In [24]:
y_pred = predictions.argmax(1)
print(predictions[:2])
print(test_lbl[:2])

[[1. 0. 0.]
 [1. 0. 0.]]
[[1. 0. 0.]
 [1. 0. 0.]]


In [28]:
lbl = test_lbl.argmax(1)
print(confusion_matrix(lbl,y_pred))


[[42907    52     0]
 [   43     0     0]
 [  189     0     0]]
