# k-Nearest Neighbour on Time-Series data
Authors: Vivek Mahato & Pádraig Cunningham  
You will need to install `tslearn`  
`conda install -c conda-forge tslearn`   
As of 28/01/2021 tslearn requires Python 3.8   
`conda install python=3.8`


## Contents

1. Read IR_data.
2. Visualize raster scans.
3. Smooth/Filter the data.
4. Hyper-parameter tuning (Model Selection). 

In [None]:
import numpy as np
import random
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline

## Reading Data

We would be using Pandas package to read the IR_data.csv and represent it in a DataFrame (Table) format.  
For more info on DataFrame and its functions visit: https://bit.ly/2RKLtd0


## Extract the class labels into a different variable

In [None]:
X = pd.read_csv("IR_data.csv", index_col=0, header=0)
y = X["class"].values
X.drop(["class"] , axis=1, inplace=True)
X.head(5)

## Filtering The Data

We would be using a low-pass Butterworth filter to remove any possible noise from the IR-data, and smoothen it.  
To learn how a Butterworth filter works, visit: https://bit.ly/3kBv0Uy

In [None]:
from scipy.signal import butter, freqz, lfilter

def butter_lowpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return b, a

def butter_lowpass_filter(data, cutoff, fs, order=5):
    b, a = butter_lowpass(cutoff, fs, order=order)
    y = lfilter(b, a, data)
    return y

## Filtering the entire dataset 

In [None]:
order = 4
fs = 100.0
cutoff = 10

In [None]:
X_fltr = pd.DataFrame(columns=range(300))
for idx,row in X.iterrows():
    row_fltr = butter_lowpass_filter(row,cutoff=cutoff,fs=fs,order=order)
    X_fltr.loc[idx] = row_fltr

In [None]:
X_fltr.head(5)

# k-NN Classification

We will be using k-NN Time-Series Classifier (https://bit.ly/3kyYQcx) from the tslearn package. 

We shall use two version of this classifier: 
<ol>
<li>Using Euclidean as our distance metric</li>
<li>Using Dynamic Time Warping (DTW) as the distance measure</li>
</ol> 

In [None]:
from tslearn.neighbors import KNeighborsTimeSeriesClassifier
from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV

## Hold-Out Testing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_fltr.values, y, test_size=0.33, random_state=42)

### Euclidean

In [None]:
EkNN = KNeighborsTimeSeriesClassifier(n_neighbors=5,metric='euclidean', n_jobs=-1)
EkNN = EkNN.fit(X_train,y_train)
Ey_hat = EkNN.predict(X_test)

#### Calculate accuracy of the model

In [None]:
euc_accuracy = accuracy_score(y_test,Ey_hat)*100
print("Model Accuracy = {:.2f}%".format(euc_accuracy))

### DTW

In [None]:
metric_params = {'global_constraint': 'sakoe_chiba', 'sakoe_chiba_radius': 5}
DkNN = KNeighborsTimeSeriesClassifier(n_neighbors=5,metric='dtw', metric_params=metric_params, n_jobs=-1)
DkNN = DkNN.fit(X_train,y_train)
Dy_hat = DkNN.predict(X_test)

#### Calculate the accuracy of the model

In [None]:
dtw_accuracy = accuracy_score(y_test,Dy_hat)*100
print("Model Accuracy = {:.2f}%".format(dtw_accuracy))

### Compare the two models

## ROC
`roc_curve` produces the figures to draw the ROC curves.  
`auc` calculates the AUC score for that ROC curve. 

In [None]:
Ey_score = EkNN.fit(X_train, y_train).predict_proba(X_test)
fprE, tprE, t = roc_curve(y_test, Ey_score[:,1])
roc_aucE = auc(fprE, tprE)

In [None]:
Dy_score = DkNN.fit(X_train, y_train).predict_proba(X_test)
fprD, tprD, t = roc_curve(y_test, Dy_score[:,1])
roc_aucD = auc(fprD, tprD)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure()
lw = 2
plt.plot(fprE, tprE, color='red',
         lw=lw, label='ROC Euc (area = %0.2f)' % roc_aucE)
plt.plot(fprD, tprD, color='green',
         lw=lw, label='ROC DTW (area = %0.2f)' % roc_aucD)

plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Analysis for the IR data')
plt.legend(loc="lower right")
plt.show()

In [None]:
fig = plt.figure(figsize=(6,5))
model = ['Euclidean', 'DTW']
accuracies = [euc_accuracy,dtw_accuracy]
plt.bar(model,accuracies)
plt.title("Model Comparison")
plt.ylabel("Accuracy (%)")
plt.show()

# Tuning parameters of the models

We will use `GridSearchCV` to perform grid search on the parameter options.  
For Euclidean distance we consder 10 options for `k` and two weighting options.  
So the grid has 10 x 2 options.

## Euclidean
Defining the parameter grid. 

In [None]:
param_grid = {
    "n_neighbors" : np.arange(1,20,2),
    "weights" : ["uniform", "distance"]
}
param_grid

In [None]:
model = KNeighborsTimeSeriesClassifier(metric='euclidean', n_jobs=-1)
gs_model = GridSearchCV(model, param_grid,scoring="accuracy", cv=5, n_jobs=-1)
gs_model.fit(X_train, y_train) # The grid search returns an actual model. 
print("Best parameters: ",gs_model.best_params_)
print("Accuracy: ",gs_model.best_score_)

### Held-out test data
Providing the model with the best parameters, and evaluating it on the held-out test data.

In [None]:
ETkNN = KNeighborsTimeSeriesClassifier(n_neighbors= 15,metric='euclidean',
                                       weights="distance", n_jobs=-1)
ETkNN = ETkNN.fit(X_train, y_train)
y_hat = ETkNN.predict(X_test)

#### Accuracy of the model

In [None]:
euc_accuracy_tuned = accuracy_score(y_test,y_hat)*100
print("Model Accuracy = {:.2f}%".format(euc_accuracy_tuned))

## DTW 
The DTW grid is more complicated because DTW has a parameter that has two components,  
the warping constraint and then the radius for that constraint. 

In [None]:
m_params = []
for i in np.arange(1,10,2): #np.arange(1,25,2):
    m_params.append({
                "global_constraint": "sakoe_chiba",
                "sakoe_chiba_radius": i
            })


param_grid = {
    "n_neighbors" : [1,3,7], #np.arange(1,20,2),
    "weights":["uniform","distance"],
    "metric_params": m_params
}
param_grid

In [None]:
model = KNeighborsTimeSeriesClassifier(metric='dtw', n_jobs=-1)
gs_model = GridSearchCV(model, param_grid,scoring="accuracy", cv=5, n_jobs=-1)
gs_model.fit(X_train, y_train)
print("Best parameters: ",gs_model.best_params_)
print("Accuracy: ",gs_model.best_score_)

In [None]:
metric_params = {'global_constraint': 'sakoe_chiba', 'sakoe_chiba_radius': 21}

DTkNN = KNeighborsTimeSeriesClassifier(n_neighbors=3,weights="distance",
                                       metric='dtw',
                                       metric_params=metric_params,
                                       n_jobs=-1)
DTkNN = DTkNN.fit(X_train, y_train)
y_hat = DTkNN.predict(X_test)

#### Accuracy of the model

In [None]:
dtw_accuracy_tuned = accuracy_score(y_test,y_hat)*100
print("Model Accuracy = {:.2f}%".format(dtw_accuracy_tuned))

## Compare the models

In [None]:
model_acc = [euc_accuracy,dtw_accuracy]
model_tuned_acc = [euc_accuracy_tuned,dtw_accuracy_tuned]
rows = ["Euclidean","DTW"]

barWidth = 0.25

r1 = np.arange(len(model_acc))
r2 = [x + barWidth for x in r1]
y_pos = []

for i in range(len(r1)):
    y_pos.append((r1[i]+r2[i])/2)


fig, ax = plt.subplots(figsize=(7,5))
plt.xticks(y_pos, rows)

ax.bar(r1, model_acc, color='skyblue', label='Vanilla Version',width = 0.2)
ax.bar(r2, model_tuned_acc, color='steelblue', label='Tuned Version',width = 0.2)
ax.set_ylabel('Accuracy (%)')

ax.set_ylim(0,100)

plt.title("Model Comparison")

ax.legend( loc="upper left", ncol = 2, fontsize=12)

plt.show()

## ROC

In [None]:
Ey_score = EkNN.fit(X_train, y_train).predict_proba(X_test)
fprE, tprE, t = roc_curve(y_test, Ey_score[:,1])
roc_aucE = auc(fprE, tprE)

In [None]:
ETy_score = ETkNN.fit(X_train, y_train).predict_proba(X_test)
fprET, tprET, t = roc_curve(y_test, ETy_score[:,1])
roc_aucET = auc(fprET, tprET)

In [None]:
Dy_score = DkNN.fit(X_train, y_train).predict_proba(X_test)
fprD, tprD, t = roc_curve(y_test, Dy_score[:,1])
roc_aucD = auc(fprD, tprD)

In [None]:
DTy_score = DTkNN.fit(X_train, y_train).predict_proba(X_test)
fprDT, tprDT, t = roc_curve(y_test, DTy_score[:,1])
roc_aucDT = auc(fprDT, tprDT)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure()
lw = 2
plt.plot(fprE, tprE, color='red',
         lw=lw, linestyle='--', label='ROC Euc (area = %0.2f)' % roc_aucE)
plt.plot(fprET, tprET, color='red',
         lw=lw, label='ROC Euc Tuned(area = %0.2f)' % roc_aucET)

plt.plot(fprD, tprD, color='green',
         lw=lw, linestyle='--', label='ROC DTW (area = %0.2f)' % roc_aucD)
plt.plot(fprDT, tprDT, color='green',
         lw=lw, label='ROC DTW Tuned (area = %0.2f)' % roc_aucDT)


plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Analysis for IR data')
plt.legend(loc="lower right")
plt.show()