# K-Nearest Neighbour Classifier

In [None]:
import os
from pathlib import Path
from datetime import datetime
from dateutil.tz import gettz
import csv
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier, DistanceMetric
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(455)
k_fold_seed = 765432

## User inputs

In [None]:
flist = ['private_balanced'] #, 'private_dog0_correct_plus', 'private_dog2_correct'] # List of dataset directory names. WormsTwoClass Lightning2 Earthquakes GunPoint 

n_neighbors=1

k = 3 # For k-fold cross validation. If k=1, the original test-train split is used.
m = 4 # Number of repetitions of k-fold cross validation (if k>1).

# Input directory
if 'private_dog0' == flist[0]:
    fdir = '../data/private_data/private_events_dev' 
elif 'private' in flist[0]:
    fdir = '../data/private_data/private_events_dev2' 
else:
    fdir = '../data' 
    
# Output directories
logs_dir = '../logs'
timestamp = '{:%Y-%m-%dT%H:%M}'.format(datetime.now(gettz("Europe/London")))
logs_dir = logs_dir +'/' + timestamp

if 'private' in flist[0] and 'correct_plus' in flist[0]:
    do_end_test = True
else:
    do_end_test = False
    
def readucr(filename):
    ''' Load a dataset from a file in UCR format
    space delimited, class labels in the first column.
    Returns
    X : DNN input data
    Y : class labels
    '''
    data = np.loadtxt(Path(filename))
    Y = data[:,0]
    X = data[:,1:]
    return X, Y

## Load data

In [None]:
fname = flist[0]
x_train, y_train = readucr(fdir+'/'+fname+'/'+fname+'_TRAIN.txt')
x_test, y_test = readucr(fdir+'/'+fname+'/'+fname+'_TEST.txt')

nb_classes = 2
y_train = (y_train - y_train.min())/(y_train.max()-y_train.min())*(nb_classes-1)
y_test = (y_test - y_test.min())/(y_test.max()-y_test.min())*(nb_classes-1)
    
x_train_mean = x_train.mean()
x_train_std = x_train.std()
x_train = (x_train - x_train_mean)/(x_train_std) 
x_test = (x_test - x_train_mean)/(x_train_std)

print('Number of training samples of class 0', (y_train == 0).sum())
print('Number of training samples of class 1', (y_train == 1).sum())
print('Number of test samples of class 0', (y_test == 0).sum())
print('Number of test samples of class 1', (y_test == 1).sum())

## Fit classifier (single train and test)

In [None]:
neigh = KNeighborsClassifier(n_neighbors=n_neighbors, metric='euclidean') # minkowski
neigh.fit(x_train, y_train) 

In [None]:
y_pred = neigh.predict(x_test)
cm = confusion_matrix(y_test, y_pred, labels=[1,0])
acc_calc = (cm[0][0]+cm[1][1])/(cm.sum())
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Pred', y_pred[:20])
print('True', y_test.astype(int))
print(cm)
print('Calculated accuracy:',acc_calc)
print('Normalised confusion matrix:\n', cm_norm)

## Test on other dataset

In [None]:
if do_end_test:
    other = fname+'_END_TEST' #_dog_incorrect' # 'private_dog0_correct_plus_END_TEST'
    datadir = fdir+'/'+fname
    print('Testing on:', datadir+'/'+other+'.txt')
    x_other, y_other = readucr(datadir+'/'+other+'.txt')
    y_other_pred = neigh.predict(x_other)

    # Results
    cm = confusion_matrix(y_other, y_other_pred, labels=[1,0])
    acc_calc = (cm[0][0]+cm[1][1])/(cm.sum())
    cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print('KNN cm\n', cm)
    print('KNN cm_norm\n', cm_norm)
    print('KNN acc', acc_calc)

    # Get dog result
    meta = pd.read_csv(datadir+'/'+other+'_meta.txt', sep=',', parse_dates=['date'])
    cm = confusion_matrix(y_other, meta['dog_pred'], labels=[1,0])
    dog_acc = (cm[0][0]+cm[1][1])/(cm.sum())
    cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print('Dog cm\n', cm)
    print('Dog cm_norm\n', cm_norm)
    print('Dog acc', dog_acc)

# k-fold cross validation

In [None]:
# k-fold cross validation setup
if k > 1:
    x_all = np.concatenate((x_train, x_test), axis=0)
    y_all = np.concatenate((y_train, y_test), axis=0)
    kfold = RepeatedStratifiedKFold(n_splits=k, n_repeats=m, random_state=k_fold_seed)
    scores = list()
    other_scores = list() # accuracy on the other dataset, the realistic dataset
    for train, test in kfold.split(x_all, y_all):
        x_train, y_train, x_test, y_test = x_all[train], y_all[train], x_all[test], y_all[test]
        neigh = KNeighborsClassifier(n_neighbors=n_neighbors)
        neigh.fit(x_train, y_train)
        scores.append(neigh.score(x_test, y_test))
        if do_end_test:
            other_scores.append(neigh.score(x_other, y_other))
    print(scores)
    print('Estimated Accuracy and sample std dev:')
    print(np.mean(scores))
    print(np.std(scores, ddof=1))
    
    if do_end_test:
        print(other_scores)
        print('Estimated Accuracy and sample std dev on realistic dataset:')
        print(np.mean(other_scores))
        print(np.std(other_scores, ddof=1))
else:
    neigh = KNeighborsClassifier(n_neighbors=n_neighbors)
    neigh.fit(x_train, y_train)
    print('Accuracy', neigh.score(x_test, y_test))
    
# Save the result to file
Path(logs_dir+'/'+fname).mkdir(parents=True, exist_ok=True)
with open(logs_dir+'/'+fname+'/nearestneighbours_summary.csv', 'w') as f:
    w = csv.writer(f, dialect='excel')
    for s, o in zip(scores, other_scores):
        w.writerow([s, o])
    print('Added scores to ', f.name)  

In [None]:
data = pd.DataFrame(scores, columns=['val_acc'])
data.boxplot(whis=[2.5,97.5])

In [None]:
sns.set(style="whitegrid")
ax = sns.boxplot(data=data)
ax = sns.swarmplot(data=data, color='black')

## Compare classifiers

In [None]:
file1 = '../logs/2019-03-17T12:59/private_dog0_correct/devnet_summary.csv'
data1 = pd.read_csv(file1, header=None, names=['run','loss','val_acc','epoch','time'])
name1 = 'dog0_correct'

all_data = [data1['val_acc'], data['val_acc']]
sns.set(style="whitegrid")
ax = sns.boxplot(data=all_data)
ax = sns.swarmplot(data=all_data, color='black')
plt.xticks([0, 1], ['dev_net', 'kNN'])