In [3]:
import numpy as np
import pandas as pd
import matplotlib as plt

import time
import random
RANDOM_SEED = 42



In [7]:
import sys
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install python-mnist



Requirement already up-to-date: pip in /usr/local/lib/python3.6/dist-packages (18.0)


In [19]:
#KNN

def euclideanDistance (x,y):
    if len(x.shape)==1: return np.sqrt(np.sum(np.square(x-y)))
    else: return np.sqrt(np.sum((np.square(x - y)), axis = 1))

'''
def getKNeighbours(testX, X, K, distanceMetric):
    dists = distanceMetric(X, testX);
    ind = np.argpartition(dists, K)[0:K]
    return ind
'''

def getKNeighbours(testX, X, K, distanceMetric):
    #testX is a matrix
    dists = np.array([distanceMetric(X, testx) for testx in testX])
    ind = np.argpartition(dists, K, axis = 1)[:, 0:K]
    return ind


def knnClassifier(testX,trainX,trainY, K, distanceMetric):    
    indices = getKNeighbours(testX, trainX, K, distanceMetric)
    freqs = np.array([trainY[index] for index in indices])
    predY = [np.bincount(freq).argmax() for freq in freqs]
    return np.array(predY)

             

In [20]:
#Evaluation Metrics

def accuracy(prediction, actual):
    return np.sum(prediction==actual)/prediction.shape[0]

#How to define with multi-class recall, precision and F1 score?
    


In [21]:
#Utility Functions


In [31]:

#FMNIST Data

from mnist import MNIST
mndata = MNIST('fashion_data')
imagesTrain,labelsTrain = mndata.load_training()
imagesTest, labelsTest = mndata.load_testing()


X_test = np.array(imagesTest)
y_test = np.array(labelsTest)

import random
n = len(imagesTrain)
np.random.seed(RANDOM_SEED)
indices = np.random.permutation(n)

trainingIndex = indices[:int(4*n/5)]
validationIndex = indices[int(4*n/5):]

X_train = np.array(imagesTrain)[trainingIndex]
y_train = np.array(labelsTrain)[trainingIndex]

X_val = np.array(imagesTrain)[validationIndex]
y_val = np.array(labelsTrain)[validationIndex]


print("Train, Validation, Test")
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

Train, Validation, Test
(48000, 784) (48000,)
(12000, 784) (12000,)
(10000, 784) (10000,)


In [34]:
n_samples = 10000
print("n_samples = ", n_samples)
indices = random.sample(range(0, 10000), n_samples)

for K in [25]:
    start = time.time()
    y_pred = knnClassifier(X_test[indices], X_train, y_train, K, euclideanDistance )
    acc = accuracy(y_pred, y_test[indices])
    end = time.time()
    #print("K: ", K)
    print("accuracy: ", acc)
    print("time taken: ", end-start)


n_samples =  10000
accuracy:  0.8323
time taken:  2597.575558900833


In [25]:
##MEDICAL DATA
def prepareMedicalData(scale = 0):
    medicalData = pd.read_csv('Medical_data.csv')
    
    '''
    print("GROUPED Mean")
    print(medicalData[['Health', 'TEST1', 'TEST2', 'TEST3']].groupby('Health').mean())
    print("GROUPED Standard Deviation")
    print(medicalData[['Health', 'TEST1', 'TEST2', 'TEST3']].groupby('Health').std())
    '''

    medicalData['Health'] = medicalData['Health'].map({'HEALTHY': 0, 'MEDICATION': 1, 'SURGERY': 2}).astype(int)
# Healthy == 0
# Medication == 1
# Surgery == 2
    X = medicalData.values[::, 1::]
    y = medicalData.values[::, 0].astype(int)

    n = X.shape[0]
    np.random.seed(RANDOM_SEED)
    indices = np.random.permutation(n)
    trainingIndex = indices[:int(4*n/6)]
    validationIndex = indices[int(4*n/6): int(5*n/6)]
    testIndex = indices[int(5*n/6):]

    X_train = X[trainingIndex]
    y_train = y[trainingIndex]

    if(scale == 1):
        mean = np.mean(X_train, axis = 0)
        X = X - mean
        variance = np.var(X_train, axis = 0)
        X = X/np.sqrt(variance)
        
        X_train = X[trainingIndex]
        y_train = y[trainingIndex]
        
    
    X_val = X[validationIndex]
    y_val = y[validationIndex]

    X_test = X[testIndex]
    y_test = y[testIndex]

    return (X_train, y_train, X_val, y_val, X_test, y_test)

(X_train, y_train, X_val, y_val, X_test, y_test) = prepareMedicalData()


print()
print("Train, Validation, Test")
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)





Train, Validation, Test
(2000, 3) (2000,)
(500, 3) (500,)
(500, 3) (500,)


In [26]:
#KNN
(X_train, y_train, X_val, y_val, X_test, y_test) = prepareMedicalData(0)
for K in [1, 3, 10, 30, 100, 300]:
    start = time.time()
    y_pred = knnClassifier(X_val, X_train, y_train, K, euclideanDistance )
    acc = accuracy(y_pred, y_val)
    end = time.time()
    print("K: ", K)
    print("accuracy_validation: ", acc)
    #print("time taken: ", end-start)

print("\n\n\n")
#Accuracy peaks for K=1
K = 1
y_pred = knnClassifier(X_test, X_train, y_train, K, euclideanDistance )
acc = accuracy(y_pred, y_test)
print("K: ", K)
print("accuracy_test: ", acc)



K:  1
accuracy_validation:  0.886
K:  3
accuracy_validation:  0.874
K:  10
accuracy_validation:  0.868
K:  30
accuracy_validation:  0.87
K:  100
accuracy_validation:  0.856
K:  300
accuracy_validation:  0.796




K:  1
accuracy_test:  0.88


In [27]:
#Normalized KNN
(X_train, y_train, X_val, y_val, X_test, y_test) = prepareMedicalData(1)
for K in [1, 3, 10, 30, 100, 300]:
    start = time.time()
    y_pred = knnClassifier(X_val, X_train, y_train, K, euclideanDistance )
    acc = accuracy(y_pred, y_val)
    end = time.time()
    print("K: ", K)
    print("accuracy_validation: ", acc)
    #print("time taken: ", end-start)

print("\n\n\n")
#Accuracy peaks for K=1
K = 1
y_pred = knnClassifier(X_test, X_train, y_train, K, euclideanDistance )
acc = accuracy(y_pred, y_test)
print("K: ", K)
print("accuracy_test: ", acc)




K:  1
accuracy_validation:  0.882
K:  3
accuracy_validation:  0.87
K:  10
accuracy_validation:  0.87
K:  30
accuracy_validation:  0.874
K:  100
accuracy_validation:  0.858
K:  300
accuracy_validation:  0.802




K:  1
accuracy_test:  0.878


In [28]:
##RAILWAY BOOKING DATA

#membercount from 0 to 10, add 1
#preferredClass : FIRST_AC, NO_PREF, SECOND_AC, THIRD_AC
#Age is age category 0 to 8

def prepareRailwayData(scale = 0):
    railwayData = pd.read_csv('railwayBookingList.csv')

    railwayData['sex'] = railwayData['sex'].map({'female': 1, 'male': 0})
    railwayData.fillna(0, inplace = True)
    railwayData['memberCount'] = railwayData['memberCount'] + 1
    railwayData['preferredClass'] = railwayData['preferredClass'].map({'FIRST_AC': 3, 'SECOND_AC': 2, 'THIRD_AC': 1, 'NO_PREF': 0})

    X = railwayData.values[::, 2::]
    y = railwayData.values[::, 1].astype(int)
                
        
    n = X.shape[0]
    np.random.seed(RANDOM_SEED)
    indices = np.random.permutation(n)
    trainingIndex = indices[:int(4*n/6)]
    validationIndex = indices[int(4*n/6): int(5*n/6)]
    testIndex = indices[int(5*n/6):]

    X_train = X[trainingIndex]
    y_train = y[trainingIndex]

    if(scale == 1):
        mean = np.mean(X_train, axis = 0)
        X = X - mean
        variance = np.var(X_train, axis = 0)
        X = X/np.sqrt(variance)
        
        X_train = X[trainingIndex]
        y_train = y[trainingIndex]

            
    X_val = X[validationIndex]
    y_val = y[validationIndex]
    
    X_test = X[testIndex]
    y_test = y[testIndex]

    return (X_train, y_train, X_val, y_val, X_test, y_test)


(X_train, y_train, X_val, y_val, X_test, y_test) = prepareRailwayData()
#print(railwayData.head())
#print()
#print("GROUPED Mean")
#print(railwayData[['boarded', 'budget', 'preferredClass', 'memberCount', 'sex', 'age']].groupby('sex').mean())
#print()
print("Train, Validation, Test")
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)



Train, Validation, Test
(873, 5) (873,)
(218, 5) (218,)
(219, 5) (219,)


In [29]:

#Unnormalized KNN with Euclidean Distance
(X_train, y_train, X_val, y_val, X_test, y_test) = prepareRailwayData(scale = 0)
for K in [1, 3, 10, 30, 100, 300]:
    start = time.time()
    y_pred = knnClassifier(X_val, X_train, y_train, K, euclideanDistance )
    acc = accuracy(y_pred, y_val)
    end = time.time()
    print("K: ", K)
    print("accuracy_validation: ", acc)
    #print("time taken: ", end-start)

print("\n\n\n")
#Accuracy peaks for K=3
K = 3
y_pred = knnClassifier(X_test, X_train, y_train, K, euclideanDistance )
acc = accuracy(y_pred, y_test)
print("K: ", K)
print("accuracy_test: ", acc)



K:  1
accuracy_validation:  0.7155963302752294
K:  3
accuracy_validation:  0.7844036697247706
K:  10
accuracy_validation:  0.7477064220183486
K:  30
accuracy_validation:  0.7385321100917431
K:  100
accuracy_validation:  0.6788990825688074
K:  300
accuracy_validation:  0.6972477064220184




K:  3
accuracy_test:  0.6940639269406392


In [30]:

#Normalized KNN with Euclidean Distance
(X_train, y_train, X_val, y_val, X_test, y_test) = prepareRailwayData(scale = 1)
for K in [1, 3, 10, 30, 100, 300]:
    start = time.time()
    y_pred = knnClassifier(X_val, X_train, y_train, K, euclideanDistance )
    acc = accuracy(y_pred, y_val)
    end = time.time()
    print("K: ", K)
    print("accuracy_validation: ", acc)
    #print("time taken: ", end-start)

print("\n\n\n")
#Accuracy peaks for K=30
K = 300
y_pred = knnClassifier(X_test, X_train, y_train, K, euclideanDistance)
acc = accuracy(y_pred, y_test)
print("K: ", K)
print("accuracy_test: ", acc)





K:  1
accuracy_validation:  0.7155963302752294
K:  3
accuracy_validation:  0.7889908256880734
K:  10
accuracy_validation:  0.7889908256880734
K:  30
accuracy_validation:  0.7935779816513762
K:  100
accuracy_validation:  0.7752293577981652
K:  300
accuracy_validation:  0.8027522935779816




K:  300
accuracy_test:  0.726027397260274
