In [17]:
import numpy as np
import pandas as pd
import matplotlib as plt

import time
import random
RANDOM_SEED = 42



In [18]:
# import sys
# !{sys.executable} -m pip3 install --upgrade pip
# !{sys.executable} -m pip3 install python-mnist



In [19]:
#KNN

def euclideanDistance (x,y):
    if len(x.shape)==1: return np.sqrt(np.sum(np.square(x-y)))
    else: return np.sqrt(np.sum((np.square(x - y)), axis = 1))

'''
def getKNeighbours(testX, X, K, distanceMetric):
    dists = distanceMetric(X, testX);
    ind = np.argpartition(dists, K)[0:K]
    return ind
'''

def getKNeighbours(testX, X, K, distanceMetric):
    #testX is a matrix
    dists = np.array([distanceMetric(X, testx) for testx in testX])
    ind = np.argpartition(dists, K, axis = 1)[:, 0:K]
    return ind


def knnClassifier(testX,trainX,trainY, K, distanceMetric):    
    indices = getKNeighbours(testX, trainX, K, distanceMetric)
    freqs = np.array([trainY[index] for index in indices])
    predY = [np.bincount(freq).argmax() for freq in freqs]
    return np.array(predY)

             

In [50]:
#Bayes
    

def parzenWindowEstimation_gaussian(testX, trainX, h):
    #each vector is d-dimensional
    #testX: (n,d), trainX: (m,d)
    d = trainX.shape[1]
    estimates = [np.mean(np.exp(-np.sum(np.square((testx - trainX)), axis = 1)/(2*(h*h)))/(np.float_power(np.sqrt(2*np.pi), d)*h)) for testx in testX]
    return np.array(estimates)
    
def bayesClassifier(testX, trainX, trainY, estimator, h = 1):
    A, priors = np.unique(trainY, return_counts = True)
    q = np.array([priors[idx]*parzenWindowEstimation_gaussian(testX, trainX[np.where(trainY == A[idx])], h) for idx in range(len(A))])
    return np.array([A[idx] for idx in np.argmax(q, axis = 0)])
    
def naiveBayesClassifier(testX, trainX, trainY, estimator, h = 1):
    A, priors = np.unique(trainY, return_counts = True)
    d = trainX.shape[1]
    q = np.array([priors[idx]*np.prod([parzenWindowEstimation_gaussian(testX[:,i], np.array([trainX[np.where(trainY == A[idx])][:,i]]).transpose(), h) for i in range(d)], axis=0) for idx in range(len(A))])
    return np.array([A[idx] for idx in np.argmax(q, axis = 0)])
    


In [30]:
#Evaluation Metrics

def accuracy(prediction, actual):
    return np.sum(prediction==actual)/prediction.shape[0]

#How to define with multi-class recall, precision and F1 score?
    


In [22]:
#Utility Functions


In [25]:

#FMNIST Data
def prepareFMNISTData(scale = 0):
    from mnist import MNIST
    mndata = MNIST('fashion_data')
    imagesTrain,labelsTrain = mndata.load_training()
    imagesTest, labelsTest = mndata.load_testing()

    X_test = np.array(imagesTest)
    y_test = np.array(labelsTest)

    import random
    n = len(imagesTrain)
    np.random.seed(RANDOM_SEED)
    indices = np.random.permutation(n)

    trainingIndex = indices[:int(4*n/5)]
    validationIndex = indices[int(4*n/5):]

    X_train = np.array(imagesTrain)[trainingIndex]
    y_train = np.array(labelsTrain)[trainingIndex]
    
    X_val = np.array(imagesTrain)[validationIndex]
    y_val = np.array(labelsTrain)[validationIndex]
    
    if(scale == 1):
        mean = np.mean(X_train, axis = 0)
        X_train = X_train - mean
        X_test = X_test - mean
        X_val = X_val - mean
        
        variance = np.var(X_train, axis = 0)
        X_train = X_train/np.sqrt(variance)
        X_test = X_test/np.sqrt(variance)
        X_val = X_val/np.sqrt(variance)


    return (X_train, y_train, X_val, y_val, X_test, y_test) 

(X_train, y_train, X_val, y_val, X_test, y_test) = prepareFMNISTData();


print("Train, Validation, Test")
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)


Train, Validation, Test
(48000, 784) (48000,)
(12000, 784) (12000,)
(10000, 784) (10000,)


In [24]:
(X_train, y_train, X_val, y_val, X_test, y_test) = prepareFMNISTData();
n_samples = 5
print("n_samples = ", n_samples)
indices = random.sample(range(0, 10000), n_samples)

for K in [1000, 100, 25, 5]:
    start = time.time()
    y_pred = knnClassifier(X_test[indices], X_train, y_train, K, euclideanDistance )
    acc = accuracy(y_pred, y_test[indices])
    end = time.time()
    #print("K: ", K)
    print("accuracy: ", acc)
    print("time taken: ", end-start)


n_samples =  5


KeyboardInterrupt: 

In [26]:
##MEDICAL DATA
def prepareMedicalData(scale = 0):
    medicalData = pd.read_csv('Medical_data.csv')
    
    '''
    print("GROUPED Mean")
    print(medicalData[['Health', 'TEST1', 'TEST2', 'TEST3']].groupby('Health').mean())
    print("GROUPED Standard Deviation")
    print(medicalData[['Health', 'TEST1', 'TEST2', 'TEST3']].groupby('Health').std())
    '''

    medicalData['Health'] = medicalData['Health'].map({'HEALTHY': 0, 'MEDICATION': 1, 'SURGERY': 2}).astype(int)
# Healthy == 0
# Medication == 1
# Surgery == 2
    X = medicalData.values[::, 1::]
    y = medicalData.values[::, 0].astype(int)

    n = X.shape[0]
    np.random.seed(RANDOM_SEED)
    indices = np.random.permutation(n)
    trainingIndex = indices[:int(4*n/6)]
    validationIndex = indices[int(4*n/6): int(5*n/6)]
    testIndex = indices[int(5*n/6):]

    X_train = X[trainingIndex]
    y_train = y[trainingIndex]

    if(scale == 1):
        mean = np.mean(X_train, axis = 0)
        X = X - mean
        variance = np.var(X_train, axis = 0)
        X = X/np.sqrt(variance)
        
        X_train = X[trainingIndex]
        y_train = y[trainingIndex]
        
    
    X_val = X[validationIndex]
    y_val = y[validationIndex]

    X_test = X[testIndex]
    y_test = y[testIndex]

    return (X_train, y_train, X_val, y_val, X_test, y_test)

(X_train, y_train, X_val, y_val, X_test, y_test) = prepareMedicalData()


print()
print("Train, Validation, Test")
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)





Train, Validation, Test
(2000, 3) (2000,)
(500, 3) (500,)
(500, 3) (500,)


In [10]:
#KNN
(X_train, y_train, X_val, y_val, X_test, y_test) = prepareMedicalData(0)
for K in [1, 3, 10, 30, 100, 300]:
    start = time.time()
    y_pred = knnClassifier(X_val, X_train, y_train, K, euclideanDistance )
    acc = accuracy(y_pred, y_val)
    end = time.time()
    print("K: ", K)
    print("accuracy_validation: ", acc)
    #print("time taken: ", end-start)

print("\n\n\n")
#Accuracy peaks for K=1
K = 1
y_pred = knnClassifier(X_test, X_train, y_train, K, euclideanDistance )
acc = accuracy(y_pred, y_test)
print("K: ", K)
print("accuracy_test: ", acc)



K:  1
accuracy_validation:  0.886
K:  3
accuracy_validation:  0.874
K:  10
accuracy_validation:  0.868
K:  30
accuracy_validation:  0.87
K:  100
accuracy_validation:  0.856
K:  300
accuracy_validation:  0.796




K:  1
accuracy_test:  0.88


In [11]:
#Normalized KNN
(X_train, y_train, X_val, y_val, X_test, y_test) = prepareMedicalData(1)
for K in [1, 3, 10, 30, 100, 300]:
    start = time.time()
    y_pred = knnClassifier(X_val, X_train, y_train, K, euclideanDistance )
    acc = accuracy(y_pred, y_val)
    end = time.time()
    print("K: ", K)
    print("accuracy_validation: ", acc)
    #print("time taken: ", end-start)

print("\n\n\n")
#Accuracy peaks for K=1
K = 1
y_pred = knnClassifier(X_test, X_train, y_train, K, euclideanDistance )
acc = accuracy(y_pred, y_test)
print("K: ", K)
print("accuracy_test: ", acc)




K:  1
accuracy_validation:  0.882
K:  3
accuracy_validation:  0.87
K:  10
accuracy_validation:  0.87
K:  30
accuracy_validation:  0.874
K:  100
accuracy_validation:  0.858
K:  300
accuracy_validation:  0.802




K:  1
accuracy_test:  0.878


In [54]:
#ParzenWindow Naive Bayes
(X_train, y_train, X_val, y_val, X_test, y_test) = prepareRailwayData(1)
for H in [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300]:
    start = time.time()
    y_pred = naiveBayesClassifier(X_val, X_train, y_train, parzenWindowEstimation_gaussian, H)
    acc = accuracy(y_pred, y_val)
    end = time.time()
    print("H: ", H)
    print("accuracy_validation: ", acc)
    print("time taken: ", end-start)

print("\n\n\n")
#Accuracy peaks for H = 0.1
H = 0.1
y_pred = naiveBayesClassifier(X_test, X_train, y_train, parzenWindowEstimation_gaussian, H)
acc = accuracy(y_pred, y_test)
print("H: ", H)
print("accuracy_test: ", acc)



H:  0.001
accuracy_validation:  0.7568807339449541
time taken:  0.08260321617126465
H:  0.003
accuracy_validation:  0.7568807339449541
time taken:  0.07741570472717285
H:  0.01
accuracy_validation:  0.7568807339449541
time taken:  0.07424426078796387
H:  0.03
accuracy_validation:  0.7568807339449541
time taken:  0.08136725425720215
H:  0.1
accuracy_validation:  0.7614678899082569
time taken:  0.07462048530578613
H:  0.3
accuracy_validation:  0.7614678899082569
time taken:  0.07416844367980957
H:  1
accuracy_validation:  0.7889908256880734
time taken:  0.07696366310119629
H:  3
accuracy_validation:  0.6559633027522935
time taken:  0.08341336250305176
H:  10
accuracy_validation:  0.6467889908256881
time taken:  0.07584881782531738
H:  30
accuracy_validation:  0.6467889908256881
time taken:  0.07637262344360352
H:  100
accuracy_validation:  0.6467889908256881
time taken:  0.07621955871582031
H:  300
accuracy_validation:  0.6467889908256881
time taken:  0.07447075843811035




H:  0.1
accu

In [52]:
#ParzenWindow Bayes
(X_train, y_train, X_val, y_val, X_test, y_test) = prepareMedicalData(1)
for H in [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300]:
    start = time.time()
    y_pred = bayesClassifier(X_val, X_train, y_train, parzenWindowEstimation_gaussian, H)
    acc = accuracy(y_pred, y_val)
    end = time.time()
    print("H: ", H)
    print("accuracy_validation: ", acc)
    print("time taken: ", end-start)

print("\n\n\n")
#Accuracy peaks for H = 0.1
H = 0.1
y_pred = bayesClassifier(X_test, X_train, y_train, parzenWindowEstimation_gaussian, H)
acc = accuracy(y_pred, y_test)
print("H: ", H)
print("accuracy_test: ", acc)




H:  0.001
accuracy_validation:  0.366
time taken:  0.08209037780761719
H:  0.003
accuracy_validation:  0.662
time taken:  0.08033323287963867
H:  0.01
accuracy_validation:  0.874
time taken:  0.07841181755065918
H:  0.03
accuracy_validation:  0.884
time taken:  0.09086418151855469
H:  0.1
accuracy_validation:  0.89
time taken:  0.08337211608886719
H:  0.3
accuracy_validation:  0.882
time taken:  0.08438897132873535
H:  1
accuracy_validation:  0.796
time taken:  0.09182190895080566
H:  3
accuracy_validation:  0.716
time taken:  0.08039522171020508
H:  10
accuracy_validation:  0.322
time taken:  0.07985830307006836
H:  30
accuracy_validation:  0.322
time taken:  0.08328819274902344
H:  100
accuracy_validation:  0.322
time taken:  0.08427667617797852
H:  300
accuracy_validation:  0.322
time taken:  0.09776496887207031




H:  0.1
accuracy_test:  0.882


In [13]:
##RAILWAY BOOKING DATA

#membercount from 0 to 10, add 1
#preferredClass : FIRST_AC, NO_PREF, SECOND_AC, THIRD_AC
#Age is age category 0 to 8

def prepareRailwayData(scale = 0):
    railwayData = pd.read_csv('railwayBookingList.csv')

    railwayData['sex'] = railwayData['sex'].map({'female': 1, 'male': 0})
    railwayData.fillna(0, inplace = True)
    railwayData['memberCount'] = railwayData['memberCount'] + 1
    railwayData['preferredClass'] = railwayData['preferredClass'].map({'FIRST_AC': 3, 'SECOND_AC': 2, 'THIRD_AC': 1, 'NO_PREF': 0})

    X = railwayData.values[::, 2::]
    y = railwayData.values[::, 1].astype(int)
                
        
    n = X.shape[0]
    np.random.seed(RANDOM_SEED)
    indices = np.random.permutation(n)
    trainingIndex = indices[:int(4*n/6)]
    validationIndex = indices[int(4*n/6): int(5*n/6)]
    testIndex = indices[int(5*n/6):]

    X_train = X[trainingIndex]
    y_train = y[trainingIndex]

    if(scale == 1):
        mean = np.mean(X_train, axis = 0)
        X = X - mean
        variance = np.var(X_train, axis = 0)
        X = X/np.sqrt(variance)
        
        X_train = X[trainingIndex]
        y_train = y[trainingIndex]

            
    X_val = X[validationIndex]
    y_val = y[validationIndex]
    
    X_test = X[testIndex]
    y_test = y[testIndex]

    return (X_train, y_train, X_val, y_val, X_test, y_test)


(X_train, y_train, X_val, y_val, X_test, y_test) = prepareRailwayData()
#print(railwayData.head())
#print()
#print("GROUPED Mean")
#print(railwayData[['boarded', 'budget', 'preferredClass', 'memberCount', 'sex', 'age']].groupby('sex').mean())
#print()
print("Train, Validation, Test")
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)



Train, Validation, Test
(873, 5) (873,)
(218, 5) (218,)
(219, 5) (219,)


In [14]:

#Unnormalized KNN with Euclidean Distance
(X_train, y_train, X_val, y_val, X_test, y_test) = prepareRailwayData(scale = 0)
for K in [1, 3, 10, 30, 100, 300]:
    start = time.time()
    y_pred = knnClassifier(X_val, X_train, y_train, K, euclideanDistance )
    acc = accuracy(y_pred, y_val)
    end = time.time()
    print("K: ", K)
    print("accuracy_validation: ", acc)
    #print("time taken: ", end-start)

print("\n\n\n")
#Accuracy peaks for K=3
K = 3
y_pred = knnClassifier(X_test, X_train, y_train, K, euclideanDistance )
acc = accuracy(y_pred, y_test)
print("K: ", K)
print("accuracy_test: ", acc)



K:  1
accuracy_validation:  0.7155963302752294
K:  3
accuracy_validation:  0.7844036697247706
K:  10
accuracy_validation:  0.7477064220183486
K:  30
accuracy_validation:  0.7385321100917431
K:  100
accuracy_validation:  0.6788990825688074
K:  300
accuracy_validation:  0.6972477064220184




K:  3
accuracy_test:  0.6940639269406392


In [15]:

#Normalized KNN with Euclidean Distance
(X_train, y_train, X_val, y_val, X_test, y_test) = prepareRailwayData(scale = 1)
for K in [1, 3, 10, 30, 100, 300]:
    start = time.time()
    y_pred = knnClassifier(X_val, X_train, y_train, K, euclideanDistance )
    acc = accuracy(y_pred, y_val)
    end = time.time()
    print("K: ", K)
    print("accuracy_validation: ", acc)
    #print("time taken: ", end-start)

print("\n\n\n")
#Accuracy peaks for K=30
K = 300
y_pred = knnClassifier(X_test, X_train, y_train, K, euclideanDistance)
acc = accuracy(y_pred, y_test)
print("K: ", K)
print("accuracy_test: ", acc)





K:  1
accuracy_validation:  0.7155963302752294
K:  3
accuracy_validation:  0.7889908256880734
K:  10
accuracy_validation:  0.7889908256880734
K:  30
accuracy_validation:  0.7935779816513762
K:  100
accuracy_validation:  0.7752293577981652
K:  300
accuracy_validation:  0.8027522935779816




K:  300
accuracy_test:  0.726027397260274


In [16]:
a = np.random.rand(10000)
%timeit np.square(a)
%timeit np.multiply(a.transpose(), a)

4.65 µs ± 39.5 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
4.58 µs ± 28.1 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [None]:
a = np.array([1,2,3,4])
for idx, i in a:
    print(i)