In [5]:
%load_ext autoreload
%autoreload 2
#Reloads import libraries before every piece of code is run so that changes in function.py reflect immediately


import numpy as np
import pandas as pd
import matplotlib as plt

import time
import random
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

#import functions
from functions import prepareFMNISTData, prepareRailwayData, prepareMedicalData
from functions import euclideanDistance, manhattenDistance, chebyschevDistance
from functions import accuracy
from functions import knnClassifier, bayesClassifier, naiveBayesClassifier
from functions import kNearestNeighboursEstimation, parzenWindowEstimation_gaussian

# import sys
# !{sys.executable} -m pip3 install --upgrade pip
# !{sys.executable} -m pip3 install python-mnist


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
(X_train, y_train, X_val, y_val, X_test, y_test) = prepareFMNISTData(0);
n_samples = 30
print("n_samples = ", n_samples)
indices = random.sample(range(0, 10000), n_samples)
for K in [1]:
    start = time.time()
    y_pred = bayesClassifier(X_test[indices], X_train, y_train, parzenWindowEstimation_gaussian, K) 
    #print(y_pred, y_test[indices])
    acc = accuracy(y_pred, y_test[indices])
    end = time.time()
    #print("K: ", K)
    print("accuracy: ", acc)
    print("time taken: ", end-start)


n_samples =  30
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [5 1 7 4 3 0 4 7 1 9 7 2 0 1 0 4 5 4 2 4 6 3 2 2 0 1 1 5 2 0]
accuracy:  0.16666666666666666
time taken:  7.5740578174591064


In [3]:
##MEDICAL DATA
(X_train, y_train, X_val, y_val, X_test, y_test) = prepareMedicalData()
print()
print("Train, Validation, Test")
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)





Train, Validation, Test
(2000, 3) (2000,)
(500, 3) (500,)
(500, 3) (500,)


In [4]:
#KNN
(X_train, y_train, X_val, y_val, X_test, y_test) = prepareMedicalData(0)
for K in [1, 3, 10, 30, 100, 300]:
    start = time.time()
    y_pred = knnClassifier(X_val, X_train, y_train, K, euclideanDistance )
    acc = accuracy(y_pred, y_val)
    end = time.time()
    print("K: ", K)
    print("accuracy_validation: ", acc)
    #print("time taken: ", end-start)

print("\n\n\n")
#Accuracy peaks for K=1
K = 1
y_pred = knnClassifier(X_test, X_train, y_train, K, euclideanDistance )
acc = accuracy(y_pred, y_test)
print("K: ", K)
print("accuracy_test: ", acc)



K:  1
accuracy_validation:  0.886
K:  3
accuracy_validation:  0.874
K:  10
accuracy_validation:  0.868
K:  30
accuracy_validation:  0.87
K:  100
accuracy_validation:  0.856
K:  300
accuracy_validation:  0.796




K:  1
accuracy_test:  0.88


In [5]:
#Normalized KNN
(X_train, y_train, X_val, y_val, X_test, y_test) = prepareMedicalData(1)
for K in [1, 3, 10, 30, 100, 300]:
    start = time.time()
    y_pred = knnClassifier(X_val, X_train, y_train, K, euclideanDistance )
    acc = accuracy(y_pred, y_val)
    end = time.time()
    print("K: ", K)
    print("accuracy_validation: ", acc)
    #print("time taken: ", end-start)

print("\n\n\n")
#Accuracy peaks for K=1
K = 1
y_pred = knnClassifier(X_test, X_train, y_train, K, euclideanDistance )
acc = accuracy(y_pred, y_test)
print("K: ", K)
print("accuracy_test: ", acc)




K:  1
accuracy_validation:  0.882
K:  3
accuracy_validation:  0.87
K:  10
accuracy_validation:  0.87
K:  30
accuracy_validation:  0.874
K:  100
accuracy_validation:  0.858
K:  300
accuracy_validation:  0.802




K:  1
accuracy_test:  0.878


In [6]:
#ParzenWindow  Bayes
(X_train, y_train, X_val, y_val, X_test, y_test) = prepareMedicalData(1)
for H in [np.sqrt, np.abs, np.log, lambda n: n/2]:
    start = time.time()
    y_pred = bayesClassifier(X_val, X_train, y_train, kNearestNeighboursEstimation, H)
    
    acc = accuracy(y_pred, y_val)
    end = time.time()
    print("H: ", H)
    print("accuracy_validation: ", acc)
    print("time taken: ", end-start)

print("\n\n\n")
#Accuracy peaks for H = 0.1
H = np.log
y_pred = bayesClassifier(X_test, X_train, y_train, kNearestNeighboursEstimation, H)
acc = accuracy(y_pred, y_test)
print("H: ", H)
print("accuracy_test: ", acc)



H:  <ufunc 'sqrt'>
accuracy_validation:  0.85
time taken:  0.09196090698242188
H:  <ufunc 'absolute'>
accuracy_validation:  0.526
time taken:  0.07190608978271484
H:  <ufunc 'log'>
accuracy_validation:  0.876
time taken:  0.08338785171508789
H:  <function <lambda> at 0x10b0ca730>
accuracy_validation:  0.73
time taken:  0.09158515930175781




H:  <ufunc 'log'>
accuracy_test:  0.86


In [7]:
#ParzenWindow Bayes
(X_train, y_train, X_val, y_val, X_test, y_test) = prepareMedicalData(1)
for H in [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300]:
    start = time.time()
    y_pred = bayesClassifier(X_val, X_train, y_train, parzenWindowEstimation_gaussian, H)
    acc = accuracy(y_pred, y_val)
    end = time.time()
    print("H: ", H)
    print("accuracy_validation: ", acc)
    print("time taken: ", end-start)

print("\n\n\n")
#Accuracy peaks for H = 0.1
H = 0.1
y_pred = bayesClassifier(X_test, X_train, y_train, parzenWindowEstimation_gaussian, H)
acc = accuracy(y_pred, y_test)
print("H: ", H)
print("accuracy_test: ", acc)




H:  0.001
accuracy_validation:  0.368
time taken:  0.10215091705322266
H:  0.003
accuracy_validation:  0.664
time taken:  0.09500885009765625
H:  0.01
accuracy_validation:  0.874
time taken:  0.09470391273498535
H:  0.03
accuracy_validation:  0.884
time taken:  0.10492205619812012
H:  0.1
accuracy_validation:  0.89
time taken:  0.10155606269836426
H:  0.3
accuracy_validation:  0.882
time taken:  0.10697698593139648
H:  1
accuracy_validation:  0.796
time taken:  0.11120080947875977
H:  3
accuracy_validation:  0.716
time taken:  0.10945320129394531
H:  10
accuracy_validation:  0.322
time taken:  0.10027408599853516
H:  30
accuracy_validation:  0.322
time taken:  0.10784316062927246
H:  100
accuracy_validation:  0.322
time taken:  0.1041557788848877
H:  300
accuracy_validation:  0.322
time taken:  0.10130476951599121




H:  0.1
accuracy_test:  0.882


In [8]:



(X_train, y_train, X_val, y_val, X_test, y_test) = prepareRailwayData()
#print(railwayData.head())
#print()
#print("GROUPED Mean")
#print(railwayData[['boarded', 'budget', 'preferredClass', 'memberCount', 'sex', 'age']].groupby('sex').mean())
#print()
print("Train, Validation, Test")
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)



Train, Validation, Test
(873, 5) (873,)
(218, 5) (218,)
(219, 5) (219,)


In [11]:

#Unnormalized KNN with Euclidean Distance
(X_train, y_train, X_val, y_val, X_test, y_test) = prepareRailwayData(scale = 0)
for K in [1, 3, 10, 30, 100, 300]:
    start = time.time()
    y_pred = knnClassifier(X_val, X_train, y_train, K, euclideanDistance )
    acc = accuracy(y_pred, y_val)
    end = time.time()
    print("K: ", K)
    print("accuracy_validation: ", acc)
    #print("time taken: ", end-start)

print("\n\n\n")
#Accuracy peaks for K=3
K = 3
y_pred = knnClassifier(X_test, X_train, y_train, K, euclideanDistance )
acc = accuracy(y_pred, y_test)
print("K: ", K)
print("accuracy_test: ", acc)



K:  1
accuracy_validation:  0.7155963302752294
K:  3
accuracy_validation:  0.7844036697247706
K:  10
accuracy_validation:  0.7477064220183486
K:  30
accuracy_validation:  0.7385321100917431
K:  100
accuracy_validation:  0.6788990825688074
K:  300
accuracy_validation:  0.6972477064220184




K:  3
accuracy_test:  0.6940639269406392


In [3]:
#Normalized KNN with Euclidean Distance
(X_train, y_train, X_val, y_val, X_test, y_test) = prepareRailwayData(scale = 1)
for K in [1, 3, 10, 30, 100, 300]:
    start = time.time()
    y_pred = knnClassifier(X_val, X_train, y_train, K, euclideanDistance )
    acc = accuracy(y_pred, y_val)
    end = time.time()
    print("K: ", K)
    print("accuracy_validation: ", acc)
    #print("time taken: ", end-start)

print("\n\n\n")
#Accuracy peaks for K=30
K = 30
y_pred = knnClassifier(X_test, X_train, y_train, K, euclideanDistance)
acc = accuracy(y_pred, y_test)
print("K: ", K)
print("accuracy_test: ", acc)





1234
K:  1
accuracy_validation:  0.7155963302752294
1234
K:  3
accuracy_validation:  0.7889908256880734
1234
K:  10
accuracy_validation:  0.7889908256880734
1234
K:  30
accuracy_validation:  0.7935779816513762
1234
K:  100
accuracy_validation:  0.7752293577981652
1234
K:  300
accuracy_validation:  0.8027522935779816




1234
K:  30
accuracy_test:  0.7488584474885844
