In [1]:
from matplotlib import pyplot as plt
from sklearn import datasets, metrics
from sklearn.neighbors import KNeighborsClassifier
from keras.datasets import mnist
import pickle
import time
import pandas as pd

Using TensorFlow backend.


In [2]:

















# load data
(img_train, val_train),(img_test, val_test) = mnist.load_data()
print(img_train.shape)
print(img_test.shape)

Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz
(60000, 28, 28)
(10000, 28, 28)


In [3]:
# convert images (matrixs) to vectors
n = len(img_train)
data = img_train.reshape(n, -1)
print(data.shape)

(60000, 784)


In [None]:
# declare and asign values for variables
max_k = 29
dist_algo = 'jaccard'
index_name = []
train_time = []
predict_time = []
accuracy = []
headers = ['train-time','predict-time','accuracy']

In [5]:
# training
for k in range(1, max_k+1, 2):
    print('Training ' + str(k) + '-NN with ' + dist_algo + ' distance algorithm')
    index_name.append(str(k) + '-NN')
    
    # create KNN classifier
    start = time.time()
    knn = KNeighborsClassifier(algorithm='auto', metric = dist_algo, p=2, n_neighbors=k)
    knn.fit(data, val_train)
    end = time.time()
    print("Training time: %s seconds" % str(end - start))
    train_time.append(round(end-start,2))

    # save model
    filename = str(k) + '-NN ' + dist_algo + '.sav'
    pickle.dump(knn, open(filename, 'wb'))
    print("%s saved\n" %filename)
print('Train complete!')

Training 1-NN with jaccard distance algorithm
Training time: 35.2480731010437 seconds
1-NN jaccard.sav saved

Training 3-NN with jaccard distance algorithm
Training time: 35.08888292312622 seconds
3-NN jaccard.sav saved

Training 5-NN with jaccard distance algorithm
Training time: 35.00754189491272 seconds
5-NN jaccard.sav saved

Training 7-NN with jaccard distance algorithm
Training time: 34.887356996536255 seconds
7-NN jaccard.sav saved

Training 9-NN with jaccard distance algorithm
Training time: 34.91658806800842 seconds
9-NN jaccard.sav saved

Training 11-NN with jaccard distance algorithm
Training time: 34.990676403045654 seconds
11-NN jaccard.sav saved

Training 13-NN with jaccard distance algorithm
Training time: 34.83969831466675 seconds
13-NN jaccard.sav saved

Training 15-NN with jaccard distance algorithm
Training time: 35.03442430496216 seconds
15-NN jaccard.sav saved

Training 17-NN with jaccard distance algorithm
Training time: 35.08758020401001 seconds
17-NN jaccard.sav

In [None]:
# convert test images (matrixs) to vectors
start = time.time()
n = len(img_test)
test_data = img_test.reshape(n, -1)

In [7]:
# predict data
for k in range(1, max_k+1, 2):
    filename = str(k) + '-NN ' + dist_algo + '.sav'
    loaded_knn = pickle.load(open(filename, 'rb'))
    
    start = time.time()
    print('Predicting ' + str(k) + '-NN')
    predicted = loaded_knn.predict(test_data)
    print(len(predicted))
    end = time.time()
    print("Predict time: %s seconds" % str(end - start))
    predict_time.append(round(end-start,2))
    
    # calculate accuracy average
    num_correct=0
    for i in range(0,len(val_test)):
        if val_test[i] == predicted[i]:
            num_correct +=1
    accuracy.append(float(num_correct / float(len(val_test))))
print('Predict complete')

Predicting 1-NN
10000
Predict time: 1272.4507057666779 seconds
Predicting 3-NN
10000
Predict time: 1273.8691248893738 seconds
Predicting 5-NN
10000
Predict time: 1285.8642988204956 seconds
Predicting 7-NN
10000
Predict time: 1266.8742537498474 seconds
Predicting 9-NN
10000
Predict time: 1268.8411684036255 seconds
Predicting 11-NN
10000
Predict time: 1259.9489018917084 seconds
Predicting 13-NN
10000
Predict time: 1264.0600035190582 seconds
Predicting 15-NN
10000
Predict time: 1256.814787387848 seconds
Predicting 17-NN
10000
Predict time: 1265.665010213852 seconds
Predicting 19-NN
10000
Predict time: 1265.4880576133728 seconds
Predicting 21-NN
10000
Predict time: 1262.929990530014 seconds
Predicting 23-NN
10000
Predict time: 1245.5977976322174 seconds
Predicting 25-NN
10000
Predict time: 1254.9027009010315 seconds
Predicting 27-NN
10000
Predict time: 1255.219843864441 seconds
Predicting 29-NN
10000
Predict time: 1269.1429278850555 seconds
Predict complete


In [8]:
# save result
list_of_result = list(zip(train_time, predict_time, accuracy))
df = pd.DataFrame(list_of_result, columns=headers, index=index_name)
print(df)
df.to_csv('result.csv', sep=',')

       train-time  predict-time  accuracy
1-NN        35.25       1272.45    0.9598
3-NN        35.09       1273.87    0.9618
5-NN        35.01       1285.86    0.9628
7-NN        34.89       1266.87    0.9622
9-NN        34.92       1268.84    0.9613
11-NN       34.99       1259.95    0.9605
13-NN       34.84       1264.06    0.9607
15-NN       35.03       1256.81    0.9594
17-NN       35.09       1265.67    0.9595
19-NN       35.22       1265.49    0.9588
21-NN       35.05       1262.93    0.9588
23-NN       35.11       1245.60    0.9584
25-NN       35.10       1254.90    0.9569
27-NN       35.18       1255.22    0.9563
29-NN       35.21       1269.14    0.9564
