In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
import numpy as np

In [2]:
def get_data_and_labels(images_filename, labels_filename):
    print("Opening files ...")
    images_file = open(images_filename, "rb")
    labels_file = open(labels_filename, "rb")

    try:
        print("Reading files ...")
        images_file.read(4)
        num_of_items = int.from_bytes(images_file.read(4), byteorder="big")
        num_of_rows = int.from_bytes(images_file.read(4), byteorder="big")
        num_of_colums = int.from_bytes(images_file.read(4), byteorder="big")
        labels_file.read(8)

        num_of_image_values = num_of_rows * num_of_colums
        data = [[None for x in range(num_of_image_values)]
                for y in range(num_of_items)]
        labels = []
        for item in range(num_of_items):
            if item % 10000 == 0:
                print("Current image number: %7d" % item)
            for value in range(num_of_image_values):
                data[item][value] = int.from_bytes(images_file.read(1),
                                                   byteorder="big")
            labels.append(int.from_bytes(labels_file.read(1), byteorder="big"))
        return data, labels
    finally:
        images_file.close()
        labels_file.close()
        print("Files closed.")


In [3]:
train_x, train_y  = get_data_and_labels("mnist_train_images", "mnist_train_labels")

Opening files ...
Reading files ...
Current image number:       0
Current image number:   10000
Current image number:   20000
Current image number:   30000
Current image number:   40000
Current image number:   50000
Files closed.


In [4]:
test_x, test_y  = get_data_and_labels("mnist_test_images", "mnist_test_labels")

Opening files ...
Reading files ...
Current image number:       0
Files closed.


In [9]:
# perform normalization on entire training set
normal_train_x = normalize(train_x)

In [23]:
import random

def get_samples_random(x, y, n):
    my_indices = random.sample(range(len(x)), n)
    
    random_x = [0]*n
    random_y = [0]*n
    
    for i in range(n):
        random_x[i] = x[my_indices[i]]
        random_y[i] = y[my_indices[i]]
    
    return random_x, random_y

In [24]:
def my_prototypes(x_norm, y, n):
    my_indices = random.sample(range(len(x_norm)), n)
    
    proto_x = [0]*n
    proto_y = [0]*n
    
    for i in range(n):
        proto_x[i] = x_norm[my_indices[i]]
        proto_y[i] = y[my_indices[i]]
    
    return proto_x, proto_y

In [25]:
def my_pipeline(num_samples):
    # number of runs to do
    num_runs = 10
    
    my_1nn = KNeighborsClassifier(n_neighbors=1)
    
    # normalized test set for use with prototype
    test_x_norm = normalize(test_x)
    
    for i in range(num_runs):
        # pick num_samples randomly from training set
        my_x, my_y = get_samples_random(train_x, train_y, num_samples)
        # put them into a 1-NN classifier    
        my_1nn.fit(my_x, my_y)
        # get test error rate
        my_error = 0.0
        my_pred = my_1nn.predict(test_x)
        for j in range(len(test_x)):
            if my_pred[j] != test_y[j]:
                my_error += 1
        my_error1 = my_error / len(test_x)
        
        # get prototypes
        my_x, my_y = my_prototypes(normal_train_x, train_y, num_samples)
        # put them into a 1-NN classifier    
        my_1nn.fit(my_x, my_y)
        # get test error rate
        my_error = 0.0
        my_pred = my_1nn.predict(test_x_norm)
        for j in range(len(test_x)):
            if my_pred[j] != test_y[j]:
                my_error += 1
        my_error2 = my_error / len(test_x)
        
        # print results
        print ("Run " + str(i+1) + " errors: " + str(my_error1) + " " + str(my_error2))
    

In [26]:
my_pipeline(1000)

Run 1 errors: 0.1179 0.095
Run 2 errors: 0.1103 0.0998
Run 3 errors: 0.1174 0.1014
Run 4 errors: 0.1235 0.0954
Run 5 errors: 0.1093 0.1036
Run 6 errors: 0.1222 0.1005
Run 7 errors: 0.1144 0.1039
Run 8 errors: 0.1117 0.0948
Run 9 errors: 0.115 0.1012
Run 10 errors: 0.1206 0.1034


In [28]:
my_pipeline(5000)

Run 1 errors: 0.0633 0.0572
Run 2 errors: 0.0642 0.0562
Run 3 errors: 0.0629 0.0557
Run 4 errors: 0.0663 0.0556
Run 5 errors: 0.0635 0.0548
Run 6 errors: 0.0654 0.0525
Run 7 errors: 0.0635 0.0524
Run 8 errors: 0.0638 0.0578
Run 9 errors: 0.0648 0.0578
Run 10 errors: 0.0602 0.0544


In [27]:
my_pipeline(10000)

Run 1 errors: 0.0528 0.0461
Run 2 errors: 0.0513 0.0467
Run 3 errors: 0.0495 0.0445
Run 4 errors: 0.0489 0.0461
Run 5 errors: 0.0498 0.0471
Run 6 errors: 0.0505 0.045
Run 7 errors: 0.0523 0.0466
Run 8 errors: 0.0532 0.0468
Run 9 errors: 0.05 0.044
Run 10 errors: 0.0518 0.0466
