In [170]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mnist-data-for-digit-recognation/t10k-images-idx3-ubyte.gz
/kaggle/input/mnist-data-for-digit-recognation/train-images.idx3-ubyte
/kaggle/input/mnist-data-for-digit-recognation/t10k-labels-idx1-ubyte.gz
/kaggle/input/mnist-data-for-digit-recognation/train-labels-idx1-ubyte.gz
/kaggle/input/mnist-data-for-digit-recognation/t10k-labels.idx1-ubyte
/kaggle/input/mnist-data-for-digit-recognation/t10k-images.idx3-ubyte
/kaggle/input/mnist-data-for-digit-recognation/train-labels.idx1-ubyte
/kaggle/input/mnist-data-for-digit-recognation/train-images-idx3-ubyte.gz
/kaggle/input/digit-recognizer/sample_submission.csv
/kaggle/input/digit-recognizer/train.csv
/kaggle/input/digit-recognizer/test.csv


In [171]:
DATA_DIR="./mnist"
TEST_DATA_FILENAME="./mnist/t10k-images.idx3-ubyte"
TEST_LABELS_FILENAME="./mnist/t10k-labels.idx1-ubyte"
TRAIN_DATA_FILENAME="./mnist/train-images.idx3-ubyte"
TRAIN_LABELS_FILENAME="./mnist/train-labels.idx1-ubyte"

In [172]:

def bytes_to_int(byte_data):
    return int.from_bytes(byte_data,"big")
    


def read_images(filename,n_max_images=None):
    images=[]
    with open(filename,"rb") as f:
        _=f.read(4)
        n_images=bytes_to_int(f.read(4))
        n_rows=bytes_to_int(f.read(4))
        n_columns=bytes_to_int(f.read(4))
        
        if(n_max_images):
            n_images=n_max_images
        
        
        for image_idx in range(n_images):
            image=[]
            for row_idx in range(n_rows):
                row=[]
                for col_idx in range(n_columns):
                    pixel=f.read(1)
                    row.append(pixel)
                image.append(row)
            images.append(image)
    return images

    
    

In [173]:
def read_labels(filename,n_max_labels=None):
    labels=[]
    with open(filename,"rb") as f:
        _=f.read(4)
        n_labels=bytes_to_int(f.read(4))
        if(n_max_labels):
            n_labels=n_max_labels
        
        
        for label_idx in range(n_labels):
            label=f.read(1)
            labels.append(label)
                
    return labels

In [174]:
# read_images(TRAIN_DATA_FILENAME)
# print("done")

In [175]:
X_train=read_images(TRAIN_DATA_FILENAME,1000)
print(len(X_train))

1000


In [176]:
y_train=read_labels(TRAIN_LABELS_FILENAME,1000)
print(len(y_train))

1000


In [177]:
X_test=read_images(TEST_DATA_FILENAME,10)
print(len(X_test))


10


In [178]:
y_test=read_labels(TEST_LABELS_FILENAME,10)
print(len(y_test))

10


In [179]:
def flatten_list(l):
    return [pixel for sublist in l for pixel in sublist]

In [180]:
def extract_features(X):
    return [flatten_list(sample) for sample in X]

In [181]:
print(len(X_train[0]))
print(len(X_test[0]))

28
28


In [182]:
X_train=extract_features(X_train)
X_test=extract_features(X_test)

In [183]:
print(len(X_train[0]))
print(len(X_test[0]))

784
784


In [184]:
print(str(X_train[0][0]))

b'\x00'


In [185]:
def dist(train_sample,test_sample):
    return sum([(bytes_to_int(x_i)-bytes_to_int(y_i))**2 for x_i,y_i in zip(train_sample,test_sample)])**(0.5)

In [186]:
def get_training_distances_for_test_sample(X_train,test_sample):#this should get the distances between the test sample and every other Xtrain value
    return [dist(train_sample,test_sample) for train_sample in X_train]

In [187]:
def get_most_frequent(l):
    return max(l,key=l.count)

In [188]:
def knn(X_train,y_train,X_test,y_test,k=1):
    y_pred=[]
    candidates=[]
    for test_sample_idx,sample in enumerate(X_test):
        training_distances=get_training_distances_for_test_sample(X_train,sample)
        sorted_distance_indices=[pair[0] for pair in sorted(enumerate(training_distances),key=lambda x:x[1])]
        candidates=[ord(y_train[idx]) for idx in sorted_distance_indices[:k]]
        
        top_candidate=get_most_frequent(candidates)
        print(f'Point is {ord(y_test[test_sample_idx])} guesses are {(candidates)} and the prediction is {top_candidate} ')
        
        y_pred.append(top_candidate)
#         candidates.append(candidates)
    return y_pred

In [189]:
y_pred=knn(X_train,y_train,X_test,y_test,10)

Point is 7 guesses are [7, 7, 7, 7, 7, 7, 7, 7, 7, 7] and the prediction is 7 
Point is 2 guesses are [2, 2, 8, 3, 2, 6, 3, 3, 2, 2] and the prediction is 2 
Point is 1 guesses are [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] and the prediction is 1 
Point is 0 guesses are [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] and the prediction is 0 
Point is 4 guesses are [4, 9, 4, 4, 7, 9, 9, 4, 4, 4] and the prediction is 4 
Point is 1 guesses are [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] and the prediction is 1 
Point is 4 guesses are [9, 4, 9, 4, 9, 4, 4, 4, 4, 4] and the prediction is 4 
Point is 9 guesses are [9, 9, 9, 9, 4, 9, 9, 9, 7, 9] and the prediction is 9 
Point is 5 guesses are [4, 9, 6, 6, 4, 5, 4, 9, 4, 5] and the prediction is 4 
Point is 9 guesses are [9, 7, 7, 9, 7, 7, 9, 7, 9, 9] and the prediction is 9 


In [190]:
print(y_pred)
# print(y_test)

[7, 2, 1, 0, 4, 1, 4, 9, 4, 9]


In [191]:
def accuracy(y_pred,y_test):
    acc=sum([int(y_pred_i==ord(y_test_i)) for y_pred_i,y_test_i in zip(y_pred,y_test)])
    return acc/len(y_pred)*100

In [192]:
accuracy(y_pred,y_test)

90.0

In [193]:
from PIL import Image
import numpy as np

def read_image(path):
    return np.asarray(Image.open(path).convert('L'))
def show_image(image):
    image= Image.fromarray(np.array(image),"L")
    image.show()


In [None]:
#creating our own case
own_sample=[read_image()]