In [1]:
import numpy as np
import pandas as pd

# Defining Functions

In [2]:
def normalise(X):
    max_X = np.max(X)
    X_norm = X/max_X
    
    return X_norm

In [3]:
def dist(X_train,x):
    dist = np.sqrt(np.sum((X_train - x)**2, axis=1))
    return dist

In [6]:
def majority_count(y):
    y_unique = np.unique(y)
    c = np.array([])
    for i in y_unique:
        count = list(y).count(i)
        c = np.append(c,count)
    most_index = np.argmax(c)
    return y_unique[most_index]

In [69]:
def knn(X_train,X_test,y_train,k):
    m = X_test.shape[0]
    pred = []
    print('Loading ',end='')
    for i in range(m):
        d_i = dist(X_train,X_test[i])
        d_i_min_indices = np.argsort(d_i)
        y_selected = y_train[d_i_min_indices[:k]]
        most_repeated = majority_count(y_selected)
        pred.append(most_repeated)
        if i%np.ceil(m/50)==0:
            print('==', end='')
    print(' Done!!!')
    return pred

In [53]:
def accuracy(pred,y_test):
    subs = pred - y_test
    correct = list(subs).count(0)
    accuracy = (correct/len(pred))
    return accuracy

# Adding Training, Cross-Validation and Test Data Sets

In [76]:
df = pd.read_csv('Classification_train.csv')

df_test = pd.read_csv('Classification_test.csv')

In [78]:
X = df.drop('label', axis =1)
y = df.iloc[:,0]
X_train = X.head(25000).to_numpy()
X_cv = X.tail(5000).to_numpy()
y_train = y.head(25000).to_numpy()
y_cv = y.tail(5000).to_numpy()

X_test = df_test.iloc[:,1:].to_numpy()

print("X_train.shape:", X_train.shape)
print("y_train.shape:", y_train.shape)
print("X_cv.shape:", X_cv.shape)
print("y_cv.shape:", y_cv.shape)
print("X_test.shape:", X_test.shape)

X_train.shape: (25000, 784)
y_train.shape: (25000,)
X_cv.shape: (5000, 784)
y_cv.shape: (5000,)
X_test.shape: (10000, 784)


In [79]:
X_train_norm = normalise(X_train)
X_cv_norm = normalise(X_cv)
X_test_norm = normalise(X_test)
print(f"Peak to peak value before normalisation: {np.ptp(X_train)}")
print(f"Peak to peak value after normalisation: {np.ptp(X_train_norm)}")

Peak to peak value before normalisation: 255
Peak to peak value after normalisation: 1.0


# Running KNN on Cross-Validation Data Set

In [70]:
pred = knn(X_train_norm,X_cv_norm,y_train,3)



In [71]:
print(len(pred))

print(y_cv.shape)

5000
(5000,)


In [72]:
acc = accuracy(pred,y_cv)

print(f"Accuracy on cross-validation set: {acc*100}%")

Accuracy on cross-validation set: 98.08%


# Running KNN on Test Data Set

In [81]:
pred_test = knn(X_train_norm, X_test_norm, y_train, 3)



In [87]:
df_t = pd.DataFrame(pred_test, columns=['Predictions'])
df_t

Unnamed: 0,Predictions
0,0
1,4
2,5
3,5
4,9
...,...
9995,9
9996,9
9997,0
9998,8


In [88]:
df_t.to_csv('KNN_pred.csv')