In [2]:

import random
import numpy as np
from data_utils import load_CIFAR10
import matplotlib.pyplot as plt

from __future__ import print_function

cifar10_dir = 'datasets/cifar-10-batches-py'
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)

print('Training data shape: ', X_train.shape)
print('Training labels shape: ', y_train.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)

Training data shape:  (50000, 32, 32, 3)
Training labels shape:  (50000,)
Test data shape:  (10000, 32, 32, 3)
Test labels shape:  (10000,)


In [3]:
X_train = X_train.reshape(X_train.shape[0], 32 * 32 * 3) 
X_test = X_test.reshape(X_test.shape[0], 32 * 32 * 3) 
X_train = X_train[:5000]
X_test = X_test[:1000]
y_train = y_train[:5000]
y_test = y_test[:1000]

In [4]:
print (X_train.shape)
print (X_test.shape)

(5000, 3072)
(1000, 3072)


In [7]:
def Nearest_Neighbours_l1(Xtr, Xte, ytr,k=1):
    num_test = Xte.shape[0]
    Ypred = np.zeros(num_test, dtype = ytr.dtype)

    
    for i in range(num_test):
      distances = np.sum(np.abs(Xtr - Xte[i,:]), axis = 1)
      min_index = np.argmin(distances) 
      Ypred[i] = ytr[min_index]

    return Ypred, distances


In [8]:
def Nearest_Neighbours_l2(Xtr, Xte, ytr,k=1):
    num_test = Xte.shape[0]
    Ypred = np.zeros(num_test, dtype = ytr.dtype)


    for i in range(num_test):
      distances = np.sqrt(np.sum(np.square(Xtr - Xte[i,:]), axis = 1))
      min_index = np.argmin(distances) 
      Ypred[i] = ytr[min_index] 

    return Ypred, distances


In [None]:
y_pred_l1, dists = Nearest_Neighbours_l1(X_train, X_test, y_train)

In [None]:
y_pred_l2, dists = Nearest_Neighbours_l2(X_train, X_test, y_train)

In [None]:
num_correct_l1 = np.sum(y_pred_l1 == y_test)

In [None]:
num_correct_l2 = np.sum(y_pred_l2 == y_test)

In [None]:
print ("The accuracy of nearest neighbour with l1 distance is",float(num_correct_l1)*100/y_test.shape[0],"%")

In [None]:
print ("The accuracy of nearest neighbour with l2 distance is",float(num_correct_l2)*100/y_test.shape[0],"%")

In [None]:
num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20]

X_train_folds = []
y_train_folds = []

X_train_folds = np.array(np.array_split(X_train, num_folds))
y_train_folds = np.array(np.array_split(y_train, num_folds))

In [None]:
print (X_train)

In [None]:
print(X_train.shape)

In [None]:
print(X_train_folds)

In [None]:
print (X_train_folds.shape)

In [None]:
def Nearest_Neighbours_l1_cv(Xtr, Xte, ytr,k):
    num_test = Xte.shape[0]
    
    predicted_val=[]
    for i in range(num_test):
     
      Ypred=[]
      distances = np.sqrt(np.sum(np.square(Xtr - Xte[i,:]), axis = 1)).tolist()
      arr=(np.sort(distances)[:k])
      for j in range(k):
        Ypred.append(ytr[distances.index(arr[j])])
      predicted_val.append(np.argmax(np.bincount(Ypred)))
    return predicted_val



In [None]:
k_to_accuracies = {}

for k in k_choices:
    for n in xrange(num_folds):
        combinat = [x for x in xrange(num_folds) if x != n] 
        x_training_dat = np.concatenate(X_train_folds[combinat])
        y_training_dat = np.concatenate(y_train_folds[combinat])
        predicted_val = Nearest_Neighbours_l1_cv(x_training_dat, X_train_folds[n], y_training_dat,k)
        num_correct = np.sum(predicted_val == y_train_folds[n])
        accuracy = float(num_correct) / y_train_folds.shape[0]
        k_to_accuracies.setdefault(k, []).append(accuracy)



In [None]:
for k in k_choices:
  accuracies = k_to_accuracies[k]
  plt.scatter([k] * len(accuracies), accuracies)
    
accuracies_mean = np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())])
accuracies_std = np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())])
plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)
plt.title('Cross-validation on k')
plt.xlabel('k')
plt.ylabel('Cross-validation accuracy')
plt.show()
