In [14]:
import os
import cPickle
import numpy as np

In [15]:
def unpickle(file):
    with open(file, 'rb') as fo:
        #data = cPickle.load(fo, encoding='bytes')
        data = cPickle.load(fo)
    return data

In [16]:
def load_cifar10_dataset(data_path, negatives=False):
    meta_data_dict = unpickle(data_path+'/batches.meta')
    #print(meta_data_dict)
    cifar_label_names = meta_data_dict[b'label_names']
    #print(cifar_label_names)
    cifar_label_names = np.array(cifar_label_names)
    
    # training data
    cifar_train_data = None
    cifar_train_filenames = []
    cifar_train_labels = []
    
    for i in range(1, 6):
        cifar_train_data_dict = unpickle(data_path+'/data_batch_{}'.format(i))
        #print(cifar_train_data_dict)
        if i == 1:
            cifar_train_data = cifar_train_data_dict[b'data']
            #print(cifar_train_data)
        else:
            cifar_train_data = np.vstack((cifar_train_data, cifar_train_data_dict[b'data']))
            
        cifar_train_filenames += cifar_train_data_dict[b'filenames']
        cifar_train_labels += cifar_train_data_dict[b'labels']
        
        #print(cifar_train_filenames, cifar_train_labels)
        
    cifar_train_data = cifar_train_data.reshape((len(cifar_train_data), 3, 32, 32))
    if negatives:
        cifar_train_data = cifar_train_data.transpose(0, 2, 3, 1).astype(np.float32)
    else:
        cifar_train_data = np.rollaxis(cifar_train_data, 1, 4)
    cifar_train_filenames = np.array(cifar_train_filenames)
    cifar_train_labels = np.array(cifar_train_labels)
    
    cifar_test_data_dict = unpickle(data_path+'/test_batch')
    cifar_test_data = cifar_test_data_dict[b'data']
    cifar_test_filenames = cifar_test_data_dict[b'filenames']
    cifar_test_labels = cifar_test_data_dict[b'labels']
    
    cifar_test_data = cifar_test_data.reshape((len(cifar_test_data), 3, 32, 32))
    if negatives:
        cifar_test_data = cifar_test_data.transpose(0, 2, 3, 1).astype(np.float32)
    else:
        cifar_test_data = np.rollaxis(cifar_test_data, 1, 4)
    cifar_test_filenames = np.array(cifar_test_filenames)
    cifar_test_labels = np.array(cifar_test_labels)
    
    return cifar_train_data, cifar_train_filenames, cifar_train_labels, cifar_test_data, cifar_test_filenames, cifar_test_labels, cifar_label_names

In [17]:
data_path = '../../data/cifar-10-python/cifar-10-batches-py/'
train_data, train_filenames, train_labels, test_data, test_filenames, test_labels, label_names = load_cifar10_dataset(data_path)
#print(train_data.shape, test_data.shape)

x_train_rows = train_data.reshape(train_data.shape[0], train_data.shape[1]*train_data.shape[2]*train_data.shape[3])
x_test_rows = test_data.reshape(test_data.shape[0], test_data.shape[1]*test_data.shape[2]*test_data.shape[3])
#print(x_train_rows.shape, x_test_rows.shape)

In [30]:
class NearestNeighbor(object):
    def __init__(self):
        pass
    
    def train(self, X, y):
        self.Xtr = X
        self.ytr = y
        
    def predict(self, X):
        count = 0
        num_test = X.shape[0]
        Ypred = np.zeros(num_test, dtype=self.ytr.dtype)
        
        for i in xrange(num_test):
            distances = np.sum(np.abs(self.Xtr-X[i,:]), axis=1)
            print(distances)
            print(sorted(distances))
            min_index = np.argmin(distances)
            Ypred[i] = self.ytr[min_index]
            count += 1
            if count > 5:
                print(self.Xtr.shape, X.shape, num_test)
                break
                
        return Ypred

In [31]:
nn = NearestNeighbor()
nn.train(x_train_rows, train_labels)
y_test_predict = nn.predict(x_test_rows)

print('accuracy: %f' %(np.mean(y_test_predict == test_labels)))

[428732 371593 458561 ... 462035 352688 377070]
[257312, 264462, 266714, 272313, 272705, 272818, 273074, 273153, 273745, 274376, 274556, 274572, 274590, 275998, 276267, 276344, 276737, 277045, 277335, 277615, 277641, 278080, 278177, 278755, 278778, 278870, 278880, 279434, 279474, 279639, 279660, 279816, 279956, 280138, 280249, 280261, 280366, 280778, 280995, 281111, 281474, 281512, 281578, 281745, 282117, 282515, 282631, 282741, 282855, 282904, 283018, 283037, 283068, 283097, 283202, 283276, 283407, 283577, 283605, 283631, 283806, 283881, 283924, 284015, 284133, 284155, 284186, 284292, 284316, 284339, 284346, 284491, 284508, 284542, 284583, 284604, 284624, 284632, 284659, 284822, 284871, 284880, 284927, 285052, 285110, 285244, 285588, 285612, 285802, 285844, 286167, 286213, 286345, 286645, 286758, 286814, 286870, 286886, 286924, 286977, 287098, 287102, 287370, 287433, 287494, 287516, 287554, 287566, 287572, 287680, 287681, 287764, 287784, 287784, 287867, 287989, 288082, 288198, 288241,

In [11]:
Xval_rows = x_train_rows[:1000, :]
Yval = train_labels[:1000]
Xtr_rows = x_train_rows[1000:, :]
Ytr = train_labels[1000:]

print(Xval_rows.shape, Yval.shape, Xtr_rows.shape, Ytr.shape)

((1000, 3072), (1000,), (49000, 3072), (49000,))


In [13]:
validation_accuracies = []
for k in [1, 3, 5, 10, 20, 50, 100]:
    nn = NearestNeighbor()
    nn.train(Xtr_rows, Ytr)
    Yval_predict = nn.predict(Xval_rows, k=k)
    acc = np.mean(Yval_predict==Yval)
    print("accuracy: %f"%(acc,))
    validation_accuracies.append(acc)

TypeError: predict() got an unexpected keyword argument 'k'