In [24]:
"This demo shows how to do K-Means Clustering"
"on the CIFAR-10 dataset."

%matplotlib inline

# Numpy is a python library for scientific computing
import numpy as np
# library allowing us to handle serialization in python
import cPickle as pickle
# library that allows us to write shell commands in python
import os
# library to generate plots
import matplotlib
import matplotlib.pyplot as plt

In [25]:

""" load all of cifar """
xs = []
ys = []
for b in range(1,6):
    filename = os.path.join('cifar-10-batches-py/data_batch_%d' % (b, ))
    with open(filename, 'rb') as f:
        datadict = pickle.load(f)
        X = datadict['data']
        Y = datadict['labels']
        X = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
        Y = np.array(Y)
    xs.append(X)
    ys.append(Y)    
    Xtr = np.concatenate(xs)
    Ytr = np.concatenate(ys)
    del X, Y
with open('cifar-10-batches-py/test_batch', 'rb') as f:
    datadict = pickle.load(f)
    X = datadict['data']
    Y = datadict['labels']
    Xte = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
    Yte = np.array(Y)
    
print 'Training data shape: ', Xtr.shape
print 'Training labels shape: ', Ytr.shape
print 'Test data shape: ', Xte.shape
print 'Test labels shape: ', Yte.shape

Training data shape:  (50000, 32, 32, 3)
Training labels shape:  (50000,)
Test data shape:  (10000, 32, 32, 3)
Test labels shape:  (10000,)


In [48]:
# Now we train our model with only a K nearests neighbors

k = 20

# Subsample the data for more efficient code execution in this exercise
num_training = 1000
mask = range(num_training)
X_train = Xtr[mask]
y_train = Ytr[mask]

num_test = 100
mask = range(num_test)
X_test = Xte[mask]
y_test = Yte[mask]

# Reshape the image data into rows
train_data = np.reshape(X_train, (X_train.shape[0], -1))
test_data = np.reshape(X_test, (X_test.shape[0], -1))
print train_data.shape, test_data.shape

dists = np.zeros((num_test, num_training))

for i in xrange(num_test):
  for j in xrange(num_training):
    
    #####################################################################
    # TODO:                                                             #
    # Implement the L2 distance that we discusses in class by           #
    # substituting the currently used L1 distance                       #
    #####################################################################

    dists[i,j] = np.sum(np.abs(train_data[j,:] - test_data[i,:]))


  #######################################################################
  #                         END OF YOUR CODE                            #
  #######################################################################
    
y_pred = np.zeros(num_test)
for i in xrange(num_test):
    
      dists_i_row_sorted=np.argsort(dists[i,:]) 
      k_closest_labels=y_train[dists_i_row_sorted[:k]]   
      
      # count the number of times each label is repeated for the k closest training points
      arr_w_k_incidences = np.bincount(k_closest_labels)
      # find the most repeating label, or the smaller label to break ties
      y_pred[i]=np.argmax(arr_w_k_incidences)

(1000, 3072) (100, 3072)


In [49]:
    print(sum(y_pred.astype(int) == y_test) *100 /num_test)

25


In [57]:
# We will now implement cross validation on the cifar-10 dataset

num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]

X_train_folds = []
y_train_folds = []
################################################################################
# Split up the training data into folds. After splitting, X_train_folds and    #
# y_train_folds should each be lists of length num_folds, where                #
# y_train_folds[i] is the label vector for the points in X_train_folds[i].     #
# Hint: Look up the numpy array_split function.                                #
################################################################################
X_train_folds = np.array_split(Xtr, num_folds)
y_train_folds = np.array_split(Ytr, num_folds)

# print y_train_folds

# A dictionary holding the accuracies for different values of k that we find
# when running cross-validation. After running cross-validation,
# k_to_accuracies[k] should be a list of length num_folds giving the different
# accuracy values that we found when using that value of k.
k_to_accuracies = {}

################################################################################
# Perform k-fold cross validation to find the best value of k. For each        #
# possible value of k, run the k-nearest-neighbor algorithm num_folds times,   #
# where in each case you use all but one of the folds as training data and the #
# last fold as a validation set. Store the accuracies for all fold and all     #
# values of k in the k_to_accuracies dictionary.                               #
################################################################################

for k in k_choices:
    k_to_accuracies[k] = []

for k in k_choices:
    print 'evaluating k=%d' % k
    for j in range(num_folds):
        X_train_cv = np.vstack(X_train_folds[0:j]+X_train_folds[j+1:])
        X_test_cv = X_train_folds[j]
        
        #print len(y_train_folds), y_train_folds[0].shape
        
        y_train_cv = np.hstack(y_train_folds[0:j]+y_train_folds[j+1:])
        y_test_cv = y_train_folds[j]
        
#         #print 'Training data shape: ', X_train_cv.shape
#         #print 'Training labels shape: ', y_train_cv.shape
#         #print 'Test data shape: ', X_test_cv.shape
#         #print 'Test labels shape: ', y_test_cv.shape
        
#         classifier.train(X_train_cv, y_train_cv)
#         dists_cv = classifier.compute_distances_no_loops(X_test_cv)
#         #print 'predicting now'
#         y_test_pred = classifier.predict_labels(dists_cv, k)

        X_train_cv = np.reshape(X_train_cv, (X_train_cv.shape[0], -1))
        X_test_cv = np.reshape(X_test_cv, (X_test_cv.shape[0], -1))

        print(X_train_cv.shape)
        print(X_test_cv.shape)
        num_train = X_train_cv.shape[0]
        num_test = X_test_cv.shape[0]
        
        dists = np.zeros((num_test, num_training))

        for i in xrange(num_test):
          for j in xrange(num_training):

            #####################################################################
            # TODO:                                                             #
            # Implement the L2 distance that we discusses in class by           #
            # substituting the currently used L1 distance                       #
            #####################################################################

            dists[i,j] = np.sum(np.abs(X_train_cv[j,:] - X_test_cv[i,:]))


          #######################################################################
          #                         END OF YOUR CODE                            #
          #######################################################################

        y_pred = np.zeros(num_test)
        for i in xrange(num_test):

              dists_i_row_sorted=np.argsort(dists[i,:]) 
              k_closest_labels=y_train_cv[dists_i_row_sorted[:k]]   

              # count the number of times each label is repeated for the k closest training points
              arr_w_k_incidences = np.bincount(k_closest_labels)
              # find the most repeating label, or the smaller label to break ties
              y_pred[i]=np.argmax(arr_w_k_incidences)



        num_correct = np.sum(y_pred == y_test_cv)
        accuracy = float(num_correct) / num_test
        
        k_to_accuracies[k].append(accuracy)

################################################################################
#                                 END OF YOUR CODE                             #
################################################################################

# Print out the computed accuracies
for k in sorted(k_to_accuracies):
    for accuracy in k_to_accuracies[k]:
        print 'k = %d, accuracy = %f' % (k, accuracy)

evaluating k=1
(40000, 3072)
(10000, 3072)
(40000, 3072)
(10000, 3072)
(40000, 3072)
(10000, 3072)


KeyboardInterrupt: 

In [None]:
# plot the raw observations
for k in k_choices:
  accuracies = k_to_accuracies[k]
  plt.scatter([k] * len(accuracies), accuracies)

# plot the trend line with error bars that correspond to standard deviation
accuracies_mean = np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())])
accuracies_std = np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())])
plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)
plt.title('Cross-validation on k')
plt.xlabel('k')
plt.ylabel('Cross-validation accuracy')
plt.show()