In [1]:
import random
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
from cs231n.data_utils import load_CIFAR10
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading extenrnal modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000, num_dev=500):
    
    """
    Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
    it for the linear classifier. These are the same steps as we used for the
    SVM, but condensed to a single function.  
    """
    # Load the raw CIFAR-10 data
    cifar10_dir = 'cs231n/datasets/cifar-10-batches-py'
    X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)

    # subsample the data
    mask = range(num_training, num_training + num_validation)
    X_val = X_train[mask]
    y_val = y_train[mask]
    mask = range(num_training)
    X_train = X_train[mask]
    y_train = y_train[mask]
    mask = range(num_test)
    X_test = X_test[mask]
    y_test = y_test[mask]
    mask = np.random.choice(num_training, num_dev, replace=False)
    X_dev = X_train[mask]
    y_dev = y_train[mask]

    # Preprocessing: reshape the image data into rows
    X_train = np.reshape(X_train, (X_train.shape[0], -1))
    X_val = np.reshape(X_val, (X_val.shape[0], -1))
    X_test = np.reshape(X_test, (X_test.shape[0], -1))
    X_dev = np.reshape(X_dev, (X_dev.shape[0], -1))

    # Normalize the data: subtract the mean image
    mean_image = np.mean(X_train, axis = 0)
    X_train -= mean_image
    X_val -= mean_image
    X_test -= mean_image
    X_dev -= mean_image

    # add bias dimension and transform into columns
    X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))])
    X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))])
    X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])
    X_dev = np.hstack([X_dev, np.ones((X_dev.shape[0], 1))])

    return X_train, y_train, X_val, y_val, X_test, y_test, X_dev, y_dev


# Invoke the above function to get our data.
X_train, y_train, X_val, y_val, X_test, y_test, X_dev, y_dev = get_CIFAR10_data()

print 'Train data shape: ', X_train.shape
print 'Train labels shape: ', y_train.shape
print 'Validation data shape: ', X_val.shape
print 'Validation labels shape: ', y_val.shape
print 'Test data shape: ', X_test.shape
print 'Test labels shape: ', y_test.shape
print 'dev data shape: ', X_dev.shape
print 'dev labels shape: ', y_dev.shape

Train data shape:  (49000, 3073)
Train labels shape:  (49000,)
Validation data shape:  (1000, 3073)
Validation labels shape:  (1000,)
Test data shape:  (1000, 3073)
Test labels shape:  (1000,)
dev data shape:  (500, 3073)
dev labels shape:  (500,)


<h1><center> Vectorized Softmax </center></h1>

In [3]:
## Randomly initialising a weight matrix of shape (3073,10).
W = np.random.randn(3073, 10) * 0.0001
print 'Weights Matrix Shape : ', W.shape

Weights Matrix Shape :  (3073, 10)


In [4]:
## Computing the raw scores or performing the linear mathematical operation Wx + b.
scores = X_dev.dot(W)
print 'Raw Scores Matrix Shape : ', scores.shape

Raw Scores Matrix Shape :  (500, 10)


In [5]:
## Normalise the raw scores to avoid exponential score blow-up.
## To do so, subtract the maximum score from each score value for each image.
expScores = np.exp(scores - np.max(scores, axis = 1, keepdims = True))

## Compute the probabilities (or softmax scores) of each class.
softmaxScores = expScores/np.sum(expScores, axis = 1, keepdims = True)

In [6]:
## Creating a 1-D matrix containing the softmax score of the correct class.
corrSoftScore = np.choose(y_dev, softmaxScores.T)

In [7]:
## Computing the cross-entropy loss.
loss = -np.sum(np.log(corrSoftScore), axis = 0, keepdims = True)

In [8]:
dW = np.zeros_like(W)

## Initialising dO to softmaxScores.
dO = softmaxScores

## Computing dL/dO.
dO[np.arange(500), y_dev] -= 1

## Computing dL/dW with the help of chain rule.
dW = X_dev.T.dot(dO)

## Dividing by the number of training instances.
dW /= 500

dW

array([[ -2.65248087e+00,   7.12894356e-01,   1.94097841e-01, ...,
          1.77945060e+00,  -8.80867114e-01,  -4.96769947e+00],
       [ -3.70659854e+00,   1.18530696e+00,   1.01148681e-01, ...,
          8.90939910e-01,  -1.85701988e+00,  -5.43101914e+00],
       [ -5.50411560e+00,   1.18004350e+00,   7.25357844e-01, ...,
          1.16776482e+00,  -3.49751995e+00,  -6.77117904e+00],
       ..., 
       [ -1.33043570e+00,   7.20431146e-01,   3.02391793e-01, ...,
         -7.52522978e-01,   2.19324805e+00,  -1.38864516e+00],
       [ -2.37606838e+00,   2.69552322e-01,   8.33697046e-01, ...,
          4.92608758e-01,   4.69409191e-01,  -1.58037107e+00],
       [  3.75702408e-03,   1.04273182e-02,   5.12071035e-03, ...,
         -1.68357177e-02,   1.64161617e-02,   1.01501768e-02]])

<h1><center> Naive Softmax </center></h1>

In [9]:
## Randomly initialising a weight matrix of shape (3073,10).
W = np.random.randn(3073, 10) * 0.0001
print 'Weights Matrix Shape : ', W.shape

Weights Matrix Shape :  (3073, 10)


In [10]:
## Initialising the gradients matrix.
dW = np.zeros_like(W)

## Initialising the gradients matrix of the logits i.e. Wx+b.
dO = np.zeros_like(scores)

## Initialising loss to zero.
loss = 0.0

## Computing number of training instances.
numTrain = X_dev.shape[0]

## Computing the number of classes.
numClasses = W.shape[1]

# for i in range(0, numTrain):
for i in range(0, 5):
     
    ## Computing the raw scores for each image.
    imgScoreMat = X_dev[i].dot(W)
    
    ## Finding the maximum class score in above array.
    maxClsScore = np.max(imgScoreMat)
    
    ## Normalise the raw scores to avoid exponential score blow-up.
    ## To do so, subtract the maximum score from each score value for each image.
    normScoreMat = imgScoreMat - maxClsScore
    
    ## Exponentiate the normalised class scores.
    expScoreMat = np.exp(normScoreMat)
    
    ## Computing the sum of all the exponentiated scores.
    expScoresSum = np.sum(expScoreMat, axis = 0, keepdims = True)
        
    ## Compute the probabilities (or softmax scores) of each class.
    imgSoftmaxScores = expScoreMat/expScoresSum
 
    ## Finding the softmax score for the correct class.
    corrSoftScore = imgSoftmaxScores[y_dev[i]]
    
    ## Computing the loss for the particular image.
    loss = loss + -np.log(corrSoftScore/np.sum(imgSoftmaxScores))
    
    ## Updating the gradients wrt the logits.
    for j in range(0, numClasses):

        if (j == y_dev[i]):

            dO = imgSoftmaxScores[j] - 1

        else:

            dO = imgSoftmaxScores[j]
        
        ## Updating the gradients wrt the weights.
        dW[:,j] += dO * X_dev[i]
    
          
print loss

10.2277599727


<h1><center> Tuning HyperParameters </center></h1>

In [11]:
from cs231n.classifiers import Softmax

In [27]:
results = {}
best_val = -1
best_softmax = None
# learning_rates = [1e-7, 5e-7]
# regularization_strengths = [5e4, 1e8]
learning_rates = [2.8e-6, 1e-7]
regularization_strengths = [1e+03, 5e4]



## Converting the learning rate and regularization strenghts list into numpy arrays.
learning_rates = np.array(learning_rates)
regularization_strengths = np.array(regularization_strengths)

## Creating a grid search array which contains the different hyperparameter combinations.
gridArray = np.array(np.meshgrid(learning_rates, regularization_strengths)).T.reshape(4, 2)

In [28]:
## Iterating over each hyperparameter combination.
for i in range(gridArray.shape[0]):
    
    ## Setting the learning rate and the regularisation strength.
    learning_rate = gridArray[i][0]
    regularization_strength = gridArray[i][1]
    
    ## Instantiating the model.
    softmaxModel = Softmax()
    
    ## Training the model on the training data.
    softmaxModel.train(X_train, y_train, learningRate = learning_rate, reg = regularization_strength, numIters = 1000)
    
    ## Training Predictions.
    yPredTrain = softmaxModel.predict(X_train)
    
    ## Validation Predictions.
    yPredValidation = softmaxModel.predict(X_val)
    
    ## Computing training accuracy.
    trainAccuracy = np.mean(yPredTrain == y_train)
    
    ## Computing validation accuracy.
    validationAccuracy = np.mean(yPredValidation == y_val)
    
    ## Comparing the current validation accuracy with the previous one.
    if (validationAccuracy > best_val):
        
        best_val = validationAccuracy
        best_softmax = softmaxModel
        
    results[learning_rate, regularization_strength] = (trainAccuracy, validationAccuracy)
    
    print "Validation Accuracy : ", validationAccuracy
    
    
# Print out results.
for lr, reg in sorted(results):
    train_accuracy, val_accuracy = results[(lr, reg)]
    print 'lr %e reg %e train accuracy: %f val accuracy: %f' % (
                lr, reg, train_accuracy, val_accuracy)
    
print 'best validation accuracy achieved during cross-validation: %f' % best_val

Validation Accuracy :  0.36
Validation Accuracy :  0.376
Validation Accuracy :  0.214
Validation Accuracy :  0.23
lr 1.000000e-07 reg 1.000000e+03 train accuracy: 0.231531 val accuracy: 0.214000
lr 1.000000e-07 reg 5.000000e+04 train accuracy: 0.228286 val accuracy: 0.230000
lr 2.800000e-06 reg 1.000000e+03 train accuracy: 0.374204 val accuracy: 0.360000
lr 2.800000e-06 reg 5.000000e+04 train accuracy: 0.392551 val accuracy: 0.376000
best validation accuracy achieved during cross-validation: 0.376000
