In [1]:
# A bit of setup

import numpy as np
import matplotlib.pyplot as plt

from cs231n.classifiers.neural_net import TwoLayerNet

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

def rel_error(x, y):
  """ returns relative error """
  return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

<h1><center> Basic Setup </h1></center>

In [2]:
from cs231n.data_utils import load_CIFAR10

def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000):
    """
    Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
    it for the two-layer neural net classifier. These are the same steps as
    we used for the SVM, but condensed to a single function.  
    """
    # Load the raw CIFAR-10 data
    cifar10_dir = 'cs231n/datasets/cifar-10-batches-py'
    X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
        
    # Subsample the data
    mask = range(num_training, num_training + num_validation)
    X_val = X_train[mask]
    y_val = y_train[mask]
    mask = range(num_training)
    X_train = X_train[mask]
    y_train = y_train[mask]
    mask = range(num_test)
    X_test = X_test[mask]
    y_test = y_test[mask]

    # Normalize the data: subtract the mean image
    mean_image = np.mean(X_train, axis=0)
    X_train -= mean_image
    X_val -= mean_image
    X_test -= mean_image

    # Reshape data to rows
    X_train = X_train.reshape(num_training, -1)
    X_val = X_val.reshape(num_validation, -1)
    X_test = X_test.reshape(num_test, -1)

    return X_train, y_train, X_val, y_val, X_test, y_test


# Invoke the above function to get our data.
X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data()
print 'Train data shape: ', X_train.shape
print 'Train labels shape: ', y_train.shape
print 'Validation data shape: ', X_val.shape
print 'Validation labels shape: ', y_val.shape
print 'Test data shape: ', X_test.shape
print 'Test labels shape: ', y_test.shape

Train data shape:  (49000, 3072)
Train labels shape:  (49000,)
Validation data shape:  (1000, 3072)
Validation labels shape:  (1000,)
Test data shape:  (1000, 3072)
Test labels shape:  (1000,)


In [3]:
## Initiating the argument values.
inputSize = 32 * 32 * 3
hiddenSize = 50
outputSize = 10
std = 1e-4

## Dictionary to hold the values of the network parameters.

params = {}
params['W1'] = std * np.random.randn(inputSize, hiddenSize)
params['b1'] = np.zeros(hiddenSize)
params['W2'] = std * np.random.randn(hiddenSize, outputSize)
params['b2'] = np.zeros(outputSize)

<h1><center> Computing Scores </h1></center>

In [4]:
W1, b1 = params['W1'], params['b1']
W2, b2 = params['W2'], params['b2']

N, D = X_val.shape
reg = 0.5

In [5]:
scores = None

## Computing the first hidden layer.
hiddenLayer1 = X_val.dot(W1) + b1

## Applying Relu to the hidden layer.
activatedHiddenLayer1 = np.clip(hiddenLayer1, 0, None)

## Computing the second hidden layer.
hiddenLayer2 = activatedHiddenLayer1.dot(W2) + b2

## Storing this matrix in the scores variable.
scores = hiddenLayer2

In [6]:
## Normalise the raw scores to avoid exponential score blow-up.
## To do so, subtract the maximum score from each score value for each image.
expScores = np.exp(scores - np.max(scores, axis = 1, keepdims = True))

## Compute the probabilities (or softmax scores) of each class.
softmaxScores = expScores/np.sum(expScores, axis = 1, keepdims = True)

## Creating a 1-D matrix containing the softmax score of the correct class.
corrSoftScore = np.choose(y_val, softmaxScores.T)

## Computing the cross-entropy loss.
loss = -np.sum(np.log(corrSoftScore), axis = 0, keepdims = True)

## Extracting the single float value from the 1 element numpy array.
loss = loss[0]

## Compute the full training loss by dividing the cummulative loss by the number of training instances.
loss = loss/N

## Add regularisation loss.
loss = loss + 0.5 * reg * np.sum(W1 * W1) + 0.5 * reg * np.sum(W2 * W2)

loss

2.302951688604538

In [7]:
grads = {}

# Backward pass: compute gradients
dO = softmaxScores

## Computing dL/dO (Softmax Gradient).
dO[np.arange(N), y_val] -= 1
dO /= N

## Computing dL/db2.
grads['b2'] = np.sum(dO, axis = 0)

## Computing dL/dW2.
grads['W2']= activatedHiddenLayer1.T.dot(dO) + reg * W2

## Computing dL/dActivatedHiddenLayer1.
dActivatedHiddenLayer1 = dO.dot(W2.T)

## Computing dL/dHiddenLayer1 (Backprop through Relu).
dActivatedHiddenLayer1[activatedHiddenLayer1 <= 0] = 0
# dHiddenLayer1 = np.clip(dActivatedHiddenLayer1, 0, None)

## Computing dL/db1.
grads['b1'] = np.sum(dActivatedHiddenLayer1, axis = 0)

## Computing dL/dW1.
grads['W1'] = X_val.T.dot(dActivatedHiddenLayer1) + reg * W1

In [12]:
scores = None
reLU = lambda x: np.maximum(0, x)
hidden_layer = reLU(X_val.dot(W1) + b1)
scores = hidden_layer.dot(W2) + b2

loss = None
scores -= np.max(scores)
probs = np.exp(scores) / np.sum(np.exp(scores), axis=1, keepdims=True)
loss = np.mean(-np.log(probs[np.arange(N), y_val]))
loss += 0.5 * reg * (np.sum(W1**2) + np.sum(W2**2))

In [15]:
grads2 = {}
dscores = probs
dscores[np.arange(N), y_val] -= 1
dscores /= N

grads2['W2'] = hidden_layer.T.dot(dscores)
grads2['b2'] = np.sum(dscores, axis=0)
dhidden = dscores.dot(W2.T)
dhidden[hidden_layer <= 0] = 0
grads2['W1'] = X_val.T.dot(dhidden)
grads2['b1'] = np.sum(dhidden, axis=0)

grads2['W1'] += reg * W1
grads2['W2'] += reg * W2

In [23]:
dhidden[hidden_layer < 0]

array([], dtype=float64)