In [20]:
# As usual, a bit of setup

import time
import numpy as np
import matplotlib.pyplot as plt
from cs231n.classifiers.fc_net import *
from cs231n.data_utils import get_CIFAR10_data
from cs231n.gradient_check import eval_numerical_gradient, eval_numerical_gradient_array
from cs231n.solver import Solver

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

def rel_error(x, y):
  """ returns relative error """
  return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
# Load the (preprocessed) CIFAR10 data.

data = get_CIFAR10_data()
for k, v in data.iteritems():
  print '%s: ' % k, v.shape

X_val:  (1000, 3, 32, 32)
X_train:  (49000, 3, 32, 32)
X_test:  (1000, 3, 32, 32)
y_val:  (1000,)
y_train:  (49000,)
y_test:  (1000,)


<h1><center> Forward Pass </center></h1>

In [22]:
## Specifying the different inputs required for the batch normalization forward pass.

## Simulate the forward pass for a two-layer network
N, D1, D2, D3 = 200, 50, 60, 3
X = np.random.randn(N, D1)
W1 = np.random.randn(D1, D2)
W2 = np.random.randn(D2, D3)
X = np.maximum(0, X.dot(W1)).dot(W2)

## Setting gamma = 1 and beta = 0.
gamma = np.ones(D3)
beta = np.ones(D3)

## Additional parameters.
batchNormDict = {'mode' : 'train',
                 'eps' : 1e-5,
                 'momentum' : 0.9,
                 'runningMean' : np.zeros(D3),
                 'runningVar' : np.zeros(D3)}

In [23]:
## Computing the forward pass.
## We compute the forward pass as a computational graph so as to easily 
## backpropagate into the network. 

## Computing the mean of the sample.
sampleMean = (1.0/N) * (np.sum(X, axis = 0))

## Computing the numerator expression (X - E[X]).
numExpression = X - sampleMean

## Computing the denominator expression (Standard Deviation of X).
interMediate = numExpression ** 2
varianceInput = (1.0/N) * (np.sum(interMediate, axis = 0))
stableSD = np.sqrt(varianceInput + batchNormDict['eps'])

## Inverting the standard deviation.
invertedSD = (1.0/stableSD)

## Computing the normalised gaussian input.
xHat = numExpression * invertedSD

## Scaling the normalised gaussian input by gamma.
xHatScaled = gamma * xHat

## Shifting the normalised gaussian input by beta.
xHatShifted = xHatScaled + beta
out = xHatShifted

In [24]:
## Computing the mean and variance of the input along each dimension (feature).
sampleMean = np.mean(out, axis = 0)
sampleVariance = np.var(out, axis = 0)

## Normalizing the input.
out = ((out - sampleMean)/sampleVariance)

## Scaling and Shifting the normalized data.
out = (gamma * out + beta)

## Updating the running mean and running variance.
batchNormDict['runningMean'] = batchNormDict['momentum'] * batchNormDict['runningMean'] + (1 - batchNormDict['momentum']) * sampleMean
batchNormDict['runningVar'] = batchNormDict['momentum'] * batchNormDict['runningVar'] + (1 - batchNormDict['momentum']) * sampleVariance

<h1><center> Backward Pass </center></h1>

In [34]:
## Simulation Parameters.
N, D = 4, 5
x = 5 * np.random.randn(N, D) + 12
gamma = np.random.randn(D)
beta = np.random.randn(D)
dOut = np.random.randn(N, D)
bn_param = {'mode': 'train'}

## Computing the forward pass.
out, cache = batchnorm_forward(x, gamma, beta, bn_param)
out, xHatShifted, xHatScaled, xHat, invertedSD, stableSD, sampleVariance, interMediate, numExpression, sampleMean, gamma, beta, eps = cache

## Defining a function to compute the output so that it's numerical gradient can be evaluated for gradient checking.
fx = lambda x: batchnorm_forward(x, gamma, beta, bn_param)[0]
fg = lambda a: batchnorm_forward(x, gamma, beta, bn_param)[0]
fb = lambda b: batchnorm_forward(x, gamma, beta, bn_param)[0]

## Computing the numerical gradients for performing sanity checks.
dx_num = eval_numerical_gradient_array(fx, x, dOut)
da_num = eval_numerical_gradient_array(fg, gamma, dOut)
db_num = eval_numerical_gradient_array(fb, beta, dOut)

In [41]:
## Implementing the backward pass.

## Computing the gradient with respect to the beta parameter.
dBeta = np.sum(dOut, axis = 0)

## Computing the gradient with respect to the gamma parameter.
dGamma = np.sum(xHat * dOut, axis = 0)

## Computing the gradient with respect to xHat.
dXhat = (gamma * dOut)

## Computing the gradient with respect to inverted standard deviation.
dInvertedSD = np.sum( numExpression * dXhat, axis = 0)

## Computing the gradient with respect to the numerator expression (P1).
dNumExpressionP1 = (invertedSD * dXhat)

## Computing the gradient with respect to the standard deviation.
dStableSD = ((-1.0) * (invertedSD**2) * dInvertedSD)

## Computing the gradient with respect to the sample variance.
dSampleVariance = ((0.5) * (1.0 / np.sqrt(sampleVariance + eps)) * dStableSD)

## Computing the gradient with respect to the interMediate.
dInterMediate = ((1.0 / N) * np.ones((N, D)) * dSampleVariance)

## Computing the gradient with respect to the numerator expression (P2).
dNumExpressionP2 = ((2.0) * numExpression * dInterMediate)

## Combining the gradients to obtain the full gradient with respect to the numerator expression.
dNumExpression = dNumExpressionP1  + dNumExpressionP2

## Computing the gradient with respect to the sample mean.
dSampleMean = (-1) * np.sum(dNumExpression, axis = 0)

## Computing the gradient with respect to the input.
dXP1 = ((1.0 / N) * np.ones((N, D)) * dSampleMean)
dXP2 = dNumExpression
dx = dXP1 + dXP2

In [42]:
print 'dbeta error: ', rel_error(db_num, dBeta)
print 'dgamma error: ', rel_error(da_num, dGamma)
print 'dx error: ', rel_error(dx_num, dx)

dbeta error:  5.53012302586e-11
dgamma error:  9.97382936252e-12
dx error:  1.89593804747e-09
