In [1]:
import numpy as np

## Some tips about lists

### zip is a useful function to merge and unmerge two lists

In [2]:
x = [1, 2, 3] ; y = [4, 5, 6]
z = zip(x, y)

In [3]:
z

[(1, 4), (2, 5), (3, 6)]

Use zip with * to unzip

In [4]:
[x2, y2] = zip(*z)
print "x2 is", x2
print "y2 is", y2

x2 is (1, 2, 3)
y2 is (4, 5, 6)


### you can construct lists using something like set builder notation

In math, we easily understand a statement like this

$$\left\{2x_i+y_i \;\vert\; i=0, \ldots, 3\right\}$$


In [5]:
mylist = [[1,2],[2,5],[5,4],[4,6]]

In [6]:
for x,y in mylist:
    print 2*x+y

4
9
14
14


In [7]:
newlist=np.zeros(len(mylist))
counter=0
for x,y in mylist:
    newlist[counter]=2*x+y
    counter+=1
print newlist

[  4.   9.  14.  14.]


In [8]:
newlist2 = np.array([2*x+y for x,y in mylist])
print newlist2

[ 4  9 14 14]


# Defining a network and computing the gradient

We'll need to define the activation function.

In [9]:
def sigmoid(x):
    return 1.0/(1.0+np.exp(-x))

def dsigmoid(x):
    return sigmoid(x)*(1-sigmoid(x))

To keep our parameters organized, it will help to have a picture.  Suppose, for example, somewhere in our net we have a layer of three nodes connected to a layer of four nodes.  Then we will have $12$ weights on the edges between these layers.  Denote the weight connecting the $k$-th node from the layer on the left to the $j$-th node of the layer on the right by $w_{jk}$.  For example, in the picture, $w_{21}=1.2$, $w_{22}=0.7$, and $w_{23}=-0.4$.  The other parameters are the biases $b_1, b_2, b_3, b_4$.  In the picture $b_2=2.1$.

<img src="Figure.png">  

The output of the nodes on the left are numbers, labelled here by $(z_1, z_2, z_3)$.  Then the output of the nodes in the next layer are determined by weights and the biases.  In the picture, the output of the second node on the right is 

$$\sigma \left( \begin{bmatrix} 1.2 & 0.7 & -0.4\end{bmatrix} \begin{bmatrix} z_{1} \\ z_{2}\\ z_{3}\end{bmatrix} + 2.1\right)$$

More generally, the output of all the nodes on the right are given by 

$$\sigma \left( \begin{bmatrix} w_{11} & w_{12}& w_{13}\\
w_{21} & w_{22}& w_{23}\\w_{31} & w_{32}& w_{33}\\
w_{41} & w_{42}& w_{43}\end{bmatrix} \begin{bmatrix} z_{1} \\ z_{2}\\ z_{3}\end{bmatrix} + \begin{bmatrix}  b_1 \\ b_2 \\b_3\\b_4 \end{bmatrix}\right)$$

where we use the convention that $\sigma$ applied to a vector means $\sigma$ applied to each element.  

Finally, keep in mind this picture is just part of our net, say connecting the $i$-th layer to the $i+1$-st layey.  To make this explicit, we'll decorate everything with a superscript $i$ to keep track of which layer we're in.  So, we let

<ul>
<li>$W=\{w^i_{jk}$ be the matrix of weights connecting the $k$-th node of the $i$-th layer to the $j$-th node of the $i+1$-st layer</li>
<li>$z^i$ be the vector of outputs of the $i$-th layer</li>
<li>$B^i$ be the vector of biases that we input in the $i+1$-st layer</li>
</ul>

In this notation, the output of the $i+1$-st layer is

$$z^{i+1}=\sigma \left( W^iz^i+B^i\right)$$



In [10]:
mysizes = [3,5,2]

In [11]:
def makeinitial(sizes):
    biases = [np.random.randn(y, 1) for y in sizes[1:]]
    weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]
    return [biases,weights]

In [12]:
mybiases,myweights=makeinitial(mysizes)

In [13]:
def finaloutput(biases,weights, x):
    for b, w in zip(biases, weights):
        x = sigmoid(np.dot(w, x)+b)
    return x

In [14]:
def cost(biases,weights,x,y):
    error = finaloutput(biases,weights,x)-y
    return 0.5*np.dot(error.transpose(),error)

In [15]:
myx=np.random.randn(3,1);
print myx

[[-1.76095814]
 [ 0.10334163]
 [ 1.16587609]]


In [16]:
finaloutput(mybiases,myweights,myx)

array([[ 0.23534959],
       [ 0.17733111]])

In [17]:
myy=np.random.rand(2,1)

In [18]:
cost(mybiases,myweights,myx,myy)

array([[ 0.44992961]])

In [19]:
def alloutputs(biases,weights, x):
        layers=[]
        layer=x
        layers.append(layer)
        for b, w in zip(biases,weights):
            z = np.dot(w,layer)+b
            #print "input", layer
            #print "weight", w
            #print "biases", b
            #print "z", z
            layer=sigmoid(z)
            layers.append(layer)
        return layers

In [20]:
alloutputs(mybiases,myweights,myx)

[array([[-1.76095814],
        [ 0.10334163],
        [ 1.16587609]]), array([[ 0.58959644],
        [ 0.06096265],
        [ 0.31746618],
        [ 0.81024644],
        [ 0.21492882]]), array([[ 0.23534959],
        [ 0.17733111]])]

In [21]:
def dcost(biases,weights, x, y):
        nabla_b = [np.zeros(b.shape) for b in biases]
        nabla_w = [np.zeros(w.shape) for w in weights]
        num_layers = len(biases)+1
        layers=[]
        layer=x
        layers.append(layer)
        for b, w in zip(biases,weights):
            z = np.dot(w,layer)+b
            layer=sigmoid(z)
            layers.append(layer)
        delta = (layers[-1]-y)*layers[-1]*(1-layers[-1])
        # print delta
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, layers[-2].transpose())
        for l in range(2, num_layers):
            z = layers[-l]
            ds = z*(1-z)
            delta = np.dot(weights[-l+1].transpose(), delta) * ds
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, (layers[-l-1]).transpose())
        return (nabla_b, nabla_w)

In [22]:
dbiases,dweights=dcost(mybiases,myweights,myx,myy)

In [23]:
mybiases2,myweights2=makeinitial(mysizes)

In [25]:
mybiases2=[np.copy(mybiases[0]), np.copy(mybiases[1])]

In [26]:
mybiases2[0][1]=mybiases2[0][1]+.00001

In [27]:
(cost(mybiases2,myweights,myx,myy)-cost(mybiases,myweights,myx,myy))*100000

array([[-0.00393743]])

In [28]:
dbiases[0][1]

array([-0.00393741])

Following Neilsen's book <a href="http://neuralnetworksanddeeplearning.com">Neural Networks
and Deep Learning"</a>, we can define a general network class to store the parameters for our neural networks. 

In [None]:
class Network(object):

    def __init__(self, sizes):
        """sizes is a list of the number of neurons in each layer.  
        The weights and biases are initialized randomly.
        
        """
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x)
                        for x, y in zip(sizes[:-1], sizes[1:])]
    
    def finaloutput(self, x):
        for b, w in zip(self.biases, self.weights):
            x = sigmoid(np.dot(w, x)+b)
        return x
    
    def alloutputs(self, x):
        layers=[]
        layer=x
        layers.append(layer)
        for b, w in zip(self.biases,self.weights):
            z = np.dot(w,layer)+b
            layer=sigmoid(z)
            layers.append(layer)
        return layers
    
    def dcost(self, x, y):
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        layers=[]
        layer=x
        layers.append(layer)
        for b, w in zip(self.biases,self.weights):
            z = np.dot(w,layer)+b
            layer=sigmoid(z)
            layers.append(layer)
        delta = (layers[-1]-y)*layers[-1]*(1-layers[-1])
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, layers[-2].transpose())
        for l in range(2, self.num_layers):
            z = layers[-l]
            ds = z*(1-z)
            delta = np.dot(self.weights[-l+1].transpose(), delta) * ds
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, (layers[-l-1]).transpose())
        return (nabla_b, nabla_w)