In [1]:
import numpy as np

![1](https://cs231n.github.io/assets/nn1/neuron.png)  ![2](https://cs231n.github.io/assets/nn1/neuron_model.jpeg)
Historically, a common choice of activation function is the sigmoid function σ, since it takes a real-valued input (the signal strength after the sum) and squashes it to range between 0 and 1. We will see details of these activation functions later in this section.

$\sigma(x) = 1/(1+e^{-x})$

In [2]:
class Neuron(object):
    
    def forward(self, inputs):
        """假设输入和权重都是1-D向量，偏差是一个数字"""
        cell_body_sum = np.sum(inputs * self.weights) + self.bias
        fiting_rate = 1.0 / (1.0 + np.exp(-cell_body_sum))

sigmoid 函数会使梯度饱和或者梯度小时，因为在激活函数接近0或者1时，这些区域的梯度几乎为0

如果初始权重设置过大，那么神经网络几乎不会学习

有没有其他激活函数呢？

ReLU 线性整流单元 $$f(x) = \max(0, x)$$
![3](https://cs231n.github.io/assets/nn1/relu.jpeg)

神经网络架构

In [16]:
W1 = np.random.randn(1,3)
W2 =np.random.randn(1,1)
W3 =  np.random.randn(1,1)
b1=b2=b3 = 1


In [17]:
# forward-pass of a 3-layer neural network:
f = lambda x: 1.0/(1.0 + np.exp(-x)) # activation function (use sigmoid)
x = np.random.randn(3, 1) # random input vector of three numbers (3x1)
h1 = f(np.dot(W1, x) + b1) # calculate first hidden layer activations (4x1)
h2 = f(np.dot(W2, h1) + b2) # calculate second hidden layer activations (4x1)
out = np.dot(W3, h2) + b3 # output neuron (1x1)

In [18]:
out

array([[0.07528657]])

In [25]:
def eval_numerical_gradient(f, x):
    """
    a naive implementation of numerical gradient of f at x
    - f should be a function that takes a single argument
    - x is the point (numpy array) to evaluate the gradient
    at
    """
    fx = f(x) # evaluate function value at original point
    grad = np.zeros(x.shape)
    h = 0.00001
    # iterate over all indexes in x
    it = np.nditer(x, flags=[multi_index],op_flags=[readwrite])
    while not it.finished:
        # evaluate function at x+h
        ix = it.multi_index
        old_value = x[ix]
        x[ix] = old_value + h # increment by h
        fxh_left = f(x) # evaluate f(x + h)
        x[ix] = old_value - h # decrement by h
        fxh_right = f(x) # evaluate f(x - h)
        x[ix] = old_value # restore to previous value (veryimportant!)
        # compute the partial derivative
        grad[ix] = (fxh_left - fxh_right) / (2*h) # the slope
        it.iternext() # step to next dimension
    return grad

In [None]:
# Computes a standard momentum update
# on parameters x
v = mu*v - alpha*grad_x
x += v

In [None]:
# Assume the gradient dx and parameter vector x
cache += dx**2
x += - learning_rate * dx / np.sqrt(cache + 1e-8)

In [None]:
# Update rule for RMS prop
cache = decay_rate * cache + (1 - decay_rate) * dx**2
x += - learning_rate * dx / (np.sqrt(cache) + eps)

In [None]:
# update for Adam
m = beta1*m + (1-beta1)*dx
v = beta2*v + (1-beta2)*(dx**2)
x += - learning_rate * m / (np.sqrt(v) + eps)

$$f(x,y,z) = (x + y) z$$

In [26]:

# set some inputs
x = -2; y = 5; z = -4

# perform the forward pass
q = x + y # q becomes 3
f = q * z # f becomes -12

# perform the backward pass (backpropagation) in reverse order:
# first backprop through f = q * z
dfdz = q # df/dz = q, so gradient on z becomes 3
dfdq = z # df/dq = z, so gradient on q becomes -4
# now backprop through q = x + y
dfdx = 1.0 * dfdq # dq/dx = 1. And the multiplication here is the chain rule!
dfdy = 1.0 * dfdq # dq/dy = 1

$$f(w,x) = \frac{1}{1+e^{-(w_0x_0 + w_1x_1 + w_2)}}$$

In [34]:
w = [2,-3,-3] # assume some random weights and data
x = [-1, -2]

# forward pass
dot = w[0]*x[0] + w[1]*x[1] + w[2]
f = 1.0 / (1 + np.exp(-dot)) # sigmoid function

# backward pass through the neuron (backpropagation)
ddot = (1 - f) * f # gradient on dot variable, using the sigmoid gradient derivation
dx = [w[0] * ddot, w[1] * ddot] # backprop into x
dw = [x[0] * ddot, x[1] * ddot, 1.0 * ddot] # backprop into w
# we're done! we have the gradients on the inputs to the circuit

In [37]:
dx,dw

([0.3932238664829637, -0.5898357997244456],
 [-0.19661193324148185, -0.3932238664829637, 0.19661193324148185])

$$ f(x,y) = \frac{x + \sigma(y)}{\sigma(x) + (x+y)^2} $$

In [30]:
x = 3 # example values
y = -4
import math
# forward pass
sigy = 1.0 / (1 + math.exp(-y)) # sigmoid in numerator   #(1)
num = x + sigy # numerator                               #(2)

sigx = 1.0 / (1 + math.exp(-x)) # sigmoid in denominator #(3)
xpy = x + y                                              #(4)
xpysqr = xpy**2                                          #(5)
den = sigx + xpysqr # denominator                        #(6)

invden = 1.0 / den                                       #(7)
f = num * invden # done!                                 #(8)

In [31]:
f

1.5456448841066441

In [38]:
# backprop f = num * invden
dnum = invden # gradient on numerator                             #(8)
dinvden = num                                                     #(8)
# backprop invden = 1.0 / den 
dden = (-1.0 / (den**2)) * dinvden                                #(7)
# backprop den = sigx + xpysqr
dsigx = (1) * dden                                                #(6)
dxpysqr = (1) * dden                                              #(6)
# backprop xpysqr = xpy**2
dxpy = (2 * xpy) * dxpysqr                                        #(5)
# backprop xpy = x + y
dx = (1) * dxpy                                                   #(4)
dy = (1) * dxpy                                                   #(4)
# backprop sigx = 1.0 / (1 + math.exp(-x))
dx += ((1 - sigx) * sigx) * dsigx # Notice += !! See notes below  #(3)
# backprop num = x + sigy
dx += (1) * dnum                                                  #(2)
dsigy = (1) * dnum                                                #(2)
# backprop sigy = 1.0 / (1 + math.exp(-y))
dy += ((1 - sigy) * sigy) * dsigy                                 #(1)
# done! phew

In [39]:
dx

2.0595697955721652