# $\bar Y = W \times X+\vec b$

$(k \times m) = (k \times  n) \times (n \times m) + (k \times 1)$

- $\vec x$ is $n$ dimension vector which represents $n$ features
- $X$ is a $(n \times m)$ dimension tensor, $m$ represents batch size,
- $W$ is a $(k \times n)$ dimension weights matrix of the full connected layer w.r.t. $k$ hidden unites(neurons) or $k$ outputs if it is the last layer
- $\vec b$ is a $k$ dimension bias vector
- $\bar Y$ is a $(k \times m)$ dimension outputs, $m$ is the batch number, $k$ is the output number of last layer. The bar over $\bar Y$ represents this is the prediction of the model, which is distinguished with the ground truth $Y^\ast$



# Autograd Python example

In [1]:
class array(object):
    """Simple Array object that support autodiff."""
    def __init__(self, value, name=None):
        self.value = value
        if name:
            self.grad = lambda g : {name : g}

    def __add__(self, other):
        assert isinstance(other, int)
        ret = array(self.value + other)
        ret.grad = lambda g : self.grad(g)
        return ret

    def __mul__(self, other):
        assert isinstance(other, array)
        ret = array(self.value * other.value)
        def grad(g):
            x = self.grad(g * other.value)
            x.update(other.grad(g * self.value))
            return x
        ret.grad = grad
        return ret
    
x = array(1, 'x')
W = array(2, 'W')
b = 3

y = W * x + b
print(y.value)
print(y.grad(1))

5
{'x': 2, 'W': 1}


# Autograd Tensorflow example

In [2]:
import tensorflow as tf
import numpy as np
x = tf.placeholder(tf.float32, shape=(10,1))
W = tf.Variable(tf.ones([1, 10])*2, tf.float32)
b = tf.Variable(tf.ones([1])*3, tf.float32)

y = tf.matmul(W, x) + b
op_grad = tf.gradients(y, [W, b])
init = tf.global_variables_initializer()

# Launch the graph in a session.
sess = tf.Session()
sess.run(init)
result, grad = sess.run([y,op_grad], {x:np.ones((10,1),np.float32)})
print(result)
print(grad)

[[ 23.]]
[array([[ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.]], dtype=float32), array([ 1.], dtype=float32)]


# Autograd PyTorch example

In [3]:
import torch
from torch.autograd import Variable
x = Variable(torch.ones(10,1), requires_grad=False)
W = Variable(torch.ones(1,10)*2, requires_grad=True)
b = Variable(torch.ones(1)*3, requires_grad=True)

y = W.mm(x) + b
y.backward()

print(y)
print(W.grad.data)
print(b.grad.data)

Variable containing:
 23
[torch.FloatTensor of size 1x1]


    1     1     1     1     1     1     1     1     1     1
[torch.FloatTensor of size 1x10]


 1
[torch.FloatTensor of size 1]



# Autograd MxNet example

In [4]:
import mxnet as mx
import numpy as np
x = mx.symbol.Variable('x')
W = mx.symbol.Variable('W')
b = mx.symbol.Variable('b')
y = mx.symbol.dot(W, x) + b

args={'x': mx.nd.ones([10,1]),
      'W':mx.nd.ones(10)*2, 
      'b':mx.nd.ones(1)*3}
args_grad={'W':mx.nd.ones(10), #change to `zeros` no impact 
           'b':mx.nd.ones(1)}  #change to `zeros` no impact 

y=mx.symbol.MakeLoss(y)
exe = y.bind(mx.cpu(), args=args, args_grad=args_grad)# args_grad is necessary
result = exe.forward(is_train=True)[0].asnumpy()
exe.backward()
grad = exe.grad_arrays
print(result)
print(grad)
print(grad[0].asnumpy())
print(grad[2].asnumpy())

[ 23.]
[<NDArray 10 @cpu(0)>, None, <NDArray 1 @cpu(0)>]
[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
[ 1.]
