# Define the network

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        # 1 input image channel, 6 output channels, 3x3 square convolution
        self.conv1 = nn.Conv2d(1,6,3)
        self.conv2 = nn.Conv2d(6,16,3)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16*6*6,120)
        self.fc2 = nn.Linear(120,84)
        self.fc3 = nn.Linear(84,10)
    def forward(self,x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)),(2,2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)),2)
        x = x.view(-1,self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:]# all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features
    
net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


只需要定义`forward`函数，而`backward`函数(计算梯度的)将使用`autograd`自动定义。你可以在`forward`函数中使用任意张量运算。

In [14]:
params = list(net.parameters())
print(len(params))
print(params[0].size())# conv1's .weight

10
torch.Size([6, 1, 3, 3])


尝试一个随机的32x32输入。注意:这个网络(LeNet)的期望输入大小是32x32。要在MNIST数据集上使用此网络，请将数据集的图像大小调整为32x32。

In [15]:
input = torch.randn(1,1,32,32)
out = net(input)
print(out)

tensor([[-0.0946,  0.0125,  0.0042,  0.0064, -0.0006,  0.1194, -0.0851,  0.0173,
          0.0040, -0.1435]], grad_fn=<AddmmBackward>)


使用随机梯度将所有参数和backprops的梯度缓冲区置零:

In [16]:
net.zero_grad()
out.backward(torch.randn(1,10))

> `torch.nn`只支持mini-batches，整个的`torch.nn`package只支持输入是一个mini-batches，而不是单个样本。

>例如`nn.Conv2d`将接受一个4D张量（nSamples x nChannels x Height x Width）。

>如果只有一个样本，只需使用`input.unsqueeze(0)`来添加一个假批尺寸。

# Loss Function

In [17]:
output = net(input)
target = torch.randn(10)
target = target.view(1,-1)
criterion = nn.MSELoss()

loss = criterion(output,target)
print(loss)
print(loss.grad_fn)

tensor(0.9839, grad_fn=<MseLossBackward>)
<MseLossBackward object at 0x00000271C9352908>


`input -> conv2d -> relu -> maxpool2d -> conv2d -> relu     
       -> maxpool2d -> view -> linear -> relu -> linear
       -> relu -> linear
       -> MSELoss
       -> loss`

当我们调用`loss.backward()`，整个图都会微分，而且所有的在图中的`requires_grad=True`的张量将会让他们的`grad`张量累计梯度。

In [18]:
print(loss.grad_fn) # MSELoss
print(loss.grad_fn.next_functions[0][0]) # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])  # ReLU

<MseLossBackward object at 0x00000271C9364CC8>
<AddmmBackward object at 0x00000271C9364C48>
<AccumulateGrad object at 0x00000271C9364CC8>


# Backprop

为了实现反向传播损失，我们所有需要做的事情仅仅是使用 `loss.backward()`。需要使用`.zero_grad()`清空现存的梯度，否则梯度将会和现存的梯度累计到一起。

In [19]:
net.zero_grad()     # zeroes the gradient buffers of all parameters
print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)

loss.backward()

print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.bias.grad after backward
tensor([ 0.0097, -0.0015, -0.0119,  0.0157, -0.0054,  0.0020])


# Update the weights

最简单的更新规则就是随机梯度下降。

$weight = weight - learning\_rate * gradient$

In [31]:
learning_rate = 0.01
for f in net.parameters():
    print(f.shape)
    f.data.sub_(f.grad.data * learning_rate)

torch.Size([6, 1, 3, 3])
torch.Size([6])
torch.Size([16, 6, 3, 3])
torch.Size([16])
torch.Size([120, 576])
torch.Size([120])
torch.Size([84, 120])
torch.Size([84])
torch.Size([10, 84])
torch.Size([10])


`torch.optim`包含不同的优化器，可以使用不同的更新规则，比如： SGD, Nesterov-SGD, Adam, RMSProp等

In [33]:
import torch.optim as optim

# create your optimizer
optimizer = optim.SGD(net.parameters(),lr=0.01)

# in your training loop:
optimizer.zero_grad()# zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()# Does the update