In [3]:
import torch
import numpy as np
# http://tangshusen.me/Dive-into-DL-PyTorch/#/


**torch**
- torch.utils.data模块提供了有关数据处理的工具
- torch.nn模块定义了大量神经网络的层
- torch.nn.init模块定义了各种初始化方法
- torch.optim模块提供了模型参数初始化的各种方法。


**torchvision**
- torchvision.datasets: 一些加载数据的函数及常用的数据集接口；
- torchvision.models: 包含常用的模型结构（含预训练模型），例如AlexNet、VGG、ResNet等；
- torchvision.transforms: 常用的图片变换，例如裁剪、旋转等；
- torchvision.utils: 其他的一些有用的方法。


**常用操作**
- torch.cat
- torch.mm

### tensor 操作

In [4]:
# numpy(), torch.from_numpy()
print(torch.tensor([1.,2.]).numpy())
print(torch.from_numpy(np.array([1.,2.])))

[1. 2.]
tensor([1., 2.], dtype=torch.float64)


In [5]:
# unsqueeze
print(torch.tensor([1.,2.]))
print(torch.tensor([1.,2.]).unsqueeze(0))

tensor([1., 2.])
tensor([[1., 2.]])


In [6]:
a = torch.tensor(range(4), dtype=torch.float32).view(2,2)
print(a)
print(a.sum())
print(a.sum(dim=0))
print(a.sum(dim=1))
print()
print(a.sum(dim=0, keepdim=True))
print(a.sum(dim=1, keepdim=True))

tensor([[0., 1.],
        [2., 3.]])
tensor(6.)
tensor([2., 4.])
tensor([1., 5.])

tensor([[2., 4.]])
tensor([[1.],
        [5.]])


In [7]:
# gather
a = torch.tensor(range(9), dtype=torch.float32).view(3,3)
print(a)
print(a.gather(1, torch.tensor([[1],[0], [1]])))
print(a.gather(0, torch.tensor([[1, 1, 2]]))) # out[i][j] = input[index[i][j]][j]
print(a.gather(0, torch.tensor([[1, 1, 2], [0, 0, 1]])))

tensor([[0., 1., 2.],
        [3., 4., 5.],
        [6., 7., 8.]])
tensor([[1.],
        [3.],
        [7.]])
tensor([[3., 4., 8.]])
tensor([[3., 4., 8.],
        [0., 1., 5.]])


In [8]:
y_hat = torch.tensor([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5]])
y = torch.LongTensor([0, 2])
y_hat.gather(1, y.view(-1, 1))
torch.gather??

### 学习率的调整

```
optimizer =optim.SGD([
    # 如果对某个参数不指定学习率，就使用最外层的默认学习率
    {'params': net.subnet1.parameters()}, # lr=0.03
    {'params': net.subnet2.parameters(), 'lr': 0.01}
], lr=0.03)
```
有时候我们不想让学习率固定成一个常数，那如何调整学习率呢？主要有两种做法。一种是修改optimizer.param_groups中对应的学习率，另一种是更简单也是较为推荐的做法——新建优化器，由于optimizer十分轻量级，构建开销很小，故而可以构建新的optimizer。但是后者对于使用动量的优化器（如Adam），会丢失动量等状态信息，可能会造成损失函数的收敛出现震荡等情况。

```
# 调整学习率
for param_group in optimizer.param_groups:
    param_group['lr'] *= 0.1 # 学习率为之前的0.1倍
```

### 定义模型

In [9]:
# 写法一
net = torch.nn.Sequential(
    torch.nn.Linear(2, 1)
)

# 写法二
net = torch.nn.Sequential()
net.add_module('linear', torch.nn.Linear(2, 1))
# net.add_module ......

# 写法三
from collections import OrderedDict
net = torch.nn.Sequential(OrderedDict([
    ('linear', torch.nn.Linear(2, 1))
    # ......
]))

print(net)

Sequential(
  (linear): Linear(in_features=2, out_features=1, bias=True)
)


#### 构造复杂模型

- 可以通过继承Module类来构造模型。
- Sequential、ModuleList、ModuleDict类都继承自Module类。
- 与Sequential不同，ModuleList和ModuleDict并没有定义一个完整的网络，它们只是将不同的模块存放在一起，需要自己定义forward函数。
- 虽然Sequential等类可以使模型构造更加简单，但直接继承Module类可以极大地拓展模型构造的灵活性。


### 参数的相关

net[0] 这样根据下标访问子模块的写法只有当 net 是个 ModuleList 或者 Sequential 实例时才可以

In [27]:
class LinearNet(torch.nn.Module):
    def __init__(self, input, output):
        super(LinearNet, self).__init__()
        self.linear = torch.nn.Linear(input, output, bias=True)
        self.w1 = torch.randn(4,3)
        self.w2 = torch.nn.Parameter(torch.randn(4,3))
    
    def forward(self, x):
        return self.linear(x)

net = torch.nn.Sequential(LinearNet(4,3), torch.nn.ReLU(), torch.nn.Linear(3,1))

#### 访问参数

`torch.nn.Parameter` 是 `tensor` 的子类, 并且会自动被添加到模型的参数列表 `net.named_parameters()` 或 `net.parameters()` 里

In [30]:
for param in net.parameters():
    print(param)

Parameter containing:
tensor([[ 1.3584,  0.5445,  0.6224],
        [ 1.6710,  0.4676,  1.0141],
        [-0.1355,  1.3659, -0.2936],
        [ 0.0757,  0.5898, -1.4875]], requires_grad=True)
Parameter containing:
tensor([[-0.3151, -0.3208,  0.3802,  0.0899],
        [-0.3079,  0.3529,  0.3170,  0.4978],
        [ 0.4353, -0.3208, -0.4605,  0.1064]], requires_grad=True)
Parameter containing:
tensor([ 0.0731,  0.1902, -0.0534], requires_grad=True)
Parameter containing:
tensor([[-0.2613, -0.4719, -0.1674]], requires_grad=True)
Parameter containing:
tensor([-0.2197], requires_grad=True)


In [31]:
print(net)
for name, param in net.named_parameters():
    print(name, param.size())

Sequential(
  (0): LinearNet(
    (linear): Linear(in_features=4, out_features=3, bias=True)
  )
  (1): ReLU()
  (2): Linear(in_features=3, out_features=1, bias=True)
)
0.w2 torch.Size([4, 3])
0.linear.weight torch.Size([3, 4])
0.linear.bias torch.Size([3])
2.weight torch.Size([1, 3])
2.bias torch.Size([1])


In [38]:
net[0].w2.shape
net[0].linear.weight.shape
net[0].linear.bias.shape
net[2].weight.shape
net[2].bias.shape

torch.Size([1])

#### 初始化参数

In [42]:
torch.nn.init.normal_(net[0].w2, 0., 0.1)
torch.nn.init.constant_(net[2].bias, 0.)

Parameter containing:
tensor([0.], requires_grad=True)

In [45]:
for name, param in net.named_parameters():
    if 'weight' in name:
        torch.nn.init.normal_(param, mean=0, std=0.01)
        print(name, param.data)
    if 'bias' in name:
        torch.nn.init.constant_(param, val=0)
        print(name, param.data) 

0.linear.weight tensor([[-0.0026, -0.0153,  0.0033,  0.0024],
        [-0.0085,  0.0047,  0.0049, -0.0101],
        [ 0.0018,  0.0073, -0.0047, -0.0097]])
0.linear.bias tensor([0., 0., 0.])
2.weight tensor([[ 0.0027,  0.0004, -0.0091]])
2.bias tensor([0.])


#### 自定义初始化方法

In [46]:
def normal_(tensor, mean=0, std=1):
    with torch.no_grad():
        return tensor.normal_(mean, std)

In [47]:
def init_weight_(tensor):
    with torch.no_grad():
        tensor.uniform_(-10, 10)
        tensor *= (tensor.abs() >= 5).float()

for name, param in net.named_parameters():
    if 'weight' in name:
        init_weight_(param)
        print(name, param.data)

0.linear.weight tensor([[ 8.2923,  0.0000,  0.0000, -0.0000],
        [ 0.0000,  0.0000,  8.5411,  8.0008],
        [ 6.9095,  5.2381, -0.0000, -6.1619]])
2.weight tensor([[8.7914, 0.0000, 8.0843]])


#### 共享参数

> 因为模型参数里包含了梯度，所以在反向传播计算时，这些共享的参数的梯度是累加的:

#### 正则

```
# 初始化参数
nn.init.normal_(net.weight, mean=0, std=1)
nn.init.normal_(net.bias, mean=0, std=1)
    
# 定义优化器
optimizer_w = torch.optim.SGD(params=[net.weight], lr=lr, weight_decay=wd) # 对权重参数衰减
optimizer_b = torch.optim.SGD(params=[net.bias], lr=lr)  # 不对偏差参数衰减

#训练
    optimizer_w.zero_grad()
    optimizer_b.zero_grad()
    l = ...
    l.backward()
    
    optimizer_w.step()
    optimizer_b.step()
```
    


### 训练的过程

```
optimizer.zero_grad() # 梯度清零，等价于net.zero_grad()
output = net(X)
l = loss(output, y.view(output.shape))
l.backward()
optimizer.step()
```

### 其它

注意：torch.nn仅支持输入一个batch的样本不支持单个样本输入，如果只有单个样本，可使用input.unsqueeze(0)来添加一维。

In [25]:
print(torch.tensor([1.,2.]))
print(torch.tensor([1.,2.]).unsqueeze(0))

tensor([1., 2.])
tensor([[1., 2.]])


#### 修改数值, 不影响梯度的方式
```
#1.
with torch.no_grad():
    ...

#2.
param.data.init_()
```

### tricks

#### 两个参数矩阵计算后求和 等价化

XW_x+HW_h 等价于 (X, H 列合并)(W_x, W_h 行合并)

In [50]:
X = torch.randn(3,1)
W_x = torch.randn(1,4)

H = torch.randn(3,4)
W_h = torch.randn(4,4)

In [52]:
torch.mm(X, W_x) + torch.mm(H, W_h)

tensor([[-0.5841, -0.5694,  1.6718, -0.9662],
        [ 0.3362, -0.4847,  2.6763, -2.8878],
        [-0.5670,  1.0877, -2.4146, -0.8737]])

In [53]:
torch.mm(
    torch.cat([X, H], dim=1), torch.cat([W_x, W_h], dim=0)
)

tensor([[-0.5841, -0.5694,  1.6718, -0.9662],
        [ 0.3362, -0.4847,  2.6763, -2.8878],
        [-0.5670,  1.0877, -2.4146, -0.8737]])

### scatter & gather
- https://pytorch.org/docs/stable/tensors.html?highlight=scatter#torch.Tensor.scatter_
- https://pytorch.org/docs/stable/tensors.html?highlight=scatter#torch.Tensor.gather_

In [56]:
y=torch.tensor([0,1,2])
y_hat = torch.tensor([
    [0.7,0.2,0.1],
    [0.2,0.5,0.3],
    [0.1,0.1,0.8],
])
y_hat.gather(dim=1, index = y.view(-1,1))

tensor([[0.7000],
        [0.5000],
        [0.8000]])

In [61]:
x = torch.tensor([0,1,2])
torch.zeros(3,10).scatter(1, x.view(-1,1), 1)

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]])