<a href="https://colab.research.google.com/github/Xinsen-Zhang/torch-study/blob/master/06_%E5%8F%82%E6%95%B0%E5%88%9D%E5%A7%8B%E5%8C%96.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 参数初始化
## 使用numpy 来进行初始化

In [0]:
import numpy as np
from torch import nn
import torch

In [0]:
# 定义一个Sequential 模型
net = nn.Sequential(
    nn.Linear(30, 40),
    nn.ReLU(),
    nn.Linear(40, 50),
    nn.ReLU(),
    nn.Linear(50, 10)
)

In [0]:
# 访问第一层的参数
w = net[0].weight
b = net[0].bias
w

Parameter containing:
tensor([[-0.0268,  0.1415, -0.0282,  ...,  0.1673,  0.1451,  0.0021],
        [-0.1532,  0.1785,  0.1576,  ..., -0.0034,  0.0799, -0.1583],
        [ 0.1280, -0.0858, -0.0718,  ..., -0.0276, -0.1234,  0.0570],
        ...,
        [ 0.0993,  0.0550,  0.0195,  ...,  0.1647, -0.0602, -0.0076],
        [-0.0036,  0.1045,  0.1380,  ..., -0.0491, -0.0052, -0.1135],
        [-0.0563, -0.1181, -0.1406,  ..., -0.0420,  0.0672, -0.0190]],
       requires_grad=True)

In [0]:
# 查看bias
b

Parameter containing:
tensor([ 0.1319,  0.1057,  0.0205, -0.1668, -0.1257,  0.0357, -0.1138, -0.0904,
        -0.0881,  0.0118,  0.1764,  0.1765, -0.0450, -0.0436,  0.0717, -0.1118,
         0.0099, -0.0633, -0.1169, -0.0802,  0.0527, -0.1542, -0.0071,  0.0579,
         0.0389,  0.0649,  0.0438,  0.0459, -0.1451,  0.0062, -0.1632, -0.0240,
         0.0356, -0.0187,  0.0014, -0.1230,  0.0950, -0.0169,  0.0885,  0.1291],
       requires_grad=True)

In [0]:
w.data = torch.from_numpy(np.random.uniform(5, size= (30,40))).float()

In [0]:
# 使用均值为0， 方差为 0.5 的随机数对线性层进行初始化
for layer in net:
    if(isinstance(layer, nn.Linear)):
        param_shape = layer.weight.shape
        layer.weight.data = torch.from_numpy(np.random.normal(0.0, 0.5, size = param_shape)).float()

In [0]:
param_shape

torch.Size([30, 40])

## 模块的children 和 modules属性

In [0]:
class sim_net(nn.Module):
    def __init__(self):
        super(sim_net,self).__init__()
        # 第一层
        self.l1 = nn.Sequential(
            nn.Linear(30, 40),
            nn.ReLU()
        )
        self.l1[0].weight.data = torch.randn(30, 40)
        # 第二层
        self.l2 = nn.Sequential(
            nn.Linear(40, 50),
            nn.ReLU()
        )
        self.l3 = nn.Sequential(
            nn.Linear(50, 10),
            nn.ReLU()
        )
    # forward
    def forward(self, x):
        x = self.l1(x)
        x = self.l2(x)
        x = self.l3(x)
        return x

In [0]:
# 实例化对象
net = sim_net()

### childre 属性

In [0]:
for i in net.children():
    print(i)

Sequential(
  (0): Linear(in_features=30, out_features=40, bias=True)
  (1): ReLU()
)
Sequential(
  (0): Linear(in_features=40, out_features=50, bias=True)
  (1): ReLU()
)
Sequential(
  (0): Linear(in_features=50, out_features=10, bias=True)
  (1): ReLU()
)


### modules 属性

In [0]:
for i in net.modules():
    print(i)
    print('-' * 10)

sim_net(
  (l1): Sequential(
    (0): Linear(in_features=30, out_features=40, bias=True)
    (1): ReLU()
  )
  (l2): Sequential(
    (0): Linear(in_features=40, out_features=50, bias=True)
    (1): ReLU()
  )
  (l3): Sequential(
    (0): Linear(in_features=50, out_features=10, bias=True)
    (1): ReLU()
  )
)
----------
Sequential(
  (0): Linear(in_features=30, out_features=40, bias=True)
  (1): ReLU()
)
----------
Linear(in_features=30, out_features=40, bias=True)
----------
ReLU()
----------
Sequential(
  (0): Linear(in_features=40, out_features=50, bias=True)
  (1): ReLU()
)
----------
Linear(in_features=40, out_features=50, bias=True)
----------
ReLU()
----------
Sequential(
  (0): Linear(in_features=50, out_features=10, bias=True)
  (1): ReLU()
)
----------
Linear(in_features=50, out_features=10, bias=True)
----------
ReLU()
----------


### 做到和Sequential相同的初始化

In [0]:
for layer in net.modules():
    if(isinstance(layer, nn.Linear)):
        param_shape = layer.weight.shape
        layer.weight.data = torch.from_numpy(np.random.normal(0, 0.5, size= param_shape)).float()

## 使用nn.init 进行初始化

In [0]:
# 定义一个Sequential 模型
net = nn.Sequential(
    nn.Linear(30, 40),
    nn.ReLU(),
    nn.Linear(40, 50),
    nn.ReLU(),
    nn.Linear(50, 10)
)

In [0]:
def show_layer1_weight(net):
    layer = net[0]
    print(layer.weight.data)

In [0]:
show_layer1_weight(net)

tensor([[-0.1179, -0.1249, -0.1570,  ...,  0.0865,  0.1321,  0.1274],
        [-0.0028,  0.0153,  0.0756,  ...,  0.0731,  0.0931,  0.1412],
        [-0.0337, -0.0081,  0.1360,  ...,  0.1670,  0.0336,  0.0642],
        ...,
        [ 0.1302,  0.1505, -0.0491,  ..., -0.0996, -0.0958,  0.1496],
        [-0.0747, -0.1467,  0.0303,  ..., -0.0830, -0.0682, -0.1583],
        [ 0.0900,  0.1189, -0.0640,  ...,  0.1602,  0.0683,  0.0845]])


In [0]:
from torch.nn import init

In [0]:
init.xavier_uniform(net[0].weight)

  """Entry point for launching an IPython kernel.


Parameter containing:
tensor([[ 0.1556,  0.2491, -0.2216,  ..., -0.2683, -0.2462,  0.0542],
        [ 0.1898, -0.1853, -0.1136,  ...,  0.1802,  0.1790,  0.2750],
        [-0.1759,  0.1154, -0.1481,  ...,  0.1774,  0.2898,  0.0716],
        ...,
        [ 0.0311,  0.0592, -0.1929,  ..., -0.0719,  0.1532, -0.1360],
        [ 0.2500,  0.0261,  0.0246,  ...,  0.1088, -0.0410, -0.2483],
        [-0.0355, -0.0696, -0.0105,  ..., -0.0314,  0.2350,  0.1622]],
       requires_grad=True)

## Xavier 初始化方法能够使每一层的输出方差尽可能相等
$$w - Uniform[-\frac{\sqrt{6}}{\sqrt{n_j + n_{j + 1}}}, \frac{\sqrt{6}}{\sqrt{n_j + n_{j + 1}}}]$$