In [2]:
import torch.nn as nn

In [11]:
# 使用nn.Module来构造更加灵活的模型，Module是所有模型的基类
# 新建模型需要重载Module类的__init__函数和forward函数, 无需定义反向传播函数
# 系统通过自动求梯度生成反向传播所需的backward函数

class MLP(nn.Module):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)  # super(MLP, self).__init__(**kwargs)
        self.hidden = nn.Linear(784, 256)
        self.act = nn.ReLU()
        self.output = nn.Linear(256, 10)
    def forward(self, x):
        act_x = self.act(self.hidden(x))
        return self.output(act_x)

In [13]:
import torch

input = torch.randn(784)
mlp = MLP()
output = mlp(input)
print(output)

tensor([ 0.1967,  0.0433,  0.1620,  0.0980, -0.0162,  0.4275,  0.0996,  0.0702,
         0.1785,  0.3364], grad_fn=<AddBackward0>)


In [14]:
# Module的子类：Module类是一个通用的部件，事实上，PyTorch还实现了继承自Module的可以方便构建模型的类：Sequential、ModuleList、ModuleDict
# Sequential类的目的：接受一个子模块的有序字典或者一系列子模块作为参数来逐一添加Module的实例，模型的前向计算就是按顺序逐一计算
sequentialMLP = nn.Sequential(nn.Linear(784, 256), nn.ReLU(), nn.Linear(256, 10))
print(sequentialMLP)
sequentialOutput = sequentialMLP(input)
print(sequentialOutput)

Sequential(
  (0): Linear(in_features=784, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=10, bias=True)
)
tensor([-0.1167,  0.0088,  0.3210,  0.1750,  0.2873,  0.2245,  0.0148,  0.0309,
        -0.1957, -0.2048], grad_fn=<AddBackward0>)


In [19]:
# ModuleList接收一个子模块的列表作为输入，然后可以类似List那样进行append和extend操作
moduleLstNet = nn.ModuleList([nn.Linear(784, 256), nn.ReLU()])
moduleLstNet.append(nn.Linear(256, 10))
print(moduleLstNet)


ModuleList(
  (0): Linear(in_features=784, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=10, bias=True)
)


In [20]:
# moduleDict接收一个子模块的字典作为输入，然后也可以像字典那样进行添加访问操作
moduleDictNet = nn.ModuleDict({'linear': nn.Linear(784, 256), 'act': nn.ReLU()})
moduleDictNet['output'] = nn.Linear(256, 10)
print(moduleDictNet)

ModuleDict(
  (act): ReLU()
  (linear): Linear(in_features=784, out_features=256, bias=True)
  (output): Linear(in_features=256, out_features=10, bias=True)
)


In [24]:
class FancyMLP(nn.Module):
    def __init__(self, **kwargs):
        super(FancyMLP, self).__init__(**kwargs)
        self.randWeight = torch.rand((20, 20), requires_grad=False)
        self.linear = nn.Linear(20, 20)

    def forward(self, x):
        x = self.linear(x)
        x = nn.functional.relu(torch.mm(x, self.randWeight.data) + 1) # 使用了常数参数
        x = self.linear(x)    # 重复使用了self.linear层

        # python控制流
        while x.norm().item() > 1: x /= 2
        if x.norm().item() < 0.8: x *= 10

        return x.sum()

In [26]:
input = torch.rand(2, 20)
fancyMLP = FancyMLP()
output = fancyMLP(input)
print(output)

tensor(-3.0820, grad_fn=<SumBackward0>)


In [27]:
print(fancyMLP.randWeight)
output.backward()
print(fancyMLP.randWeight)

tensor([[0.4661, 0.8697, 0.1475, 0.9463, 0.5911, 0.6075, 0.2942, 0.2041, 0.4409,
         0.2621, 0.2661, 0.4380, 0.3620, 0.6661, 0.2655, 0.4746, 0.6140, 0.5595,
         0.0487, 0.3103],
        [0.2772, 0.2375, 0.8919, 0.0775, 0.1416, 0.2169, 0.1617, 0.1309, 0.3342,
         0.5981, 0.2956, 0.5038, 0.5102, 0.8190, 0.0090, 0.3568, 0.3549, 0.4321,
         0.8918, 0.9339],
        [0.5713, 0.9271, 0.8925, 0.4028, 0.3958, 0.2615, 0.3960, 0.2584, 0.5241,
         0.1695, 0.7487, 0.2654, 0.6945, 0.4166, 0.6598, 0.0292, 0.9648, 0.5710,
         0.5854, 0.0750],
        [0.1632, 0.3876, 0.0704, 0.2999, 0.0604, 0.3231, 0.7501, 0.1435, 0.2435,
         0.0902, 0.6501, 0.1142, 0.6470, 0.6841, 0.0378, 0.9717, 0.0030, 0.7530,
         0.2392, 0.1436],
        [0.7349, 0.5020, 0.0594, 0.3663, 0.7038, 0.3981, 0.1894, 0.5450, 0.4313,
         0.7990, 0.6736, 0.8064, 0.7079, 0.8475, 0.4513, 0.7996, 0.6589, 0.3753,
         0.7734, 0.0187],
        [0.4264, 0.0035, 0.4294, 0.9358, 0.0998, 0.3618, 0.0

In [29]:
# 模型嵌套调用

class NestMLP(nn.Module):
    def __init__(self, **kwargs):
        super(NestMLP, self).__init__(**kwargs)
        self.net = nn.Sequential(nn.Linear(40, 30), nn.ReLU())
    
    def forward(self, x):
        return self.net(x)

nestModule = nn.Sequential(NestMLP(), nn.Linear(30, 20), FancyMLP())
input = torch.rand(2, 40)
print(nestModule(input))

tensor(2.6561, grad_fn=<SumBackward0>)


In [32]:
# 访问模型参数 parameters方式
print(type(nestModule.parameters()))
for param in nestModule.parameters(): print(param)

<class 'generator'>
Parameter containing:
tensor([[ 1.0632e-01, -1.0758e-02,  1.1995e-04,  ...,  4.5790e-02,
          6.8271e-02,  9.8536e-02],
        [-4.1305e-02,  2.6749e-02,  3.3654e-02,  ...,  8.4781e-02,
         -9.0875e-03, -5.2351e-02],
        [-1.3303e-01,  5.5016e-02, -1.2562e-01,  ...,  1.0229e-01,
         -2.8073e-02,  3.4376e-02],
        ...,
        [-1.0138e-01,  3.5060e-02,  5.2653e-02,  ..., -1.0547e-01,
          2.3706e-02, -1.2087e-02],
        [-1.3556e-01, -4.0240e-02,  3.3253e-02,  ...,  8.6094e-02,
          1.5793e-01, -7.9311e-02],
        [-9.8206e-02,  1.5464e-01, -5.9911e-02,  ..., -1.0440e-01,
          1.3490e-01,  1.2286e-01]], requires_grad=True)
Parameter containing:
tensor([-1.2516e-01, -6.8050e-02,  1.2106e-01,  1.0394e-01,  3.0253e-02,
        -8.7294e-02, -1.1264e-04,  5.2160e-02,  5.3115e-02,  1.5100e-01,
        -1.4198e-01, -1.5284e-01, -1.0123e-01,  1.4491e-01,  5.6987e-02,
        -1.5461e-01,  2.6199e-02,  1.3271e-01,  1.3999e-01, -1.28

In [37]:
# 访问模型参数 named_parameters方式
print(type(nestModule.named_parameters()))
for name, param in nestModule.named_parameters(): print(name)
# 返回的名字自动加上了层数的索引作为前缀，单层的就没有层数索引的前缀
#  torch.nn.parameter.Parameter ，其实这是 Tensor 的子类

# randweight不在参数列表中？

print('\n')
# 访问单层的参数
for name, param in nestModule[1].named_parameters(): print(name) 

<class 'generator'>
0.net.0.weight
0.net.0.bias
1.weight
1.bias
2.linear.weight
2.linear.bias


weight
bias


In [40]:
from torch.nn import init
for name, param in nestModule.named_parameters():
    if 'weight' in name: 
        init.normal_(param, mean=0, std=0.01)
        print(name, param.data)

0.net.0.weighttensor([[-4.4612e-03, -9.7813e-03,  1.6284e-02,  ...,  5.6344e-03,
         -1.1865e-03, -1.6980e-02],
        [ 1.8055e-02, -9.5757e-03,  5.2357e-03,  ...,  3.5864e-03,
         -2.4506e-03, -3.8248e-03],
        [-1.8526e-02, -5.3761e-03, -2.0859e-03,  ...,  9.7163e-03,
         -3.0434e-05, -3.6126e-03],
        ...,
        [ 4.1569e-03,  1.2405e-02,  6.5253e-04,  ...,  1.4102e-02,
         -9.1181e-04, -1.6776e-02],
        [-6.7079e-03,  1.4770e-02, -1.1613e-02,  ..., -3.4333e-03,
          3.4166e-03,  9.4195e-04],
        [ 2.3919e-03, -1.1134e-02, -2.8177e-03,  ...,  8.7555e-03,
          3.2819e-03, -8.2593e-03]])
1.weighttensor([[-1.7538e-03,  1.4744e-02, -4.6096e-03,  2.9317e-03,  1.5924e-03,
         -7.1981e-03, -6.9165e-03, -1.2513e-02, -2.6573e-03,  7.5224e-03,
         -9.8681e-03,  1.1862e-02, -1.3333e-02,  6.5121e-03,  3.1250e-03,
         -4.8340e-03,  1.4213e-03,  1.1448e-02, -7.2540e-03, -1.0928e-02,
         -1.3325e-02, -9.8972e-03, -6.1179e-03, -2

In [44]:
for name, param in nestModule.named_parameters():
    if 'bias' in name:
        init.constant_(param, val=0)
        print(name, '\n', param)

0.net.0.bias
Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.], requires_grad=True)
1.bias
Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       requires_grad=True)
2.linear.bias
Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       requires_grad=True)


In [46]:
# 自定义初始化方法
def init_weight_(tensor):
    with torch.no_grad():
         tensor.uniform_(-10, 10)   # inplace 操作
         tensor *= (tensor.abs() >= 5).float()

for name, param in nestModule.named_parameters():
    if 'weight' in name:
        init_weight_(param)
        print(name, '\n',  param)

0.net.0.weight
Parameter containing:
tensor([[ 0.0000, -8.2964, -0.0000,  ..., -7.9558,  0.0000, -0.0000],
        [-0.0000,  0.0000,  7.5248,  ...,  8.4141,  7.5898,  0.0000],
        [ 6.5668, -0.0000,  0.0000,  ..., -0.0000, -0.0000, -6.3137],
        ...,
        [-6.3530,  6.5531,  6.2616,  ...,  0.0000,  6.6434,  8.4557],
        [ 0.0000, -8.7917,  0.0000,  ..., -6.8890,  6.4532, -0.0000],
        [ 0.0000,  0.0000,  6.5248,  ...,  6.5456, -0.0000, -8.6588]],
       requires_grad=True)
1.weight
Parameter containing:
tensor([[-0.0000, -9.7025,  0.0000,  8.6028,  0.0000,  7.9704, -8.1251,  8.3370,
         -8.1020, -0.0000, -0.0000, -0.0000, -9.5632, -9.2052, -6.7319,  6.6002,
          9.0449,  0.0000, -9.3713, -5.1876, -0.0000,  6.5189,  0.0000,  5.6039,
          0.0000,  0.0000,  0.0000, -0.0000,  0.0000, -0.0000],
        [ 6.6082, -7.1986,  0.0000,  7.2345, -0.0000,  6.5871,  0.0000,  0.0000,
         -8.5550, -0.0000,  6.1927,  0.0000,  0.0000,  5.5509, -6.4474, -0.0000,
  

In [50]:
# 改变参数值 但是不影响梯度: 使用data属性，不会被记录在计算图中
for name, param in nestModule.named_parameters():
    if 'bias' in name:
        param.data += 1
        print(name, '\n', param.data)

0.net.0.bias
tensor([4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
        4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.])
1.bias
tensor([4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
        4., 4.])
2.linear.bias
tensor([4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
        4., 4.])


In [59]:
##### 共享模型参数， 因为模型参数里包含了梯度，所以在反向传播计算时，这些共享的梯度是累加的
linear = nn.Linear(2,2, bias=False)
net = nn.Sequential(linear, linear, linear)
print(net)

for name, param in net.named_parameters():
    init.constant_(param, val=3)
    print(name, '\n', param.data)

print(id(net[0]) == id(net[1]))        # 两个线性层其实是一个对象


Sequential(
  (0): Linear(in_features=2, out_features=2, bias=False)
  (1): Linear(in_features=2, out_features=2, bias=False)
  (2): Linear(in_features=2, out_features=2, bias=False)
)
0.weight
tensor([[3., 3.],
        [3., 3.]])
True


In [69]:
x = torch.ones(2,2)
y = net(x).sum()
net.zero_grad()
y.backward()
print(y, '\n', net[0].weight.grad)

tensor(864., grad_fn=<SumBackward0>)
tensor([[216., 216.],
        [216., 216.]])


In [70]:
#################################################### 自定义层： nn.Module

### 不含模型参数的自定义层
class CenteredLayer(nn.Module):
    def __init__(self, **kwards):
        super(CenteredLayer, self).__init__(**kwards)
    def forward(self, x):
        return x - x.mean()


In [71]:
layer = CenteredLayer()
input = torch.rand(10,10,dtype=torch.float32)
output = layer(input)
print(output)

tensor([[ 0.2500,  0.4736,  0.3497,  0.0024,  0.3314, -0.2915,  0.3861, -0.4774,
         -0.2092, -0.3287],
        [-0.4933, -0.3188, -0.0129,  0.2012,  0.0104,  0.3017, -0.0969,  0.2819,
          0.1212, -0.4554],
        [-0.4830, -0.2102,  0.1164,  0.0963,  0.4228,  0.3864,  0.4056,  0.4598,
         -0.5140, -0.1571],
        [ 0.0879, -0.0751,  0.1490,  0.0675,  0.2524,  0.0194, -0.5143,  0.2055,
         -0.1395,  0.1076],
        [ 0.3162, -0.1657,  0.1506,  0.4765, -0.3470, -0.0255, -0.4793, -0.4856,
          0.0706, -0.0695],
        [ 0.3854,  0.4766,  0.1097, -0.0528, -0.1431, -0.1592,  0.3191, -0.0310,
          0.3549, -0.2005],
        [-0.2625,  0.0125,  0.2340,  0.2316, -0.2667, -0.3291, -0.4060, -0.0379,
          0.0322, -0.0586],
        [-0.2046, -0.3730,  0.4627, -0.4936,  0.1201,  0.3746, -0.0205, -0.5010,
         -0.0655, -0.2492],
        [-0.0530,  0.1380,  0.1048,  0.4034, -0.3485,  0.4689,  0.2296,  0.0114,
         -0.0359, -0.4422],
        [ 0.1902,  

In [74]:
net = nn.Sequential(nn.Linear(8, 128), CenteredLayer())
input = torch.rand(4, 8)
output = net(input)
print(output.mean().item())

-3.725290298461914e-09


In [86]:
### 包含模型参数的自定义层
# 如果一个Tensor是Parameter，会被自动添加到模型的参数列表中。 定义参数：Parameter、ParameterList、parameterDict

class MyDense(nn.Module):
    def __init__(self):
        super(MyDense, self).__init__()
        self.lstparams = nn.ParameterList([nn.Parameter(torch.randn(4,4)) for i in range(3)])
        self.lstparams.append(nn.Parameter(torch.randn(4,1)))

        self.dictparams = nn.ParameterDict({
            'linear1' : nn.Parameter(torch.randn(4,4)),
            'linear2' : nn.Parameter(torch.randn(4,1))
        })
        self.dictparams.update({'linear3': nn.Parameter(torch.randn(4, 2))})
    def forward(self, x):
        for i in range(len(self.lstparams)):
            x = torch.mm(x, self.lstparams[i])
        for name, param in self.dictparams.items():
            x = torch.mm(x, param)
        return x
net = MyDense()
print(len(net.lstparams), '\n', len(net.dictparams.keys()))
print(net.dictparams['linear1'])

4
3
Parameter containing:
tensor([[-0.8814, -0.3411, -0.0116,  0.5605],
        [-2.9033,  0.7493,  1.9043, -0.0604],
        [ 0.7425, -0.3993, -1.3453, -1.7525],
        [ 0.2149,  0.4505,  0.5775,  0.5321]], requires_grad=True)


In [87]:
net = nn.Sequential(MyDense(), MyDense())
print(net)

Sequential(
  (0): MyDense(
    (lstparams): ParameterList(
        (0): Parameter containing: [torch.FloatTensor of size 4x4]
        (1): Parameter containing: [torch.FloatTensor of size 4x4]
        (2): Parameter containing: [torch.FloatTensor of size 4x4]
        (3): Parameter containing: [torch.FloatTensor of size 4x1]
    )
    (dictparams): ParameterDict(
        (linear1): Parameter containing: [torch.FloatTensor of size 4x4]
        (linear2): Parameter containing: [torch.FloatTensor of size 4x1]
        (linear3): Parameter containing: [torch.FloatTensor of size 4x2]
    )
  )
  (1): MyDense(
    (lstparams): ParameterList(
        (0): Parameter containing: [torch.FloatTensor of size 4x4]
        (1): Parameter containing: [torch.FloatTensor of size 4x4]
        (2): Parameter containing: [torch.FloatTensor of size 4x4]
        (3): Parameter containing: [torch.FloatTensor of size 4x1]
    )
    (dictparams): ParameterDict(
        (linear1): Parameter containing: [torch.F

In [89]:
############################# 模型的读取和存储
# 只有具有可学习参数的层才有state_dict中的条目
for key, value in net.state_dict().items():
    print(key)

0.lstparams.0
0.lstparams.1
0.lstparams.2
0.lstparams.3
0.dictparams.linear1
0.dictparams.linear2
0.dictparams.linear3
1.lstparams.0
1.lstparams.1
1.lstparams.2
1.lstparams.3
1.dictparams.linear1
1.dictparams.linear2
1.dictparams.linear3


In [93]:
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.get_device_capability(), torch.cuda.get_device_name())

1
0
(6, 1)GeForce GTX 1070 Ti


In [96]:
device = torch.device(type='cuda', index=0)
print(device)
net.to(device, dtype=torch.float16)
# 对在GPU上的数据进行运算，结果还是存放在GPU上。 存储在不同位置上的数据是不可以直接进行计算的

cuda:0


Sequential(
  (0): MyDense(
    (lstparams): ParameterList(
        (0): Parameter containing: [torch.cuda.HalfTensor of size 4x4 (GPU 0)]
        (1): Parameter containing: [torch.cuda.HalfTensor of size 4x4 (GPU 0)]
        (2): Parameter containing: [torch.cuda.HalfTensor of size 4x4 (GPU 0)]
        (3): Parameter containing: [torch.cuda.HalfTensor of size 4x1 (GPU 0)]
    )
    (dictparams): ParameterDict(
        (linear1): Parameter containing: [torch.cuda.HalfTensor of size 4x4 (GPU 0)]
        (linear2): Parameter containing: [torch.cuda.HalfTensor of size 4x1 (GPU 0)]
        (linear3): Parameter containing: [torch.cuda.HalfTensor of size 4x2 (GPU 0)]
    )
  )
  (1): MyDense(
    (lstparams): ParameterList(
        (0): Parameter containing: [torch.cuda.HalfTensor of size 4x4 (GPU 0)]
        (1): Parameter containing: [torch.cuda.HalfTensor of size 4x4 (GPU 0)]
        (2): Parameter containing: [torch.cuda.HalfTensor of size 4x4 (GPU 0)]
        (3): Parameter containing: 