In [2]:
import torch
from torch import nn
from torch.nn import functional as F

In [3]:
net = nn.Sequential(nn.Linear(20,256), nn.ReLU(), nn.Linear(256,10))

# X = torch.rand(2,20) 这行代码会生成一个大小为 2x20 的张量，其中包含了 40 个在区间 [0, 1) 内均匀分布的随机数。
X = torch.rand(2,20)     # 输入

net(X)   # 输出

tensor([[ 0.0574,  0.3315,  0.1166,  0.0199,  0.3483, -0.1876, -0.1673,  0.0476,
         -0.1516, -0.1820],
        [-0.1105,  0.1474,  0.0903, -0.0106,  0.4121, -0.1440, -0.1650,  0.0741,
         -0.1555, -0.1497]], grad_fn=<AddmmBackward>)

In [4]:
class MyMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20,256)
        self.out = nn.Linear(256,10)
    def forward(self, X):
        return self.out(F.relu((self.hidden(X))))      # 将三个层链接起来

net = MyMLP()
X = torch.rand(2,20)
net(X)

tensor([[-0.1130, -0.2605, -0.0028, -0.1315,  0.2831, -0.0219, -0.1388, -0.3198,
          0.0520,  0.1621],
        [-0.1664, -0.2344, -0.0618, -0.1139,  0.2405,  0.1695, -0.1188, -0.3584,
          0.0692,  0.0633]], grad_fn=<AddmmBackward>)

In [8]:
class MySequential(nn.Module):
    def __init__(self,*args):
        super().__init__()
        for block in args:
            print(block)
            self._modules[block] = block
    def forward(self,X):
        for block in self._modules.values():
            X = block(X)
        return X

net = MySequential(nn.Linear(20,256),nn.ReLU(),nn.Linear(256,10))
net(X)

Linear(in_features=20, out_features=256, bias=True)
ReLU()
Linear(in_features=256, out_features=10, bias=True)


tensor([[ 0.1493, -0.0089,  0.2427, -0.0798,  0.0935,  0.0015,  0.1714,  0.1016,
         -0.2196, -0.0520],
        [ 0.0919, -0.0402,  0.1483, -0.0247, -0.0209,  0.0968,  0.0856,  0.1403,
         -0.2901, -0.1206]], grad_fn=<AddmmBackward>)

In [9]:
class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(20,64),nn.ReLU(),
                                 nn.Linear(64,32),nn.ReLU())
        self.linear = nn.Linear(32,16)
    def forward(self,X):
        return self.linear(self.net(X))

chimera = nn.Sequential(NestMLP(),nn.Linear(16,20),MyMLP())
X = torch.rand(2,20)
chimera(X)

tensor([[ 0.0210,  0.1105,  0.0292,  0.0055, -0.1473, -0.1418,  0.0247, -0.0770,
          0.0096, -0.0914],
        [ 0.0166,  0.1126,  0.0327, -0.0024, -0.1452, -0.1435,  0.0254, -0.0761,
          0.0077, -0.0984]], grad_fn=<AddmmBackward>)

In [13]:
net = nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8,1))
X = torch.rand(size=(2,4))
net(X)

tensor([[0.1520],
        [0.1294]], grad_fn=<AddmmBackward>)

In [22]:
print(net[2].state_dict())
print()
print(type(net[2].bias))
print()
print(net[2].bias)
print()
print(net[2].bias.data)
print()
print(net[2].weight.grad==None)

OrderedDict([('weight', tensor([[-0.0261, -0.2381,  0.2371,  0.3306,  0.0700,  0.2761, -0.1394,  0.1811]])), ('bias', tensor([0.0377]))])

<class 'torch.nn.parameter.Parameter'>

Parameter containing:
tensor([0.0377], requires_grad=True)

tensor([0.0377])

True


In [23]:
print(*[(name,param.shape)for name,param in net[0].named_parameters()])
print()
print(*[(name, param.shape) for name, param in net.named_parameters()])
print()
print(net.state_dict()['2.bias'].data)

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))

('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))

tensor([0.0377])


In [26]:
def block1():
    return nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8,4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        net.add_module(f'block {i}',block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4,1))
rgnet(X)

tensor([[0.1841],
        [0.1841]], grad_fn=<AddmmBackward>)

In [27]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


In [33]:
def init_normal(m):    # 传入的是一个module
    if type(m) == nn.Linear:     # 如果是线性层
        nn.init.normal_(m.weight, mean=0, std=0.01)     # _表示无需返回值，直接原地替换。这里将weight变成均值为0，标准差为0.01
        nn.init.zeros_(m.bias)    # 将bias偏移替换为0

net = nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8,1))
X = torch.rand(size=(2,4))
net(X)

net.apply(init_normal)      # 将网络遍历一遍，用init_normal进行更新
net[0].weight.data[0],net[0].bias.data[0]

(tensor([-0.0028, -0.0195,  0.0069, -0.0052]), tensor(0.))

In [35]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)

net = nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8,1))
X = torch.rand(size=(2,4))
net(X)

net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

In [38]:
def xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)      # 对weight做xavier均值化

def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)      # 全部赋值42

net = nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8,1))
X = torch.rand(size=(2,4))
net(X)

net[0].apply(xavier)
net[2].apply(init_42)

print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([ 0.6672, -0.2456, -0.2529,  0.2186])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


In [41]:
def my_init(m):
    if type(m) == nn.Linear:
        print(
            "Init",
            *[(name,param.shape) for name, param in m.named_parameters()][0]
        )
        nn.init.uniform_(m.weight, -10, 10)      # (-10,10)之间的均匀分布填充
        m.weight.data *= m.weight.data.abs()>=5     # 表示如果 m.weight.data.abs()>=5成立，相当于1；不成立相当于0

net = nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8,1))
X = torch.rand(size=(2,4))
net(X)

net.apply(my_init)
net[0].weight[:2]

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([[0.0000, 0.0000, 7.5092, -0.0000],
        [0.0000, 5.3086, 7.6778, 0.0000]], grad_fn=<SliceBackward>)

In [42]:
net[0].weight.data[:] += 1     # 所有值+1
net[0].weight.data[0, 0] = 42     # 第一行的第一个元素为42
net[0].weight.data[0]      # 第一行元素

tensor([42.0000,  1.0000,  8.5092,  1.0000])

In [43]:
shared = nn.Linear(8,8)
net = nn.Sequential(
    nn.Linear(4,8),nn.ReLU(),shared,nn.ReLU(),shared,nn.ReLU(),nn.Linear(8,1)
)
net(X)
print(net[2].weight.data[0] == net[4].weight.data[0])

net[2].weight.data[0,0] = 100
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


In [45]:
class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, X):
        return X - X.mean()

layer = CenteredLayer()
layer(torch.FloatTensor([1,2,3,4,5]))

tensor([-2., -1.,  0.,  1.,  2.])

In [46]:
net = nn.Sequential(nn.Linear(8,128), CenteredLayer())

Y = net(torch.rand(4,8))
Y.mean()

tensor(-7.4506e-09, grad_fn=<MeanBackward0>)