In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F
import numpy as np
import torch.optim as optim


### 搭建模型

In [2]:
class LinearBNAC(nn.Module):
    def __init__(self, in_channels, out_channels, bias=True, dropout=0.3, is_output=False):
        super(LinearBNAC, self).__init__()
        if is_output and out_channels==1:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Sigmoid()
            )
        elif is_output:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Softmax(dim=1)
            )   
        else:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Dropout(dropout),
                nn.BatchNorm1d(out_channels),
                nn.LeakyReLU(inplace=True)
            )
            
    def forward(self, x):
        out=self.linear(x)
        return out

In [3]:
class Model(nn.Module):
    def __init__(self, input_dimention, output_classes=1):
        super(Model, self).__init__()
        self.layer1 = LinearBNAC(input_dimention, 128)
        self.layer2 = LinearBNAC(128,64)
        self.layer3 = LinearBNAC(64,32)
        self.output = LinearBNAC(32, output_classes, is_output=True)
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.output(x)
        return x 
        

### 準備輸入資料、優化器、標籤資料、模型輸出

In [15]:
model = Model(input_dimention=256,output_classes=10)
optimizer = optim.Adam(params=model.parameters(), lr=1e-3, weight_decay=1e-3)

In [16]:
batch_size = 4
input_features = 256
dummy_input = torch.randn(batch_size, input_features,)

#target = torch.empty(4, dtype=torch.float).random_(10)
target = torch.tensor([9., 5., 4., 4.], dtype=torch.long)

In [17]:
output = model(dummy_input)
print(output)

tensor([[0.1308, 0.1106, 0.1113, 0.0831, 0.1228, 0.1229, 0.0762, 0.0569, 0.0602,
         0.1253],
        [0.0909, 0.0893, 0.1677, 0.0562, 0.1352, 0.1536, 0.0561, 0.0709, 0.0628,
         0.1173],
        [0.0799, 0.0681, 0.1347, 0.0490, 0.2014, 0.0863, 0.0562, 0.0624, 0.1348,
         0.1272],
        [0.0754, 0.0946, 0.1109, 0.0913, 0.1357, 0.0958, 0.0855, 0.0810, 0.1075,
         0.1224]], grad_fn=<SoftmaxBackward>)


### 計算 CrossEntropy Loss
* 請注意哪一個 Loss最適合：我們已經使用 softmax
* 因為我們有使用dropout，並隨機產生dummy_input，所以各為學員得到的值會與解答不同，然而步驟原理需要相同

In [25]:
from torch.nn import NLLLoss, LogSoftmax, CrossEntropyLoss

In [26]:
criterion = CrossEntropyLoss()

In [27]:
loss = criterion(torch.log(output), target)
print(loss)

tensor(1.8874, grad_fn=<NllLossBackward>)


### 完成back propagation並更新梯度

In [28]:
loss.backward()

In [29]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[ 0.0561, -0.0243, -0.0138,  ...,  0.0052,  0.0050, -0.0564],
        [-0.0596, -0.0209, -0.0368,  ..., -0.0110, -0.0238,  0.0553],
        [-0.0248,  0.0382,  0.0375,  ...,  0.0294,  0.0419, -0.0398],
        ...,
        [ 0.0136, -0.0296,  0.0268,  ...,  0.0523,  0.0272,  0.0394],
        [ 0.0623, -0.0134, -0.0263,  ..., -0.0568, -0.0504, -0.0492],
        [-0.0460,  0.0328,  0.0032,  ...,  0.0151,  0.0242,  0.0274]],
       requires_grad=True)


grad : tensor([[ -1.0324, -11.3790,   5.2907,  ...,   7.2664,  -3.2597,  11.6453],
        [  1.3716,  -1.2888,  -0.0212,  ...,   0.9908,  -0.3720,  -0.7623],
        [  0.5514,  -0.3592,   0.0582,  ...,   0.0372,  -0.0315,  -0.1132],
        ...,
        [ -0.8486,   0.7287,  -0.3879,  ...,   0.0782,   0.3995,  -0.6859],
        [  0.5797,  -0.6876,  -0.1785,  ...,   0.7804,   0.0864,  -0.9109],
        [  0.6162,   0.3066,  -0.1558,  ...,  -0.3587,  -0.2968,  -0.3819]])


In [30]:
optimizer.step()

In [31]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[ 0.0571, -0.0233, -0.0148,  ...,  0.0042,  0.0060, -0.0574],
        [-0.0606, -0.0199, -0.0358,  ..., -0.0120, -0.0228,  0.0563],
        [-0.0258,  0.0392,  0.0365,  ...,  0.0284,  0.0429, -0.0388],
        ...,
        [ 0.0146, -0.0306,  0.0278,  ...,  0.0513,  0.0262,  0.0404],
        [ 0.0613, -0.0124, -0.0253,  ..., -0.0578, -0.0514, -0.0482],
        [-0.0470,  0.0318,  0.0042,  ...,  0.0161,  0.0252,  0.0284]],
       requires_grad=True)


grad : tensor([[ -1.0324, -11.3790,   5.2907,  ...,   7.2664,  -3.2597,  11.6453],
        [  1.3716,  -1.2888,  -0.0212,  ...,   0.9908,  -0.3720,  -0.7623],
        [  0.5514,  -0.3592,   0.0582,  ...,   0.0372,  -0.0315,  -0.1132],
        ...,
        [ -0.8486,   0.7287,  -0.3879,  ...,   0.0782,   0.3995,  -0.6859],
        [  0.5797,  -0.6876,  -0.1785,  ...,   0.7804,   0.0864,  -0.9109],
        [  0.6162,   0.3066,  -0.1558,  ...,  -0.3587,  -0.2968,  -0.3819]])


### 清空 gradient

In [32]:
optimizer.zero_grad()

In [33]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[ 0.0571, -0.0233, -0.0148,  ...,  0.0042,  0.0060, -0.0574],
        [-0.0606, -0.0199, -0.0358,  ..., -0.0120, -0.0228,  0.0563],
        [-0.0258,  0.0392,  0.0365,  ...,  0.0284,  0.0429, -0.0388],
        ...,
        [ 0.0146, -0.0306,  0.0278,  ...,  0.0513,  0.0262,  0.0404],
        [ 0.0613, -0.0124, -0.0253,  ..., -0.0578, -0.0514, -0.0482],
        [-0.0470,  0.0318,  0.0042,  ...,  0.0161,  0.0252,  0.0284]],
       requires_grad=True)


grad : tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
