PyTorch Cookbook(常用代码段整理合集)![https://zhuanlan.zhihu.com/p/59205847]

In [1]:
import collections 
import os 
import shutil
import tqdm
import numpy as np
import PIL.Image
import torch
import torchvision
import torch.nn as nn
import torch

# 基础配置

In [1]:
# 检查PyTorch版本
print(torch.__version__) 
print(torch.version.cuda)
print(torch.backends.cudnn.version())
torch.cuda.get_device_name(0)

# 1. 基础配置
# 更新PyTorch
#conda update pytorch torchvision -c pytorch

# 固定随机种子
torch.manual_seed(0)
torch.cuda.manual_seed(0)

# 指定程序运行在特定GPU卡上
# 命令行：
# CUDA_VISIBLE_DEVICES=0,1 python trian.py

# 代码指定：
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

# 判断是否有CUDA支持
torch.cuda.is_available()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# 设置cuDNN benchmark模式
# Benchmark模式会提升计算速度，但是由于计算中有随机性，每次网络前馈结果略有差异
torch.backends.cudnn.benchmark = True
# 如果想要避免这种结果波动，设置
torch.backends.cudnn.deterministic = True

# 清除GPU存储
# 有时control-c终止运行后GPU存储没有及时释放，需要手动清空
torch.cuda.empty_cache()
# 火灾命令行可以先使用ps找到程序的PID，再使用kill结束该进程
# ps aux | grep python
# kill -9 [pid]
# 或者直接重置没有被清空的GPU
# nvidia-smi --gpu-reset -i [gpu_id]


NameError: name 'torch' is not defined

# 张量处理
## 定义张量

    LongTensor; tensor --> torch.int64
    Tensor; FloatTensor --> torch.float32
    DoubleTensor --> torch.float64

In [23]:
# 1D 
points = torch.tensor([1, 2, 3, 4])
print(type(points)) # class
print(points.dtype) # dtype
print(points.device) # cpu or cuda

# 2D 
points = torch.Tensor([[1, 2], [3, 4]]) # 大写T是float32
print(points.dtype) # dtype



<class 'torch.Tensor'>
torch.int64
cpu
torch.float32


## 内存（Storage）
     = 深拷贝
     .clone() 浅拷贝
     .t() 转置(共享一块内存，知识stride变了)

In [51]:
points_storage = points.storage()
offset = points[0, 1].storage_offset()
print(offset)
stride = points.stride()
print(stride) # (row, col)

points_t = points
print(id(points_t) == id(points))

points_t = points.clone()
print(id(points_t) == id(points))

points_t = points.t() # 转置不会分配新的内存，而是共享同一块内存，知识stride变了
print(id(points_t.storage()) == id(points.storage()))

1
(2, 1)
True
False
True


## 处理

In [105]:
# 张量基本信息
tensor = torch.zeros(2,1,3)
tensor.type()
tensor.size()
tensor.dim()

# 数据类型转换
# set default tensor type, float in pytorch is much faster than double
torch.set_default_tensor_type(torch.FloatTensor)
# type conversions
tensor = tensor.cuda()
tensor = tensor.cpu()
tensor = tensor.float()
tensor = tensor.long()

# torch.Tensor与np.ndarray转换
# PyTorch中的张量默认采用N×D×H×W的顺序，并且数据范围在[0, 1]，需要进行转置和规范化。
ndarray = tensor.cpu().numpy()

tensor = torch.from_numpy(ndarray.astype(np.float32)).to(device)
tensor = torch.from_numpy(ndarray).float()
tensor = torch.from_numpy(ndarray.copy()).float() # if ndarray has negative stride like np.array([1,2,3])[::-1] 

# torch.Tensor -> PIL.Image(H×W×D)
image = PIL.Image.fromarray(torch.clamp(tensor*255, min=0, max=255).byte().permute(1, 2, 0).cpu().numpy())
image = torchvision.transforms.functional.to_pil_image(tensor)

# PIL.Image(H×W×D) -> torch.Tensor
tensor = torch.from_numpy(np.asarray(PIL.Image.open("leslie2.jpg"))).permute(2, 0, 1).float() / 255.
tensor = torchvision.transforms.functional.to_tensor(PIL.Image.open("leslie2.jpg"))

# np.ndarray与PIL.Image转换
# np.ndarray -> PIL.Image
ndarray = np.random.random((2,2))
image = PIL.Image.fromarray(ndarray.astype(np.uint8)) # uint(unsigned int)
# PIL.Image -> np.ndarray
ndarray = np.asarray(PIL.Image.open("leslie.jpg"))

# 从只包含一个元素的张量中提取值
# 这在训练时统计loss的变化过程中特别有用，否则这将累积计算图，使GPU储存占用量越来越大。
tensor = torch.zeros(1)
value = tensor.item()

# 张量形变: 张量形变常常用于将卷积层特征输入全连接层的情形，相比torch.view, torch.reshape可以自动处理输入张量不连续的情况，
# tensor = torch.reshape(tensor, shape)

# 打乱顺序
tensor = torch.arange(20).reshape((1, 1, 4, 5))
# print(tensor)
#tensor = tensor[torch.randperm(tensor.size(0))] # shuffle the first dimension
#print(tensor) # size == tensor.size(0)

# 水平翻转: PyTorch不支持tensor[::-1]这样的负步长操作，水平翻转可以用张量索引实现
# assume tensor has shape n*d*h*w
# print(torch.arange(tensor.size(3)-1, -1, -1).long())
tensor = tensor[:, :, :, torch.arange(tensor.size(3)-1, -1, -1).long()]
# print(tensor)

# 复制张量: 有三种复制的方式，对应不同的需求。
# Operation                 |  New/Shared memory | Still in computation graph |
tensor.clone()            # |        New         |          Yes               |
tensor.detach()           # |      Shared        |          No                |
tensor.detach().clone()   # |        New         |          No                |

# 拼接张量
# 注意torch.cat和torch.stack的区别在于torch.cat沿着给定的维度拼接，而torch.stack会新增一维。
#tensor_2 = tensor.detach()
#tensor = torch.cat([tensor, tensor_2], dim=0)
#print(tensor.size())
tensor_3 = torch.stack([tensor, tensor_2])
# print(tensor_3.size())

# 将整数标记转化独热(one-hot)编码
# PyTorch中的标记默认从0开始
tensor = torch.arange(10)
# print(tensor)
N = tensor.size(0)
one_hot = torch.zeros(N, 10).long()
# print(torch.unsqueeze(tensor, dim=1))
one_hot.scatter_(dim=1, index=torch.unsqueeze(tensor, dim=1), src=torch.ones(N, 10).long())

# 得到非零/零元素
tensor = tensor *2
# print(tensor==0) # tensor([1, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=torch.uint8)
# print(torch.nonzero(tensor))     # Index of non-zero elements
# print(torch.nonzero(tensor == 0))  # Index of zero elements
torch.nonzero(tensor).size(0)
torch.nonzero(tensor == 0).size(0)

tensor = tensor.unsqueeze(dim=1)
tensor1 = tensor.clone()
# 判断两个张量相等
torch.allclose(tensor1, tensor) # float tensor
torch.equal(tensor1, tensor)

tensor1 = tensor1.permute(1,0)

# 矩阵乘法
# Matrix multiplication: (m*n) * (n*p) -> (m*p).
result = torch.mm(tensor1, tensor) 
tensor, tensor1 = tensor.unsqueeze(dim=0), tensor1.unsqueeze(dim=0)
# print(tensor.size(), tensor1.size())

# Batch matrix multiplication: (b*m*n) * (b*n*p) -> (b*m*p).
result = torch.bmm(tensor1, tensor)

# Elment-wise multiplication
# result = tensor1 * tensor

# 计算两组数据之间的两两欧式距离
# X1 is of shape m*d, X2 is of shape n*d.
np.random.seed(0)
X1, X2 = torch.from_numpy(np.random.random((4, 5))).float(), torch.from_numpy(np.random.random((4, 5))).float()
dist = torch.sqrt(torch.sum((X1[:,None,:] - X2) ** 2, dim=2))
print(dist)
X1[:,None,:], X2 # 添加维度[:,none,:]

tensor([[-0.3028,  0.1709,  0.7531, -0.6369,  0.6141],
        [ 0.1843,  0.6579,  1.2402, -0.1499,  1.1012],
        [-0.2528,  0.2208,  0.8031, -0.5869,  0.6641],
        [-0.5499, -0.0763,  0.5060, -0.8841,  0.3670]])


(tensor([[[0.5488, 0.7152, 0.6028, 0.5449, 0.4237]],
 
         [[0.6459, 0.4376, 0.8918, 0.9637, 0.3834]],
 
         [[0.7917, 0.5289, 0.5680, 0.9256, 0.0710]],
 
         [[0.0871, 0.0202, 0.8326, 0.7782, 0.8700]]]),
 tensor([[0.9786, 0.7992, 0.4615, 0.7805, 0.1183],
         [0.6399, 0.1434, 0.9447, 0.5218, 0.4147],
         [0.2646, 0.7742, 0.4562, 0.5684, 0.0188],
         [0.6176, 0.6121, 0.6169, 0.9437, 0.6818],
         [0.3595, 0.4370, 0.6976, 0.0602, 0.6668]]))

## numel
    .numel() 返回元素个数

In [33]:
points = torch.tensor([1, 2, 3, 4])
num = points.numel()
print(num)

4


# 模型定义（不能运行）

# Network
## Sequential

In [1]:
import torch.nn as nn
from collections import OrderedDict

model = nn.Sequential(
                nn.Linear(100, 128),
                nn.ReLU(),
)

model = nn.Sequential(OrderedDict([
                ('linear', nn.Linear(100, 128)),
                ('relu', nn.ReLU(),)
]))

## Embedding

In [7]:
# nn.Embedding(num_embeddings, embedding_dim) 
# num_embeddings (int) – size of the dictionary of embeddings
# embedding_dim (int) – the size of each embedding vector
embedding = nn.Embedding(10, 3)
inputs = torch.LongTensor([[1,2,3,4],[5,6,7,8]])
embedding(inputs) # 1-->[ 0.6762,  1.2037,  2.0559] 2-->[-0.5135,  1.4760, -0.0193],


tensor([[[-1.6299, -0.1615, -1.0212],
         [-0.5135,  1.4760, -0.0193],
         [-1.4137,  0.9403,  0.8197],
         [-1.1023,  1.1103,  1.6325]],

        [[ 0.5707, -2.1696, -0.7352],
         [ 0.7828,  0.4089,  1.1962],
         [-0.3928,  0.3144,  0.0257],
         [-1.9752, -0.5378, -0.3947]]], grad_fn=<EmbeddingBackward>)

## model info(#params, structure)

### name_parameters() and parameters()
    [name, parameters] = model.name_parameters()
    parameters = model.parameters()
        # type(parameters) <class 'torch.nn.parameter.Parameter'>
        # attr: requires_grad, shape, numel, mean, std

In [48]:
for x, (n, p) in zip(model.parameters(), model.named_parameters()): # x is the weights
        print("x is the weight, its name: {0}, its parameters: p, its num of paramters# {1}, shape: {2}".format(n, p.numel(), p.shape))
        print(type(x))
        break

x is the weight, its name: inconv.UConv.0.weight, its parameters: p, its num of paramters# 1728, shape: torch.Size([64, 3, 3, 3])
<class 'torch.nn.parameter.Parameter'>


In [2]:
def model_info(model):  # Plots a line-by-line description of a PyTorch model
    n_p = sum(x.numel() for x in model.parameters())  # number parameters
    n_g = sum(x.numel() for x in model.parameters() if x.requires_grad)  # number gradients
    print('\n%5s %50s %9s %12s %20s %12s %12s' % ('layer', 'name', 'gradient', 'parameters', 'shape', 'mu', 'sigma'))
    for i, (name, p) in enumerate(model.named_parameters()):
        name = name.replace('module_list.', '')
        print('%5g %50s %9s %12g %20s %12.3g %12.3g' % (
            i, name, p.requires_grad, p.numel(), list(p.shape), p.mean(), p.std()))
    print('Model Summary: %g layers, %g parameters, %g gradients\n' % (i + 1, n_p, n_g))

## UNet

In [282]:
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Sequential as S
"""
clear UNet implementation.
"""
class UNetConv(nn.Module):
    """
    conv-bn-relu-conv-bn-relu
    """
    def __init__(self, c_in, c_out):
        super(UNetConv, self).__init__()
        self.UConv = nn.Sequential(
            nn.Conv2d(c_in, c_out, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(c_out),
            nn.ReLU(inplace=True),
            nn.Conv2d(c_out, c_out, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(c_out),
            nn.ReLU(inplace=True),
        )
        
    def forward(self, x):
        return self.UConv(x)

class Up(nn.Module):
    """
    Upscaling then double conv(implemented by https://github.com/milesial/Pytorch-UNet)
    """
    def __init__(self, c_in, c_out, bilinear=True):
        super(Up, self).__init__()

        # if bilinear, use the normal convolutions to reduce the number of channels
        if bilinear:
            self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
        else:
            self.up = nn.ConvTranspose2d(c_in // 2, c_in // 2, kernel_size=2, stride=2)

        self.conv = UNetConv(c_in, c_out)

    def forward(self, x1, x2):
        x1 = self.up(x1)
        # input is CHW
        diffY = x2.size()[2] - x1.size()[2]
        diffX = x2.size()[3] - x1.size()[3]

        x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2,
                        diffY // 2, diffY - diffY // 2])
        # if you have padding issues, see
        # https://github.com/HaiyongJiang/U-Net-Pytorch-Unstructured-Buggy/commit/0e854509c2cea854e247a9c615f175f76fbb2e3a
        # https://github.com/xiaopeng-liao/Pytorch-UNet/commit/8ebac70e633bac59fc22bb5195e513d5832fb3bd
        x = torch.cat([x2, x1], dim=1)
        return self.conv(x)

class UNet(nn.Module):
    def __init__(self, c_in, n_classes, bilinear=True):
        super(UNet, self).__init__()
        
        c_base = 16
        self.inconv = UNetConv(c_in, c_base)
        self.down_1 = S(nn.MaxPool2d(2), UNetConv(c_base, c_base*2),)
        self.down_2 = S(nn.MaxPool2d(2), UNetConv(c_base*2, c_base*4),)
        self.down_3 = S(nn.MaxPool2d(2), UNetConv(c_base*4, c_base*8),)
        self.down_4 = S(nn.MaxPool2d(2), UNetConv(c_base*8, c_base*8),)
        self.up_1 = Up(c_base*16, c_base*4, bilinear)
        self.up_2 = Up(c_base*8, c_base*2, bilinear)
        self.up_3 = Up(c_base*4, c_base*1, bilinear)
        self.up_4 = Up(c_base*2, c_base*1, bilinear)
        self.outconv = nn.Conv2d(c_base, n_classes, kernel_size=1)
        
    def forward(self, x):
        x1 = self.inconv(x)
        x2 = self.down_1(x1)
        x3 = self.down_2(x2)
        x4 = self.down_3(x3)
        x5 = self.down_4(x4)
        x = self.up_1(x5, x4)
        x = self.up_2(x, x3)
        x = self.up_3(x, x2)
        x = self.up_4(x, x1)
        logits = self.outconv(x)
        return logits

In [283]:
import numpy as np 
import torch
import time
model = UNet(3, 10)
model_info(model)
inp = torch.from_numpy(np.random.normal(0, 1, [1, 3, 64, 64]).astype(np.float32))
s = time.time()
preds = model(inp)
e = time.time()
print(e-s)
# print(preds.shape)


layer                                               name  gradient   parameters                shape           mu        sigma
    0                              inconv.UConv.0.weight      True          432        [16, 3, 3, 3]      0.00164        0.116
    1                              inconv.UConv.1.weight      True           16                 [16]            1            0
    2                                inconv.UConv.1.bias      True           16                 [16]            0            0
    3                              inconv.UConv.3.weight      True         2304       [16, 16, 3, 3]     0.000406        0.048
    4                              inconv.UConv.4.weight      True           16                 [16]            1            0
    5                                inconv.UConv.4.bias      True           16                 [16]            0            0
    6                            down_1.1.UConv.0.weight      True         4608       [32, 16, 3, 3]     -0.00

## Numerical Coordinates within Heatmap Regeression Network

In [3]:
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Sequential as S
"""
clear UNet implementation.
"""
class UNetConv(nn.Module):
    """
    conv-bn-relu-conv-bn-relu
    """
    def __init__(self, c_in, c_out):
        super(UNetConv, self).__init__()
        self.UConv = nn.Sequential(
            nn.Conv2d(c_in, c_out, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(c_out),
            nn.ReLU(inplace=True),
            nn.Conv2d(c_out, c_out, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(c_out),
            nn.ReLU(inplace=True),
        )
        
    def forward(self, x):
        return self.UConv(x)

class Up(nn.Module):
    """
    Upscaling then double conv(implemented by https://github.com/milesial/Pytorch-UNet)
    """
    def __init__(self, c_in, c_out, bilinear=True):
        super(Up, self).__init__()

        # if bilinear, use the normal convolutions to reduce the number of channels
        if bilinear:
            self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
        else:
            self.up = nn.ConvTranspose2d(c_in // 2, c_in // 2, kernel_size=2, stride=2)

        self.conv = UNetConv(c_in, c_out)

    def forward(self, x1, x2):
        x1 = self.up(x1)
        # input is CHW
        diffY = x2.size()[2] - x1.size()[2]
        diffX = x2.size()[3] - x1.size()[3]

        x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2,
                        diffY // 2, diffY - diffY // 2])
        # if you have padding issues, see
        # https://github.com/HaiyongJiang/U-Net-Pytorch-Unstructured-Buggy/commit/0e854509c2cea854e247a9c615f175f76fbb2e3a
        # https://github.com/xiaopeng-liao/Pytorch-UNet/commit/8ebac70e633bac59fc22bb5195e513d5832fb3bd
        x = torch.cat([x2, x1], dim=1)
        
        return self.conv(x)

class NHCHRConv(nn.Module):
    """
    Contains X-axis and Y-axis Map and Heatmap Conv
    """

    def __init__(self, c_in, c_out, bilinear=True):
        super(NHCHRConv, self).__init__()
        # if bilinear, use the normal convolutions to reduce the number of channels
        if bilinear:
            self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
        else:
            self.up = nn.ConvTranspose2d(c_in // 2, c_in // 2, kernel_size=2, stride=2)

        self.HConv = UNetConv(c_in, c_out)
        self.XConv = UNetConv(c_in, c_out)
        self.YConv = UNetConv(c_in, c_out)

    def conv_pad_cat(self, conv, x1, x2):
        x1 = self.up(x1)
        # input is CHW
        diffY = x2.size()[2] - x1.size()[2]
        diffX = x2.size()[3] - x1.size()[3]

        x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2,
                        diffY // 2, diffY - diffY // 2])
        # if you have padding issues, see
        # https://github.com/HaiyongJiang/U-Net-Pytorch-Unstructured-Buggy/commit/0e854509c2cea854e247a9c615f175f76fbb2e3a
        # https://github.com/xiaopeng-liao/Pytorch-UNet/commit/8ebac70e633bac59fc22bb5195e513d5832fb3bd
        x = torch.cat([x2, x1], dim=1)

        return conv(x)

    def forward(self, x1, x2):
        HOut = self.conv_pad_cat(self.HConv, x1, x2)
        XOut = self.conv_pad_cat(self.XConv, x1, x2)
        YOut = self.conv_pad_cat(self.YConv, x1, x2)

        return HOut, XOut, YOut


class NCHRNet(nn.Module):
    def __init__(self, c_in, n_classes, bilinear=True):
        super(NCHRNet, self).__init__()

        c_base = 16
        # self.f = 13.675*64
        self.inconv = UNetConv(c_in, c_base)
        self.down_1 = S(nn.MaxPool2d(2), UNetConv(c_base, c_base * 2), )
        self.down_2 = S(nn.MaxPool2d(2), UNetConv(c_base * 2, c_base * 4), )
        self.down_3 = S(nn.MaxPool2d(2), UNetConv(c_base * 4, c_base * 8), )
        self.down_4 = S(nn.MaxPool2d(2), UNetConv(c_base * 8, c_base * 8), )
        self.up_1 = Up(c_base * 16, c_base * 4, bilinear)
        self.up_2 = Up(c_base * 8, c_base * 2, bilinear)
        self.up_3 = Up(c_base * 4, c_base * 1, bilinear)
        self.NHCHRConv = NHCHRConv(c_base * 2, c_base * 1, bilinear)
        self.HOutconv = nn.Conv2d(c_base, n_classes, kernel_size=1)
        self.XOutconv = S(
            nn.Conv2d(c_base+1, 1, kernel_size=1),)
#             nn.BatchNorm2d(9),
#             nn.ReLU(inplace=True),
#             nn.Conv2d(9, 1, kernel_size=1))
        self.YOutconv = S(
            nn.Conv2d(c_base+1, 1, kernel_size=1),)
#             nn.BatchNorm2d(9),
#             nn.ReLU(inplace=True),
#             nn.Conv2d(9, 1, kernel_size=1))


        # for m in self.modules():
        #     if isinstance(m, nn.Conv2d):
        #         n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
        #         m.weight.data.normal_(0, math.sqrt(2. / n))
        #     elif isinstance(m, nn.BatchNorm2d):
        #         m.weight.data.fill_(1)
        #         m.bias.data.zero_()

    def forward(self, x):
        x1 = self.inconv(x)
        x2 = self.down_1(x1)
        x3 = self.down_2(x2)
        x4 = self.down_3(x3)
        x5 = self.down_4(x4)
        x = self.up_1(x5, x4)
        x = self.up_2(x, x3)
        x = self.up_3(x, x2)
        H, X, Y = self.NHCHRConv(x, x1)

        H = self.HOutconv(H)  # final heatmap prediction
        X = self.XOutconv(torch.cat([H, X],dim=1))  # intermediate x-axis map (2-d)
        Y = self.YOutconv(torch.cat([H, Y],dim=1))  # intermediate y-axis map (2-d)
        # print(X[0,:,:,:])
        # GET FINAL NUMERICAL COORDINATES
        # x = torch.sum(H * X, dim=(1, 2, 3), keepdims=True).view(-1, 1) / self.f
        # y = torch.sum(H * Y, dim=(1, 2, 3), keepdims=True).view(-1, 1) / self.f

        # x = (H * X).sum(3).sum(2).sum(1).view(-1, 1) / self.f
        # y = (H * Y).sum(3).sum(2).sum(1).view(-1, 1) / self.f
        x = F.adaptive_avg_pool2d(X, (1,1)).view(-1,1)
        y = F.adaptive_avg_pool2d(Y, (1,1)).view(-1,1)
        coordinates = torch.cat([x, y], dim=1)

        return H, coordinates

In [17]:
import numpy as np 
import torch
model = NCHRNet(1, 1)
# model_info(model)
inp = torch.from_numpy(np.random.normal(0, 1, [10, 1, 64, 64]).astype(np.float32))
H, coor = model(inp)

print(H.shape, coor.shape)



torch.Size([10, 1, 64, 64]) torch.Size([10, 2])
inconv.UConv.0.weight False
inconv.UConv.1.weight False
inconv.UConv.1.bias False
inconv.UConv.3.weight False
inconv.UConv.4.weight False
inconv.UConv.4.bias False
down_1.1.UConv.0.weight False
down_1.1.UConv.1.weight False
down_1.1.UConv.1.bias False
down_1.1.UConv.3.weight False
down_1.1.UConv.4.weight False
down_1.1.UConv.4.bias False
down_2.1.UConv.0.weight False
down_2.1.UConv.1.weight False
down_2.1.UConv.1.bias False
down_2.1.UConv.3.weight False
down_2.1.UConv.4.weight False
down_2.1.UConv.4.bias False
down_3.1.UConv.0.weight False
down_3.1.UConv.1.weight False
down_3.1.UConv.1.bias False
down_3.1.UConv.3.weight False
down_3.1.UConv.4.weight False
down_3.1.UConv.4.bias False
down_4.1.UConv.0.weight False
down_4.1.UConv.1.weight False
down_4.1.UConv.1.bias False
down_4.1.UConv.3.weight False
down_4.1.UConv.4.weight False
down_4.1.UConv.4.bias False
up_1.conv.UConv.0.weight False
up_1.conv.UConv.1.weight False
up_1.conv.UConv.1.bias

## Freeze Layers

In [19]:
## freeze x-y-conv layer
# for (name, param) in model.named_parameters():
#     if 'X' in name or 'Y' in name:
#         param.requires_grad = False
#     print(name, param.requires_grad)
## freeze layers but not x-y-conv
for (name, param) in model.named_parameters():
    if 'X' not in name and 'Y' not in name:
        param.requires_grad = False
    print(name, param.requires_grad)
# model_info(model)

inconv.UConv.0.weight False
inconv.UConv.1.weight False
inconv.UConv.1.bias False
inconv.UConv.3.weight False
inconv.UConv.4.weight False
inconv.UConv.4.bias False
down_1.1.UConv.0.weight False
down_1.1.UConv.1.weight False
down_1.1.UConv.1.bias False
down_1.1.UConv.3.weight False
down_1.1.UConv.4.weight False
down_1.1.UConv.4.bias False
down_2.1.UConv.0.weight False
down_2.1.UConv.1.weight False
down_2.1.UConv.1.bias False
down_2.1.UConv.3.weight False
down_2.1.UConv.4.weight False
down_2.1.UConv.4.bias False
down_3.1.UConv.0.weight False
down_3.1.UConv.1.weight False
down_3.1.UConv.1.bias False
down_3.1.UConv.3.weight False
down_3.1.UConv.4.weight False
down_3.1.UConv.4.bias False
down_4.1.UConv.0.weight False
down_4.1.UConv.1.weight False
down_4.1.UConv.1.bias False
down_4.1.UConv.3.weight False
down_4.1.UConv.4.weight False
down_4.1.UConv.4.bias False
up_1.conv.UConv.0.weight False
up_1.conv.UConv.1.weight False
up_1.conv.UConv.1.bias False
up_1.conv.UConv.3.weight False
up_1.conv.

## AUNet

In [203]:
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Sequential as S
"""
clear UNet implementation.
"""
class UNetConv(nn.Module):
    """
    conv-bn-relu-conv-bn-relu
    """
    def __init__(self, c_in, c_out):
        super(UNetConv, self).__init__()
        self.UConv = S(
            nn.Conv2d(c_in, c_out, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(c_out),
            nn.ReLU(inplace=True),
            nn.Conv2d(c_out, c_out, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(c_out),
            nn.ReLU(inplace=True),
        )
        
    def forward(self, x):
        return self.UConv(x)
    
class AtrousConv(nn.Module):
    def __init__(self, c_in, c_out, dilation=(1,2,5,9)):
        super(AtrousConv, self).__init__()
        inter_out = int(c_out/2)

#         self.conv_1x1_1 = S(
#             nn.Conv2d(c_in, inter_out, kernel_size=1, bias=False),
#             nn.BatchNorm2d(inter_out),
#             nn.ReLU(inplace=True),
#         )
        
        self.conv_3x3_1 = S(
            nn.Conv2d(c_in, inter_out, kernel_size=3, padding=dilation[0], bias=False, dilation=dilation[0]),
            nn.BatchNorm2d(inter_out),
            nn.ReLU(inplace=True),
        )
        
        self.conv_3x3_2 = S(
            nn.Conv2d(c_in, inter_out, kernel_size=3, padding=dilation[1], bias=False, dilation=dilation[1]),
            nn.BatchNorm2d(inter_out),
            nn.ReLU(inplace=True),
        )
        
        self.conv_3x3_3 = S(
            nn.Conv2d(c_in, inter_out, kernel_size=3, padding=dilation[2], bias=False, dilation=dilation[2]),
            nn.BatchNorm2d(inter_out),
            nn.ReLU(inplace=True),
        )
        
        self.conv_3x3_4 = S(
            nn.Conv2d(c_in, inter_out, kernel_size=3, padding=dilation[3], bias=False, dilation=dilation[3]),
            nn.BatchNorm2d(inter_out),
            nn.ReLU(inplace=True),
        )
        
        self.out_conv = S(
            nn.Conv2d(int(inter_out*4), c_out, kernel_size=1, bias=False),
            nn.BatchNorm2d(c_out),
            nn.ReLU(inplace=True),
        ) 
    
    def forward(self, x):
        
#         x1 = self.conv_1x1_1(x)
        x1 = self.conv_3x3_1(x)
        x2 = self.conv_3x3_2(x)
        x3 = self.conv_3x3_3(x)
        x4 = self.conv_3x3_4(x)
        out = self.out_conv(torch.cat([x1, x2, x3, x4], dim=1))
        
        return out
        

class Up(nn.Module):
    """
    Upscaling then double conv(implemented by https://github.com/milesial/Pytorch-UNet)
    """
    def __init__(self, c_in, c_out, bilinear=True):
        super(Up, self).__init__()

        # if bilinear, use the normal convolutions to reduce the number of channels
        if bilinear:
            self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
        else:
            self.up = nn.ConvTranspose2d(c_in // 2, c_in // 2, kernel_size=2, stride=2)

        self.conv = UNetConv(c_in, c_out)

    def forward(self, x1, x2):
        x1 = self.up(x1)
        # input is CHW
        diffY = x2.size()[2] - x1.size()[2]
        diffX = x2.size()[3] - x1.size()[3]

        x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2,
                        diffY // 2, diffY - diffY // 2])
        # if you have padding issues, see
        # https://github.com/HaiyongJiang/U-Net-Pytorch-Unstructured-Buggy/commit/0e854509c2cea854e247a9c615f175f76fbb2e3a
        # https://github.com/xiaopeng-liao/Pytorch-UNet/commit/8ebac70e633bac59fc22bb5195e513d5832fb3bd
        x = torch.cat([x2, x1], dim=1)
        return self.conv(x)
    

class AUNet(nn.Module):
    def __init__(self, c_in, n_classes, bilinear=True):
        super(AUNet, self).__init__()
        
        c_base = 16
        self.inconv = UNetConv(c_in, c_base)                              # 64*64
        self.down_1 = S(nn.MaxPool2d(2), AtrousConv(c_base, c_base*2),)     # 32*32
        self.down_2 = S(nn.MaxPool2d(2), AtrousConv(c_base*2, c_base*4),)   # 16*16
#         self.down_3 = S(nn.MaxPool2d(2), AtrousConv(c_base*4, c_base*8, dilation=(1,2,3)),)   # 8*8
#         self.down_4 = S(nn.MaxPool2d(2), AtrousConv(c_base*8, c_base*8, dilation=(1,2,3)),)   # 4*4
#         self.up_1 = Up(c_base*16, c_base*4, bilinear)
#         self.up_2 = Up(c_base*8, c_base*2, bilinear)
        self.up_3 = Up(c_base*6, c_base*2, bilinear)
        self.up_4 = Up(c_base*3, c_base*1, bilinear)
#         self.up_1 = DUC(c_base*16, c_base*4, scale_factor=2)
#         self.up_2 = DUC(c_base*8, c_base*2, scale_factor=2)
#         self.up_3 = DUC(c_base*4, c_base*1, scale_factor=2)
#         self.up_4 = DUC(c_base*2, c_base*1, scale_factor=2)
        self.outconv = nn.Conv2d(c_base, n_classes, kernel_size=1)
        
    def forward(self, x):
        x1 = self.inconv(x)
        x2 = self.down_1(x1)
        x3 = self.down_2(x2)
#         x4 = self.down_3(x3)
#         print(x1.shape, x2.shape, x4.shape, x3.shape)
#         x5 = self.down_4(x4)
#         x = self.up_1(x5, x4)
#         x = self.up_2(x3, x2)
#         print(x1.shape, x2.shape, x3.shape)
        x = self.up_3(x3, x2)
        x = self.up_4(x, x1)
        logits = self.outconv(x)
        return logits    



In [193]:
import numpy as np 
import torch
import time
model = AUNet(3, 10)
model_info(model)
inp = torch.from_numpy(np.random.normal(0, 1, [1, 3, 64, 64]).astype(np.float32))
s = time.time()
preds = model(inp)
e = time.time()
print(e-s)


layer                                               name  gradient   parameters                shape           mu        sigma
    0                              inconv.UConv.0.weight      True          432        [16, 3, 3, 3]     -0.00142        0.109
    1                              inconv.UConv.1.weight      True           16                 [16]            1            0
    2                                inconv.UConv.1.bias      True           16                 [16]            0            0
    3                              inconv.UConv.3.weight      True         2304       [16, 16, 3, 3]    -0.000408       0.0484
    4                              inconv.UConv.4.weight      True           16                 [16]            1            0
    5                                inconv.UConv.4.bias      True           16                 [16]            0            0
    6                       down_1.1.conv_3x3_1.0.weight      True         2304       [16, 16, 3, 3]     0.000

In [230]:
class DUC(nn.Module):
    def __init__(self,c_in, c_out, scale_factor=4):
        super(DUC, self).__init__()
        
        self.conv = UNetConv(c_in, c_out*scale_factor*scale_factor)
        print(c_out*scale_factor*scale_factor)
        self.pixel_shuffle = nn.PixelShuffle(scale_factor)
        
    def forward(self, x):
        
        x = self.conv(x)
        x = self.pixel_shuffle(x)
        
        return x
    
class ADUNet(nn.Module):
    def __init__(self, c_in, n_classes, bilinear=True):
        super(ADUNet, self).__init__()
        
        c_base = 16
        self.inconv = UNetConv(c_in, c_base)                              # 64*64
        self.down_1 = S(nn.MaxPool2d(2), AtrousConv(c_base, c_base*2),)     # 32*32
        self.down_2 = S(nn.MaxPool2d(2), AtrousConv(c_base*2, c_base*4),)   # 16*16
#         self.down_3 = S(nn.MaxPool2d(2), AtrousConv(c_base*4, c_base*8, dilation=(1,2,3)),)   # 8*8
#         self.down_4 = S(nn.MaxPool2d(2), AtrousConv(c_base*8, c_base*8, dilation=(1,2,3)),)   # 4*4
#         self.up_1 = Up(c_base*16, c_base*4, bilinear)
#         self.up_2 = Up(c_base*8, c_base*2, bilinear)
#         self.up_3 = Up(c_base*6, c_base*2, bilinear)
#         self.up_4 = Up(c_base*3, c_base*1, bilinear)
#         self.up_1 = DUC(c_base*16, c_base*4, scale_factor=2)
#         self.up_2 = DUC(c_base*8, c_base*2, scale_factor=2)
#         self.up_3 = DUC(c_base*4, c_base*1, scale_factor=2)
        self.duc = DUC(c_base*4, c_base*1, scale_factor=4)
        self.outconv = nn.Conv2d(c_base, n_classes, kernel_size=1)
        
    def forward(self, x):
        x1 = self.inconv(x)
        x2 = self.down_1(x1)
        x3 = self.down_2(x2)
#         x4 = self.down_3(x3)
#         print(x1.shape, x2.shape, x4.shape, x3.shape)
#         x5 = self.down_4(x4)
#         x = self.up_1(x5, x4)
#         x = self.up_2(x3, x2)
#         print(x1.shape, x2.shape, x3.shape)
#         x = self.up_3(x3, x2)
        x = self.duc(x3)
        logits = self.outconv(x)
        return logits

In [246]:
import numpy as np 
import torch
import time
model = ADUNet(3, 10)
model_info(model)
inp = torch.from_numpy(np.random.normal(0, 1, [1, 3, 64, 64]).astype(np.float32))
s = time.time()
preds = model(inp)
e = time.time()
print(e-s)

256

layer                                               name  gradient   parameters                shape           mu        sigma
    0                              inconv.UConv.0.weight      True          432        [16, 3, 3, 3]     -0.00648        0.109
    1                              inconv.UConv.1.weight      True           16                 [16]            1            0
    2                                inconv.UConv.1.bias      True           16                 [16]            0            0
    3                              inconv.UConv.3.weight      True         2304       [16, 16, 3, 3]    -1.87e-05       0.0477
    4                              inconv.UConv.4.weight      True           16                 [16]            1            0
    5                                inconv.UConv.4.bias      True           16                 [16]            0            0
    6                       down_1.1.conv_3x3_1.0.weight      True         2304       [16, 16, 3, 3]     -

## ResUNet

In [392]:
def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=dilation, groups=groups, bias=False, dilation=dilation)


def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(BasicBlock, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        if groups != 1 or base_width != 64:
            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
        if dilation > 1:
            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = norm_layer(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(Bottleneck, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        width = int(planes * (base_width / 64.)) * groups
        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)
        self.conv2 = conv3x3(width, width, stride, groups, dilation)
        self.bn2 = norm_layer(width)
        self.conv3 = conv1x1(width, planes * self.expansion)
        self.bn3 = norm_layer(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out
    
class Up(nn.Module):
    """
    Upscaling then double conv(implemented by https://github.com/milesial/Pytorch-UNet)
    """
    def __init__(self, c_in, c_out, bilinear=True):
        super(Up, self).__init__()

        # if bilinear, use the normal convolutions to reduce the number of channels
        if bilinear:
            self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
        else:
            self.up = nn.ConvTranspose2d(c_in // 2, c_in // 2, kernel_size=2, stride=2)

        self.conv = UNetConv(c_in, c_out)

    def forward(self, x1, x2):
        x1 = self.up(x1)
        # input is CHW
        diffY = x2.size()[2] - x1.size()[2]
        diffX = x2.size()[3] - x1.size()[3]

        x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2,
                        diffY // 2, diffY - diffY // 2])
        # if you have padding issues, see
        # https://github.com/HaiyongJiang/U-Net-Pytorch-Unstructured-Buggy/commit/0e854509c2cea854e247a9c615f175f76fbb2e3a
        # https://github.com/xiaopeng-liao/Pytorch-UNet/commit/8ebac70e633bac59fc22bb5195e513d5832fb3bd
        x = torch.cat([x2, x1], dim=1)
        return self.conv(x)    
    
class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
                 norm_layer=None):
        super(ResNet, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer
        bilinear = True
        self.inplanes = 16
        c_base = 16
        self.dilation = 1
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError("replace_stride_with_dilation should be None "
                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group
        self.conv1 = nn.Conv2d(1, c_base, kernel_size=3, stride=1, padding=1,
                               bias=False)
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, c_base, layers[0])
        self.layer2 = self._make_layer(block, c_base*2, layers[1], stride=2,
                                       dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(block, c_base*4, layers[2], stride=2,
                                       dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(block, c_base*8, layers[3], stride=2,
                                       dilate=replace_stride_with_dilation[2])
        
        self.up_1 = Up(c_base*12, c_base*4, bilinear)
        self.up_2 = Up(c_base*6, c_base*2, bilinear)
        self.up_3 = Up(c_base*3, c_base*1, bilinear)
        self.up_4 = Up(c_base*2, c_base*1, bilinear)
        
        self.outconv = nn.Conv2d(c_base, 1, kernel_size=1)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)
                elif isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)

    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                norm_layer(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes, groups=self.groups,
                                base_width=self.base_width, dilation=self.dilation,
                                norm_layer=norm_layer))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x1 = self.relu(x)
        x2 = self.layer1(self.maxpool(x1))
        x3 = self.layer2(x2)
        x4 = self.layer3(x3)
        x5 = self.layer4(x4)
        x = self.up_1(x5, x4)
        x = self.up_2(x, x3)
        x = self.up_3(x, x2)
        x = self.up_4(x, x1)
        x = self.outconv(x)

        return x

def _resnet(arch, block, layers, pretrained, progress, **kwargs):
    model = ResNet(block, layers, **kwargs)
    if pretrained:
        state_dict = load_state_dict_from_url(model_urls[arch],
                                              progress=progress)
        model.load_state_dict(state_dict)
    return model


def resnet50(pretrained=False, progress=True, **kwargs):
    r"""ResNet-50 model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
                   **kwargs)

In [402]:
import numpy as np 
import torch
import time
model = resnet50()
model_info(model)
inp = torch.from_numpy(np.random.normal(0, 1, [1, 1, 64, 64]).astype(np.float32))
s = time.time()
preds = model(inp)
print(preds.shape)
e = time.time()
print(e-s)



layer                                               name  gradient   parameters                shape           mu        sigma
    0                                       conv1.weight      True          144        [16, 1, 3, 3]     2.86e-05        0.126
    1                                         bn1.weight      True           16                 [16]            1            0
    2                                           bn1.bias      True           16                 [16]            0            0
    3                              layer1.0.conv1.weight      True          256       [16, 16, 1, 1]     -0.00419        0.363
    4                                layer1.0.bn1.weight      True           16                 [16]            1            0
    5                                  layer1.0.bn1.bias      True           16                 [16]            0            0
    6                              layer1.0.conv2.weight      True         2304       [16, 16, 3, 3]     0.000

# 数据准备、特征提取与微调

In [None]:
# 微调全连接层
model = torchvision.models.resnet18(pretrianed=True)
for param in model.parameters():
    param.requires_grad = False
model.fc = nn.Linear(512, 10) # Replace the last fc layer
optimizer = torch.optim.SGD(model.fc.parameters(), lr=1e-2, momentum=0.9, weight_decay=1e-4)

# 以较大学习率微调全连接层，较小学习率微调卷积层
model = torchvision.models.resnet18(pretrained=True)
finetuned_parameters = list(map(id, model.fc.parameters()))
conv_parmeters = (p for p in model.parameters() if id(p) not in finetuned_parameters)
parameters = [{'params': conv_parameters, 'lr':1e-3},
              {'params': model.fc.parameters()}]
optimizer = torch.optim.SGD(parameters, lr=1e-2, momentum=0.9, weight_decay=1e-5)


## 数据预处理

In [None]:
# 常用训练和验证数据预处理
# 其中ToTensor操作会将PIL.Image或形状H×W×D，数值范围为[0,255]的ndarray转换为形状D×H×W，数值范围为[0.0，1.0]的torch.Tensor
train_transform = torchvision.transforms.Compose([
    torchvision.transforms.RandomResizedCrop(size=224,
                                             scale=(0.08, 1.0)),
    torchvision.transforms.RandomHorizontalFlip(),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize(mean=(0.485, 0.456, 0.406),
                                     std=(0.229, 0.224, 0.225)),
 ])
 val_transform = torchvision.transforms.Compose([
    torchvision.transforms.Resize(256),
    torchvision.transforms.CenterCrop(224),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize(mean=(0.485, 0.456, 0.406),
                                     std=(0.229, 0.224, 0.225)),
])

# 模型训练
## 训练基本代码框架

In [None]:
# 训练基本代码框架
for t in epoch(80):
    for images, labels in tqdm.tqdm(train_loader, desc='Epoch %3d' %(t+1)):
        images, labels = images.cuda(), labels.cuda()
        scores = model(images)
        loss = loss_function(scores, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        

    


## 学习率衰减

In [None]:
# 得到当前学习率
# If there is one global learning rate(which is the common case)
lr = next(iter(optimizer.param_groups))['lr']

# If there are multiple learning rates for different layers.
all_lr = []
for param_group in optimizer.param_groups:
    all_lr.append(param_group['lr'])
    
# 学习率衰减
# Reduce learning rate when validation accuarcy plateau.
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=5, verbose=True)
for t in range(0, 80):
    train(...); val(...)
    scheduler.step(val_acc)

# Cosine annealing learning rate.
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=80)
# Reduce learning rate by 10 at given epochs.
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[50, 70], gamma=0.1)
for t in range(0, 80):
    scheduler.step()    
    train(...); val(...)

# Learning rate warmup by 10 epochs.
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda t: t / 10)
for t in range(0, 10):
    scheduler.step()
    train(...); val(...)

# Step learning rate decay
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
for t in range(0, 10):
    scheduler.step()
    train(...); val(...)

## 梯度检验与clip

## 模型保存与导入

In [6]:
import torch
# 保存与加载断点
# 注意为了能够恢复训练，我们需要同时保存模型和优化器的状态，以及当前的训练轮数。
# Save checkpoinit.
optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, model.parameters()), lr=0.01)
epoch = 10
current_acc = 0.6
best_acc = 0.5
resume = True
is_best = current_acc > best_acc
best_acc = max(best_acc, current_acc)
checkpoint = {
    'best_acc': best_acc,
    'epoch': t+1,
    'model': model.state_dict(),
    'optimizer': optimizer.state_dict(),
}
model_path = os.path.join('model', 'checkpoint.pth.tar')
torch.save(checkpoint, model_path)
if is_best:
    shutil.copy('checkpoint.pth.tar', model_path)
    
# Load checkpoint
if resume:
    model_path = os.path.join('model', 'checkpoint.pth.tar')
    assert os.path.isfile(model_path)
    checkpoint = torch.load(model_path)
    best_acc = checkpoint['best_acc']
    start_epoch = checkpoint['epoch']
    model.load_state_dict(checkpoint['model'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    print('Load checkpoint at epoch %d.' % start_epoch)



# states = {"state_dict":model.state_dict(),
#                  "epcoh":epoch,
#                  "optimizer":optimizer.state_dict(),}
#                  #"best_acc":accuracy,}
# torch.save(states, "./model_pred.pth")

# states = torch.load("./model_pred.pth")
print(checkpoint)

{'optimizer': {'state': {}, 'param_groups': [{'eps': 1e-08, 'lr': 0.01, 'betas': (0.9, 0.999), 'amsgrad': False, 'params': [2511406941960, 2511406942032], 'weight_decay': 0}]}, 'epcoh': 10, 'state_dict': OrderedDict([('linear.weight', tensor([[-0.0343, -0.0826, -0.0592,  ..., -0.0346, -0.0260,  0.0300],
        [-0.0804,  0.0180,  0.0716,  ..., -0.0354,  0.0807, -0.0505],
        [ 0.0981, -0.0978, -0.0455,  ...,  0.0182, -0.0366,  0.0263],
        ...,
        [ 0.0200,  0.0230, -0.0919,  ...,  0.0215, -0.0111,  0.0640],
        [ 0.0044, -0.0459,  0.0360,  ..., -0.0753,  0.0333,  0.0291],
        [-0.0323,  0.0726, -0.0105,  ...,  0.0155,  0.0662,  0.0221]])), ('linear.bias', tensor([ 0.0585,  0.0424, -0.0641, -0.0947,  0.0217,  0.0401, -0.0748,  0.0789,
        -0.0505, -0.0208,  0.0741,  0.0627,  0.0322, -0.0263, -0.0026,  0.0054,
        -0.0913, -0.0785, -0.0099,  0.0071,  0.0515,  0.0358, -0.0862,  0.0155,
        -0.0215, -0.0404,  0.0097, -0.0053,  0.0829, -0.0823,  0.0351,  0

# 多GPU
## 单机多GPU

# 注意事项

PyTorch其他注意事项

模型定义

建议有参数的层和汇合（pooling）层使用torch.nn模块定义，激活函数直接使用torch.nn.functional。torch.nn模块和torch.nn.functional的区别在于，torch.nn模块在计算时底层调用了torch.nn.functional，但torch.nn模块包括该层参数，还可以应对训练和测试两种网络状态。使用torch.nn.functional时要注意网络状态，如
def forward(self, x):
    ...
    x = torch.nn.functional.dropout(x, p=0.5, training=self.training)
model(x)前用model.train()和model.eval()切换网络状态。
不需要计算梯度的代码块用with torch.no_grad()包含起来。model.eval()和torch.no_grad()的区别在于，model.eval()是将网络切换为测试状态，例如BN和随机失活（dropout）在训练和测试阶段使用不同的计算方法。torch.no_grad()是关闭PyTorch张量的自动求导机制，以减少存储使用和加速计算，得到的结果无法进行loss.backward()。
torch.nn.CrossEntropyLoss的输入不需要经过Softmax。torch.nn.CrossEntropyLoss等价于torch.nn.functional.log_softmax + torch.nn.NLLLoss。
loss.backward()前用optimizer.zero_grad()清除累积梯度。optimizer.zero_grad()和model.zero_grad()效果一样。

PyTorch性能与调试

torch.utils.data.DataLoader中尽量设置pin_memory=True，对特别小的数据集如MNIST设置pin_memory=False反而更快一些。num_workers的设置需要在实验中找到最快的取值。
用del及时删除不用的中间变量，节约GPU存储。
使用inplace操作可节约GPU存储，如
x = torch.nn.functional.relu(x, inplace=True)
此外，还可以通过torch.utils.checkpoint前向传播时只保留一部分中间结果来节约GPU存储使用，在反向传播时需要的内容从最近中间结果中计算得到。

减少CPU和GPU之间的数据传输。例如如果你想知道一个epoch中每个mini-batch的loss和准确率，先将它们累积在GPU中等一个epoch结束之后一起传输回CPU会比每个mini-batch都进行一次GPU到CPU的传输更快。
使用半精度浮点数half()会有一定的速度提升，具体效率依赖于GPU型号。需要小心数值精度过低带来的稳定性问题。
时常使用assert tensor.size() == (N, D, H, W)作为调试手段，确保张量维度和你设想中一致。
除了标记y外，尽量少使用一维张量，使用n*1的二维张量代替，可以避免一些意想不到的一维张量计算结果。
统计代码各部分耗时
with torch.autograd.profiler.profile(enabled=True, use_cuda=False) as profile:
    ...
print(profile)
或者在命令行运行

python -m torch.utils.bottleneck main.py

# Deep Learning with Pytoch

## PixelShuffle

In [118]:
import torch.nn as nn 
ps = nn.PixelShuffle(2)
tensor = torch.Tensor(1, 4, 8, 8)
ps_tensor = ps(tensor)
ps_tensor.shape

torch.Size([1, 1, 16, 16])