In [1]:
'''
Resnet18在cifar10上训练
'''
import paddle
import paddle.nn as nn 

class Identity(nn.Layer):
    def __init__(self):
        super().__init__()
    def forward(self,x):
        return x

class Block(nn.Layer):
    def __init__(self,in_dim,out_dim,stride):
        super().__init__()
        self.conv1 = nn.Conv2D(in_channels=in_dim,
                               out_channels=out_dim,
                               kernel_size=3,
                               stride=stride,
                               padding=1,
                               bias_attr=False)
        self.bn1 = nn.BatchNorm2D(out_dim)
        self.conv2 = nn.Conv2D(in_channels=out_dim,
                               out_channels=out_dim,
                               kernel_size=3,
                               stride=1,
                               padding=1,
                               bias_attr=False)        
        self.bn2 = nn.BatchNorm2D(out_dim)
        self.relu = nn.ReLU()
        if stride == 2 or in_dim != out_dim:
            self.downsample = nn.Sequential(*[
                nn.Conv2D(in_dim,out_dim,1,stride=stride),
                nn.BatchNorm2D(out_dim)
            ])
        else:
            self.downsample = Identity()

    def forward(self,x):
        h = x
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        identity = self.downsample(h)
        x = x + identity
        x = self.relu(x)
        return x


class ResNet18(nn.Layer):
    def __init__(self,in_dim=64,num_classes=10):
        super().__init__()
        self.in_dim = in_dim
        self.conv1 = nn.Conv2D(in_channels=3,
                               out_channels=in_dim,
                               kernel_size=3,
                               stride=1,
                               padding=1,
                               bias_attr=False)
        self.bn1 = nn.BatchNorm2D(in_dim)
        self.relu = nn.ReLU()
        # blocks
        self.layer1 = self._make_layer(dim=64,n_blocks=2,stride=1)
        self.layer2 = self._make_layer(dim=128,n_blocks=2,stride=2)
        self.layer3 = self._make_layer(dim=256,n_blocks=2,stride=2)
        self.layer4 = self._make_layer(dim=512,n_blocks=2,stride=2)
        
        # head layer
        self.avgpool = nn.AdaptiveAvgPool2D(1)
        self.classifier = nn.Linear(8192,num_classes)
    
    def _make_layer(self,dim,n_blocks,stride):
        layer_list = []
        layer_list.append(Block(self.in_dim,dim,stride=stride))
        self.in_dim = dim 
        for i in range(n_blocks):
            layer_list.append(Block(self.in_dim,dim,stride=1))
        return nn.Sequential(*layer_list)   
    def forward(self,x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = x.flatten(1)
        x = self.classifier(x)
        return x

def test_model():
    t = paddle.randn([4,3,32,32])
    model = ResNet18()
    out = model(t)
    paddle.summary(model,(4,3,32,32))

# dataset----------------------------------------------------------------------------------
from paddle.io import Dataset
from paddle.io import DataLoader 
from paddle.vision import datasets 
import paddle.vision.transforms as T  

def get_transforms(mode='train'):
    if mode == 'train':
        data_transforms = T.Compose([
            T.RandomCrop(32,padding=4),
            T.RandomHorizontalFlip(),
            T.ToTensor(),
            T.Normalize(mean=[0.4914,0.4822,0.4465],std=[0.2023, 0.1994, 0.2010])
            ])
    else:
        data_transforms = transforms.Compose([
            T.ToTensor(),
            T.Normalize(mean=[0.4914,0.4822,0.4465],std=[0.2023, 0.1994, 0.2010])
        ])
    return data_transforms 

def get_dataset(name='cifar10',mode='train'):
    if name == 'cifar10':
        dataset = datasets.Cifar10(mode=mode,transform=get_transforms(mode=mode))
    return dataset 

def get_dataloader(dataset,batch_size=128,mode='train'):
    dataloader = DataLoader(dataset,batch_size=batch_size)
    return dataloader

# utils----------------------------------------------------------------------------------
class AverageMeter():
    def __init__(self):
        self.avg = 0
        self.sum = 0
        self.cnt = 0
        self.reset()
    def reset(self):
        self.avg = 0
        self.sum = 0
        self.cnt = 0
    def update(self,val,n=1):
        self.sum += val * n
        self.cnt += n
        self.avg = self.sum / self.cnt 

# start train!
def train_one_epoch(model,dataloader,criterion,optimizer,epoch):
    model.train()
    loss_meter = AverageMeter()
    acc_meter = AverageMeter()
    for batch_id,data in enumerate(dataloader):
        image = data[0]
        label = data[1]

        out = model(image)
        loss = criterion(out,label)

        loss.backward()
        optimizer.step()
        optimizer.clear_grad()

        pred = nn.functional.softmax(out,axis=1)
        acc = paddle.metric.accuracy(pred,label.unsqueeze(-1))

        batch_size = image.shape[0]
        loss_meter.update(loss.cpu().numpy()[0],batch_size)
        acc_meter.update(acc.cpu().numpy()[0],batch_size)

        if batch_id > 0:
            print(f'----- Batch {batch_id}, Loss {loss_meter.avg}, Acc {acc_meter.avg}')
            
def main():
    EPOCH_NUM = 200
    BATCH_SIZE = 16 
    
    model = ResNet18(num_classes=10)
    train_dataset = get_dataset(mode='train')
    train_dataloader = get_dataloader(train_dataset,mode='train',batch_size=BATCH_SIZE)
    criterion = nn.CrossEntropyLoss()
    scehduler =  paddle.optimizer.lr.CosineAnnealingDecay(0.02,EPOCH_NUM)
    optimizer = paddle.optimizer.Momentum(learning_rate=scehduler,
                                          parameters=model.parameters(),
                                          momentum=0.9,
                                          weight_decay=5e-4)
    for i in range(EPOCH_NUM):
        train_one_epoch(model,train_dataloader,criterion,optimizer,i)  
        scheduler.step()



main()

KeyboardInterrupt: 

![title](http://tiebapic.baidu.com/forum/w%3D580/sign=4bb503daaa36afc30e0c3f6d8318eb85/6fbede00baa1cd113a713c3ae412c8fcc2ce2d43.jpg)
  
- vit并没有使用decoder,只用了encoder  
- ViT的结构很简单: 
image tokens -> encoders -> class label
- 如何得到图像分词(image tokens):
![title](http://tiebapic.baidu.com/forum/w%3D580/sign=abcddbc5931b9d168ac79a69c3deb4eb/82ac39d3d539b600e7f846adb450352ac65cb75d.jpg)
- 通常在Feed Forward中我们用的激活函数是Gelu 
![title](https://pic4.zhimg.com/v2-efda13b22f7bee7fbb02eb529ba017c3_r.jpg)




In [4]:
import paddle 
from PIL import Image 
import numpy as np 
import matplotlib.pyplot as plt 
import paddle.nn as nn
'''
预备内容教学
'''
if True:
    '''创建'''
    # t = paddle.zeros([3,3])
    # t = paddle.randn([3,3])
    img = np.array(Image.open('./bingdundun.jpg').resize((560,560)) )
    #plt.imshow(img)
    t = paddle.to_tensor(img)
    '''type是tensor类型，dtype是里面存的数据的类型'''
    # t = paddle.randint(0,10,[5,15])
    # qkv = t.chunk(3,-1)
    # q,k,v = qkv
    '''patch_embeded'''
    #----------------------------------
    class Identity(nn.Layer):
        def __init__(self):
            super().__init__()
        def forward(self,x):
            return x
    #----------------------------------
    class MLP(nn.Layer):
        def __init__(self,embed_dim,mlp_ratio=4.0,dropout=0.):
            super().__init__()
            self.fc1 = nn.Linear(embed_dim,int(embed_dim * mlp_ratio))
            self.fc2 = nn.Linear(int(embed_dim * mlp_ratio),embed_dim)
            self.act = nn.GELU()
            self.dropout = nn.Dropout() 

        def forward(self,x):
            x = self.fc1(x)
            x = self.act(x)
            x = self.dropout(x)
            x = self.fc2(x)
            x = self.dropout(x)

            return x           
    #----------------------------------
    class PatchEmbedding(nn.Layer):
        def __init__(self,image_size,patch_size,in_channels,embed_dim,dropout=0):
            super().__init__()
            self.patch_embed =  nn.Conv2D(in_channels,
                                          embed_dim,
                                          kernel_size=patch_size,
                                          stride = patch_size,
                                          weight_attr=paddle.ParamAttr(initializer=nn.initializer.Constant(1.0)),
                                          bias_attr=False)
            self.dropout = nn.Dropout(dropout)
        def forward(self,x):
            # x:[1,3,560,560]
            x = self.patch_embed(x)
            # x:[n,embeded_dim,h',w']
            x = x.flatten(2)# [n,embeded_dim,h'*w']
            x = x.transpose([0,2,1])# [n,h'*w',embeded_dim]
            x = self.dropout(x)
            return x 
    #----------------------------------
    class Encoder(nn.Layer):
        def __init__(self,embed_dim):
            super().__init__()
            self.attn = Identity()
            self.attn_norm = nn.LayerNorm(embed_dim)
            self.mlp = MLP(embed_dim)
            self.mlp_norm = nn.LayerNorm(embed_dim)
        def forward(self,x):
            h = x 
            x = self.attn_norm(x)
            x = self.attn(x)
            x = h + x
            h = x 
            x = self.mlp_norm(x)
            x = self.mlp(x)
            x = h + x
            return x

    #----------------------------------
    class ViT(nn.Layer):
        def __init__(self):
            super().__init__()
            self.patch_embed = PatchEmbedding(224,7,3,16)
            layer_list = [Encoder(16) for i in range(5)]
            self.encoders = nn.LayerList(layer_list)
            self.head = nn.Linear(16,10)
            self.avgpool = nn.AdaptiveAvgPool1D(1)

        def forward(self,x):
            x = self.patch_embed(x)
            for encoder in self.encoders:
                x = encoder(x)
            x = x.transpose([0,2,1]) # [n,h'*w',c] ---> [n,c,h'*w']
            x = self.avgpool(x) # [n,c,1]
            x = x.flatten(1) # [n,c]
            x = self.head(x)
            return x
    # ====================================================
    # sample = paddle.randn([28,28]).astype('float32')
    # sample = sample.reshape([1,1,28,28])
    # patch_embeded = PatchEmbedding(image_size=28,patch_size=7,in_channels=1,embed_dim=1)
    # out = patch_embeded(sample)
    # mlp = MLP(embed_dim=1)
    # out = mlp(out)
    # print(out.shape)
    img = np.array(Image.open('./bingdundun.jpg').resize((224,224)))
    img = img.transpose(2,1,0)
    imgs = np.array([img,img,img,img]).astype('float32')
    imgs_t = paddle.to_tensor(imgs,dtype='float32')
    model = ViT()
    out = model(imgs_t)
    print(out.shape)



[4, 10]


- ViT的本质就是使用CNN将图片转为S而群策，然后再使用Attention