### 视觉问题中的注意力机制 
> 1,ViT （图片分类）模型定义  
> 2,DeiT (图片分类)  
> 3,自己写paddle.vision.transforms  

- 训练ViT很不容易
  - 1,vit需要大量的GPU资源
  - 2，ViT与训练数据集JFT-300M没有公开 
  - 3，超参数设置不好很容易Train不出效果  
- 如何更有效的训练ViT模型？   
DeiT方法(Data-efficient image Transforms)

- 把[3,28,28]的图片裁剪成16*[3,7,7]个小图片
- 对每个[3,7,7]的小图片拉直成为[None，147]整张大图变为[16,147]的图片，我们管这个[16,147]的东西叫patches
- patches[16,147] * PatchEmbedding[147,96] = image tokens[16,96]这里的Embedding层中参数是可学习的
- 把得到的结果image tokens[16,96]和图片[3,28,28]一起输入到transformer  

这里的[3,7,7]方块就相当于一个单词，整张[3,28,28]的图片就相当于一个16个单词的句子。最后根据每个“单词”的特点生成[96]的特征向量。  
在单个序列中使用不同位置的注意力用于实现该序列的表征方法  
  
#### Seq2seq注意力机制
![title](http://tiebapic.baidu.com/forum/w%3D580/sign=70182e764d3853438ccf8729a313b01f/f5b2f803918fa0ec4dc8b9177b9759ee3d6ddb04.jpg)  
这里的x1就代表一个单词或者我们上面一个小图转化成的[96]的特征向量
  
注意力机制中x1--> [q1,k1,v1]的过程就叫Embeding

![title](http://tiebapic.baidu.com/forum/w%3D580/sign=401e08b6d1eef01f4d1418cdd0ff99e0/a248c95c103853430f6336f3d613b07ecb808813.jpg)

In [15]:
import paddle 
import paddle.nn as nn 

class Attention(nn.Layer):
    '''
    输入是patches:变成特征向量之后的[8,num_batches,96]
    输出是tokens:[8,num_batches,96]
    '''
    def __init__(self,embed_dim,num_heads,qkv_bias=False,qk_scale=None,dropout=0.,attention_dropout=0.):
        super().__init__()
        self.num_heads  = num_heads
        self.head_dim = int(embed_dim/num_heads)
        self.all_head_dim = self.head_dim * num_heads
        self.qkv = nn.Linear(embed_dim,self.all_head_dim*3,bias_attr=False if qkv_bias is False else None)
        self.scale = self.head_dim ** -0.5 if qk_scale is None else qk_scale 
        self.softmax = nn.Softmax(-1)
        self.proj = nn.Linear(self.all_head_dim,embed_dim)
    
    def transpose_multi_head(self,x):
        new_shape = x.shape[:-1] + [self.num_heads,self.head_dim] 
        x = x.reshape(new_shape)
        x = x.transpose([0,2,1,3])
        # x [B,num_heads,num_patches,head_dim]
        return x 
    def forward(self,x):
        B, N, _ = x.shape # B是batch_size,N是num_patches
        qkv = self.qkv(x).chunk(3,-1)
        # qkv [B,N,all_head_dim] * 3
        q,k,v = map(self.transpose_multi_head,qkv)
        # q,k,v [B,num_heads,num_patches,head_dim]
       
        attn = paddle.matmul(q,k,transpose_y=True) # q*k' 
        attn = self.scale * attn 
        attn = self.softmax(attn) 
        # attn [B,num_heads,N]

        out = paddle.matmul(attn,v)
        # out [B,num_heads,num_patches,head_dim]
        out = out.transpose([0,2,1,3])
        # out [B,num_patches,num_heads,head_dim] [8,16,4,24]
        out = out.reshape([B,N,-1])
        out = self.proj(out)

        return out 

def main():
    t = paddle.randn([8,16,96]) # batchsize=8,16个单词
    model = Attention(embed_dim=96, num_heads=4, qkv_bias=False, qk_scale=None)
    print(model)
    out = model(t)
    print(out.shape)


Attention(
  (qkv): Linear(in_features=96, out_features=288, dtype=float32)
  (softmax): Softmax(axis=-1)
  (proj): Linear(in_features=96, out_features=96, dtype=float32)
)
[8, 16, 96]


有两种方案，PostNorm和PreNorm，大量实验表明，PreNorm效果更好。  
- PostNorm   
$$
BatchNorm(input + Multi-Attention(input))
$$   
- PreNorm  
$$  
input + Multi-Attention(BatchNorm(input))
$$
  
##### BN是把一摞书中每本书的同一页取出来归一化
##### LN是对每本书自己做归一化

### ViT实践 论文复现  
![title](http://tiebapic.baidu.com/forum/w%3D580/sign=14272c957c12b31bc76ccd21b61a3674/457901f41bd5ad6eba6ed438c4cb39dbb7fd3c2b.jpg)

In [35]:
'''
ViT
'''
import paddle 
import paddle.nn as nn
from encoder import MLP as MLP
# Encoder
class EncoderLayer(nn.Layer):
    def __init__(self,embed_dim=768,num_heads=4,qkv_bias=True,mlp_ratio=4,dropout=0.):
        super().__init__()
        self.attn_norm = nn.LayerNorm(embed_dim)
        self.attn = Attention(embed_dim,num_heads)
        self.mlp_norm = nn.LayerNorm(embed_dim)
        self.mlp = MLP(embed_dim,mlp_ratio)
    def forward(self,x):
        h = x 
        x = self.attn_norm(x)
        x = self.attn(x) 
        x = x + h
        return x


class Encoder(nn.Layer):
    def __init__(self,embed_dim,depth):
        super().__init__()
        layer_list = []
        for i in range(depth):
            encoder_layer = EncoderLayer()
            layer_list.append(encoder_layer)
        self.layers = nn.LayerList(layer_list)
        self.norm = nn.LayerNorm(embed_dim)
    def forward(self,x):
        for layer in self.layers:
            x = layer(x)
        x = self.norm(x)
        return x

# PathchEmbedding 
class PatchEmbedding(nn.Layer):
    def __init__(self, image_size=224, patch_size=16, in_channels=3, embed_dim=768, dropout=0.):
        super().__init__()
        self.embed_dim = embed_dim
        n_patches = (image_size // patch_size) * (image_size // patch_size) 
        self.patch_embedding = nn.Conv2D(in_channels=in_channels,
                                         out_channels=embed_dim,
                                         kernel_size=patch_size,
                                         stride=patch_size)
        self.dropout = nn.Dropout(dropout)
        # add class token 
        self.class_token = paddle.create_parameter(
            shape = [1,1,embed_dim],
            dtype='float32',
            default_initializer=nn.initializer.Constant(0.)) #这个不是层，只是一个可学习参数 
        # add position embedding 
        self.position_embedding = paddle.create_parameter(
            shape = [1,n_patches+1,embed_dim],
            dtype='float32',
            default_initializer=nn.initializer.TruncatedNormal())
    def forward(self,x):
        # [n,c,h,w]
        cls_tokens = self.class_token.expand([x.shape[0],1,self.embed_dim])
        x = self.patch_embedding(x) # [n,embed_dim,h',w']
        x = x.flatten(2)
        x = x.transpose([0,2,1])
        x = paddle.concat([cls_tokens,x],axis=1)
        x = x + self.position_embedding 
        return x


# main network
class VisualTransformer(nn.Layer):
    def __init__(self,
                 image_size=224,
                 patch_size=16,
                 in_channels=3,
                 num_classes=1000,
                 embed_dim=768,
                 depth=3,
                 num_heads=8,
                 mlp_ratio=4,
                 qkv_bias=True,
                 dropout=0.,
                 attention_dropout=0.,
                 droppath=0.):
        super().__init__()
        self.embed_dim = embed_dim
        self.patch_embedding = PatchEmbedding(image_size,patch_size,in_channels,embed_dim)
        self.encoder = Encoder(embed_dim,depth)
        self.classifier = nn.Linear(embed_dim,num_classes)
    def forward(self,x):
        # x [N,C,H,W]
        N,C,H,W = x.shape
        x = self.patch_embedding(x)
        # x [N,embed_dim,h',w'] h'=H/patch_size w'=W/patchsize
        x = x.flatten(2) #把二三维度变成h*w
        # x [N,embed_dim,h'*w'] = [N,embed_dim,num_patches]
        # x = x.transpose([0,2,1])
        # x [N,num_patches,embed_dim]
        x = self.encoder(x)
        x = self.classifier(x[:,0])
        return x 




def main():
    vit = VisualTransformer()
    paddle.summary(vit,(4,3,224,224))

# main()

ViT模型需要很强大的GPU，预训练数据集JFT-300M没有公开 
鉴于此，DeiT模型很好地解决了此问题  
![title](https://gimg2.baidu.com/image_search/src=http%3A%2F%2Fpic1.zhimg.com%2Fv2-957ba527a96523b4882173801897b860_b.jpg&refer=http%3A%2F%2Fpic1.zhimg.com&app=2002&size=f9999,10000&q=a80&n=0&g=0n&fmt=jpeg?sec=1647598499&t=c4880640ae34d415c97c077a31948c6a)  
DeiT加入了Distillation(知识蒸馏)，就是teacher-student  
  
DeiT知识蒸馏 
下图是soft-Distillation  
![title](http://tiebapic.baidu.com/forum/w%3D580/sign=fa0191f3d01001e94e3c1407880f7b06/296b6463f6246b6055fe6a6eb6f81a4c500fa29f.jpg)


![title](http://tiebapic.baidu.com/forum/w%3D580/sign=3aa4a80954f79052ef1f47363cf3d738/3e153612b31bb051e21968326b7adab44aede04a.jpg)  
Data Augmentation 
- (1)Random Erease
- (2)Mixup ,Cutmix
- (3)Droppath
- (4)EMA
- (5)AutoAug  
![title](http://tiebapic.baidu.com/forum/w%3D580/sign=02e4de64781f95caa6f592bef9167fc5/b84cb6389b504fc2408c6b1fb8dde71191ef6db8.jpg)  
  
AutoAugmentation是25种变化打包在一起的一种方式，谷歌自己搞得。 每次Aug时从25种里面随机选一个用。  
RandAug比上面的简便一些，是13个policy。  
- Model EMA 是一种利用上一轮和这一轮模型参数 加权求和的方法

In [39]:
# 普通ViT的patch_embedding,模型主干稍微修改
'''
DeiT
'''
class Encoder_DeiT(nn.Layer):
    def __init__(self,embed_dim,depth):
        super().__init__()
        layer_list = []
        for i in range(depth):
            encoder_layer = EncoderLayer()
            layer_list.append(encoder_layer)
        self.layers = nn.LayerList(layer_list)
        self.norm = nn.LayerNorm(embed_dim)
    def forward(self,x):
        for layer in self.layers:
            x = layer(x)
        x = self.norm(x)
        return x[:,0],x[:,1]

class PatchEmbedding_DeiT(nn.Layer):
    def __init__(self, image_size=224, patch_size=16, in_channels=3, embed_dim=768, dropout=0.):
        super().__init__()
        self.embed_dim = embed_dim
        n_patches = (image_size // patch_size) * (image_size // patch_size) 
        self.patch_embedding = nn.Conv2D(in_channels=in_channels,
                                         out_channels=embed_dim,
                                         kernel_size=patch_size,
                                         stride=patch_size)
        self.dropout = nn.Dropout(dropout)
        # add class token 
        self.class_token = paddle.create_parameter(
            shape = [1,1,embed_dim],
            dtype='float32',
            default_initializer=nn.initializer.Constant(0.)) #这个不是层，只是一个可学习参数 
        # add distill token 
        self.distill_token = paddle.create_parameter(
            shape = [1,1,embed_dim],
            dtype='float32',
            default_initializer=nn.initializer.TruncatedNormal(0.02)) #这个不是层，只是一个可学习参数 
        # add position embedding 
        self.position_embedding = paddle.create_parameter(
            shape = [1,n_patches+2,embed_dim],
            dtype='float32',
            default_initializer=nn.initializer.TruncatedNormal(.02))
    def forward(self,x):
        # [n,c,h,w]
        cls_tokens = self.class_token.expand([x.shape[0],1,self.embed_dim])
        distill_tokens = self.distill_token.expand((x.shape[0],-1,-1))
        x = self.patch_embedding(x) # [n,embed_dim,h',w']
        x = x.flatten(2)
        x = x.transpose([0,2,1])
        x = paddle.concat([cls_tokens,distill_tokens,x],axis=1)
        x = x + self.position_embedding 
        x = self.dropout(x)
        return x


class DeiT(nn.Layer): 
    def __init__(self,
                img_size=224,
                patch_size=16, 
                in_channels=3,
                num_classes=1000,
                embed_dim=768,
                depth=3,
                num_heads=8,
                mlp_ratio=1,
                qkv_bias=True,
                dropout=0.,
                attention_dropout=0.,
                droppath=0.):
        super().__init__() 
        self.patch_embedding = PatchEmbedding_DeiT(224,16,3,768)
        self.encoder = Encoder_DeiT(embed_dim,depth)
        self.head = nn.Linear(embed_dim,num_classes)
        self.head_distill = nn.Linear(embed_dim,num_classes)
        
    def forward(self,x):
        x = self.patch_embedding(x)  
        x,x_distill = self.encoder(x)
        x = self.head(x)
        x_distill = self.head_distill(x_distill)
        if self.training:
            return x,x_distill 
        else: 
            return (x + x_distill)/2 
        

def main():
    model = DeiT()
    paddle.summary(model,(4,3,224,224)) 


---------------------------------------------------------------------------------
    Layer (type)          Input Shape          Output Shape         Param #    
      Conv2D-8         [[4, 3, 224, 224]]    [4, 768, 14, 14]       590,592    
     Dropout-26         [[4, 198, 768]]       [4, 198, 768]            0       
PatchEmbedding_DeiT-3  [[4, 3, 224, 224]]     [4, 198, 768]         153,600    
    LayerNorm-43        [[4, 198, 768]]       [4, 198, 768]          1,536     
     Linear-108         [[4, 198, 768]]       [4, 198, 2304]       1,769,472   
     Softmax-33        [[4, 4, 198, 198]]    [4, 4, 198, 198]          0       
     Linear-109         [[4, 198, 768]]       [4, 198, 768]         590,592    
    Attention-33        [[4, 198, 768]]       [4, 198, 768]            0       
   EncoderLayer-20      [[4, 198, 768]]       [4, 198, 768]            0       
    LayerNorm-45        [[4, 198, 768]]       [4, 198, 768]          1,536     
     Linear-112         [[4, 198, 768]

### 自己写paddle.vision.transforms

In [53]:
import paddle.vision.transforms as T  
import numpy as np 
from PIL import Image 
import paddle 
import matplotlib.pyplot as plt


def crop(img,region):
    img = T.crop(img,*region)
    return img
class CenterCrop():
    def __init__(self,size):
        self.size = size 
    def __call__(self,image):
        w,h,_ = image.size
        ch,cw = self.size 
        crop_top = int(round(h-ch)/2.)
        crop_left = int(round(w-cw)/2.)
        return crop(image,(crop_top,crop_left,ch,cw))
class Resize():
    def __init__(self,size):
        self.size=size 
    def __call__(self,image):
        return T.Resize(image,self.size)

class ToTensor():
    def __init__(self):
        pass
    def __call__(self,image): 
        w,h = image.size
        image = paddle.to_tensor(np.array(image))
        if image.dtype == paddle.uint8:
            image = paddle.cast(image,dtype='float32')/255.0 #强制类型转换 + 归一化 
        image = image.transpose([2,0,1]) 
        return image 

class Compose():
    def __init__(self,transforms):
        self.transforms = transforms 
    def __call__(self,image):
        '''
        把class当函数用
        '''
        for t in self.transforms:
            image = t(image)
        return image

def main():
    img = Image.open('img.jpg')
    transforms = Compose([
                        Resize([256,256]),
                        CenterCrop([112,112]),
                        ToTensor()
                        ])
    out = transforms(img)
ss



AssertionError: 