![title](2019040813401989.png)

In [7]:
import torch
from torch import nn
from torch.functional import F
import copy
import numpy as np
from torch import Variable

In [14]:
class EncoderDecoder(nn.Module):
    def __init__(self,encoder,decoder,input_emd,out_emd,generate):
        super(EncoderDecoder,self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.input_emd = input_emd
        self.out_emd = out_emd
        self.generate = generate
    
    def forward(self,inputword,outputword,):
        pass
#整体框架，generate指的是decoder后的分类器       

In [15]:
#generate的搭建，linear和softmax
class generate(nn.Module):
    def __init__(self,out_size,vocabsize):
        super(generate,self).__init__()
        self.linear = nn.Linear(out_size,vocabsize)
    def forward(self,x):
        return F.log_softmax(self.linear(x), dim=-1)

In [1]:
def clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
#encoder 和 decoder都是N层的，为了代码的精简这里用到了深层拷贝

In [5]:
#encoder搭建
class encoder(nn.Module):
    def __init__(self,encoderlayer):
        super(encoder,self).__init__()
        self.layers = clones(encoderlayer,6)
    def forward(self,x):
        for layer in self.layers:
            x = layer(x)
        return x    

In [6]:
#残差网络的搭建，就是图中add+norml这块
class SublayerConnection(nn.Module):
    def __init__(self,sentance_size,word_size):
        super(SublayerConnection,self).__init__()
        self.layernorm = nn.LayerNorm([sentance_size,word_size],eps=1e-5)
        self.dropout = nn.Dropout(0.2)
    def forward(self,x,sublayer):
        out = self.layernorm(x + self.dropout(sublayer(x)))
        return out
#这里使用layernorm是因为每个layer长度是不同的，为了能batch，我们会选择padding补0，这时候再使用BatchNormal就不合适了

In [8]:
#搭建encoderlayer
class encoderlayer(nn.Module):
    def __init__(self,feedforward,self_attn,sentance_size,word_size):
        super(encoderlayer,self).__init__()
        self.self_attn = self_attn
        self.feedforward = feedforward
        self.layers = clones(SublayerConnection(sentance_size,word_size),2)
    def forward(x,mask):
        x = self.layers[0](x,lambda x:self.self_attn(x,x,x,mask))#这里使用lambda是因为SublayerConnection中我们输入的是一个函数
        x = self.layers[1](x,self.feedforward)
        return x
#这里mask的目的是去除掉padding在训练过程中的影响。

In [14]:
#decoder的搭建
class Decoder(nn.Module):
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        
    def forward(self, x, memory, second_mask, first_mask):
        for layer in self.layers:
            x = layer(x, memory, second_mask, first_mask)
        return x

In [15]:
#搭建decoderlayer
class DecoderLayer(nn.Module):
    def __init__(self, sentance_size,word_size, self_attn, out_attn, feed_forward):
        super(DecoderLayer, self).__init__()
        self.self_attn  = self_attn #第一层attenation
        self.out_attn = out_attn  #第二层attenation
        self.feed_forward = feed_forward
        self.layers = clones(SublayerConnection(sentance_size,word_size),3)
    def forward(x,memory, second_mask, first_mask):
        m = memory #指定是enconder的输出
        x = self.layers[0](x,lambda x :self.self.self_attn(x,x,x,first_mask))
        x = self.layers[1](x,lambda x :self.self.out_attn(m,m,x,second_mask))
        x = self.layers[2](x,self.feedforward)
        return x
    
        

In [25]:

m = np.ones((4,4))
k1 = np.triu(m, k=1)
k2 = np.triu(m, k=0)
print(torch.from_numpy(k1) == 0)
print(torch.from_numpy(k2) == 0)


tensor([[ True, False, False, False],
        [ True,  True, False, False],
        [ True,  True,  True, False],
        [ True,  True,  True,  True]])
tensor([[False, False, False, False],
        [ True, False, False, False],
        [ True,  True, False, False],
        [ True,  True,  True, False]])


In [22]:
#制造mask 效果见上
def subsequent_mask(word_size):
    attn_shape = (1, word_size, word_size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

![title](20190408141116387.png)

In [26]:
#搭建attenation
def attention(query, key, value, mask=None):
    dk = query.size(-1)
    #用爱因斯坦求和约定torch.einsum('bij,bkj->bik')/torch.sqrt(dk),如果是多头的话有变化
    s = torch.matmul(query,key.transpose(-2,-1))/torch.sqrt(dk) 
    if mask is not None:
        s = s.masked_fill(mask == 0, -1e9)#把mask上为0的地方填充为负无穷
    p = F.softmax(s,dim=-1)
    return torch.matmul(p,value),p
    

多头自注意力模型
![title](transformer_38_0.png)

In [8]:
class MultiHeadedAttention(nn.Module):
    def __init__(self,word_size,h):
        super(MultiHeadedAttention,self).__init__()
        self.h = h
        self.word_size = word_size
        self.dk = word_size//h  #dk是每一个头的dim，这里用到了向下取整//，我比较喜欢头数能和wordsize整除
        self.linears = clones(nn.Linear(word_size,word_size),4)
        
    def forward(self,query, key,value, mask=None):
        if mask is not None:
            mask = mask.unsqueeze(1)
        batches = query.size(0)
        #这里是q,k,v分别经过linear层之后，拆成(b,h,s,dk)
        query,key,value = [linear(x).view(batches,-1,self.h,self.dk).transpose(1,2) for linear,x in zip(self.linears,(query,key,value))]
        x, _ = attention(query,key,value,mask)
        x = x.transpose(1,2).contiguous().view(batches,-1,self.h*self.dk) #重新整合成（b，s, h*dk),让多头变成一个头
        x = self.linears[-1](x)#经过最后一层linear
        return x

In [9]:
#feedforward简单的前馈神经网络，没啥好讲的
class PositionwiseFeedForward(nn.Module):
    def __init__(self, word_size, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(word_size, d_ff)
        self.w_2 = nn.Linear(d_ff, word_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

In [10]:
#embedding层也没啥好讲的
class Embeddings(nn.Module):
    def __init__(self, word_size, vocab):
        super(Embeddings, self).__init__()
        self.emd = nn.Embedding(vocab, word_size)

    def forward(self, x):
        return self.emd(x)

![title](20190918202641449.png)

In [12]:
class PositionalEncoding(nn.Module):
    def __init__(self, word_size, dropout, num_word=2000):
        super(PositionalEncoding, self).__init__()
        #word_size指的是词向量的维度，max_word是词向量的数量
        pe = torch.zeros(num_word,word_size)
        position = torch.arange(0,num_word).unsqueeze(0)
        div = torch.pow(10000,torch.arange(0,num_word,2)/word_size)
        pe[:,0::2] = torch.sin(position/div)
        pe[:,1::2] = torch.cos(position/div)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)#把pe推入内存，并不参与梯度计算
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], 
                         requires_grad=False)    #word+word_position,根据你实际的词的多少，从pe中直接取得位置信息
        return x

后面就不做了，具体实现，建议去看bert和gpt的源码，他们才是面试的时候会经常问的，bert采用的是transformer中encoder这一块，而GPT采用的是decoder这一块。