In [71]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
class SingleScaledDotAttention():
    def __init__(self, model_dim, ffn_dim):
        self.linear_q = nn.Linear(model_dim, ffn_dim)
        self.linear_k = nn.Linear(model_dim, ffn_dim)
        self.linear_v = nn.Linear(model_dim, ffn_dim)
    def forward(query, key, value, leng, scale):
        query = self.linear_q(query)
        key = self.linear_k(key)
        value = self.linear_v(value)
        attention = torch.bmm(query, key.transpose(1,2))
        if scale:
            scale = self.key.size(-1) ** -0.5
            attention = torch.bmm(attention, scale)
        mask = torch.zeros(attention.size(1), attention.size(-1))
        for id_, len_ in enumerate(leng):
            mask[id_][len_:] = 1 ###
        attention.masked_fill_(mask.byte(), float('-inf'))
        attention = nn.Softmax(attention, dim = 2)
        context = torch.bmm(attention, value)
        return attention, context
        
class scaledDotAttention(nn.Module):
    def __init__(self, attentin_dropout=0.0):
        super(scaledDotAttention, self).__init__()
        self.dropout = nn.Dropout(attentin_dropout)
        self.softmax = nn.Softmax(dim=2)
    def forward(self, query, key, value, attn_mask, scale):
        attention = torch.bmm(query, key.transpose(1,2))
#         if scale id not None:
        print(attention.shape)
        print("attn_mask:", attn_mask)
        attention = attention * scale #.cuda()
        print(attention.shape)
#         when use leng replace attn_mask:
#         mask = torch.zeros(attention.size(0), attention.size(1), attention.size(-1))
#         print(leng)
#         for e_id, src_len in enumerate(leng):
#             mask[e_id][src_len:][:] = 1
#             for row in range(mask.size(1)):
#                 mask[e_id][row][src_len:] = 1
#         print("mask:", mask)
#         if attn_mask:
        attention = attention.masked_fill_(attn_mask.byte(), float('-inf'))
        print("attention:::", attention)
        attention = self.softmax(attention)
        print(attention)
        attention = self.dropout(attention)
#         print(attention)
        context = torch.bmm(attention, value)
        return attention, context
        
            
        
class pointwiseFeedForward(nn.Module):
    def __init__(self, model_dim=512, ffn_dim=2048, dropout=0.0):
        super(pointwiseFeedForward, self).__init__()
        self.W1 = nn.Conv1d(model_dim, ffn_dim, 1)
#         self.W2 = nn.Conv1d(model_dim, ffn_dim, 1)
        self.W2 = nn.Conv1d(ffn_dim, model_dim, 1)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(model_dim)
    def forward(self, X):
        #[5,6,512]
        print(X.shape)
        output = F.relu(self.W1(X.transpose(1,2)))
        output = self.W2(output)
#         output = self.dropout(output.transpose(1,2))
        output = output.transpose(1,2)
        #add residual and norm layer
        output = self.layer_norm(X + output)
        print("sp:", output.shape)
        return output
    
class MultiHeadAttention(nn.Module):
    def __init__(self, model_dim=512, num_heads=8, dropout=0.0):
        super(MultiHeadAttention, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.dim_per_head = model_dim // num_heads
        self.num_heads = num_heads
        print(model_dim, self.dim_per_head)
        self.linear_q = nn.Linear(model_dim, self.dim_per_head * num_heads)#avoid float perheadDim
        self.linear_k = nn.Linear(model_dim, self.dim_per_head * num_heads)#avoid float perheadDim
        self.linear_v = nn.Linear(model_dim, self.dim_per_head * num_heads)#avoid float perheadDim
        self.dot_product_attention = scaledDotAttention(dropout)
        self.linear_final = nn.Linear(model_dim, model_dim)
        self.layer_norm = nn.LayerNorm(model_dim)
    def forward(self, query, key, value, attn_mask=None):
        residual = query
        dim_per_head = self.dim_per_head
        num_heads = self.num_heads
        batch_size = query.size(0)
        query = self.linear_q(query)
        key = self.linear_q(key)
        value = self.linear_q(value)
        
        #split by heads 
        query = query.view(batch_size * num_heads, -1, dim_per_head)
        key = query.view(batch_size * num_heads, -1, dim_per_head)
        value = query.view(batch_size * num_heads, -1, dim_per_head)
        
#         if attn_mask:
        attn_mask = attn_mask.repeat(num_heads, 1, 1)
        scale = key.size(-1) ** -0.5###diff?
        attention, context = self.dot_product_attention(query, key, value, attn_mask, scale)
        
        #concat heads
        context = context.view(batch_size, -1, dim_per_head*num_heads)
        output = self.linear_final(context)
        
        output = self.dropout(output)
        output = self.layer_norm(residual + output)
#         print(residual.shape, output.shape)
        return attention, context
    
class EncoderLayer(nn.Module):
    def __init__(self, model_dim=512, num_heads=8, ffn_dim=2048, dropout=0.0 ):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(model_dim, num_heads, dropout)
        self.feed_forward = pointwiseFeedForward(model_dim, ffn_dim, dropout)
    def forward(self, inputs, attn_mask=None):
        leng = torch.tensor([6,5,4,3,2,1])
        attention, context = self.attention(inputs, inputs, inputs, attn_mask)
        output = self.feed_forward(context)
        return output, attention
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_len):
        super(PositionalEncoding, self).__init__()
        position_encoding = np.array([
            [pos / np.power(10000, 2.0*(j // 2) / d_model) for j in range(d_model)]
            for pos in range(max_seq_len)
            ])
        position_encoding[:, 0::2] = np.sin(position_encoding[:, 0::2])
        position_encoding[:, 1::2] = np.cos(position_encoding[:, 1::2])
        position_encoding = torch.Tensor(position_encoding)
        
        pad_row = torch.zeros([1, d_model])
        position_encoding = torch.cat((pad_row, position_encoding))
        
#         print("***))", position_encoding.shape)
        self.position_encoding = nn.Embedding(max_seq_len+1, d_model)
        self.position_encoding.weight = nn.Parameter(position_encoding, requires_grad=False)

    def forward(self, input_len):
        print("***input_len:", input_len)
        #[bs, single_len_perSeq]
        max_len = torch.max(input_len).item()
        print(max_len)
#         tensor = 
        input_pos = torch.tensor([list(range(1, len.item()+1)) + [0]*(max_len-len.item()) for len in input_len])
        
        print("***input_pos:", input_pos)
        return self.position_encoding(input_pos)
class Encoder(nn.Module):
    def __init__(self, 
                 vocab_size=31,
                 max_seq_len=3,
                 num_layers=6,
                 model_dim=512,
                 num_heads=8,
                 ffn_dim=2048,
                 dropout=0.0
                ):
        super(Encoder, self).__init__()
        self.encoder_layers = nn.ModuleList(
            [EncoderLayer(model_dim, num_heads, ffn_dim, dropout) for _ in range(num_layers)])
        
        self.seq_embedding = nn.Embedding(vocab_size+1, model_dim, padding_idx=0)
        
        self.pos_embedding = PositionalEncoding(model_dim, max_seq_len)
#         self.pos_embedding = PositionalEncoding(model_dim, max_seq_len)

    def forward(self, inputs, intpus_len):
        def padding_mask(inputs, inputb):
            print("inputs:", inputs)
            pad_mask = inputs.eq(0)
            print("pad_mask:", pad_mask)
            len_q = inputb.size(1)
            pad_mask = pad_mask.unsqueeze(1).expand(-1, len_q, -1)
            print("pad_mask:", pad_mask)
            return pad_mask
        output = self.seq_embedding(inputs)
        output += self.pos_embedding(inputs_len)
#         print("X:", inputs)
        self_attention_mask = padding_mask(inputs, inputs)
        attentions = []
        for encoder in self.encoder_layers:
            output, attention = encoder(output, self_attention_mask)
            attentions.append(attention)
        return output, attentions
        
        

In [72]:
X = torch.rand(5,6,512)
encode1 = EncoderLayer() #需要实例方法化（）
output = encode1(X)
# print("X:", X)
# print(output)
# print(X.type)

512 64


AttributeError: 'NoneType' object has no attribute 'repeat'

In [73]:
X = torch.rand(5,6,512)
vocab = {"some": 1, "words": 2, "pad":0}
word_index = torch.Tensor([[vocab[w] for w in ["some", "words", "pad"]]for _ in range(2)])
input = word_index.to(torch.int64)# word2index = [X[w] for w in ["some", "words"]]

# X = torch.tensor(X).to(torch.float64)
encoder = Encoder()
# print("X:", X)
print("input:", input.shape)
inputs_len = torch.tensor([[len(seq)] for seq in input])
print("inputs_len:", inputs_len.shape)
output = encoder(input, inputs_len) #实例调用encoder()(...)
print("output----:", output)

512 64
512 64
512 64
512 64
512 64
512 64
input: torch.Size([2, 3])
inputs_len: torch.Size([2, 1])
***input_len: tensor([[3],
        [3]])
3
***input_pos: tensor([[1, 2, 3],
        [1, 2, 3]])
inputs: tensor([[1, 2, 0],
        [1, 2, 0]])
pad_mask: tensor([[0, 0, 1],
        [0, 0, 1]], dtype=torch.uint8)
pad_mask: tensor([[[0, 0, 1],
         [0, 0, 1],
         [0, 0, 1]],

        [[0, 0, 1],
         [0, 0, 1],
         [0, 0, 1]]], dtype=torch.uint8)
torch.Size([16, 3, 3])
attn_mask: tensor([[[0, 0, 1],
         [0, 0, 1],
         [0, 0, 1]],

        [[0, 0, 1],
         [0, 0, 1],
         [0, 0, 1]],

        [[0, 0, 1],
         [0, 0, 1],
         [0, 0, 1]],

        [[0, 0, 1],
         [0, 0, 1],
         [0, 0, 1]],

        [[0, 0, 1],
         [0, 0, 1],
         [0, 0, 1]],

        [[0, 0, 1],
         [0, 0, 1],
         [0, 0, 1]],

        [[0, 0, 1],
         [0, 0, 1],
         [0, 0, 1]],

        [[0, 0, 1],
         [0, 0, 1],
         [0, 0, 1]],

       