In [None]:
# attention

import torch
import torch.nn as nn

import torch.nn.functional as F

d_model = 6
d_k = 4
batch_size = 2
seq_len = 3
x = torch.rand(batch_size, seq_len, d_model)

print("input shape: ", x.shape)

class SelfAttention(nn.Module):
    def __init__(self, d_model, d_k):
        super().__init__()
        self.W_q = nn.Linear(d_model, d_k)
        self.W_k = nn.Linear(d_model, d_k)
        self.W_v = nn.Linear(d_model, d_k)

    def forward(self, input):   # [B, S, d_model]
        Q = self.W_q(input)     # [B, S, d_k]
        K = self.W_k(input)     # [B, S, d_k]
        V = self.W_v(input)     # [B, S, d_k]

        scores = torch.matmul(Q, K.transpose(-2, -1)) / (Q.size(-1) ** 0.5)     # [B, S, S]
        weights = F.softmax(scores, dim=-1)
        output = torch.matmul(weights, V)

        return output, weights

attention_layer = SelfAttention(d_model, d_k)

output, weights = attention_layer(x)

print(output)



print(weights)



input shape:  torch.Size([2, 3, 6])
tensor([[[ 0.3468,  0.3706, -0.2624,  0.2491],
         [ 0.3442,  0.3662, -0.2685,  0.2541],
         [ 0.3465,  0.3706, -0.2625,  0.2492]],

        [[ 0.4045,  0.2085, -0.3606,  0.2901],
         [ 0.4062,  0.2053, -0.3626,  0.2926],
         [ 0.4031,  0.2112, -0.3585,  0.2880]]], grad_fn=<UnsafeViewBackward0>)
tensor([[[0.3342, 0.3263, 0.3395],
         [0.3281, 0.3408, 0.3310],
         [0.3327, 0.3262, 0.3411]],

        [[0.3364, 0.3301, 0.3335],
         [0.3289, 0.3408, 0.3303],
         [0.3396, 0.3216, 0.3388]]], grad_fn=<SoftmaxBackward0>)


In [None]:
# mask

import torch
import torch.nn as nn

import torch.nn.functional as F

d_model = 6
d_k = 4
batch_size = 2
seq_len = 3
x = torch.rand(batch_size, seq_len, d_model)

print("input shape: ", x.shape)

def causal_mask(seq_len):
    return torch.tril(torch.ones(seq_len, seq_len)).bool()  # [L, L]

mask = causal_mask(seq_len)  # [L, L]
print(mask)


class SelfAttention(nn.Module):
    def __init__(self, d_model, d_k):
        super().__init__()
        self.W_q = nn.Linear(d_model, d_k)
        self.W_k = nn.Linear(d_model, d_k)
        self.W_v = nn.Linear(d_model, d_k)

    def forward(self, input, attn_mask=None):   # [B, S, d_model]
        Q = self.W_q(input)     # [B, S, d_k]
        K = self.W_k(input)     # [B, S, d_k]
        V = self.W_v(input)     # [B, S, d_k]

        scores = torch.matmul(Q, K.transpose(-2, -1)) / (Q.size(-1) ** 0.5)     # [B, S, S]

        if attn_mask != None:
            scores = scores.masked_fill(attn_mask==0, float('-inf'))
            print("socres:", scores)

        weights = F.softmax(scores, dim=-1)
        output = torch.matmul(weights, V)

        return output, weights

attention_layer = SelfAttention(d_model, d_k)

output, weights = attention_layer(x, mask)

print("output: ", output)

print("weights: ", weights)



input shape:  torch.Size([2, 3, 6])
tensor([[ True, False, False],
        [ True,  True, False],
        [ True,  True,  True]])
socres: tensor([[[0.0288,   -inf,   -inf],
         [0.3989, 0.3232,   -inf],
         [0.1370, 0.1270, 0.1868]],

        [[0.4134,   -inf,   -inf],
         [0.1390, 0.0844,   -inf],
         [0.1108, 0.0873, 0.0983]]], grad_fn=<MaskedFillBackward0>)
tensor([[[-0.2365, -0.4558, -0.2359, -0.0013],
         [-0.2038, -0.4263, -0.2420, -0.1248],
         [-0.1586, -0.4470, -0.2827, -0.1519]],

        [[-0.3216, -0.5624, -0.3853, -0.1773],
         [-0.2795, -0.5421, -0.4857, -0.2161],
         [-0.2835, -0.5046, -0.4398, -0.1511]]], grad_fn=<UnsafeViewBackward0>)
tensor([[[1.0000, 0.0000, 0.0000],
         [0.5189, 0.4811, 0.0000],
         [0.3288, 0.3256, 0.3456]],

        [[1.0000, 0.0000, 0.0000],
         [0.5137, 0.4863, 0.0000],
         [0.3373, 0.3295, 0.3332]]], grad_fn=<SoftmaxBackward0>)


In [None]:
# Multi Head Attention

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_head"
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.d_model = d_model

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input, attn_mask=None):
        B, S, E = input.size()
        Q = self.W_q(input).view(B, S, self.num_heads, self.d_k).transpose(1, 2) # [B, S, E] -> [B, S, n, d_k] -> [B, n, S, d_k]
        K = self.W_k(input).view(B, S, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(input).view(B, S, self.num_heads, self.d_k).transpose(1, 2)

        scores = torch.matmul(Q, K.transpose(-2,-1)) / (self.d_k ** 0.5)    # [B, n, S, S]

        if attn_mask is not None:
            scores = scores.masked_fill(attn_mask==0, float('-inf'))
        
        weights = F.softmax(scores, dim=-1)

        output = torch.matmul(weights, V)   # [B, n, S, d_k]

        output = output.transpose(1,2).contiguous().view(B, S, self.d_model) # [B, S, d_model]

        output = self.dropout(self.W_o(output))
    

        return output, weights


class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()

        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model)
        )
    
    def forward(self, input):
        output = self.ff(input)

        return output
    
class TransformerBlock(nn.Module):
    def __init__(self, d_model, d_ff, n_heads, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.attention_layer = MultiHeadAttention(d_model, n_heads, dropout)
        self.ff = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)

    def forward(self, x, attn_mask):   # [B, S, E]
        attn_output, _ = self.attention_layer(x, attn_mask)   # [B, S, E]
        x = self.norm1(x + self.dropout_1(attn_output))

        ff_output = self.ff(x)
        x = self.norm2(x + self.dropout_2(ff_output))

        return x



In [8]:
import torch
import torch.nn as nn

import torch.nn.functional as F

d_model = 6
num_heads = 2
d_k = 4
batch_size = 2
seq_len = 3
x = torch.rand(batch_size, seq_len, d_model)

print("input shape: ", x.shape)

def causal_mask(seq_len):
    return torch.tril(torch.ones(seq_len, seq_len)).bool()  # [L, L]

mask = causal_mask(seq_len)  # [L, L]
print(mask)

multi_attention_layer = MultiHeadAttention(d_model, num_heads)

output, weights = multi_attention_layer(x, mask)

print("output: ", output)

print("weights: ", weights)

input shape:  torch.Size([2, 3, 6])
tensor([[ True, False, False],
        [ True,  True, False],
        [ True,  True,  True]])
output:  tensor([[[-0.1975, -0.0850, -0.1854,  0.1128,  0.1626,  0.0000],
         [-0.1770, -0.1456, -0.3501,  0.0945,  0.1552,  0.7728],
         [-0.1121, -0.1763, -0.4046,  0.1056,  0.1187,  0.8214]],

        [[-0.0216, -0.1186, -0.6042,  0.1005,  0.1399,  0.9678],
         [-0.0302, -0.1038, -0.4704,  0.1165,  0.1151,  0.8514],
         [-0.0285, -0.0904, -0.5128,  0.1219,  0.1011,  0.8995]]],
       grad_fn=<MulBackward0>)
weights:  tensor([[[[1.0000, 0.0000, 0.0000],
          [0.4953, 0.5047, 0.0000],
          [0.3346, 0.3354, 0.3300]],

         [[1.0000, 0.0000, 0.0000],
          [0.4939, 0.5061, 0.0000],
          [0.3271, 0.3307, 0.3422]]],


        [[[1.0000, 0.0000, 0.0000],
          [0.4991, 0.5009, 0.0000],
          [0.3393, 0.3337, 0.3270]],

         [[1.0000, 0.0000, 0.0000],
          [0.4998, 0.5002, 0.0000],
          [0.3451, 0.3

In [None]:
import torch
import torch.nn
import torch.nn.functional as F

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model//n_heads

        assert d_model % n_heads == 0, "d_model must be divisible by n_heads."

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)

        self.W_o = nn.Linear(d_model, d_model)

    def forward(self, x, attn_mask=None):
        B, S, E = x.shape # [B, S, d_model]

        Q = self.W_q(x).view(B, S, self.n_heads, self.d_k).transpose(1,2)   # [B, S, d_model] -> [B, S, n_heads, d_k] -> [B, n_heads, S, d_k]
        K = self.W_k(x).view(B, S, self.n_heads, self.d_k).transpose(1,2)
        V = self.W_v(x).view(B, S, self.n_heads, self.d_k).transpose(1,2)

        scores = torch.matmul(Q, K.transpose(-2,-1)) / (self.d_k**0.5)  # [B, n_heads, S, d_k] * [B, n_heads, d_k, S] -> [B, n_heads, S, S]

        if attn_mask is not None:
            scores = scores.masked_fill(attn_mask == 0, float('-inf'))
            
        att_weights = F.softmax(scores, dim=-1)

        output = torch.matmul(att_weights, V)   # [B, n_heads, S, S] * [B, n_heads, S, d_k] -> [B, n_heads, S, d_k]
        output = output.transpose(1,2).contiguous() # [B, n_heads, S, d_k] -> [B, S, n_heads, d_k] contiguous的作用是让output再内存上连续，接下来可以用view
        output = output.view(B, S, E)   # [B, S, n_heads, d_k] -> [B, S, d_model]

        output = self.W_o(output)

        return output, weights
    
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()

        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff),
            

        )
    

batch_size = 2
seq_len = 3
d_model = 4
n_heads = 2
x = torch.randn(batch_size, seq_len, d_model)   # 
print(x)
causal_mask = torch.tril(torch.ones(seq_len, seq_len)).bool()
print("causal mask: ", causal_mask)
mha_layer = MultiHeadAttention(d_model=d_model, n_heads=n_heads)

output, attn_score = mha_layer(x, causal_mask)
print(output)

print(attn_score)


tensor([[[-1.2497, -0.5584,  0.6232, -1.8979],
         [-0.9936, -1.4758,  0.3628,  1.2568],
         [-1.2396, -0.7735, -0.2151, -1.6438]],

        [[ 0.6813,  0.9366, -1.1800,  0.3557],
         [-1.3757, -1.8015,  1.0607, -0.7222],
         [-0.0020,  0.0931, -1.1953, -0.5659]]])
causal mask:  tensor([[ True, False, False],
        [ True,  True, False],
        [ True,  True,  True]])
tensor([[[-0.0836,  0.0493,  0.7319,  0.0353],
         [-0.4484, -0.5112,  0.6098, -0.1604],
         [-0.2701, -0.2868,  0.6053, -0.0994]],

        [[ 0.4637, -0.2665, -0.8256, -0.4612],
         [ 0.0328, -0.3005, -0.0077, -0.2496],
         [-0.2698, -0.3900,  0.3720, -0.0704]]], grad_fn=<ViewBackward0>)
tensor([[[[1.0000, 0.0000, 0.0000],
          [0.5217, 0.4783, 0.0000],
          [0.3498, 0.3204, 0.3298]],

         [[1.0000, 0.0000, 0.0000],
          [0.4930, 0.5070, 0.0000],
          [0.3249, 0.3154, 0.3596]]],


        [[[1.0000, 0.0000, 0.0000],
          [0.4969, 0.5031, 0.0000],
 