In [1]:
# 토큰화 코드
# 띄어쓰기 단위로 분리

input_text = "나는 최근 파리 여행을 다녀왔다"
input_text_list = input_text.split()
print("input_text_list: ", input_text_list)

# 토큰 -> 아이디 딕셔너리와 아이디 -> 토큰 딕셔너리
str2idx = {word:idx for idx, word in enumerate(input_text_list)}
idx2str = {idx:word for idx, word in enumerate(input_text_list)}
print(f"str2idx: {str2idx}")
print(f"idx2str: {idx2str}")

# 토큰을 토큰 아이디로 변환
input_ids = [str2idx[word] for word in input_text_list]
print(input_ids)

input_text_list:  ['나는', '최근', '파리', '여행을', '다녀왔다']
str2idx: {'나는': 0, '최근': 1, '파리': 2, '여행을': 3, '다녀왔다': 4}
idx2str: {0: '나는', 1: '최근', 2: '파리', 3: '여행을', 4: '다녀왔다'}
[0, 1, 2, 3, 4]


In [2]:
import torch
import torch.nn as nn

embedding_dim = 16
max_position = 12
embed_layer = nn.Embedding(len(str2idx), embedding_dim)
position_embed_layer = nn.Embedding(max_position, embedding_dim)

position_ids = torch.arange(len(input_ids), dtype=torch.long).unsqueeze(0)
print(position_ids.shape, torch.tensor(input_ids).shape)
position_encodings = position_embed_layer(position_ids)
token_embeddings = embed_layer(torch.tensor(input_ids)) # (5, 16)
token_embeddings = token_embeddings.unsqueeze(0) # (1, 5, 16)
input_embeddings = token_embeddings + position_encodings
input_embeddings.shape

torch.Size([1, 5]) torch.Size([5])


torch.Size([1, 5, 16])

In [3]:
from math import sqrt
import torch.nn.functional as F

def compute_attention(querys, keys, values, is_casual=False):
    dim_k = querys.size(-1)
    scores = querys @ keys.transpose(-2, -1) / sqrt(dim_k)
    weights = F.softmax(scores, dim=-1)
    return weights @ values

In [4]:
import torch
import torch.nn as nn
class AttentionHead(nn.Module):
    def __init__(self, token_embed_dim, head_dim, is_casual=False):
        super().__init__()
        self.is_causal = is_casual
        self.weight_q = nn.Linear(token_embed_dim, head_dim)
        self.weight_k = nn.Linear(token_embed_dim, head_dim)
        self.weight_v = nn.Linear(token_embed_dim, head_dim)
    
    def forward(self, querys, keys, values):
        outputs = compute_attention(
            self.weight_q(querys),
            self.weight_k(keys),
            self.weight_v(values),
            is_casual=self.is_causal
        )
        return outputs

In [5]:
embedding_dim = 16
head_dim = 16

attention_head = AttentionHead(embedding_dim, head_dim)
after_attention_embeddings = attention_head(input_embeddings, input_embeddings, input_embeddings)

In [6]:
class MultiheadAttention(nn.Module):
    def __init__(self, token_embed_dim, d_model, n_head, is_causal=False):
        super().__init__()
        self.n_head = n_head
        self.is_causal = is_causal
        self.weight_q = nn.Linear(token_embed_dim, d_model)
        self.weight_k = nn.Linear(token_embed_dim, d_model)
        self.weight_v = nn.Linear(token_embed_dim, d_model)
        self.concat_linear = nn.Linear(d_model, d_model)

    def forward(self, querys, keys, values):
        B, T, C = querys.size()
        querys = self.weight_q(querys).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        keys = self.weight_k(keys).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        values = self.weight_v(values).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        attention = compute_attention(querys, keys, values, self.is_causal)
        output = attention.transpose(1, 2).contiguous().view(B, T, C)
        output = self.concat_linear(output)
        return output

n_head = 4
mh_attention = MultiheadAttention(embedding_dim, embedding_dim, n_head)
after_attention_embeddings = mh_attention(input_embeddings, input_embeddings, input_embeddings)
after_attention_embeddings.shape

torch.Size([1, 5, 16])

In [7]:
norm = nn.LayerNorm(embedding_dim)
norm_x = norm(input_embeddings)

print(norm_x.shape)
print(norm_x.mean(dim=-1))
print(norm_x.std(dim=-1))

torch.Size([1, 5, 16])
tensor([[0.0000e+00, 3.7253e-09, 0.0000e+00, 7.4506e-09, 1.8626e-08]],
       grad_fn=<MeanBackward1>)
tensor([[1.0328, 1.0328, 1.0328, 1.0328, 1.0328]], grad_fn=<StdBackward0>)


In [8]:
class PreLayerNormFeedForward(nn.Module):
    def __init__(self, d_model, dim_feedforward, dropout):
        super().__init__()
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.linear2 = nn.Linear(d_model, dim_feedforward)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.activation = nn.GELU()
        self.norm = nn.LayerNorm(d_model)

    def forward(self, src):
        x = self.norm(src)
        x = x + self.linear2(self.dropout1(self.activation(self.linear1(x))))
        x = self.dropout2(x)
        return x

In [9]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, n_head, dim_feedforward, dropout):
        super().__init__()
        self.attention = MultiheadAttention(d_model, d_model, n_head)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.feed_forward = PreLayerNormFeedForward(d_model, dim_feedforward, dropout)


    def forward(self, src):
        norm_x = self.norm1(src)
        attn_output = self.attention(norm_x, norm_x, norm_x)
        x = src + self.dropout1(attn_output)

        x = self.feed_forward(x)
        return x

In [None]:
import copy
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

class TransformerEncoder(nn.Module):
    def __init__(self, encoder_layer, num_layers):
        super().__init__()
        self.layers = get_clones(encoder_layer, num_layers)
        self.num_layers = num_layers
        self.norm = num_layers

    def forward(self, src):
        output = src
        for mod in self.layers:
            output = mod(output)
        return output