In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
# 3.1 Transformer Architecture

### 인코더 유형
- BERT, RoBerta, DistilBERT
- text sequence 입력을 수치표현으로 변환

### 디코더 유형
- GPT 계열의 생성모델이 이에 해당
- 왼쪽 문맥에 따라 다음 토큰을 순차적으로 계산

### 인코더-디코더 유형
- 한 텍스트 시퀀스를 다른 시퀀스로 매핑함.
- BART, T5 모델이 이에 해당.

In [3]:
#!pip install bertviz
from transformers import AutoTokenizer
from bertviz.transformers_neuron_view import BertModel
from bertviz.neuron_view import show

Collecting bertviz
  Downloading bertviz-1.4.0-py3-none-any.whl (157 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/157.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/157.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m92.2/157.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.6/157.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting boto3 (from bertviz)
  Downloading boto3-1.34.74-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.0->bertviz)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [15]:
model_ckpt = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = BertModel.from_pretrained(model_ckpt)
text = "time flies like an arrow"
show(model, "bert", tokenizer, text, display_mode="light", layer=0, head=8)

Output hidden; open in https://colab.research.google.com to view.

In [16]:
# Query = (seq_len, embedding_dim)
# Key = (embedding_dim, seq_len)
# qk = matmul(Query, Key) (seq_len, seq_len)
# softmax(qk / scaling_factor, axis=1), scaling_factor is broadcasted. (seq_len, seq_len)

In [17]:
"""
pytorch nn function

nn.Linear,
nn.Module,
nn.Dropout,
nn.LayerNorm,
nn.Embedding
nn.GELU (Gaussian Error Linear Unit)
nn.bmm (batch matrix multiplication)
model.forward() -> pytorch forward method
"""

'\npytorch nn function\n\nnn.Linear,\nnn.Module,\nnn.Dropout,\nnn.LayerNorm,\nnn.Embedding\nnn.GELU (Gaussian Error Linear Unit)\nnn.bmm (batch matrix multiplication)\nmodel.forward() -> pytorch forward method\n'

# 3.2 Encoder

In [122]:
inputs = tokenizer(text, padding=True, max_length=10, return_tensors="pt", truncation=True)
config.max_length = 10

In [123]:
def get_padded_sequence(ids, max_length):
    batch_size, seq_len = ids.shape
    padded_seq_ids = torch.zeros(size=(1, max_length), dtype=torch.long)
    padded_seq_ids[0, :seq_len] = ids
    return padded_seq_ids

In [50]:
import torch.nn as nn
from transformers import AutoConfig

config = AutoConfig.from_pretrained(model_ckpt) # pretrained 임베딩 로드
token_emb = nn.Embedding(config.vocab_size, config.hidden_size) # embedding layer를 거쳐서 embedding으로 변환
print(token_emb) # (vocab_size, embedding_dim)

Embedding(30522, 768)


In [126]:
input_embeds = token_emb(inputs.input_ids)
input_embeds.size() # (batch_size, seq_size, embedding_dim)

torch.Size([1, 10, 768])

In [52]:
# calculating normalizaing factor
import torch
from math import sqrt

query = key = value = input_embeds # query == key == value in encoder-self-attnetion case
dim_k = key.size(-1) # == embedding_dim
scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)
# (batch_size, seq_size, embedding_dim) * (batch_size, embedding_dim, seq_size) -> (batch_size, seq_size, seq_size)

print(scores.size())
print(scores)

torch.Size([1, 7, 7])
tensor([[[ 2.5603e+01,  6.4121e-01,  3.3966e-01, -1.5425e+00, -1.5452e+00,
           5.8863e-02,  2.9096e-01],
         [ 6.4121e-01,  2.7255e+01, -2.0742e-01,  1.2523e+00, -1.4954e+00,
          -3.6402e-01, -1.3188e-02],
         [ 3.3966e-01, -2.0742e-01,  2.5836e+01,  3.9427e-02,  4.5119e-01,
           4.7003e-01,  1.4218e+00],
         [-1.5425e+00,  1.2523e+00,  3.9427e-02,  2.6389e+01, -3.1081e-02,
          -5.1921e-01,  8.0605e-01],
         [-1.5452e+00, -1.4954e+00,  4.5119e-01, -3.1081e-02,  2.9287e+01,
          -5.8324e-01,  5.8137e-01],
         [ 5.8863e-02, -3.6402e-01,  4.7003e-01, -5.1921e-01, -5.8324e-01,
           2.6304e+01,  1.1723e+00],
         [ 2.9096e-01, -1.3187e-02,  1.4218e+00,  8.0605e-01,  5.8137e-01,
           1.1723e+00,  2.6959e+01]]], grad_fn=<DivBackward0>)


In [53]:
import torch.nn.functional as F

weights = F.softmax(scores, dim=-1)
print(weights.sum(dim=-1))
print(weights)

tensor([[1., 1., 1., 1., 1., 1., 1.]], grad_fn=<SumBackward1>)
tensor([[[1.0000e+00, 1.4431e-11, 1.0674e-11, 1.6252e-12, 1.6209e-12,
          8.0608e-12, 1.0167e-11],
         [2.7653e-12, 1.0000e+00, 1.1835e-12, 5.0947e-12, 3.2645e-13,
          1.0120e-12, 1.4372e-12],
         [8.4566e-12, 4.8933e-12, 1.0000e+00, 6.2634e-12, 9.4545e-12,
          9.6342e-12, 2.4954e-11],
         [7.4076e-13, 1.2119e-11, 3.6034e-12, 1.0000e+00, 3.3581e-12,
          2.0611e-12, 7.7563e-12],
         [4.0713e-14, 4.2792e-14, 2.9974e-13, 1.8505e-13, 1.0000e+00,
          1.0654e-13, 3.4142e-13],
         [3.9973e-12, 2.6189e-12, 6.0303e-12, 2.2424e-12, 2.1033e-12,
          1.0000e+00, 1.2171e-11],
         [2.6199e-12, 1.9328e-12, 8.1167e-12, 4.3852e-12, 3.5027e-12,
          6.3248e-12, 1.0000e+00]]], grad_fn=<SoftmaxBackward0>)


In [54]:
# 이어텐션의 문제점. -> 같은 embedding vector의 부호가 같기 때문에 softmax 값이 1이 나오게 되고 attention을 진행하는 의미가 없음.

In [192]:
def scaled_dot_product_attention(query, key, value, mask=None, pad_mask=None):
  dim_k = query.size(-1)
  scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)
  if mask is not None:
    scores = scores.masked_fill(mask==0, float(-1e10))
  if pad_mask is not None:
    scores = scores.masked_fill(pad_mask==0, float(-1e10))
  weights = torch.exp(F.log_softmax(scores, dim=-1))
  return weights.bmm(value) # (batch_size, seq_size, seq_size), (batch_size, seq_size, embedding_dim) -> weighted sum with respect to attention vector

In [149]:
def make_padding_mask(q, k, padding_idx):
    # q,k의 size = (batch_size, seq_len)
    _, q_seq_len = q.size()
    _, k_seq_len = k.size()

    q = q.ne(padding_idx)  # padding token을 0, 나머지를 1로 만들어줌
    q = q.unsqueeze(1).unsqueeze(3) # (batch_size, 1, q_seq_len, 1)
    q = q.repeat(1,1,1,k_seq_len)   # (batch_size, 1, q_seq_len, k_seq_len)

    k = k.ne(padding_idx)
    k = k.unsqueeze(1).unsqueeze(2) # (batch_size, 1, 1, k_seq_len)
    k = k.repeat(1,1,q_seq_len,1)   # (batch_size, 1, q_seq_len, k_seq_len)

    # and 연산
    # (batch_size, 1, q_seq_len, k_seq_len)
    mask = q & k

    return mask.int().squeeze(dim=1)

In [57]:
# multi-head-attention
class AttentionHead(nn.Module):
  def __init__(self, embed_dim, head_dim, mask=False, padding_idx=None):
    super(AttentionHead, self).__init__()
    self.q = nn.Linear(embed_dim, head_dim)
    self.k = nn.Linear(embed_dim, head_dim)
    self.v = nn.Linear(embed_dim, head_dim)
    self.mask = mask
    self.padding_idx= padding_idx

  def forward(self, hidden_state, padding_mask=None):

    if self.mask:
      mask_matrix = torch.tril(torch.ones(hidden_state.size(1), hidden_state.size(1))).unsqueeze(0)
      mask_tensor = mask_matrix.repeat(hidden_state.size(0), 1, 1)
      mask_tensor.require_grads=False
    else:
      mask_tensor = None
    attn_outputs = scaled_dot_product_attention(
        self.q(hidden_state), self.k(hidden_state), self.v(hidden_state), mask_tensor, padding_mask
    )

    return attn_outputs


In [58]:
class MultiHeadAttention(nn.Module):
  def __init__(self, config, mask=False):
    super(MultiHeadAttention, self).__init__()
    embed_dim = config.hidden_size
    num_heads = config.num_attention_heads
    head_dim = embed_dim // num_heads
    padding_idx = config.pad_token_id
    self.mask = mask
    self.heads = nn.ModuleList(
        [AttentionHead(embed_dim, head_dim, mask, padding_idx) for _ in range(num_heads)]
    )
    self.output_linear= nn.Linear(embed_dim, embed_dim)

  def forward(self, hidden_state, padding_mask=None):
    x = torch.cat([h(hidden_state, padding_mask) for h in self.heads], dim=-1)
    x = self.output_linear(x)
    return x

In [59]:
multihead_attn = MultiHeadAttention(config)
attn_output = multihead_attn(input_embeds)
attn_output.size()

torch.Size([1, 7, 768])

In [60]:
# Self-Attention Visualization

from bertviz import head_view
from transformers import AutoModel

model = AutoModel.from_pretrained(model_ckpt, output_attentions=True)

sentence_a = "time flies like an arrow"
sentence_b = "fruit flies like a banana"

viz_inputs = tokenizer(sentence_a, sentence_b, return_tensors="pt")
attention = model(**viz_inputs).attentions
sentence_b_start = (viz_inputs.token_type_ids == 0).sum(dim=1)
tokens = tokenizer.convert_ids_to_tokens(viz_inputs.input_ids[0])

head_view(attention, tokens, sentence_b_start, heads=[1])

Output hidden; open in https://colab.research.google.com to view.

In [61]:
class FeedForward(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.linear_1 = nn.Linear(config.hidden_size, config.intermediate_size)
    self.linear_2 = nn.Linear(config.intermediate_size, config.hidden_size)
    self.gelu = nn.GELU()
    self.dropout = nn.Dropout(config.hidden_dropout_prob)

  def forward(self, x):
    x = self.linear_1(x)
    x = self.gelu(x)
    x = self.linear_2(x)
    x = self.dropout(x)
    return x

In [62]:
feed_forward = FeedForward(config)
ff_outputs = feed_forward(attn_output)
ff_outputs.size()

torch.Size([1, 7, 768])

In [63]:
class TransformerEncoderLayer(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
    self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
    self.attention = MultiHeadAttention(config)
    self.feed_forward = FeedForward(config)
    self.padding_idx = config.pad_token_id


  def forward(self, x, padding_mask=None):
    hidden_state = self.layer_norm_1(x)
    x = x + self.attention(hidden_state, padding_mask)
    x = x + self.feed_forward(self.layer_norm_2(x))
    return x

In [64]:
encoder_layer = TransformerEncoderLayer(config)
input_embeds.shape, encoder_layer(input_embeds).size()

(torch.Size([1, 7, 768]), torch.Size([1, 7, 768]))

In [65]:
class Embeddings(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.token_embeddings = nn.Embedding(config.vocab_size,
                                         config.hidden_size,
                                         padding_idx=0)
    #위치도 니가 알아서 찾아봐라 식.
    self.position_embeddings = nn.Embedding(config.max_position_embeddings,
                                            config.hidden_size)
    self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-12)
    self.dropout = nn.Dropout()

  def forward(self, input_ids):
    seq_length = input_ids.size(1)
    position_ids = torch.arange(seq_length, dtype=torch.long).unsqueeze(0)

    token_embeddings = self.token_embeddings(input_ids)
    position_embeddings = self.position_embeddings(position_ids)

    # token embedding과 position embedding을 단순히 더하는 연산인 것임.
    embeddings = token_embeddings + position_embeddings
    embeddings = self.layer_norm(embeddings)
    embeddings = self.dropout(embeddings)
    return embeddings

In [108]:
embedding_layer = Embeddings(config)
embedding_layer(inputs.input_ids).size()

torch.Size([1, 30, 768])

# Encoder의 구현

- Embeddings Layer(token_embedding + position embedding +layer Normalization + Dropout)
- Transformer Encoder Layer(layerNorm + MultiHeadAttention + LayerNorm + Feed_ForwardNet)
- Transformer Encoder Layer를 겹으로 쌓아서 구성

In [171]:
class TransformerEncoder(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.embeddings = Embeddings(config)
    self.layers = nn.ModuleList(
        [TransformerEncoderLayer(config)
        for _ in range(config.num_hidden_layers)]
    )
    self.padding_idx = config.pad_token_id
    self.padding_mask = None

  def forward(self, x):
    self.padding_mask = make_padding_mask(x, x, self.padding_idx)
    x = self.embeddings(x)
    for i, layer in enumerate(self.layers):
      #print(f"{i + 1}번째 Layer")
      x = layer(x, self.padding_mask)
    return x

In [161]:
encoder = TransformerEncoder(config)
output = encoder(inputs.input_ids)

tensor([[[1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]], dtype=torch.int32)


In [162]:
output.shape

torch.Size([1, 10, 768])

In [163]:
print(inputs.input_ids)

tensor([[  101,  2051, 10029,  2066,  2019,  8612,   102,     0,     0,     0]])


In [29]:
class TransformerForSequenceClassification(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.encoder = TransformerEncoder(config)
    self.dropout = nn.Dropout(config.hidden_dropout_prob)
    self.classifier = nn.Linear(config.hidden_size, config.num_labels)

  def forward(self, x):
    x = self.encoder(x)[:, 0, :]
    x = self.dropout(x)
    x = self.classifier(x)
    return x

In [34]:
config.num_labels = 3
encoder_classifier = TransformerForSequenceClassification(config)
output = encoder_classifier(inputs.input_ids)

In [38]:
print(output)
output = F.softmax(output, dim =-1) # 정규화 되지 않은 logit을 반환

tensor([[ 3.1588,  0.7398, -1.3600]], grad_fn=<AddmmBackward0>)


In [39]:
output

tensor([[0.9092, 0.0809, 0.0099]], grad_fn=<SoftmaxBackward0>)

# 3.3 Decoder
- Masked Self-attention Layer
- Encoder-Decoder attention layer

In [167]:
seq_len = inputs.input_ids.size(-1)
mask = torch.tril(torch.ones(seq_len, seq_len).unsqueeze(0))
mask[0]

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

In [169]:
mask.masked_fill(mask==0, float(-1e10))

tensor([[[ 1.0000e+00, -1.0000e+10, -1.0000e+10, -1.0000e+10, -1.0000e+10,
          -1.0000e+10, -1.0000e+10, -1.0000e+10, -1.0000e+10, -1.0000e+10],
         [ 1.0000e+00,  1.0000e+00, -1.0000e+10, -1.0000e+10, -1.0000e+10,
          -1.0000e+10, -1.0000e+10, -1.0000e+10, -1.0000e+10, -1.0000e+10],
         [ 1.0000e+00,  1.0000e+00,  1.0000e+00, -1.0000e+10, -1.0000e+10,
          -1.0000e+10, -1.0000e+10, -1.0000e+10, -1.0000e+10, -1.0000e+10],
         [ 1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00, -1.0000e+10,
          -1.0000e+10, -1.0000e+10, -1.0000e+10, -1.0000e+10, -1.0000e+10],
         [ 1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,
          -1.0000e+10, -1.0000e+10, -1.0000e+10, -1.0000e+10, -1.0000e+10],
         [ 1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,
           1.0000e+00, -1.0000e+10, -1.0000e+10, -1.0000e+10, -1.0000e+10],
         [ 1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,
           1.0000e+

## Encoder-Decoder 전용 Attention

In [173]:
# multi-head-attention
class AttentionEncDecHead(nn.Module):
  def __init__(self, embed_dim, head_dim, padding_idx=None, mask=False):
    super().__init__()
    self.q = nn.Linear(embed_dim, head_dim)
    self.k = nn.Linear(embed_dim, head_dim)
    self.v = nn.Linear(embed_dim, head_dim)
    self.mask = mask
    self.padding_idx=padding_idx

  def forward(self, dec_hidden_state, enc_hidden_state, padding_mask=None):
    if self.mask:
      mask_matrix = torch.tril(torch.ones(enc_hidden_state.size(1), enc_hidden_state.size(1))).unsqueeze(0)
      mask_tensor = mask_matrix.repeat(enc_hidden_state.size(0), 1, 1)
      mask_tensor.require_grads=False
    else:
      mask_tensor = None

    attn_outputs = scaled_dot_product_attention(
        self.q(dec_hidden_state), self.k(enc_hidden_state), self.v(enc_hidden_state), mask_tensor, padding_mask
    )
    return attn_outputs


In [174]:
class MultiHeadEncDecAttention(nn.Module):
  def __init__(self, config, mask=False):
    super().__init__()
    embed_dim = config.hidden_size
    num_heads = config.num_attention_heads
    head_dim = embed_dim // num_heads
    self.padding_idx = config.pad_token_id
    self.mask = mask
    self.heads = nn.ModuleList(
        [AttentionEncDecHead(embed_dim, head_dim, self.mask, self.padding_idx) for _ in range(num_heads)]
    )
    self.output_linear= nn.Linear(embed_dim, embed_dim)

  def forward(self, dec_hidden_state, enc_hidden_state, padding_mask):
    x = torch.cat([h(dec_hidden_state, enc_hidden_state, padding_mask) for h in self.heads], dim=-1)
    x = self.output_linear(x)
    return x

## Decoder Architecture
##### 여기서 부터는 직접하란다.. 이게 말이되냐..

In [175]:
class TransformerDecoderLayer(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
    self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
    self.layer_norm_3 = nn.LayerNorm(config.hidden_size)
    self.layer_norm_4 = nn.LayerNorm(config.hidden_size)
    self.masked_attention = MultiHeadAttention(config, mask=True)
    self.enc_dec_attention = MultiHeadEncDecAttention(config, mask=True)
    self.feed_forward = FeedForward(config)

  def forward(self, x, enc_outputs, attn_padding_mask=None, enc_dec_attn_padding_mask=None):
    hidden_state = self.layer_norm_1(x)
    x = x + self.masked_attention(hidden_state, attn_padding_mask)
    x = x + self.enc_dec_attention(self.layer_norm_2(x), self.layer_norm_3(enc_outputs), enc_dec_attn_padding_mask)
    x = x + self.feed_forward(self.layer_norm_4(x))
    return x

In [187]:
class TransformerDecoder(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.embeddings = Embeddings(config)
    self.layers = nn.ModuleList(
        [TransformerDecoderLayer(config)
        for _ in range(config.num_hidden_layers)]
    )
    self.padding_idx = config.pad_token_id

  def forward(self, x_target, x_enc, enc_ids):
    #이 패딩마스크는 self-attention에만 적용됨.
    self.attn_padding_mask = make_padding_mask(x_target, x_target, self.padding_idx)

    # enc-dec attention의 경우 padding_mask/look_ahead mask를 병합하는 추가연산을 필요로함.
    self.enc_dec_padding_mask = make_padding_mask(x_target, enc_ids, self.padding_idx)

    #ids를 이용해 mask를 만든후에 embedding 진행
    x_target = self.embeddings(x_target)
    for layer in self.layers:
      x_target = layer(x_target, x_enc, self.attn_padding_mask, self.enc_dec_padding_mask)
    return x_target

In [188]:
# @title Encoder output
x_src_ids = inputs.input_ids
x_tgt_ids = inputs.input_ids
encoder_model = TransformerEncoder(config)
enc_output = encoder(inputs.input_ids)

In [209]:
# @title Decoder output
decoder_model = TransformerDecoder(config)
decoder_output = decoder_model(x_tgt_ids, enc_output, x_src_ids)
print(decoder_output.shape)

torch.Size([1, 10, 768])
torch.Size([1, 10, 768])


In [210]:
print(config)
print(inputs.input_ids)

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_length": 15,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.38.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

tensor([[  101,  2051, 10029,  2066,  2019,  8612,   102,     0,     0,     0]])


In [211]:
import copy

class Transformer(nn.Module):
  def __init__(self, config, apply_softmax):
    super().__init__()
    self.encoder = TransformerEncoder(config)
    self.decoder = TransformerDecoder(config)

    hidden_size = config.hidden_size
    vocab_size = config.vocab_size
    self.embed_to_tokens = nn.Linear(hidden_size, vocab_size)
    self.apply_softmax=apply_softmax

  def forward(self, x_enc, x_tgt):
    enc_ids = copy.deepcopy(x_enc)
    enc_outputs = self.encoder(x_enc)

    #enc_id가 담긴 입력을 Decoder에 넘겨줘야함.
    outputs = self.decoder(x_tgt, enc_outputs, enc_ids)
    outputs = self.embed_to_tokens(outputs)

    if self.apply_softmax:
      outputs = F.softmax(outputs, dim=-1)
    return outputs

In [193]:
def get_src_tgt_input(source_text, target_text, config):
  source_inputs = tokenizer(source_text, padding=True, max_length=config.max_length, return_tensors="pt", truncation=True)
  target_inputs = tokenizer(source_text, padding=True, max_length=config.max_length, return_tensors="pt", truncation=True)
  source_inputs.input_ids = get_padded_sequence(source_inputs.input_ids, config.max_length)
  target_inputs.input_ids = get_padded_sequence(target_inputs.input_ids, config.max_length)
  return source_inputs, target_inputs

In [212]:
source_text = "I like apples."
target_text = "I like apples, too."
config.max_length = 15
src_inputs, tgt_inputs = get_src_tgt_input(source_text, target_text, config)

print(src_inputs.keys())
print(tgt_inputs.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [224]:
# @title model 생성
torch.manual_seed(0) # seed를 고정하여 재현성 조정.
final_model = Transformer(config, apply_softmax=True)

In [225]:
outputs =final_model(src_inputs.input_ids, tgt_inputs.input_ids)
print(outputs.shape)

torch.Size([1, 15, 768])
torch.Size([1, 15, 30522])


In [228]:
# @title Predicted Sequence 출력(학습이 안되어 이상한게 나옴.)

predicted_indices = torch.argmax(outputs, dim=-1)
print(predicted_indices)
predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_indices.squeeze().tolist())

print("추론된 토큰 시퀀스:")
print(predicted_tokens)

tensor([[29064, 19550, 22943, 14759, 16859, 28962, 18437, 15701,  3078, 20809,
         19667, 17010, 27580, 15508,  9812]])
추론된 토큰 시퀀스:
['##tsk', 'runoff', '##roid', 'drowning', 'knob', 'pebbles', 'muse', 'shouts', 'primary', 'congregational', 'darlington', 'tilt', '##cliff', 'worries', 'dimension']
