In [1]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt

from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from torch.utils.data import TensorDataset, DataLoader, Subset
from tensorflow.keras.utils import pad_sequences

from torch.utils.data import TensorDataset, DataLoader, Subset
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


tqdm.pandas()

# 데이터 Load

In [2]:
from datasets import load_dataset

dataset = load_dataset("bentrevett/multi30k")

In [3]:
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")



In [4]:
dataset2 = dataset.map(lambda e: tokenizer(e['en'], padding= False), batched=True)
dataset3 = dataset.map(lambda e: tokenizer(e['de'], padding= False), batched=True)

In [5]:
58101 # 이를 start token으로 지정.a

58101

In [6]:
# np.unique([len(i) for i in dataset2['train']['input_ids']]), np.unique([len(i) for i in dataset3['train']['input_ids']])

In [7]:
def pad_sequences_en(sequences):
    max_len = 128
    padded_sequences = []
    for sequence in sequences:
        if len(sequence) >= max_len:
            padded_sequence = sequence[:max_len]  # 최대 길이까지 잘라냄
        else:
            padded_sequence = sequence + [58100] * (max_len - len(sequence))  # 패딩 추가
        padded_sequences.append(padded_sequence)
    return padded_sequences

In [8]:
def pad_sequences_de(sequences):
    max_len = 128
    padded_sequences = []
    for sequence in sequences:
        if len(sequence) >= max_len:
            padded_sequence = [58101] + sequence[:max_len-1]  # 최대 길이까지 잘라냄
        else:
            padded_sequence = [58101] + sequence + [58100] * (max_len - len(sequence) - 1)  # 패딩 추가
        padded_sequences.append(padded_sequence)
    return padded_sequences

In [9]:
padded_batch = pad_sequences(dataset2['train']['input_ids'])

In [10]:
train_en, train_ge = torch.tensor(pad_sequences_en(dataset2['train']['input_ids'])), torch.tensor(pad_sequences_de(dataset3['train']['input_ids']))
valid_en, valid_ge = torch.tensor(pad_sequences_en(dataset2['validation']['input_ids'])), torch.tensor(pad_sequences_de(dataset3['validation']['input_ids']))
test_en, test_ge = torch.tensor(pad_sequences_en(dataset2['test']['input_ids'])), torch.tensor(pad_sequences_de(dataset3['test']['input_ids']))

In [11]:
train_ge[0]

tensor([58101,   589,  7554,  7861, 22012,    95,   197,  5192,    18, 17694,
           34,   731,  7199,    82,    49,  4407,    15,     5,     9,   394,
         1258,  1578,  5154,   526,    45, 14243,  3351,     3,     0, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100])

In [12]:
set([len(i) for i in train_en]), set([len(i) for i in train_ge])

({128}, {128})

In [13]:
class Dataset(Dataset):
    
    def __init__(self, inputs, output):

        self.inputs = inputs
        self.output = output

    def __getitem__(self, idx):

        inputs = self.inputs[idx]
        output = self.output[idx]
        
        return inputs, output

    def __len__(self):
        return len(self.output)

In [14]:
train_dataset = Dataset(train_en, train_ge)
valid_dataset = Dataset(valid_en, valid_ge)
test_dataset = Dataset(test_en, test_ge)

In [15]:



train_loader = DataLoader(train_dataset, batch_size = 64, shuffle = False, drop_last = True)
valid_loader = DataLoader(valid_dataset, batch_size = 64, shuffle = False, drop_last = True)
test_loader = DataLoader(test_dataset, batch_size = 64, shuffle = False, drop_last = True)

In [16]:
len(valid_loader)

15

# Model

In [17]:
import math

class Embeddings(nn.Module):


    def __init__(self, vocab_size, d_model, max_len = 128):
        super(Embeddings, self).__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(0.1)
        self.embed = nn.Embedding(vocab_size + 1, d_model)
        self.pe = self.create_positinal_encoding(max_len, self.d_model)
        self.dropout = nn.Dropout(0.1)

    def create_positinal_encoding(self, max_len, d_model):
        pe = torch.zeros(max_len, d_model).to(device)
        for pos in range(max_len):  
            for i in range(0, d_model, 2):  
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
        pe = pe.unsqueeze(0)  
        return pe

    def forward(self, encoded_words):
        
        embedding = self.embed(encoded_words) * torch.sqrt(torch.tensor(self.d_model)).to(device)
        embedding += self.pe[:, :embedding.size(1)]   
        embedding = self.dropout(embedding)
        return embedding

In [18]:
# class Attention(nn.Module):
    
#     def __init__(self, embedding_size = 512):
        
#         self.data = data
#         self.embedding_size= embedding_size
#         self.weight_Q = nn.Linear(embedding_size, embedding_size)
#         self.weight_K = nn.Linear(embedding_size, embedding_size)
#         self.weight_V = nn.Linear(embedding_size, embedding_size)
#         self.softmax = nn.Softmax()
        
#     def forward(self, data):
        
#         Q = self.weight_Q(data)
#         K = self.weight_K(data)
#         V = self.weight_V(data)
#         score = torch.matmul(Q,K.T) / torch.sqrt(self.embedding_size)
#         value = self.softmax(score) * V
#         return value
         

In [19]:
def create_masks(inputs, outputs_input, outputs_target):
    
    def subsequent_mask(size):
        mask = torch.triu(torch.ones(size, size)).transpose(0, 1).type(dtype = torch.uint8)
        return mask.unsqueeze(0) # 상삼각행렬 생성 -> 행과 열을 뒤 바꾸어 하삼각행렬로 바꿈. (밑에가 다 0)
    
    inputs_mask = inputs != 58100
    inputs_mask = inputs_mask.to(device)
    inputs_mask = inputs_mask.unsqueeze(1).unsqueeze(1) # 각  input에 대해서 상삼각행렬에 대응하도록 설정.
    
    outputs_input_mask = outputs_input != 58100
    outputs_input_mask = outputs_input_mask.unsqueeze(1) 
    outputs_input_mask = outputs_input_mask & subsequent_mask(outputs_input.size(-1)).type_as(outputs_input_mask.data)
    outputs_input_mask = outputs_input_mask.unsqueeze(1)
    # masking을 해줌으로서, 

    
    outputs_target_mask = outputs_target != 58100
    
    return inputs_mask, outputs_input_mask, outputs_target_mask
        
        

In [20]:
train_en

tensor([[ 4386,  1296,     2,  ..., 58100, 58100, 58100],
        [15036,  1135,     5,  ..., 58100, 58100, 58100],
        [   93,   839,  4040,  ..., 58100, 58100, 58100],
        ...,
        [ 4386,  8722,  2013,  ..., 58100, 58100, 58100],
        [  282, 17525,   175,  ..., 58100, 58100, 58100],
        [   93,   175,     5,  ..., 58100, 58100, 58100]])

In [21]:
class MultiHeadAttention(nn.Module):
    
    def __init__(self, heads, d_model):
        super(MultiHeadAttention, self).__init__()
        self.d_k = d_model // heads
        self.heads = heads
        self.dropout = nn.Dropout(0.1)
        self.query = nn.Linear(d_model, d_model)
        self.key  = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.concat = nn.Linear(d_model, d_model)
        
    def forward(self, query, key, value, mask):
        
        query = self.query(query)
        key = self.key(key)
        value = self.value(value)
        
        query = query.view(query.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)
        key = key.view(key.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)
        value = value.view(value.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)
        

        scores = torch.matmul(query, key.permute(0 ,1 ,3, 2)) / math.sqrt(query.size(-1))
        
        # print(query.shape, key.shape, value.shape, mask.shape, scores.shape)

        
        scores = scores.masked_fill(mask == 0, -1e9) # masking 된 것에 매우 작은 수 부여 -> softmax 계산시 -inf 로 계산되어짐.
        weights = F.softmax(scores, dim = -1) # attention score 계산
        context = torch.matmul(weights, value)  # attention value 계산
        context = context.permute(0,2,1,3).contiguous().view(context.shape[0], -1, self.heads * self.d_k)
        
        interacted = self.concat(context)
        
        return interacted
        
        

In [22]:
class FeedForward(nn.Module):
    
    def __init__(self, d_model, middle_dim = 2048):
        super(FeedForward, self).__init__()
        
        self.fc1 = nn.Linear(d_model, middle_dim)
        self.fc2 = nn.Linear(middle_dim, d_model)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self,x):
        out = F.relu(self.fc1(x))
        out = self.fc2(self.dropout(out))
        return out        

In [23]:
class EncoderLayer(nn.Module):
    
    def __init__(self, d_model, heads):
        super(EncoderLayer, self).__init__()
        self.layernorm = nn.LayerNorm(d_model)
        self.self_multihead = MultiHeadAttention(heads, d_model)
        self.feed_forward = FeedForward(d_model)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, embeddings, mask):
        interacted = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, mask))
        interacted = self.layernorm(interacted + embeddings)
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        encoded = self.layernorm(feed_forward_out + interacted)
        return encoded
        

In [24]:
class DecoderLayer(nn.Module):
    
    def __init__(self, d_model, heads):
        super(DecoderLayer, self).__init__()
        self.layernorm = nn.LayerNorm(d_model)
        self.self_multihead = MultiHeadAttention(heads, d_model)
        self.src_multihead = MultiHeadAttention(heads, d_model)
        self.feed_forward = FeedForward(d_model)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, embeddings, encoded, src_mask, target_mask):
        query = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, target_mask))
        query = self.layernorm(query + embeddings)
        interacted = self.dropout(self.src_multihead(query, encoded, encoded, src_mask))
        interacted = self.layernorm(interacted + query)
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        decoded = self.layernorm(feed_forward_out + interacted)
        return decoded

In [25]:
class P_Transformer(nn.Module):
    
    def __init__(self, d_model, heads, num_layers, vocab_size):
        super(Transformer, self).__init__()
        
        self.d_model = d_model
        self.vocab_size = vocab_size + 1
        self.embed = Embeddings(self.vocab_size, d_model)
        self.encoder = nn.ModuleList([EncoderLayer(d_model, heads) for _ in range(num_layers)])
        self.decoder = nn.ModuleList([DecoderLayer(d_model, heads) for _ in range(num_layers)])
        self.logit = nn.Linear(d_model, self.vocab_size)
        
    def encode(self, src_words, src_mask):
        src_embeddings = self.embed(src_words)
        encoded_layers = []
        for layer in self.encoder:
            src_embeddings = layer(src_embeddings, src_mask)
            encoded_layers.append(src_embeddings)
            
        return encoded_layers

    def decode(self, target_words, target_mask, src_embeddings, src_mask):
        tgt_embeddings = self.embed(target_words)
        for i, layer in enumerate(self.decoder):
            tgt_embeddings = layer(tgt_embeddings, src_embeddings[i], src_mask, target_mask)
            decoded_layers.append(tgt_embeddings)
        return tgt_embeddings

    def forward(self, src_words, src_mask, target_words, target_mask):
        encoded_layers = self.encode(src_words, src_mask)
        decoded = self.decode(target_words, target_mask, encoded_layers, src_mask)
        out = F.log_softmax(self.logit(decoded), dim = 2)
        return out

In [26]:
class Transformer(nn.Module):
    
    def __init__(self, d_model, heads, num_layers, vocab_size):
        super(Transformer, self).__init__()
        
        self.d_model = d_model
        self.vocab_size = vocab_size + 1
        self.embed = Embeddings(self.vocab_size, d_model)
        self.encoder = nn.ModuleList([EncoderLayer(d_model, heads) for _ in range(num_layers)])
        self.decoder = nn.ModuleList([DecoderLayer(d_model, heads) for _ in range(num_layers)])
        self.logit = nn.Linear(d_model, self.vocab_size)
        
    def encode(self, src_words, src_mask):
        src_embeddings = self.embed(src_words)
        
        for layer in self.encoder:
            src_embeddings = layer(src_embeddings, src_mask)
            
        return src_embeddings

    def decode(self, target_words, target_mask, src_embeddings, src_mask):
        tgt_embeddings = self.embed(target_words)
        for layer in self.decoder:
            tgt_embeddings = layer(tgt_embeddings, src_embeddings, src_mask, target_mask)
        return tgt_embeddings

    def forward(self, src_words, src_mask, target_words, target_mask):
        encoded = self.encode(src_words, src_mask)
        decoded = self.decode(target_words, target_mask, encoded, src_mask)
        out = F.log_softmax(self.logit(decoded), dim = 2)
        return out
            

In [27]:
class AdamWarmup:
    
    def __init__(self, model_size, warmup_steps, optimizer):
        
        self.model_size = model_size
        self.warmup_steps = warmup_steps
        self.optimizer = optimizer
        self.current_step = 0
        self.lr = 0
    
    def get_lr(self):
        return self.model_size ** (-0.5) * min(self.current_step ** (-0.5), self.current_step * self.warmup_steps ** (-1.5))
    
    def step(self):
        
        self.current_step += 1
        lr = self.get_lr()
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr
            
        self.lr = lr
        self.optimizer.step()

In [28]:
import torch.nn.functional as F

class LossWithLS(nn.Module):
    
    def __init__(self, size, smooth):
        super(LossWithLS, self).__init__()
        self.smooth = smooth
        self.size = size

    def forward(self, prediction, target, mask):
        prediction = prediction.view(-1, prediction.size(-1))   # (batch_size * max_words, vocab_size)
        target = target.contiguous().view(-1)   # (batch_size * max_words)
        mask = mask.float()
        mask = mask.view(-1)       # (batch_size * max_words)
        
        # Smoothed one-hot labels
        labels = torch.full_like(prediction, self.smooth / (self.size - 1))
        labels.scatter_(1, target.unsqueeze(1), 1 - self.smooth)
        
        # Apply mask
        masked_prediction = prediction * mask.unsqueeze(1)
        masked_labels = labels * mask.unsqueeze(1)

        
        # Calculate negative log likelihood loss
        loss = F.nll_loss(masked_prediction, target, reduction='none')
        loss *= mask
        
        # Normalize the loss
        loss = loss.sum() / mask.sum()
        
        return loss

# Training

In [29]:
128 / 8

16.0

In [30]:
d_model = 64
heads = 8
num_layers = 6
num_layers = 6
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 10
vocab_len = len(tokenizer.get_vocab())

transformer = Transformer(d_model = d_model , heads = heads, num_layers = num_layers, vocab_size = vocab_len)
transformer = transformer.to(device)
adam_optimizer = torch.optim.Adam(transformer.parameters())
transformer_optimizer = AdamWarmup(model_size = d_model, warmup_steps = 4000, optimizer = adam_optimizer)
criterion = LossWithLS(vocab_len, 0.1)

In [31]:
next(iter(train_loader.dataset))[1].shape

torch.Size([128])

In [32]:
import wandb


wandb.init(
    # set the wandb project where this run will be logged
    project="Transformer_Hard_Coding",

    # track hyperparameters and run metadata
    config={
    "loss": 'KLDivergence',
    "architecture": "Transformer",
    "optimizer" : 'AdamWarmup',
    'layer' : 6,
    'heads' : 8,
    'd_model' : 64   
    }
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myoon303b[0m ([33mku_software[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [33]:


import torch
import torch.nn.functional as F
import random

def generate_next_words(transformer, sequence, encoded, question_mask, beam_width):
    
    start_token = 58101
    
    if not sequence:
        return [(start_token, 0)]
    
    size = len(sequence)
    target_mask = torch.triu(torch.ones(size, size)).transpose(0, 1).type(dtype=torch.uint8)
    target_mask = target_mask.to(device).unsqueeze(0).unsqueeze(0)

    words = torch.LongTensor([sequence]).to(device)

    decoded = transformer.decode(words, target_mask, encoded, question_mask)
    predictions = transformer.logit(decoded[:, -1])

    # Apply log_softmax
    log_probs = F.log_softmax(predictions, dim = -1)

    # Get top-k words
    top_k_probs, top_k_indices = torch.topk(log_probs, beam_width, dim=-1)
    top_k_probs = top_k_probs.squeeze().tolist()
    top_k_indices = top_k_indices.squeeze().tolist()

    next_words = [(word, prob) for word, prob in zip(top_k_indices, top_k_probs)]
    return next_words


def beam_search(transformer, question, question_mask, max_len, dict, beam_width=5):
    transformer.eval()
    start_token = 58101
    end_token = 0

    encoded = transformer.encode(question, question_mask)

    beams = [([], 0)] 

    for step in range(max_len):
        candidates = []

        for seq, score in beams:
            if seq and seq[-1] == end_token:
                candidates.append((seq, score))
                continue
            
            next_words = generate_next_words(transformer, seq, encoded, question_mask, beam_width)

            for word, log_prob in next_words:
                candidates.append((seq + [word], score + log_prob))

        # Select top-k candidates
        candidates.sort(key=lambda x: x[1], reverse=True)
        beams = candidates[:beam_width]

        # Check if all beams have ended
        if all(seq[-1] == end_token for seq, _ in beams):
            break

    best_seq, _ = max(beams, key=lambda x: x[1])
    return best_seq

In [34]:
bb = {v:k for k,v in tokenizer.get_vocab().items()} 
bb[58101] = '<start>'

In [35]:
def train(train_loader, transformer, criterion, epoch):
    
    transformer.train()
    sum_loss, valid_sum_loss = 0, 0
    count, valid_count = 0, 0
    
    for i, (inputs, output) in enumerate(train_loader):
        
        samples = inputs.shape[0]
        
        inputs = inputs.to(device)
        output = output.to(device)
        
        output_in = output[:,:-1]
        output_target = output[:,1:]
        inputs_mask, output_in_mask, output_target_mask = create_masks(inputs, output_in, output_target)
        
        out = transformer(inputs, inputs_mask, output_in, output_in_mask)
        
        loss = criterion(out, output_target, output_target_mask)
        transformer_optimizer.optimizer.zero_grad()
        loss.backward()
        transformer_optimizer.step()
        
        sum_loss += loss.item() * samples
        count += samples
        
        if i % 5 == 0:
            print("Epoch [{}][{}/{}]\tTrain Loss: {:.3f}".format(epoch, i, len(train_loader), sum_loss/count))
            wandb.log({"Training loss" : sum_loss/count})
    
    transformer.eval()
    with torch.no_grad():
        for i, (inputs, output) in enumerate(valid_loader):
            
            samples = inputs.shape[0]
            
            inputs = inputs.to(device)
            output = output.to(device)
            
            output_in = output[:,:-1]
            output_target = output[:,1:]
            
            inputs_mask, output_in_mask, output_target_mask = create_masks(inputs, output_in, output_target)
            out = transformer(inputs, inputs_mask, output_in, output_in_mask)
            
            loss = criterion(out, output_target, output_target_mask)
            
            valid_sum_loss += loss.item() * samples
            valid_count += samples
        
            if i % 5   == 0:
                print("Epoch [{}][{}/{}]\t\t\tValid Loss: {:.3f}".format(epoch, i, len(valid_loader), valid_sum_loss/valid_count))
                wandb.log({"Validation loss" :  valid_sum_loss/valid_count })
                

    max_len = 128
    A = random.randint(1, 64)
    B = next(iter(test_loader))
    
    enc_qus = B[0][A]
    real_qus = B[1][A]
    question = torch.LongTensor(enc_qus).to(device).unsqueeze(0)
    question_mask = (question!=58100).to(device).unsqueeze(1).unsqueeze(1)
    sentence = beam_search(transformer, question, question_mask, int(max_len), dict)
    
    candidate = [bb[i] for i in sentence if (i != 0) and (i != 58100) and (i != 58101)]
    real = [bb[i.tolist()] for i in real_qus if (i.tolist() != 0) and (i.tolist() != 58100)  and (i != 58101)]
    print()
    print('Candidate :'  + ' '.join(candidate))
    print('Candidate_labeled : ', [bb[i] for i in sentence])
    print('Real :' + ' '.join(real))


In [36]:
torch.Tensor((64, 128, 128)) * math.sqrt(128)

tensor([ 724.0773, 1448.1547, 1448.1547])

In [44]:
train_dataset = Dataset(train_en, train_ge)
valid_dataset = Dataset(valid_en, valid_ge)
test_dataset = Dataset(test_en, test_ge)


train_loader = DataLoader(train_dataset, batch_size = 64, shuffle = True, drop_last = True)
valid_loader = DataLoader(valid_dataset, batch_size = 64, shuffle = True, drop_last = True)
test_loader = DataLoader(test_dataset, batch_size = 64, shuffle = False, drop_last = True)

In [45]:
len(train_en)

29000

In [46]:

random.randint(1, 15)

7

In [47]:
from datetime import datetime


for epoch in range(epochs):
    
    start_time = datetime.now() 
    train(train_loader, transformer, criterion, epoch)
    
    state = {'epoch': epoch, 'transformer': transformer, 'transformer_optimizer': transformer_optimizer}
    torch.save(state, 'checkpoint_new' + str(epoch+10) + '.pth.tar')
    
    time_elapsed = datetime.now() - start_time 
    print('#######   Time elapsed (hh:mm:ss.ms) {} #######'.format(time_elapsed))
    print()

Epoch [0][0/453]	Train Loss: 2.834
Epoch [0][5/453]	Train Loss: 2.886
Epoch [0][10/453]	Train Loss: 2.854
Epoch [0][15/453]	Train Loss: 2.859
Epoch [0][20/453]	Train Loss: 2.857
Epoch [0][25/453]	Train Loss: 2.849
Epoch [0][30/453]	Train Loss: 2.839
Epoch [0][35/453]	Train Loss: 2.838
Epoch [0][40/453]	Train Loss: 2.843
Epoch [0][45/453]	Train Loss: 2.841
Epoch [0][50/453]	Train Loss: 2.837
Epoch [0][55/453]	Train Loss: 2.837
Epoch [0][60/453]	Train Loss: 2.833
Epoch [0][65/453]	Train Loss: 2.828
Epoch [0][70/453]	Train Loss: 2.825
Epoch [0][75/453]	Train Loss: 2.822
Epoch [0][80/453]	Train Loss: 2.825
Epoch [0][85/453]	Train Loss: 2.818
Epoch [0][90/453]	Train Loss: 2.813
Epoch [0][95/453]	Train Loss: 2.811
Epoch [0][100/453]	Train Loss: 2.810
Epoch [0][105/453]	Train Loss: 2.805
Epoch [0][110/453]	Train Loss: 2.800
Epoch [0][115/453]	Train Loss: 2.798
Epoch [0][120/453]	Train Loss: 2.795
Epoch [0][125/453]	Train Loss: 2.794
Epoch [0][130/453]	Train Loss: 2.788
Epoch [0][135/453]	Trai

# Evaluate

In [29]:
d_model = 64
heads = 8
num_layers = 6
num_layers = 6
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 10
vocab_len = len(tokenizer.get_vocab())

transformer = Transformer(d_model = d_model , heads = heads, num_layers = num_layers, vocab_size = vocab_len)
transformer = transformer.to(device)
# adam_optimizer = torch.optim.Adam(transformer.parameters())
# transformer_optimizer = AdamWarmup(model_size = d_model, warmup_steps = 4000, optimizer = adam_optimizer)
criterion = LossWithLS(vocab_len, 0.1)

In [30]:
import torch
from copy import deepcopy

# 저장된 파일을 불러올 때
checkpoint = torch.load('checkpoint_cross1.pth.tar')

# 불러온 checkpoint에서 모델 상태나 다른 필요한 요소들을 추출할 수 있습니다.
transformer = deepcopy(checkpoint['transformer'])

# 모델을 evaluation 모드로 설정 (필요에 따라)


In [31]:
device

device(type='cuda')

In [30]:
dict = tokenizer.get_vocab()

In [31]:
len(train_loader.dataset)

29000

In [32]:
len(test_loader.dataset)

1000

In [33]:
bb = {v:k for k,v in tokenizer.get_vocab().items()} 

In [34]:
dict['<start>'] = 58101

In [35]:
start_token = dict['<start>']

In [36]:
def evaluate(transformer, question, question_mask, max_len, dict):

    transformer.eval()

    bb = {v:k for k,v in tokenizer.get_vocab().items()} 
    start_token = 58101
    encoded = transformer.encode(question, question_mask)
    words = torch.LongTensor([[start_token]]).to(device)

    for step in range(max_len - 1):
        size = words.shape[1]
        target_mask = torch.triu(torch.ones(size, size)).transpose(0, 1).type(dtype=torch.uint8)
        target_mask = target_mask.to(device).unsqueeze(0).unsqueeze(0)
        decoded = transformer.decode(words, target_mask, encoded, question_mask)
        predictions = transformer.logit(decoded[:, -1])
    
        _, next_word = torch.max(predictions[:,:-1], dim = 1)
        print(predictions)       
        next_word = next_word.item()
  
        words = torch.cat([words, torch.LongTensor([[next_word]]).to(device)], dim = 1) 


    if words.dim() == 2:
        words = words.squeeze(0)
        words = words.tolist()


    return words

In [37]:
bb[18]

'e'

In [38]:
next(iter(train_loader))[0][0]

tensor([ 4386,   238,    48,  7288,    32,    14,  6813, 21507,     4,  5542,
            3,     0, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100])

In [39]:

def generate_next_words(transformer, sequence, encoded, question_mask, beam_width):
    if not sequence:
        return [(start_token, 0)]
    
    size = len(sequence)
    target_mask = torch.triu(torch.ones(size, size)).transpose(0, 1).type(dtype=torch.uint8)
    target_mask = target_mask.to(device).unsqueeze(0).unsqueeze(0)

    words = torch.LongTensor([sequence]).to(device)

    decoded = transformer.decode(words, target_mask, encoded, question_mask)
    predictions = transformer.logit(decoded[:, -1])

    # Apply log_softmax
    log_probs = predictions

    # Get top-k words
    top_k_probs, top_k_indices = torch.topk(log_probs, beam_width, dim=-1)
    top_k_probs = top_k_probs.squeeze().tolist()
    top_k_indices = top_k_indices.squeeze().tolist()

    next_words = [(word, prob) for word, prob in zip(top_k_indices, top_k_probs)]
    return next_words



In [40]:
import torch
import torch.nn.functional as F

def beam_search(transformer, question, question_mask, max_len, dict, beam_width=5):
    transformer.eval()
    start_token = 58101
    end_token = 58100

    encoded = transformer.encode(question, question_mask)

    beams = [([], 0)] 

    for step in range(max_len):
        candidates = []

        for seq, score in beams:
            if seq and seq[-1] == end_token:
                candidates.append((seq, score))
                continue
            
            next_words = generate_next_words(transformer, seq, encoded, question_mask, beam_width)

            for word, log_prob in next_words:
                candidates.append((seq + [word], score + log_prob))

        # Select top-k candidates
        candidates.sort(key=lambda x: x[1], reverse=True)
        beams = candidates[:beam_width]

        # Check if all beams have ended
        if all(seq[-1] == end_token for seq, _ in beams):
            break

    best_seq, _ = max(beams, key=lambda x: x[1])
    return best_seq



In [41]:
max_len = 128
enc_qus = [  282,   892,   175,    19, 16058,    54,  7288,     5,     4,  3602,
            3,     0, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100]
question = torch.LongTensor(enc_qus).to(device).unsqueeze(0)
question_mask = (question!=58100).to(device).unsqueeze(1).unsqueeze(1)
sentence = beam_search(transformer, question, question_mask, int(max_len), dict)
print(sentence)

[58101, 38841, 7858, 28207, 48521, 5163, 7422, 48126, 52802, 51823, 48159, 36367, 19682, 25825, 18527, 4851, 51726, 46066, 141, 35711, 52357, 18822, 54645, 26260, 36463, 25933, 41653, 2056, 296, 44171, 45069, 22527, 19578, 26183, 711, 2724, 1884, 35284, 48126, 15398, 40737, 19682, 25825, 14295, 141, 35711, 52357, 14263, 44054, 53184, 48159, 12456, 31693, 18020, 47823, 48433, 27052, 32682, 36504, 42070, 35261, 48063, 42124, 1777, 53783, 1693, 26183, 711, 2724, 32425, 31765, 49900, 1884, 35284, 48126, 15398, 40737, 19682, 25825, 14295, 141, 35711, 52357, 18822, 14080, 46013, 8900, 33172, 9990, 14295, 141, 35711, 52357, 18822, 14080, 46013, 8900, 33172, 9990, 14295, 141, 35711, 52357, 18822, 14080, 46013, 8900, 33172, 9990, 14295, 141, 35711, 52357, 23123, 27392, 22994, 50631, 25825, 14295, 141, 35711, 52357, 23123, 27392, 22994, 50631, 45864, 32778]


In [62]:
question_mask

tensor([[[[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
            True,  True, False, False, False, False, False, False, False, False,
           False, False, False, False, False, False, False, False, False, False,
           False, False, False, False, False, False, False, False, False, False,
           False, False, False, False, False, False, False, False, False, False,
           False, False, False, False, False, False, False, False, False, False,
           False, False, False, False, False, False, False, False, False, False,
           False, False, False, False, False, False, False, False, False, False,
           False, False, False, False, False, False, False, False, False, False,
           False, False, False, False, False, False, False, False, False, False,
           False, False, False, False, False, False, False, False, False, False,
           False, False, False, False, False, False, False, False, False, False,
           False, False, Fal

In [None]:
sentence

''

# 끝

In [None]:
feg = []
for ww in A.detach().clone().cpu().numpy():
    feg_small = []
    for www in ww:
        if www == 0:
            break
        else:
            feg_small.append(bb[www])
        
    feg.append(feg_small)

In [None]:
feg = []
for ww in A.detach().clone().cpu().numpy():
    feg_small = []
    for www in ww:
        if www == 0:
            break
        else:
            feg_small.append(bb[www])
        
    feg.append(feg_small)

In [None]:
torch.argmax(torch.exp(out.detach().clone()),dim = -1).shape

torch.Size([64, 128])

In [39]:
fff = [tokenizer.tokenize(i) for i in dataset3['test']['de']]

In [53]:
i == j

True

In [49]:
from nltk.translate.bleu_score import corpus_bleu

scores = []
for i,j in tqdm(zip(feg, fff)):
    score1 = corpus_bleu(i, j)
    scores.append(score1)


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
960it [00:01, 658.74it/s]
