In [1]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt

from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from torch.utils.data import TensorDataset, DataLoader, Subset
from tensorflow.keras.utils import pad_sequences

from torch.utils.data import TensorDataset, DataLoader, Subset
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


tqdm.pandas()

# 데이터 Load

In [2]:
from datasets import load_dataset

dataset = load_dataset("bentrevett/multi30k")

In [3]:
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")



In [4]:
dataset2 = dataset.map(lambda e: tokenizer(e['en'], padding= False), batched=True)
dataset3 = dataset.map(lambda e: tokenizer(e['de'], padding= False), batched=True)

In [5]:
58101 # 이를 start token으로 지정.a

58101

In [6]:
# np.unique([len(i) for i in dataset2['train']['input_ids']]), np.unique([len(i) for i in dataset3['train']['input_ids']])

In [7]:
def pad_sequences_en(sequences):
    max_len = 128
    padded_sequences = []
    for sequence in sequences:
        if len(sequence) >= max_len:
            padded_sequence = sequence[:max_len]  # 최대 길이까지 잘라냄
        else:
            padded_sequence = sequence + [58100] * (max_len - len(sequence))  # 패딩 추가
        padded_sequences.append(padded_sequence)
    return padded_sequences

In [8]:
def pad_sequences_de(sequences):
    max_len = 128
    padded_sequences = []
    for sequence in sequences:
        if len(sequence) >= max_len:
            padded_sequence = [58101] + sequence[:max_len-1]  # 최대 길이까지 잘라냄
        else:
            padded_sequence = [58101] + sequence + [58100] * (max_len - len(sequence) - 1)  # 패딩 추가
        padded_sequences.append(padded_sequence)
    return padded_sequences

In [9]:
padded_batch = pad_sequences(dataset2['train']['input_ids'])

In [10]:
train_en, train_ge = torch.tensor(pad_sequences_en(dataset2['train']['input_ids'])), torch.tensor(pad_sequences_de(dataset3['train']['input_ids']))
valid_en, valid_ge = torch.tensor(pad_sequences_en(dataset2['validation']['input_ids'])), torch.tensor(pad_sequences_de(dataset3['validation']['input_ids']))
test_en, test_ge = torch.tensor(pad_sequences_en(dataset2['test']['input_ids'])), torch.tensor(pad_sequences_de(dataset3['test']['input_ids']))

In [11]:
train_ge[0]

tensor([58101,   589,  7554,  7861, 22012,    95,   197,  5192,    18, 17694,
           34,   731,  7199,    82,    49,  4407,    15,     5,     9,   394,
         1258,  1578,  5154,   526,    45, 14243,  3351,     3,     0, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
        58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100])

In [12]:
set([len(i) for i in train_en]), set([len(i) for i in train_ge])

({128}, {128})

In [13]:
class Dataset(Dataset):
    
    def __init__(self, inputs, output):

        self.inputs = inputs
        self.output = output

    def __getitem__(self, idx):

        inputs = self.inputs[idx]
        output = self.output[idx]
        
        return inputs, output

    def __len__(self):
        return len(self.output)

In [14]:
train_dataset = Dataset(train_en, train_ge)
valid_dataset = Dataset(valid_en, valid_ge)
test_dataset = Dataset(test_en, test_ge)

In [15]:



train_loader = DataLoader(train_dataset, batch_size = 64, shuffle = False, drop_last = True)
valid_loader = DataLoader(valid_dataset, batch_size = 64, shuffle = False, drop_last = True)
test_loader = DataLoader(test_dataset, batch_size = 64, shuffle = False, drop_last = True)

In [16]:
len(valid_loader)

15

# Model

In [17]:
import math

class Embeddings(nn.Module):


    def __init__(self, vocab_size, d_model, max_len = 128):
        super(Embeddings, self).__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(0.1)
        self.embed = nn.Embedding(vocab_size + 1, d_model)
        self.pe = self.create_positinal_encoding(max_len, self.d_model)
        self.dropout = nn.Dropout(0.1)

    def create_positinal_encoding(self, max_len, d_model):
        pe = torch.zeros(max_len, d_model).to(device)
        for pos in range(max_len):  
            for i in range(0, d_model, 2):  
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
        pe = pe.unsqueeze(0)  
        return pe

    def forward(self, encoded_words):
        
        embedding = self.embed(encoded_words) * torch.sqrt(torch.tensor(self.d_model)).to(device)
        embedding += self.pe[:, :embedding.size(1)]   
        embedding = self.dropout(embedding)
        return embedding

In [18]:
# class Attention(nn.Module):
    
#     def __init__(self, embedding_size = 512):
        
#         self.data = data
#         self.embedding_size= embedding_size
#         self.weight_Q = nn.Linear(embedding_size, embedding_size)
#         self.weight_K = nn.Linear(embedding_size, embedding_size)
#         self.weight_V = nn.Linear(embedding_size, embedding_size)
#         self.softmax = nn.Softmax()
        
#     def forward(self, data):
        
#         Q = self.weight_Q(data)
#         K = self.weight_K(data)
#         V = self.weight_V(data)
#         score = torch.matmul(Q,K.T) / torch.sqrt(self.embedding_size)
#         value = self.softmax(score) * V
#         return value
         

In [19]:
def create_masks(inputs, outputs_input, outputs_target):
    
    def subsequent_mask(size):
        mask = torch.triu(torch.ones(size, size)).transpose(0, 1).type(dtype = torch.uint8)
        return mask.unsqueeze(0) # 상삼각행렬 생성 -> 행과 열을 뒤 바꾸어 하삼각행렬로 바꿈. (밑에가 다 0)
    
    inputs_mask = inputs != 58100
    inputs_mask = inputs_mask.to(device)
    inputs_mask = inputs_mask.unsqueeze(1).unsqueeze(1) # 각  input에 대해서 상삼각행렬에 대응하도록 설정.
    
    outputs_input_mask = outputs_input != 58100
    outputs_input_mask = outputs_input_mask.unsqueeze(1) 
    outputs_input_mask = outputs_input_mask & subsequent_mask(outputs_input.size(-1)).type_as(outputs_input_mask.data)
    outputs_input_mask = outputs_input_mask.unsqueeze(1)
    # masking을 해줌으로서, 

    
    outputs_target_mask = outputs_target != 58100
    
    return inputs_mask, outputs_input_mask, outputs_target_mask
        
        

In [20]:
train_en

tensor([[ 4386,  1296,     2,  ..., 58100, 58100, 58100],
        [15036,  1135,     5,  ..., 58100, 58100, 58100],
        [   93,   839,  4040,  ..., 58100, 58100, 58100],
        ...,
        [ 4386,  8722,  2013,  ..., 58100, 58100, 58100],
        [  282, 17525,   175,  ..., 58100, 58100, 58100],
        [   93,   175,     5,  ..., 58100, 58100, 58100]])

In [21]:
class MultiHeadAttention(nn.Module):
    
    def __init__(self, heads, d_model):
        super(MultiHeadAttention, self).__init__()
        self.d_k = d_model // heads
        self.heads = heads
        self.dropout = nn.Dropout(0.1)
        self.query = nn.Linear(d_model, d_model)
        self.key  = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.concat = nn.Linear(d_model, d_model)
        
    def forward(self, query, key, value, mask):
        
        query = self.query(query)
        key = self.key(key)
        value = self.value(value)
        
        query = query.view(query.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)
        key = key.view(key.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)
        value = value.view(value.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)
        

        scores = torch.matmul(query, key.permute(0 ,1 ,3, 2)) / math.sqrt(query.size(-1))
        
        # print(query.shape, key.shape, value.shape, mask.shape, scores.shape)

        
        scores = scores.masked_fill(mask == 0, -1e9) # masking 된 것에 매우 작은 수 부여 -> softmax 계산시 -inf 로 계산되어짐.
        weights = F.softmax(scores, dim = -1) # attention score 계산
        context = torch.matmul(weights, value)  # attention value 계산
        context = context.permute(0,2,1,3).contiguous().view(context.shape[0], -1, self.heads * self.d_k)
        
        interacted = self.concat(context)
        
        return interacted
        
        

In [22]:
class FeedForward(nn.Module):
    
    def __init__(self, d_model, middle_dim = 2048):
        super(FeedForward, self).__init__()
        
        self.fc1 = nn.Linear(d_model, middle_dim)
        self.fc2 = nn.Linear(middle_dim, d_model)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self,x):
        out = F.relu(self.fc1(x))
        out = self.fc2(self.dropout(out))
        return out        

In [23]:
class EncoderLayer(nn.Module):
    
    def __init__(self, d_model, heads):
        super(EncoderLayer, self).__init__()
        self.layernorm = nn.LayerNorm(d_model)
        self.self_multihead = MultiHeadAttention(heads, d_model)
        self.feed_forward = FeedForward(d_model)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, embeddings, mask):
        interacted = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, mask))
        interacted = self.layernorm(interacted + embeddings)
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        encoded = self.layernorm(feed_forward_out + interacted)
        return encoded
        

In [24]:
class DecoderLayer(nn.Module):
    
    def __init__(self, d_model, heads):
        super(DecoderLayer, self).__init__()
        self.layernorm = nn.LayerNorm(d_model)
        self.self_multihead = MultiHeadAttention(heads, d_model)
        self.src_multihead = MultiHeadAttention(heads, d_model)
        self.feed_forward = FeedForward(d_model)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, embeddings, encoded, src_mask, target_mask):
        query = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, target_mask))
        query = self.layernorm(query + embeddings)
        interacted = self.dropout(self.src_multihead(query, encoded, encoded, src_mask))
        interacted = self.layernorm(interacted + query)
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        decoded = self.layernorm(feed_forward_out + interacted)
        return decoded

In [25]:
class P_Transformer(nn.Module):
    
    def __init__(self, d_model, heads, num_layers, vocab_size):
        super(Transformer, self).__init__()
        
        self.d_model = d_model
        self.vocab_size = vocab_size + 1
        self.embed = Embeddings(self.vocab_size, d_model)
        self.encoder = nn.ModuleList([EncoderLayer(d_model, heads) for _ in range(num_layers)])
        self.decoder = nn.ModuleList([DecoderLayer(d_model, heads) for _ in range(num_layers)])
        self.logit = nn.Linear(d_model, self.vocab_size)
        
    def encode(self, src_words, src_mask):
        src_embeddings = self.embed(src_words)
        encoded_layers = []
        for layer in self.encoder:
            src_embeddings = layer(src_embeddings, src_mask)
            encoded_layers.append(src_embeddings)
            
        return encoded_layers

    def decode(self, target_words, target_mask, src_embeddings, src_mask):
        tgt_embeddings = self.embed(target_words)
        for i, layer in enumerate(self.decoder):
            tgt_embeddings = layer(tgt_embeddings, src_embeddings[i], src_mask, target_mask)
            decoded_layers.append(tgt_embeddings)
        return tgt_embeddings

    def forward(self, src_words, src_mask, target_words, target_mask):
        encoded_layers = self.encode(src_words, src_mask)
        decoded = self.decode(target_words, target_mask, encoded_layers, src_mask)
        out = F.log_softmax(self.logit(decoded), dim = 2)
        return out

In [26]:
class Transformer(nn.Module):
    
    def __init__(self, d_model, heads, num_layers, vocab_size):
        super(Transformer, self).__init__()
        
        self.d_model = d_model
        self.vocab_size = vocab_size + 1
        self.embed = Embeddings(self.vocab_size, d_model)
        self.encoder = nn.ModuleList([EncoderLayer(d_model, heads) for _ in range(num_layers)])
        self.decoder = nn.ModuleList([DecoderLayer(d_model, heads) for _ in range(num_layers)])
        self.logit = nn.Linear(d_model, self.vocab_size)
        
    def encode(self, src_words, src_mask):
        src_embeddings = self.embed(src_words)
        
        for layer in self.encoder:
            src_embeddings = layer(src_embeddings, src_mask)
            
        return src_embeddings

    def decode(self, target_words, target_mask, src_embeddings, src_mask):
        tgt_embeddings = self.embed(target_words)
        for layer in self.decoder:
            tgt_embeddings = layer(tgt_embeddings, src_embeddings, src_mask, target_mask)
        return tgt_embeddings

    def forward(self, src_words, src_mask, target_words, target_mask):
        encoded = self.encode(src_words, src_mask)
        decoded = self.decode(target_words, target_mask, encoded, src_mask)
        out = F.log_softmax(self.logit(decoded), dim = 2)
        return out
            

In [27]:
class AdamWarmup:
    
    def __init__(self, model_size, warmup_steps, optimizer):
        
        self.model_size = model_size
        self.warmup_steps = warmup_steps
        self.optimizer = optimizer
        self.current_step = 0
        self.lr = 0
    
    def get_lr(self):
        return self.model_size ** (-0.5) * min(self.current_step ** (-0.5), self.current_step * self.warmup_steps ** (-1.5))
    
    def step(self):
        
        self.current_step += 1
        lr = self.get_lr()
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr
            
        self.lr = lr
        self.optimizer.step()

In [28]:
import torch.nn.functional as F

class LossWithLS(nn.Module):
    
    def __init__(self, size, smooth):
        super(LossWithLS, self).__init__()
        self.smooth = smooth
        self.size = size

    def forward(self, prediction, target, mask):
        prediction = prediction.view(-1, prediction.size(-1))   # (batch_size * max_words, vocab_size)
        target = target.contiguous().view(-1)   # (batch_size * max_words)
        mask = mask.float()
        mask = mask.view(-1)       # (batch_size * max_words)
        
        # Smoothed one-hot labels
        labels = torch.full_like(prediction, self.smooth / (self.size - 1))
        labels.scatter_(1, target.unsqueeze(1), 1 - self.smooth)
        
        # Apply mask
        masked_prediction = prediction * mask.unsqueeze(1)
        masked_labels = labels * mask.unsqueeze(1)

        
        # Calculate negative log likelihood loss
        loss = F.nll_loss(masked_prediction, target, reduction='none')
        loss *= mask
        
        # Normalize the loss
        loss = loss.sum() / mask.sum()
        
        return loss

# Training

In [29]:
128 / 8

16.0

In [30]:
d_model = 64
heads = 8
num_layers = 6
num_layers = 6
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 10
vocab_len = len(tokenizer.get_vocab())

transformer = Transformer(d_model = d_model , heads = heads, num_layers = num_layers, vocab_size = vocab_len)
transformer = transformer.to(device)
adam_optimizer = torch.optim.Adam(transformer.parameters())
transformer_optimizer = AdamWarmup(model_size = d_model, warmup_steps = 4000, optimizer = adam_optimizer)
criterion = LossWithLS(vocab_len, 0.1)

In [31]:
next(iter(train_loader.dataset))[1].shape

torch.Size([128])

In [32]:


import torch
import torch.nn.functional as F
import random

def generate_next_words(transformer, sequence, encoded, question_mask, beam_width):
    
    start_token = 58101
    
    if not sequence:
        return [(start_token, 0)]
    
    size = len(sequence)
    target_mask = torch.triu(torch.ones(size, size)).transpose(0, 1).type(dtype=torch.uint8)
    target_mask = target_mask.to(device).unsqueeze(0).unsqueeze(0)

    words = torch.LongTensor([sequence]).to(device)

    decoded = transformer.decode(words, target_mask, encoded, question_mask)
    predictions = transformer.logit(decoded[:, -1])

    # Apply log_softmax
    log_probs = F.log_softmax(predictions, dim = -1)

    # Get top-k words
    top_k_probs, top_k_indices = torch.topk(log_probs, beam_width, dim=-1)
    top_k_probs = top_k_probs.squeeze().tolist()
    top_k_indices = top_k_indices.squeeze().tolist()

    next_words = [(word, prob) for word, prob in zip(top_k_indices, top_k_probs)]
    return next_words


def beam_search(transformer, question, question_mask, max_len, dict, beam_width=5):
    transformer.eval()
    start_token = 58101
    end_token = 0

    encoded = transformer.encode(question, question_mask)

    beams = [([], 0)] 

    for step in range(max_len):
        candidates = []

        for seq, score in beams:
            if seq and seq[-1] == end_token:
                candidates.append((seq, score))
                continue
            
            next_words = generate_next_words(transformer, seq, encoded, question_mask, beam_width)

            for word, log_prob in next_words:
                candidates.append((seq + [word], score + log_prob))

        # Select top-k candidates
        candidates.sort(key=lambda x: x[1], reverse=True)
        beams = candidates[:beam_width]

        # Check if all beams have ended
        if all(seq[-1] == end_token for seq, _ in beams):
            break

    best_seq, _ = max(beams, key=lambda x: x[1])
    return best_seq

In [32]:
import wandb


wandb.init(
    # set the wandb project where this run will be logged
    project="Transformer_Hard_Coding",

    # track hyperparameters and run metadata
    config={
    "loss": 'KLDivergence',
    "architecture": "Transformer",
    "optimizer" : 'AdamWarmup',
    'layer' : 6,
    'heads' : 8,
    'd_model' : 64   
    }
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myoon303b[0m ([33mku_software[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [34]:
bb = {v:k for k,v in tokenizer.get_vocab().items()} 
bb[58101] = '<start>'

In [35]:
def train(train_loader, transformer, criterion, epoch):
    
    transformer.train()
    sum_loss, valid_sum_loss = 0, 0
    count, valid_count = 0, 0
    
    for i, (inputs, output) in enumerate(train_loader):
        
        samples = inputs.shape[0]
        
        inputs = inputs.to(device)
        output = output.to(device)
        
        output_in = output[:,:-1]
        output_target = output[:,1:]
        inputs_mask, output_in_mask, output_target_mask = create_masks(inputs, output_in, output_target)
        
        out = transformer(inputs, inputs_mask, output_in, output_in_mask)
        
        loss = criterion(out, output_target, output_target_mask)
        transformer_optimizer.optimizer.zero_grad()
        loss.backward()
        transformer_optimizer.step()
        
        sum_loss += loss.item() * samples
        count += samples
        
        if i % 5 == 0:
            print("Epoch [{}][{}/{}]\tTrain Loss: {:.3f}".format(epoch, i, len(train_loader), sum_loss/count))
            wandb.log({"Training loss" : sum_loss/count})
    
    transformer.eval()
    with torch.no_grad():
        for i, (inputs, output) in enumerate(valid_loader):
            
            samples = inputs.shape[0]
            
            inputs = inputs.to(device)
            output = output.to(device)
            
            output_in = output[:,:-1]
            output_target = output[:,1:]
            
            inputs_mask, output_in_mask, output_target_mask = create_masks(inputs, output_in, output_target)
            out = transformer(inputs, inputs_mask, output_in, output_in_mask)
            
            loss = criterion(out, output_target, output_target_mask)
            
            valid_sum_loss += loss.item() * samples
            valid_count += samples
        
            if i % 5   == 0:
                print("Epoch [{}][{}/{}]\t\t\tValid Loss: {:.3f}".format(epoch, i, len(valid_loader), valid_sum_loss/valid_count))
                wandb.log({"Validation loss" :  valid_sum_loss/valid_count })
                

    max_len = 128
    A = random.randint(1, 64)
    B = next(iter(test_loader))
    
    enc_qus = B[0][A]
    real_qus = B[1][A]
    question = torch.LongTensor(enc_qus).to(device).unsqueeze(0)
    question_mask = (question!=58100).to(device).unsqueeze(1).unsqueeze(1)
    sentence = beam_search(transformer, question, question_mask, int(max_len), dict)
    
    candidate = [bb[i] for i in sentence if (i != 0) and (i != 58100) and (i != 58101)]
    real = [bb[i.tolist()] for i in real_qus if (i.tolist() != 0) and (i.tolist() != 58100)  and (i != 58101)]
    print()
    print('Candidate :'  + ' '.join(candidate))
    print('Candidate_labeled : ', [bb[i] for i in sentence])
    print('Real :' + ' '.join(real))


In [36]:
torch.Tensor((64, 128, 128)) * math.sqrt(128)

tensor([ 724.0773, 1448.1547, 1448.1547])

In [44]:
train_dataset = Dataset(train_en, train_ge)
valid_dataset = Dataset(valid_en, valid_ge)
test_dataset = Dataset(test_en, test_ge)


train_loader = DataLoader(train_dataset, batch_size = 64, shuffle = True, drop_last = True)
valid_loader = DataLoader(valid_dataset, batch_size = 64, shuffle = True, drop_last = True)
test_loader = DataLoader(test_dataset, batch_size = 64, shuffle = False, drop_last = True)

In [45]:
len(train_en)

29000

In [46]:

random.randint(1, 15)

7

In [48]:
from datetime import datetime


for epoch in range(epochs):
        
    start_time = datetime.now() 
    train(train_loader, transformer, criterion, epoch)
    
    state = {'epoch': epoch, 'transformer': transformer, 'transformer_optimizer': transformer_optimizer}
    torch.save(state, 'checkpoint_new' + str(epoch+20) + '.pth.tar')
    
    time_elapsed = datetime.now() - start_time 
    print('#######   Time elapsed (hh:mm:ss.ms) {} #######'.format(time_elapsed))
    print()

Epoch [0][0/453]	Train Loss: 1.562
Epoch [0][5/453]	Train Loss: 1.429
Epoch [0][10/453]	Train Loss: 1.419
Epoch [0][15/453]	Train Loss: 1.412
Epoch [0][20/453]	Train Loss: 1.405
Epoch [0][25/453]	Train Loss: 1.406
Epoch [0][30/453]	Train Loss: 1.405
Epoch [0][35/453]	Train Loss: 1.408
Epoch [0][40/453]	Train Loss: 1.415
Epoch [0][45/453]	Train Loss: 1.412
Epoch [0][50/453]	Train Loss: 1.409
Epoch [0][55/453]	Train Loss: 1.410
Epoch [0][60/453]	Train Loss: 1.409
Epoch [0][65/453]	Train Loss: 1.415
Epoch [0][70/453]	Train Loss: 1.418
Epoch [0][75/453]	Train Loss: 1.420
Epoch [0][80/453]	Train Loss: 1.420
Epoch [0][85/453]	Train Loss: 1.420
Epoch [0][90/453]	Train Loss: 1.421
Epoch [0][95/453]	Train Loss: 1.420
Epoch [0][100/453]	Train Loss: 1.426
Epoch [0][105/453]	Train Loss: 1.428
Epoch [0][110/453]	Train Loss: 1.425
Epoch [0][115/453]	Train Loss: 1.427
Epoch [0][120/453]	Train Loss: 1.427
Epoch [0][125/453]	Train Loss: 1.427
Epoch [0][130/453]	Train Loss: 1.430
Epoch [0][135/453]	Trai

KeyboardInterrupt: 

# BLEU Score

In [33]:
import torch
from copy import deepcopy

# 저장된 파일을 불러올 때
checkpoint = torch.load('checkpoint_new23.pth.tar')

# 불러온 checkpoint에서 모델 상태나 다른 필요한 요소들을 추출할 수 있습니다.
transformer = deepcopy(checkpoint['transformer'])

# 모델을 evaluation 모드로 설정 (필요에 따라)


In [34]:
bb = {v:k for k,v in tokenizer.get_vocab().items()} 
bb[58101] = '<start>'

In [35]:
next(iter(test_loader))[0].shape, next(iter(test_loader))[1].shape

(torch.Size([64, 128]), torch.Size([64, 128]))

In [36]:
test_en

tensor([[   93,   175,     5,  ..., 58100, 58100, 58100],
        [   93, 13835, 21368,  ..., 58100, 58100, 58100],
        [   93,  4040,     5,  ..., 58100, 58100, 58100],
        ...,
        [ 2358,   734,    48,  ..., 58100, 58100, 58100],
        [  282,  6060,   175,  ..., 58100, 58100, 58100],
        [   93,  4040,    67,  ..., 58100, 58100, 58100]])

In [41]:
from tqdm import tqdm
import nltk.translate.bleu_score as bleu

candidate = []
real = []

max_len = 128

for a,b in zip(tqdm(test_en),test_ge):
    enc_qus = a
    real_qus = b
    question = torch.LongTensor(enc_qus).to(device).unsqueeze(0)
    
    question.to(device)
    real_qus.to(device)
    transformer.to(device)
    
    question_mask = (question!=58100).to(device).unsqueeze(1).unsqueeze(1)
    sentence = beam_search(transformer, question, question_mask, int(max_len), dict)
    
    c = [bb[i] for i in sentence if (i != 0) and (i != 58100) and (i != 58101)]
    r = [bb[i.tolist()] for i in real_qus if (i.tolist() != 0) and (i.tolist() != 58100)  and (i != 58101)]
    
    candidate.append(c)
    real.append(r)
    
    data = 0
    
    for i,j in zip(real,candidate):
        data += bleu.sentence_bleu([i],j,weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method1)
        
        
    if len(real) % 10  == 0:
        print('BLEU -4 score : ', data / len(real))        


  1%|          | 10/1000 [00:17<28:59,  1.76s/it]

BLEU -4 score :  0.3972977617472836


  2%|▏         | 20/1000 [00:34<27:21,  1.68s/it]

BLEU -4 score :  0.3077524478751904


  3%|▎         | 30/1000 [00:52<31:23,  1.94s/it]

BLEU -4 score :  0.3299985023676016


  4%|▍         | 40/1000 [01:08<26:52,  1.68s/it]

BLEU -4 score :  0.3424956239810124


  5%|▌         | 50/1000 [01:23<23:48,  1.50s/it]

BLEU -4 score :  0.3496635901763184


  6%|▌         | 60/1000 [01:40<27:00,  1.72s/it]

BLEU -4 score :  0.3477268563532785


  7%|▋         | 70/1000 [01:58<24:06,  1.56s/it]

BLEU -4 score :  0.3405755491416081


  8%|▊         | 80/1000 [02:15<29:22,  1.92s/it]

BLEU -4 score :  0.33876084907308224


  9%|▉         | 90/1000 [02:33<31:07,  2.05s/it]

BLEU -4 score :  0.33864606430720645


 10%|█         | 100/1000 [02:51<28:58,  1.93s/it]

BLEU -4 score :  0.3337108175544898


 11%|█         | 110/1000 [03:07<24:24,  1.65s/it]

BLEU -4 score :  0.3308155217487101


 12%|█▏        | 120/1000 [03:23<21:00,  1.43s/it]

BLEU -4 score :  0.3305355339547221


 13%|█▎        | 130/1000 [03:36<21:14,  1.46s/it]

BLEU -4 score :  0.3277446525082295


 14%|█▍        | 140/1000 [03:52<21:24,  1.49s/it]

BLEU -4 score :  0.3245753749507594


 15%|█▌        | 150/1000 [04:11<25:45,  1.82s/it]

BLEU -4 score :  0.3305214710270147


 16%|█▌        | 160/1000 [04:27<22:07,  1.58s/it]

BLEU -4 score :  0.3306757317347729


 17%|█▋        | 170/1000 [04:44<22:31,  1.63s/it]

BLEU -4 score :  0.3357128571619224


 18%|█▊        | 180/1000 [05:00<22:00,  1.61s/it]

BLEU -4 score :  0.34421669357851997


 19%|█▉        | 190/1000 [05:16<21:48,  1.62s/it]

BLEU -4 score :  0.34200811359222777


 20%|██        | 200/1000 [05:33<24:20,  1.83s/it]

BLEU -4 score :  0.3423341133461381


 21%|██        | 210/1000 [05:50<23:49,  1.81s/it]

BLEU -4 score :  0.34602861610634233


 22%|██▏       | 220/1000 [06:05<20:58,  1.61s/it]

BLEU -4 score :  0.34373244329757124


 23%|██▎       | 230/1000 [06:22<22:24,  1.75s/it]

BLEU -4 score :  0.3436402076568826


 24%|██▍       | 240/1000 [06:40<21:49,  1.72s/it]

BLEU -4 score :  0.342276682622346


 25%|██▌       | 250/1000 [06:57<20:55,  1.67s/it]

BLEU -4 score :  0.3405453475793308


 26%|██▌       | 260/1000 [07:14<19:57,  1.62s/it]

BLEU -4 score :  0.33985943517549877


 27%|██▋       | 270/1000 [07:32<27:07,  2.23s/it]

BLEU -4 score :  0.3448753942120355


 28%|██▊       | 280/1000 [07:51<21:30,  1.79s/it]

BLEU -4 score :  0.34826665586438477


 29%|██▉       | 290/1000 [08:06<17:50,  1.51s/it]

BLEU -4 score :  0.35174698554960765


 30%|███       | 300/1000 [08:21<17:30,  1.50s/it]

BLEU -4 score :  0.3527599872490309


 31%|███       | 310/1000 [08:39<22:54,  1.99s/it]

BLEU -4 score :  0.3500456506031758


 32%|███▏      | 320/1000 [08:58<17:47,  1.57s/it]

BLEU -4 score :  0.3504514059427123


 33%|███▎      | 330/1000 [09:14<16:08,  1.45s/it]

BLEU -4 score :  0.3515879039624161


 34%|███▍      | 340/1000 [09:32<16:10,  1.47s/it]

BLEU -4 score :  0.34821355501933604


 35%|███▌      | 350/1000 [09:52<21:50,  2.02s/it]

BLEU -4 score :  0.3478043613985935


 36%|███▌      | 360/1000 [10:10<19:13,  1.80s/it]

BLEU -4 score :  0.3486124243884541


 37%|███▋      | 370/1000 [10:28<20:32,  1.96s/it]

BLEU -4 score :  0.3494201129986728


 38%|███▊      | 380/1000 [10:48<24:21,  2.36s/it]

BLEU -4 score :  0.34647476148148976


 39%|███▉      | 390/1000 [11:06<17:58,  1.77s/it]

BLEU -4 score :  0.3435768600019465


 40%|████      | 400/1000 [11:23<14:07,  1.41s/it]

BLEU -4 score :  0.345605544774616


 41%|████      | 410/1000 [11:40<17:33,  1.79s/it]

BLEU -4 score :  0.3445902648008178


 42%|████▏     | 420/1000 [11:57<15:51,  1.64s/it]

BLEU -4 score :  0.34642679297150414


 43%|████▎     | 430/1000 [12:17<20:18,  2.14s/it]

BLEU -4 score :  0.34513623321652354


 44%|████▍     | 440/1000 [12:36<18:32,  1.99s/it]

BLEU -4 score :  0.3477885684117134


 45%|████▌     | 450/1000 [12:53<17:24,  1.90s/it]

BLEU -4 score :  0.3502066429228246


 46%|████▌     | 460/1000 [13:10<16:44,  1.86s/it]

BLEU -4 score :  0.3525536428189053


 47%|████▋     | 470/1000 [13:26<13:22,  1.51s/it]

BLEU -4 score :  0.35071318476964775


 48%|████▊     | 480/1000 [13:43<15:26,  1.78s/it]

BLEU -4 score :  0.3517505230577197


 49%|████▉     | 490/1000 [13:59<13:15,  1.56s/it]

BLEU -4 score :  0.35112426694410614


 50%|█████     | 500/1000 [14:16<15:03,  1.81s/it]

BLEU -4 score :  0.3518546159160246


 51%|█████     | 510/1000 [14:33<18:41,  2.29s/it]

BLEU -4 score :  0.3506893775773984


 52%|█████▏    | 520/1000 [14:47<11:39,  1.46s/it]

BLEU -4 score :  0.3509191273373832


 53%|█████▎    | 530/1000 [15:04<13:38,  1.74s/it]

BLEU -4 score :  0.352084461438211


 54%|█████▍    | 540/1000 [15:25<18:57,  2.47s/it]

BLEU -4 score :  0.35038608868336557


 55%|█████▌    | 550/1000 [15:42<12:26,  1.66s/it]

BLEU -4 score :  0.3509250279815551


 56%|█████▌    | 560/1000 [15:58<12:02,  1.64s/it]

BLEU -4 score :  0.3529906837422662


 57%|█████▋    | 570/1000 [16:18<14:08,  1.97s/it]

BLEU -4 score :  0.35220636413205486


 58%|█████▊    | 580/1000 [16:34<12:13,  1.75s/it]

BLEU -4 score :  0.3522836038643244


 59%|█████▉    | 590/1000 [16:51<11:48,  1.73s/it]

BLEU -4 score :  0.35131230095675975


 60%|██████    | 600/1000 [17:08<12:05,  1.81s/it]

BLEU -4 score :  0.3531890215207289


 61%|██████    | 610/1000 [17:27<11:07,  1.71s/it]

BLEU -4 score :  0.35331303289960364


 62%|██████▏   | 620/1000 [17:46<12:40,  2.00s/it]

BLEU -4 score :  0.3532256775638248


 63%|██████▎   | 630/1000 [18:06<12:56,  2.10s/it]

BLEU -4 score :  0.3516632590952345


 64%|██████▍   | 640/1000 [18:24<10:52,  1.81s/it]

BLEU -4 score :  0.35158769954709


 65%|██████▌   | 650/1000 [18:44<11:02,  1.89s/it]

BLEU -4 score :  0.3500273878924654


 66%|██████▌   | 660/1000 [19:03<09:55,  1.75s/it]

BLEU -4 score :  0.34964479122656833


 67%|██████▋   | 670/1000 [19:20<10:24,  1.89s/it]

BLEU -4 score :  0.3515148117366053


 68%|██████▊   | 680/1000 [19:40<11:33,  2.17s/it]

BLEU -4 score :  0.35027415702698567


 69%|██████▉   | 690/1000 [20:01<09:19,  1.81s/it]

BLEU -4 score :  0.34895182275511283


 70%|███████   | 700/1000 [20:20<10:48,  2.16s/it]

BLEU -4 score :  0.34756092096335395


 71%|███████   | 710/1000 [20:38<09:13,  1.91s/it]

BLEU -4 score :  0.34747778050569267


 72%|███████▏  | 720/1000 [20:55<06:34,  1.41s/it]

BLEU -4 score :  0.34841461726939355


 73%|███████▎  | 730/1000 [21:12<08:37,  1.92s/it]

BLEU -4 score :  0.34877053358750065


 74%|███████▍  | 740/1000 [21:29<07:40,  1.77s/it]

BLEU -4 score :  0.346996876064863


 75%|███████▌  | 750/1000 [21:49<08:46,  2.11s/it]

BLEU -4 score :  0.3473672361284632


 76%|███████▌  | 760/1000 [22:06<06:53,  1.72s/it]

BLEU -4 score :  0.3479083718594225


 77%|███████▋  | 770/1000 [22:25<08:17,  2.16s/it]

BLEU -4 score :  0.34630307797342724


 78%|███████▊  | 780/1000 [22:42<06:19,  1.73s/it]

BLEU -4 score :  0.3487236668965047


 79%|███████▉  | 790/1000 [23:04<07:51,  2.25s/it]

BLEU -4 score :  0.34769799386475186


 80%|████████  | 800/1000 [23:21<05:40,  1.70s/it]

BLEU -4 score :  0.34781251199225954


 81%|████████  | 810/1000 [23:39<05:29,  1.73s/it]

BLEU -4 score :  0.3473315547589149


 82%|████████▏ | 820/1000 [23:59<05:16,  1.76s/it]

BLEU -4 score :  0.3484494180454765


 83%|████████▎ | 830/1000 [24:20<07:01,  2.48s/it]

BLEU -4 score :  0.3485686720539806


 84%|████████▍ | 840/1000 [24:40<05:58,  2.24s/it]

BLEU -4 score :  0.3487054756269906


 85%|████████▌ | 850/1000 [24:59<05:37,  2.25s/it]

BLEU -4 score :  0.34934174126064865


 86%|████████▌ | 860/1000 [25:14<03:26,  1.47s/it]

BLEU -4 score :  0.3515561674974034


 87%|████████▋ | 870/1000 [25:33<04:32,  2.09s/it]

BLEU -4 score :  0.3518512128277992


 88%|████████▊ | 880/1000 [25:53<03:49,  1.91s/it]

BLEU -4 score :  0.3511139052801695


 89%|████████▉ | 890/1000 [26:19<04:38,  2.53s/it]

BLEU -4 score :  0.35147962438347685


 90%|█████████ | 900/1000 [26:40<03:26,  2.07s/it]

BLEU -4 score :  0.3505446968151889


 91%|█████████ | 910/1000 [27:01<03:48,  2.54s/it]

BLEU -4 score :  0.34954335518693097


 92%|█████████▏| 920/1000 [27:20<02:55,  2.19s/it]

BLEU -4 score :  0.34958911891527567


 93%|█████████▎| 930/1000 [27:42<02:41,  2.31s/it]

BLEU -4 score :  0.34842576993128316


 94%|█████████▍| 940/1000 [28:00<01:47,  1.79s/it]

BLEU -4 score :  0.34679061993870525


 95%|█████████▌| 950/1000 [28:17<01:21,  1.64s/it]

BLEU -4 score :  0.34789597239182035


 96%|█████████▌| 960/1000 [28:40<01:52,  2.81s/it]

BLEU -4 score :  0.3475617186309749


 97%|█████████▋| 970/1000 [29:01<01:11,  2.39s/it]

BLEU -4 score :  0.3472591574948545


 98%|█████████▊| 980/1000 [29:23<00:51,  2.57s/it]

BLEU -4 score :  0.3458723620371342


 99%|█████████▉| 990/1000 [29:45<00:22,  2.21s/it]

BLEU -4 score :  0.34540253252336567


100%|██████████| 1000/1000 [30:04<00:00,  1.80s/it]

BLEU -4 score :  0.34443188787549256





In [42]:
print('Final BLEU -4 score : ', data / len(real))   

Final BLEU -4 score :  0.34443188787549256


# 끝