In [4]:
#  pip freeze > '/kaggle/working/requirement.txt'


Note: you may need to restart the kernel to use updated packages.


# install required dependencies

In [1]:
pip install tqdm spacy 

[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
pip install indic-nlp-library

[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


# IMPORT REQUIRED LIBRARIES

In [1]:
import math
import random
import time
from torch import nn, optim
from torch.nn import functional as F
from torch.optim import Adam
import torch
from tqdm import tqdm 
import spacy
import re
import os
from IPython.display import clear_output, display
from indicnlp.tokenize import indic_tokenize
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from torch.utils.data import Dataset,DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print("Runnign on GPU")
else:
    print("No GPU")


No GPU


# EMBEDDING PART

In [2]:
class TokenEmbedding(nn.Embedding):
    """
    Token Embedding using torch.nn
    they will dense representation of word using weighted matrix
    """

    def __init__(self, vocab_size, d_model):
        """
        class for token embedding that included positional information

        :param vocab_size: size of vocabulary
        :param d_model: dimensions of model
        """
        super().__init__(vocab_size, d_model, padding_idx=1)

In [3]:
class PositionalEncoding(nn.Module):
    """
    compute sinusoid encoding.
    """

    def __init__(self, d_model, max_len, device):
        """
        constructor of sinusoid encoding class

        :param d_model: dimension of model
        :param max_len: max sequence length
        :param device: hardware device setting
        """
        super(PositionalEncoding, self).__init__()

        # same size with input matrix (for adding with input matrix)
        self.encoding = torch.zeros(max_len, d_model, device=device)
        self.encoding.requires_grad = False  # we don't need to compute gradient

        pos = torch.arange(0, max_len, device=device)
        pos = pos.float().unsqueeze(dim=1)
        # 1D => 2D unsqueeze to represent word's position

        _2i = torch.arange(0, d_model, step=2, device=device).float()
        # 'i' means index of d_model (e.g. embedding size = 50, 'i' = [0,50])
        # "step=2" means 'i' multiplied with two (same with 2 * i)

        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))
        # compute positional encoding to consider positional information of words

    def forward(self, x):
        # self.encoding
        # [max_len = 512, d_model = 512]

        batch_size, seq_len = x.size()
        # [batch_size = 128, seq_len = 30]

        return self.encoding[:seq_len, :]
        # [seq_len = 30, d_model = 512]
        # it will add with tok_emb : [128, 30, 512]


In [4]:
class TransformerEmbedding(nn.Module):
    """
    token embedding + positional encoding (sinusoid)
    positional encoding can give positional information to network
    """

    def __init__(self, vocab_size, d_model, max_len, drop_prob, device):
        """
        class for word embedding that included positional information

        :param vocab_size: size of vocabulary
        :param d_model: dimensions of model
        """
        
        super(TransformerEmbedding, self).__init__()
        self.tok_emb = TokenEmbedding(vocab_size, d_model)
        self.pos_emb = PositionalEncoding(d_model, max_len, device)
        self.drop_out = nn.Dropout(p=drop_prob)
        self.device=device
        

    def forward(self, x):
        x = x.long().to(self.device)
        tok_emb = self.tok_emb(x)
        pos_emb = self.pos_emb(x)
        return self.drop_out(tok_emb + pos_emb)

# MODEL BLOCKS

In [5]:
class Encoder(nn.Module):

    def __init__(self, enc_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device):
        super().__init__()
        self.emb = TransformerEmbedding(d_model=d_model,
                                        max_len=max_len,
                                        vocab_size=enc_voc_size,
                                        drop_prob=drop_prob,
                                        device=device)

        self.layers = nn.ModuleList([EncoderLayer(d_model=d_model,
                                                  ffn_hidden=ffn_hidden,
                                                  n_head=n_head,
                                                  drop_prob=drop_prob)
                                     for _ in range(n_layers)])

    def forward(self, x, src_mask):
        x = self.emb(x)

        for layer in self.layers:
            x = layer(x, src_mask)

        return x

In [6]:
class Decoder(nn.Module):
    def __init__(self, dec_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device):
        super().__init__()
        self.emb = TransformerEmbedding(d_model=d_model,
                                        drop_prob=drop_prob,
                                        max_len=max_len,
                                        vocab_size=dec_voc_size,
                                        device=device)

        self.layers = nn.ModuleList([DecoderLayer(d_model=d_model,
                                                  ffn_hidden=ffn_hidden,
                                                  n_head=n_head,
                                                  drop_prob=drop_prob)
                                     for _ in range(n_layers)])

        self.linear = nn.Linear(d_model, dec_voc_size)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        trg = self.emb(trg)

        for layer in self.layers:
            trg = layer(trg, enc_src, trg_mask, src_mask)

        # pass to LM head
        output = self.linear(trg)
        return output

In [7]:
class Transformer(nn.Module):

    def __init__(self, src_pad_idx, trg_pad_idx, trg_sos_idx, enc_voc_size, dec_voc_size, d_model, n_head, max_len,
                 ffn_hidden, n_layers, drop_prob, device):
        super().__init__()
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.trg_sos_idx = trg_sos_idx
        self.device = device
        self.encoder = Encoder(d_model=d_model,
                               n_head=n_head,
                               max_len=max_len,
                               ffn_hidden=ffn_hidden,
                               enc_voc_size=enc_voc_size,
                               drop_prob=drop_prob,
                               n_layers=n_layers,
                               device=device)

        self.decoder = Decoder(d_model=d_model,
                               n_head=n_head,
                               max_len=max_len,
                               ffn_hidden=ffn_hidden,
                               dec_voc_size=dec_voc_size,
                               drop_prob=drop_prob,
                               n_layers=n_layers,
                               device=device)

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        output = self.decoder(trg, enc_src, trg_mask, src_mask)
        trg_max_len = trg.size(1)
        output = F.pad(output, (0, max_len - trg_max_len), value=0)
        
        return output

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask

    def make_trg_mask(self, trg):
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(3)
        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones(trg_len, trg_len)).type(torch.ByteTensor).to(trg.device)
        trg_mask = trg_pad_mask & trg_sub_mask
        return trg_mask

In [8]:
class ScaleDotProductAttention(nn.Module):
    """
    compute scale dot product attention

    Query : given sentence that we focused on (decoder)
    Key : every sentence to check relationship with Qeury(encoder)
    Value : every sentence same with Key (encoder)
    """

    def __init__(self):
        super(ScaleDotProductAttention, self).__init__()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, q, k, v, mask=None, e=1e-12):
        # input is 4 dimension tensor
        # [batch_size, head, length, d_tensor]
        batch_size, head, length, d_tensor = k.size()

        # 1. dot product Query with Key^T to compute similarity
        k_t = k.transpose(2, 3)  # transpose
        score = (q @ k_t) / math.sqrt(d_tensor)  # scaled dot product

        # 2. apply masking (opt)
        if mask is not None:
            mask=mask.to(q.device)
            score = score.masked_fill(mask == 0, -10000)

        # 3. pass them softmax to make [0, 1] range
        score = self.softmax(score)

        # 4. multiply with Value
        v = score @ v

        return v, score

In [9]:
class MultiHeadAttention(nn.Module):

    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
        self.n_head = n_head
        self.attention = ScaleDotProductAttention()
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_concat = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        # 1. dot product with weight matrices
        q, k, v = self.w_q(q), self.w_k(k), self.w_v(v)

        # 2. split tensor by number of heads
        q, k, v = self.split(q), self.split(k), self.split(v)

        # 3. do scale dot product to compute similarity
        out, attention = self.attention(q, k, v, mask=mask)

        # 4. concat and pass to linear layer
        out = self.concat(out)
        out = self.w_concat(out)

        # 5. visualize attention map
        # TODO : we should implement visualization

        return out

    def split(self, tensor):
        """
        split tensor by number of head

        :param tensor: [batch_size, length, d_model]
        :return: [batch_size, head, length, d_tensor]
        """
        batch_size, length, d_model = tensor.size()

        d_tensor = d_model // self.n_head
        tensor = tensor.view(batch_size, length, self.n_head, d_tensor).transpose(1, 2)
        # it is similar with group convolution (split by number of heads)

        return tensor

    def concat(self, tensor):
        """
        inverse function of self.split(tensor : torch.Tensor)

        :param tensor: [batch_size, head, length, d_tensor]
        :return: [batch_size, length, d_model]
        """
        batch_size, head, length, d_tensor = tensor.size()
        d_model = head * d_tensor

        tensor = tensor.transpose(1, 2).contiguous().view(batch_size, length, d_model)
        return tensor


In [10]:
class EncoderLayer(nn.Module):

    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm1 = LayerNorm(d_model=d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNorm(d_model=d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)

    def forward(self, x, src_mask):
        # 1. compute self attention
        _x = x
        x = self.attention(q=x, k=x, v=x, mask=src_mask)
        
        # 2. add and norm
        x = self.dropout1(x)
        x = self.norm1(x + _x)
        
        # 3. positionwise feed forward network
        _x = x
        x = self.ffn(x)
      
        # 4. add and norm
        x = self.dropout2(x)
        x = self.norm2(x + _x)
        return x


In [11]:
class DecoderLayer(nn.Module):

    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm1 = LayerNorm(d_model=d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.enc_dec_attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm2 = LayerNorm(d_model=d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)

        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm3 = LayerNorm(d_model=d_model)
        self.dropout3 = nn.Dropout(p=drop_prob)

    def forward(self, dec, enc, trg_mask, src_mask):
        # 1. compute self attention
        _x = dec
        x = self.self_attention(q=dec, k=dec, v=dec, mask=trg_mask)
        
        # 2. add and norm
        x = self.dropout1(x)
        x = self.norm1(x + _x)

        if enc is not None:
            # 3. compute encoder - decoder attention
            _x = x
            x = self.enc_dec_attention(q=x, k=enc, v=enc, mask=src_mask)
            
            # 4. add and norm
            x = self.dropout2(x)
            x = self.norm2(x + _x)

        # 5. positionwise feed forward network
        _x = x
        x = self.ffn(x)
        
        # 6. add and norm
        x = self.dropout3(x)
        x = self.norm3(x + _x)
        return x

In [12]:
class LayerNorm(nn.Module):
    def __init__(self, d_model, eps=1e-12):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, unbiased=False, keepdim=True)
        # '-1' means last dimension. 

        out = (x - mean) / torch.sqrt(var + self.eps)
        out = self.gamma * out + self.beta
        return out


In [13]:
class PositionwiseFeedForward(nn.Module):

    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [14]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

# DATA LOADING , TOKENIZATION, VOCAB_BUILDING, TOKEN_TO_INDEX

In [15]:


def load_tokenizers():
    return indic_tokenize, spacy.load('en_core_web_sm')

def tokenize_ne(text: str, tokenizer):
        return [tok for tok in tokenizer.trivial_tokenize(text)]



def tokenize_en(text:str,tokenizer):
    return [tok.text for tok in tokenizer.tokenizer(text)]





class CustomDataset(Dataset):
    def __init__(self, source: str, target: str):
        self.nepali_root = source
        self.english_root = target
        self.current=0
        self.max_src_len=0
        self.max_trg_len=0
        self.tokenizers =load_tokenizers()
        self.src_vocab=set()
        self.trg_vocab=set()
        self.src_vocab.update(['<sos>', '<eos>', '<pad>','<unk>']) 
        self.trg_vocab.update(['<sos>', '<eos>', '<pad>','<unk>'])
        self.total_sentences = 0
        self.data = []

        
        with open(self.nepali_root, 'r') as nepali, open(self.english_root, 'r') as english:
            for nep, eng in zip(nepali, english):
                tokenized_nepali = tokenize_ne(nep, self.tokenizers[0])
                tokenized_eng = tokenize_en(eng, self.tokenizers[1])
                self.total_sentences += 1
                self.max_src_len = max(self.max_src_len, len(tokenized_nepali))
                self.max_trg_len = max(self.max_trg_len, len(tokenized_eng))
                self.src_vocab.update(tokenized_nepali)
                self.trg_vocab.update(tokenized_eng)
                self.data.append((tokenized_nepali, tokenized_eng))
#                 random.shuffle(self.data)

        
        self.src_vocab_dict={word: i for i, word in enumerate(self.src_vocab)}
        self.trg_vocab_dict={word: i for i, word in enumerate(self.trg_vocab)}
        

        self.trg_pad_idx = self.trg_vocab_dict['<pad>']
        self.trg_sos_idx = self.trg_vocab_dict['<sos>']
        self.trg_eos_idx = self.trg_vocab_dict['<eos>']
        self.src_pad_idx = self.src_vocab_dict['<pad>']
        self.src_sos_idx = self.src_vocab_dict['<sos>']   
        self.src_eos_idx = self.src_vocab_dict['<eos>']
        self.src_unk_idx = self.src_vocab_dict['<unk>'] 
        self.trg_unk_idx = self.trg_vocab_dict['<unk>']
        self.enc_voc_size = len(self.src_vocab)
        self.dec_voc_size= len(self.trg_vocab)
    
        train_ratio = 0.8
        val_ratio = 0.1
        data_len = len(self.data)
        train_size = int(data_len * train_ratio)
        val_size = int(data_len * val_ratio)
        test_size = data_len - train_size - val_size


        self.train_data = self.data[:train_size]
        self.val_data = self.data[train_size:train_size+val_size]
        self.test_data = self.data[train_size+val_size:]
  


    def __len__(self):
        if self.train_data:
            return len(self.train_data)
        elif self.val_data:
            return len(self.val_data)
        elif self.test_data:
            return len(self.test_data)
        else:
            raise ValueError("No data available!")


    def __getitem__(self, idx):
        src, trg = self.train_data[idx]
        src = [self.src_vocab_dict[token] if token in self.src_vocab_dict else self.src_vocab_dict['<unk>'] for token in src]
        trg = [self.trg_vocab_dict[token] if token in self.trg_vocab_dict else self.trg_vocab_dict['<unk>'] for token in trg]
        return [src, trg]   
       


    def __iter__(self):
        return self    



    def __next__(self):
        if self.current < len(self.train_data):
            self.current += 1
            return self.__getitem__(self.current)

        raise StopIteration 


    def printv(self):
        print(self.src_vocab_dict)

           

def custom_collate(batch, src_sos_idx, src_eos_idx, trg_sos_idx, trg_eos_idx, src_pad_idx, trg_pad_idx,src_vocab_dict:dict,trg_vocab_dict:dict):
    src_batch, trg_batch = zip(*batch)
#     print(src_batch,trg_batch)
    # print("Batch Sizes (Before Padding):", [len(src) for src in src_batch], [len(trg) for trg in trg_batch])
    src_batch = [[src_vocab_dict[token] if token in src_vocab_dict else src_vocab_dict['<unk>'] for token in src] for src in src_batch]
    trg_batch = [[trg_vocab_dict[token] if token in trg_vocab_dict else trg_vocab_dict['<unk>'] for token in trg] for trg in trg_batch]
    # Pad sequences to the fixed length max_len
    padded_src = [torch.cat([torch.tensor([src_sos_idx]), torch.tensor(src), torch.tensor([src_eos_idx]), torch.full((max_len - len(src) - 2,), src_pad_idx, dtype=torch.long)]) for src in src_batch]
    padded_trg = [torch.cat([torch.tensor([trg_sos_idx]), torch.tensor(trg), torch.tensor([trg_eos_idx]), torch.full((max_len - len(trg) - 2,), trg_pad_idx, dtype=torch.long)]) for trg in trg_batch]
    
    # Stack the padded sequences
    padded_src = torch.stack(padded_src)
    padded_trg = torch.stack(padded_trg)
    # print("Batch Sizes (Before Padding):", [len(src) for src in padded_src], [len(trg) for trg in padded_trg])
    # print("Batch Sizes (After Padding):", padded_src.shape, padded_trg.shape)
    return [padded_src, padded_trg]

         


dataset = CustomDataset('/Users/romankasichhwa/Desktop/complete/500_only/nep.txt', '/Users/romankasichhwa/Desktop/complete/500_only/eng.txt')

In [16]:

print (dataset.max_src_len, dataset.max_trg_len)

21 27


In [17]:
#batch_size = 128
batch_size = 16

max_len = 30
d_model = 512
n_layers = 6
n_heads = 8
ffn_hidden = 1024
drop_prob = 0.4

# optimizer parameter setting
init_lr = 1e-5
factor = 0.9
adam_eps = 5e-9
patience = 10
warmup = 100
epoch = 100  #1000
clip = 1.0
weight_decay = 5e-4
inf = float('inf')

In [18]:
import os
import shutil

# to add /kaggle/working/result directory

In [19]:
# import os

# result_directory = '/kaggle/working/result'
# os.makedirs(result_directory, exist_ok=True)

In [20]:
#to remove files from kaggle/working/result directory
# import os

# result_directory = '/kaggle/working/result'

# # List all files in the result directory
# files_in_result_directory = os.listdir(result_directory)

# # Iterate through the files and remove them
# for file_name in files_in_result_directory:
#     file_path = os.path.join(result_directory, file_name)
#     try:
#         if os.path.isfile(file_path):
#             os.remove(file_path)
#             print(f"Removed: {file_path}")
#     except Exception as e:
#         print(f"Error: {e}")

# print("Files in /kaggle/working/result directory removed.")


# to delete the modesl from kaggle/working except kaggle/working/result

In [21]:
# root_dir = "/kaggle/working"
# excluded_dir = "result"

# # Get all files and directories in the root directory
# all_items = os.listdir(root_dir)

# # Filter out the excluded directory
# items_to_delete = [item for item in all_items if item != excluded_dir]

# # Delete each item (file or directory)
# for item in items_to_delete:
#     item_path = os.path.join(root_dir, item)
#     if os.path.isdir(item_path):
#         shutil.rmtree(item_path)  # Use shutil.rmtree for directories
#     else:
#         os.remove(item_path)

# print(f"Successfully deleted all items except '{excluded_dir}' in {root_dir}.")

In [22]:
def bleu_stats(hypothesis, reference):
    """Compute statistics for BLEU."""
    stats = []
    stats.append(len(hypothesis))
    stats.append(len(reference))
    for n in range(1, 5):
        s_ngrams = Counter(
            [tuple(hypothesis[i:i + n]) for i in range(len(hypothesis) + 1 - n)]
        )
        r_ngrams = Counter(
            [tuple(reference[i:i + n]) for i in range(len(reference) + 1 - n)]
        )

        stats.append(max([sum((s_ngrams & r_ngrams).values()), 0]))
        stats.append(max([len(hypothesis) + 1 - n, 0]))
    return stats


def bleu(stats):
    """Compute BLEU given n-gram statistics."""
    if len(list(filter(lambda x: x == 0, stats))) > 0:
        return 0
    (c, r) = stats[:2]
    log_bleu_prec = sum(
        [math.log(float(x) / y) for x, y in zip(stats[2::2], stats[3::2])]
    ) / 4.
    return math.exp(min([0, 1 - float(r) / c]) + log_bleu_prec)


def get_bleu(hypotheses, reference):
    """Get validation BLEU score for dev set."""
    stats = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
    for hyp, ref in zip(hypotheses, reference):
        stats += np.array(bleu_stats(hyp, ref))
    return 100 * bleu(stats)


def idx_to_word(x, vocab):
    words = []
    for i in x:
        word = next((word for word, index in vocab.items() if index == i), None)
        if word is not None and '<' not in word:
            words.append(word)
    words = " ".join(words)
    return words

In [23]:
# dataset = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=lambda batch: custom_collate(batch, dataset.src_sos_idx, dataset.src_eos_idx, dataset.trg_sos_idx, dataset.trg_eos_idx, dataset.src_pad_idx, dataset.trg_pad_idx))
train_iter = DataLoader(dataset.train_data, batch_size=8, shuffle=True, collate_fn=lambda batch: custom_collate(batch, dataset.src_sos_idx, dataset.src_eos_idx, dataset.trg_sos_idx, dataset.trg_eos_idx, dataset.src_pad_idx, dataset.trg_pad_idx,dataset.src_vocab_dict,dataset.trg_vocab_dict))
test_iter = DataLoader(dataset.test_data, batch_size=8, shuffle=False, collate_fn=lambda batch: custom_collate(batch, dataset.src_sos_idx, dataset.src_eos_idx, dataset.trg_sos_idx, dataset.trg_eos_idx, dataset.src_pad_idx, dataset.trg_pad_idx,dataset.src_vocab_dict,dataset.trg_vocab_dict))
valid_iter = DataLoader(dataset.val_data, batch_size=8, shuffle=False, collate_fn=lambda batch: custom_collate(batch, dataset.src_sos_idx, dataset.src_eos_idx, dataset.trg_sos_idx, dataset.trg_eos_idx, dataset.src_pad_idx, dataset.trg_pad_idx,dataset.src_vocab_dict,dataset.trg_vocab_dict))


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.kaiming_uniform_(m.weight.data)


model = Transformer(src_pad_idx=dataset.src_pad_idx,
                    trg_pad_idx=dataset.trg_pad_idx,
                    trg_sos_idx=dataset.trg_sos_idx,
                    d_model=d_model,
                    enc_voc_size=dataset.enc_voc_size,
                    dec_voc_size=dataset.dec_voc_size,
                    max_len=max_len,
                    ffn_hidden=ffn_hidden,
                    n_head=n_heads,
                    n_layers=n_layers,
                    drop_prob=drop_prob,
                    device=device).to(device)

print(f'The model has {count_parameters(model):,} trainable parameters')
model.apply(initialize_weights)
optimizer = Adam(params=model.parameters(),
                 lr=init_lr,
                 weight_decay=weight_decay,
                 eps=adam_eps)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 verbose=True,
                                                 factor=factor,
                                                 patience=patience)

criterion = nn.CrossEntropyLoss(ignore_index=dataset.src_pad_idx)



def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    correct_predictions = 0
    total_samples = 0

    iterator = tqdm(iterator, total=len(iterator), desc='Training')
    
    for i, batch in enumerate(iterator):
        src, trg = batch
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()
        output = model(src, trg[:, :-1])
        output_reshape = output.contiguous().view(-1, output.shape[-1])
        trg = trg[:, 1:].contiguous().view(-1)

        loss = criterion(output_reshape, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

        # Compute accuracy
        predicted = output_reshape.argmax(dim=1)
        correct_predictions += (predicted == trg).sum().item()
        total_samples += trg.size(0)

        accuracy = correct_predictions / total_samples

        # Update the progress bar description
#         iterator.set_description(f'Training Loss: {loss.item():.4f} | Accuracy: {accuracy:.4f}')

    return epoch_loss / len(iterator), accuracy




def evaluate(model, iterator, criterion, target_vocab, device):
    model.eval()
    epoch_loss = 0
    total_correct = 0
    total_samples = 0
    accuracies = []
    references = []  # List to store reference sentences
    hypotheses = []  # List to store predicted sentences

    iterator = tqdm(iterator, total=len(iterator), desc='Evaluating')
    
    with torch.no_grad():
        for i, (src_batch, trg_batch) in enumerate(iterator):
            src = src_batch
            trg = trg_batch
            src, trg = src.to(device), trg.to(device)

            # Forward pass
            output = model(src, trg[:, :-1])
            output_reshape = output.contiguous().view(-1, output.shape[-1])
            output_reshape = output_reshape.to(device)
            trg = trg[:, 1:].contiguous().view(-1)
            trg = trg.to(output.device)

            # Calculate loss
            loss = criterion(output_reshape, trg)
            epoch_loss += loss.item()

            # Calculate accuracy
            predicted = output_reshape.argmax(dim=1)
            correct = (predicted == trg).sum().item()
            total_correct += correct
            total_samples += trg.size(0)
            accuracy = total_correct / total_samples
            accuracies.append(accuracy)

            # Convert indices to words for BLEU calculation
            trg_words = [idx_to_word(sentence, target_vocab) for sentence in trg_batch.T]
            output_words = [idx_to_word(sentence, target_vocab) for sentence in output.argmax(dim=2).T]


            references.extend(trg_words)
            hypotheses.extend(output_words)

            # print(f"trg size: {trg.size()}\n")
            # print(f"output size: {output.size()}\n")

    # Calculate BLEU score
    bleu = get_bleu(hypotheses, references)
    
    return epoch_loss / len(iterator), accuracies, bleu
        
        
        


def run(total_epoch, best_loss):
    train_losses, test_losses, bleus, train_accuracies, val_accuracies = [], [], [], [],[]
    for step in range(total_epoch):
        start_time = time.time()
        train_loss,train_accuracy = train(model, train_iter, optimizer, criterion, clip)
        valid_loss, bleu, val_accuracy = evaluate(model, valid_iter, criterion, dataset.trg_vocab_dict, device)
        end_time = time.time()

        if step > warmup:
            scheduler.step(valid_loss)

        train_losses.append(train_loss)
        test_losses.append(valid_loss)
        bleus.append(bleu)
        train_accuracies.append(train_accuracy)  # Append train accuracy to the list
        val_accuracies.append(val_accuracy)  # Append accuracy to the list
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_loss:
            best_loss = valid_loss
            # Save the model if needed
            # torch.save(model.state_dict(), '/kaggle/working/model-{0}.pt'.format(valid_loss))

        # result_directory = '/kaggle/working/result'
        # os.makedirs(result_directory, exist_ok=True)
        
        # Save metrics to separate files
        with open('/Users/romankasichhwa/Desktop/project/transformer/saved/transformer-base/train.txt', 'w') as f:
            f.write(str(train_losses))

        with open('/Users/romankasichhwa/Desktop/project/transformer/saved/transformer-base/bleu.txt', 'w') as f:
            f.write(str(bleus))

        with open('/Users/romankasichhwa/Desktop/project/transformer/saved/transformer-base/test_loss.txt', 'w') as f:
            f.write(str(test_losses))

        with open('/Users/romankasichhwa/Desktop/project/transformer/saved/transformer-base/train_accuracies.txt', 'w') as f:
            f.write(str(train_accuracies))

        with open('/Users/romankasichhwa/Desktop/project/transformer/saved/transformer-base/val_accuracies.txt', 'w') as f:
            f.write(str(val_accuracies))

        print(f'Epoch: {step + 1} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Train Accuracy: {train_accuracies[-1]*100:.3f} %')
        print(f'\tVal Loss: {valid_loss:.3f} |  Val PPL: {math.exp(valid_loss):7.3f} | Val Accuracy: {val_accuracies[-1]*100:.3f} %')
        print(f'\tBLEU Score: {bleus[-1][-1]:.3f}')


        
        

    torch.save(model.state_dict(), '/Users/romankasichhwa/Desktop/project/transformer/saved/model-1.pt'.format(valid_loss))

if __name__ == '__main__':
    run(total_epoch=epoch, best_loss=inf)
    draw(mode='loss',total_epoch=epoch, live_update=True)
    draw(mode='bleu',total_epoch=epoch, live_update=True)
    draw(mode='accuracy',total_epoch=epoch, live_update=True)

The model has 34,473,713 trainable parameters


  _torch_pytree._register_pytree_node(
Training:   0%|          | 0/50 [00:00<?, ?it/s]

Batch Sizes (Before Padding): [19, 9, 14, 14, 17, 17, 8, 10] [23, 11, 14, 19, 22, 17, 11, 8]


Training:   2%|▏         | 1/50 [00:00<00:29,  1.65it/s]

Batch Sizes (Before Padding): [11, 17, 9, 8, 11, 15, 16, 12] [14, 25, 10, 9, 11, 20, 21, 11]


Training:   4%|▍         | 2/50 [00:01<00:24,  1.99it/s]

Batch Sizes (Before Padding): [10, 7, 16, 9, 13, 14, 11, 9] [12, 8, 19, 10, 21, 20, 11, 10]


Training:   6%|▌         | 3/50 [00:01<00:20,  2.27it/s]

Batch Sizes (Before Padding): [7, 17, 10, 4, 7, 9, 14, 10] [8, 16, 10, 9, 9, 10, 16, 12]


Training:   8%|▊         | 4/50 [00:01<00:19,  2.42it/s]

Batch Sizes (Before Padding): [15, 11, 8, 13, 10, 11, 12, 16] [27, 14, 12, 19, 9, 11, 13, 17]


Training:  10%|█         | 5/50 [00:02<00:19,  2.29it/s]

Batch Sizes (Before Padding): [11, 13, 12, 12, 11, 15, 11, 9] [16, 16, 13, 18, 12, 21, 15, 9]


Training:  12%|█▏        | 6/50 [00:02<00:18,  2.40it/s]

Batch Sizes (Before Padding): [11, 14, 12, 9, 9, 9, 12, 12] [12, 17, 14, 10, 11, 11, 14, 15]


Training:  14%|█▍        | 7/50 [00:03<00:17,  2.48it/s]

Batch Sizes (Before Padding): [9, 11, 9, 16, 12, 12, 14, 12] [10, 13, 9, 22, 19, 21, 20, 18]


Training:  16%|█▌        | 8/50 [00:03<00:16,  2.56it/s]

Batch Sizes (Before Padding): [14, 8, 19, 10, 10, 14, 12, 7] [16, 8, 23, 12, 14, 16, 14, 6]


Training:  18%|█▊        | 9/50 [00:03<00:18,  2.27it/s]

Batch Sizes (Before Padding): [15, 15, 13, 10, 13, 15, 15, 10] [19, 14, 14, 11, 11, 16, 18, 11]


Training:  20%|██        | 10/50 [00:04<00:23,  1.68it/s]

Batch Sizes (Before Padding): [16, 10, 9, 17, 11, 11, 11, 19] [17, 16, 15, 19, 10, 13, 15, 18]


Training:  22%|██▏       | 11/50 [00:05<00:21,  1.80it/s]

Batch Sizes (Before Padding): [14, 12, 15, 11, 15, 8, 1, 16] [16, 16, 19, 18, 19, 9, 2, 19]


Training:  24%|██▍       | 12/50 [00:05<00:21,  1.73it/s]

Batch Sizes (Before Padding): [13, 9, 10, 5, 2, 16, 12, 11] [16, 7, 8, 4, 3, 17, 13, 17]


Training:  26%|██▌       | 13/50 [00:06<00:19,  1.88it/s]

Batch Sizes (Before Padding): [12, 13, 10, 16, 10, 10, 16, 17] [17, 12, 9, 16, 11, 8, 20, 20]


Training:  28%|██▊       | 14/50 [00:06<00:17,  2.03it/s]

Batch Sizes (Before Padding): [14, 8, 6, 11, 11, 7, 17, 14] [17, 11, 6, 12, 11, 9, 22, 17]


Training:  30%|███       | 15/50 [00:07<00:16,  2.07it/s]

Batch Sizes (Before Padding): [11, 9, 8, 7, 14, 10, 11, 8] [15, 10, 8, 7, 13, 10, 11, 8]


Training:  32%|███▏      | 16/50 [00:07<00:16,  2.06it/s]

Batch Sizes (Before Padding): [11, 14, 10, 10, 18, 14, 9, 10] [14, 19, 10, 15, 17, 16, 10, 15]


Training:  34%|███▍      | 17/50 [00:08<00:16,  2.05it/s]

Batch Sizes (Before Padding): [14, 11, 10, 7, 14, 10, 9, 13] [18, 11, 9, 11, 18, 13, 10, 22]


Training:  36%|███▌      | 18/50 [00:08<00:17,  1.87it/s]

Batch Sizes (Before Padding): [10, 18, 14, 15, 15, 8, 10, 10] [9, 16, 14, 26, 12, 9, 10, 10]


Training:  38%|███▊      | 19/50 [00:09<00:15,  2.01it/s]

Batch Sizes (Before Padding): [8, 8, 10, 12, 11, 12, 8, 17] [10, 10, 11, 13, 10, 16, 10, 20]


Training:  40%|████      | 20/50 [00:09<00:15,  1.91it/s]

Batch Sizes (Before Padding): [13, 15, 17, 9, 11, 10, 8, 16] [13, 19, 19, 11, 13, 12, 9, 17]


Training:  42%|████▏     | 21/50 [00:10<00:14,  1.99it/s]

Batch Sizes (Before Padding): [12, 14, 12, 18, 16, 15, 11, 13] [15, 17, 20, 21, 19, 22, 11, 10]


Training:  44%|████▍     | 22/50 [00:10<00:13,  2.01it/s]

Batch Sizes (Before Padding): [7, 14, 3, 15, 15, 17, 9, 9] [10, 16, 4, 15, 18, 18, 10, 10]


Training:  46%|████▌     | 23/50 [00:11<00:12,  2.13it/s]

Batch Sizes (Before Padding): [10, 12, 5, 12, 10, 18, 16, 17] [17, 11, 4, 13, 8, 22, 20, 14]


Training:  48%|████▊     | 24/50 [00:11<00:11,  2.22it/s]

Batch Sizes (Before Padding): [18, 12, 11, 8, 15, 14, 11, 12] [18, 13, 16, 10, 13, 18, 15, 14]


Training:  50%|█████     | 25/50 [00:12<00:11,  2.24it/s]

Batch Sizes (Before Padding): [12, 10, 11, 7, 17, 7, 9, 11] [14, 17, 12, 7, 20, 9, 10, 9]


Training:  52%|█████▏    | 26/50 [00:12<00:10,  2.29it/s]

Batch Sizes (Before Padding): [12, 10, 12, 5, 13, 11, 19, 9] [15, 11, 13, 6, 17, 18, 18, 12]


Training:  54%|█████▍    | 27/50 [00:12<00:10,  2.22it/s]

Batch Sizes (Before Padding): [17, 15, 11, 8, 10, 10, 14, 14] [23, 17, 9, 9, 13, 10, 18, 19]


Training:  56%|█████▌    | 28/50 [00:13<00:10,  2.19it/s]

Batch Sizes (Before Padding): [13, 9, 7, 17, 11, 15, 15, 9] [19, 12, 11, 16, 9, 16, 17, 11]


Training:  58%|█████▊    | 29/50 [00:13<00:09,  2.21it/s]

Batch Sizes (Before Padding): [14, 11, 18, 15, 12, 10, 11, 19] [18, 12, 15, 13, 19, 13, 13, 19]


Training:  60%|██████    | 30/50 [00:14<00:09,  2.15it/s]

Batch Sizes (Before Padding): [17, 17, 13, 12, 10, 13, 10, 17] [21, 22, 16, 13, 16, 16, 12, 17]


Training:  62%|██████▏   | 31/50 [00:14<00:08,  2.24it/s]

Batch Sizes (Before Padding): [10, 10, 14, 10, 7, 13, 9, 7] [12, 15, 18, 12, 9, 17, 12, 8]


Training:  64%|██████▍   | 32/50 [00:15<00:07,  2.25it/s]

Batch Sizes (Before Padding): [13, 7, 12, 17, 15, 8, 12, 11] [13, 9, 15, 15, 18, 10, 18, 9]


Training:  66%|██████▌   | 33/50 [00:15<00:07,  2.36it/s]

Batch Sizes (Before Padding): [11, 15, 10, 16, 11, 11, 16, 16] [13, 15, 10, 17, 17, 13, 20, 19]


Training:  68%|██████▊   | 34/50 [00:16<00:07,  2.25it/s]

Batch Sizes (Before Padding): [12, 9, 12, 13, 6, 15, 13, 9] [15, 12, 13, 19, 8, 18, 16, 12]


Training:  70%|███████   | 35/50 [00:16<00:07,  2.08it/s]

Batch Sizes (Before Padding): [9, 12, 9, 3, 11, 16, 9, 10] [11, 13, 11, 4, 11, 20, 10, 11]


Training:  72%|███████▏  | 36/50 [00:17<00:06,  2.10it/s]

Batch Sizes (Before Padding): [8, 11, 9, 16, 11, 17, 14, 10] [9, 15, 11, 15, 10, 18, 14, 11]


Training:  74%|███████▍  | 37/50 [00:17<00:05,  2.18it/s]

Batch Sizes (Before Padding): [19, 15, 17, 10, 9, 9, 13, 10] [16, 11, 21, 10, 11, 10, 13, 12]


Training:  76%|███████▌  | 38/50 [00:17<00:05,  2.23it/s]

Batch Sizes (Before Padding): [13, 14, 12, 10, 10, 13, 14, 10] [14, 16, 15, 8, 9, 20, 18, 8]


Training:  78%|███████▊  | 39/50 [00:18<00:04,  2.28it/s]

Batch Sizes (Before Padding): [19, 16, 11, 10, 13, 10, 14, 15] [18, 20, 14, 8, 11, 10, 18, 22]


Training:  80%|████████  | 40/50 [00:18<00:04,  2.27it/s]

Batch Sizes (Before Padding): [10, 11, 14, 10, 10, 10, 14, 11] [10, 14, 20, 12, 17, 8, 16, 12]


Training:  82%|████████▏ | 41/50 [00:19<00:04,  2.22it/s]

Batch Sizes (Before Padding): [14, 16, 16, 9, 9, 10, 13, 14] [19, 21, 20, 10, 9, 8, 16, 16]


Training:  84%|████████▍ | 42/50 [00:19<00:03,  2.15it/s]

Batch Sizes (Before Padding): [12, 18, 8, 16, 18, 10, 15, 13] [11, 17, 9, 20, 21, 15, 17, 13]


Training:  86%|████████▌ | 43/50 [00:20<00:03,  2.17it/s]

Batch Sizes (Before Padding): [9, 12, 16, 17, 18, 8, 8, 8] [10, 12, 13, 22, 19, 13, 8, 9]


Training:  88%|████████▊ | 44/50 [00:20<00:02,  2.21it/s]

Batch Sizes (Before Padding): [13, 15, 8, 14, 12, 12, 18, 15] [13, 19, 11, 23, 10, 13, 22, 17]


Training:  90%|█████████ | 45/50 [00:21<00:02,  2.18it/s]

Batch Sizes (Before Padding): [13, 13, 14, 10, 14, 13, 8, 12] [14, 15, 12, 17, 17, 16, 8, 18]


Training:  92%|█████████▏| 46/50 [00:21<00:01,  2.16it/s]

Batch Sizes (Before Padding): [8, 13, 9, 11, 13, 12, 16, 10] [9, 14, 11, 16, 12, 13, 17, 14]


Training:  94%|█████████▍| 47/50 [00:22<00:01,  2.07it/s]

Batch Sizes (Before Padding): [11, 15, 10, 3, 13, 15, 8, 1] [17, 17, 12, 4, 17, 21, 9, 4]


Training:  96%|█████████▌| 48/50 [00:22<00:00,  2.04it/s]

Batch Sizes (Before Padding): [9, 8, 18, 15, 11, 12, 12, 10] [9, 9, 16, 16, 14, 14, 13, 10]


Training:  98%|█████████▊| 49/50 [00:23<00:00,  2.16it/s]

Batch Sizes (Before Padding): [9, 13, 14, 14, 14, 14, 15] [11, 17, 14, 14, 16, 17, 21]


Training: 100%|██████████| 50/50 [00:23<00:00,  2.14it/s]
Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Batch Sizes (Before Padding): [16, 14, 16, 16, 16, 16, 11, 16] [20, 18, 20, 20, 20, 19, 15, 22]


Evaluating:  14%|█▍        | 1/7 [00:01<00:06,  1.02s/it]

Batch Sizes (Before Padding): [14, 11, 16, 14, 16, 17, 12, 16] [20, 14, 20, 18, 20, 16, 13, 18]


Evaluating:  29%|██▊       | 2/7 [00:02<00:04,  1.00it/s]

Batch Sizes (Before Padding): [15, 17, 10, 11, 9, 13, 17, 14] [16, 21, 12, 12, 10, 13, 15, 11]


Evaluating:  43%|████▎     | 3/7 [00:03<00:04,  1.01s/it]

Batch Sizes (Before Padding): [13, 9, 10, 17, 18, 9, 10, 13] [15, 11, 12, 19, 18, 11, 11, 14]


Evaluating:  57%|█████▋    | 4/7 [00:04<00:03,  1.00s/it]

Batch Sizes (Before Padding): [11, 8, 16, 13, 16, 12, 12, 8] [13, 10, 14, 12, 18, 12, 12, 8]


Evaluating:  71%|███████▏  | 5/7 [00:05<00:02,  1.00s/it]

Batch Sizes (Before Padding): [13, 13, 13, 12, 13, 17, 14, 12] [21, 16, 14, 18, 23, 18, 14, 13]


Evaluating: 100%|██████████| 7/7 [00:06<00:00,  1.13it/s]


Batch Sizes (Before Padding): [11] [19]
Epoch: 1 | Time: 0m 29s
	Train Loss: 6.466 | Train PPL: 642.692 | Train Accuracy: 22.919 %
	Val Loss: 9.169 |  Val PPL: 9593.474 | Val Accuracy: 0.000 %
	BLEU Score: 0.424


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Batch Sizes (Before Padding): [7, 14, 8, 12, 10, 18, 13, 15] [11, 18, 9, 16, 12, 21, 10, 19]


Training:   2%|▏         | 1/50 [00:00<00:28,  1.71it/s]

Batch Sizes (Before Padding): [16, 12, 15, 13, 16, 16, 10, 8] [17, 14, 21, 13, 13, 20, 11, 9]


Training:   4%|▍         | 2/50 [00:01<00:25,  1.90it/s]

Batch Sizes (Before Padding): [11, 13, 9, 12, 14, 17, 6, 8] [11, 13, 11, 18, 14, 18, 8, 10]


Training:   6%|▌         | 3/50 [00:01<00:23,  1.99it/s]

Batch Sizes (Before Padding): [13, 5, 15, 15, 11, 12, 10, 9] [22, 4, 17, 21, 12, 15, 9, 9]


Training:   8%|▊         | 4/50 [00:01<00:22,  2.08it/s]

Batch Sizes (Before Padding): [12, 13, 10, 3, 9, 7, 16, 14] [11, 19, 10, 4, 9, 11, 21, 16]


Training:  10%|█         | 5/50 [00:02<00:23,  1.94it/s]

Batch Sizes (Before Padding): [17, 11, 14, 15, 11, 9, 12, 13] [19, 18, 19, 13, 15, 10, 14, 13]


Training:  12%|█▏        | 6/50 [00:02<00:21,  2.06it/s]

Batch Sizes (Before Padding): [10, 10, 17, 15, 10, 14, 14, 7] [15, 12, 15, 13, 11, 23, 19, 8]


Training:  14%|█▍        | 7/50 [00:03<00:21,  2.00it/s]

Batch Sizes (Before Padding): [12, 11, 12, 10, 17, 10, 18, 15] [15, 10, 15, 8, 22, 12, 21, 17]


Training:  16%|█▌        | 8/50 [00:04<00:21,  1.95it/s]

Batch Sizes (Before Padding): [9, 10, 7, 10, 11, 7, 10, 14] [10, 14, 9, 9, 10, 9, 8, 20]


Training:  18%|█▊        | 9/50 [00:04<00:20,  1.97it/s]

Batch Sizes (Before Padding): [14, 8, 15, 10, 1, 13, 10, 5] [18, 8, 22, 13, 4, 16, 10, 4]


Training:  20%|██        | 10/50 [00:05<00:19,  2.01it/s]

Batch Sizes (Before Padding): [12, 9, 12, 8, 15, 9, 13, 9] [13, 9, 17, 9, 20, 7, 14, 10]


Training:  22%|██▏       | 11/50 [00:05<00:19,  1.97it/s]

Batch Sizes (Before Padding): [10, 10, 10, 14, 10, 12, 14, 14] [13, 16, 8, 16, 15, 13, 16, 17]


Training:  24%|██▍       | 12/50 [00:06<00:19,  1.99it/s]

Batch Sizes (Before Padding): [9, 11, 17, 10, 15, 12, 7, 10] [9, 11, 21, 12, 17, 11, 9, 11]


Training:  26%|██▌       | 13/50 [00:06<00:18,  1.99it/s]

Batch Sizes (Before Padding): [7, 15, 11, 8, 11, 12, 10, 16] [6, 27, 17, 11, 12, 15, 8, 15]


Training:  28%|██▊       | 14/50 [00:07<00:17,  2.06it/s]

Batch Sizes (Before Padding): [9, 10, 11, 8, 12, 13, 15, 13] [12, 12, 14, 9, 13, 20, 15, 17]


Training:  30%|███       | 15/50 [00:07<00:17,  2.06it/s]

Batch Sizes (Before Padding): [10, 15, 14, 15, 17, 17, 13, 9] [8, 19, 18, 26, 22, 22, 14, 11]


Training:  32%|███▏      | 16/50 [00:08<00:19,  1.77it/s]

Batch Sizes (Before Padding): [18, 8, 11, 14, 16, 13, 11, 9] [17, 11, 15, 18, 17, 11, 13, 12]


Training:  34%|███▍      | 17/50 [00:08<00:18,  1.79it/s]

Batch Sizes (Before Padding): [9, 12, 16, 10, 16, 16, 13, 17] [10, 12, 20, 10, 20, 19, 16, 22]


Training:  36%|███▌      | 18/50 [00:09<00:20,  1.59it/s]

Batch Sizes (Before Padding): [11, 14, 14, 16, 17, 17, 14, 12] [14, 19, 16, 19, 18, 17, 14, 14]


Training:  38%|███▊      | 19/50 [00:10<00:20,  1.50it/s]

Batch Sizes (Before Padding): [13, 16, 9, 13, 14, 18, 10, 18] [13, 20, 12, 16, 17, 22, 10, 16]


Training:  40%|████      | 20/50 [00:10<00:19,  1.54it/s]

Batch Sizes (Before Padding): [10, 10, 10, 14, 7, 8, 10, 10] [8, 14, 17, 20, 7, 13, 9, 15]


Training:  42%|████▏     | 21/50 [00:11<00:18,  1.59it/s]

Batch Sizes (Before Padding): [9, 11, 11, 19, 12, 12, 10, 16] [11, 14, 9, 18, 13, 13, 12, 21]


Training:  44%|████▍     | 22/50 [00:12<00:18,  1.55it/s]

Batch Sizes (Before Padding): [11, 12, 18, 14, 12, 17, 13, 15] [16, 13, 17, 13, 13, 20, 12, 12]


Training:  46%|████▌     | 23/50 [00:12<00:15,  1.70it/s]

Batch Sizes (Before Padding): [10, 8, 8, 9, 11, 11, 12, 10] [10, 8, 8, 10, 12, 13, 13, 12]


Training:  48%|████▊     | 24/50 [00:13<00:14,  1.82it/s]

Batch Sizes (Before Padding): [11, 9, 8, 8, 14, 8, 13, 10] [17, 12, 10, 9, 14, 8, 13, 12]


Training:  50%|█████     | 25/50 [00:13<00:12,  1.97it/s]

Batch Sizes (Before Padding): [11, 14, 15, 8, 15, 14, 19, 14] [12, 14, 18, 10, 14, 17, 23, 16]


Training:  52%|█████▏    | 26/50 [00:13<00:11,  2.07it/s]

Batch Sizes (Before Padding): [9, 10, 9, 16, 13, 9, 15, 15] [10, 10, 10, 17, 15, 10, 17, 19]


Training:  54%|█████▍    | 27/50 [00:14<00:10,  2.18it/s]

Batch Sizes (Before Padding): [10, 10, 10, 8, 15, 14, 16, 7] [11, 10, 9, 8, 22, 17, 19, 7]


Training:  56%|█████▌    | 28/50 [00:14<00:10,  2.18it/s]

Batch Sizes (Before Padding): [13, 11, 14, 10, 11, 12, 11, 14] [17, 9, 16, 12, 14, 18, 10, 14]


Training:  58%|█████▊    | 29/50 [00:15<00:09,  2.18it/s]

Batch Sizes (Before Padding): [10, 10, 12, 18, 13, 16, 15, 11] [8, 10, 14, 19, 12, 22, 18, 14]


Training:  60%|██████    | 30/50 [00:15<00:09,  2.22it/s]

Batch Sizes (Before Padding): [11, 11, 13, 11, 12, 13, 12, 8] [15, 13, 16, 11, 18, 16, 19, 9]


Training:  62%|██████▏   | 31/50 [00:16<00:08,  2.16it/s]

Batch Sizes (Before Padding): [14, 12, 9, 15, 15, 9, 14, 9] [16, 15, 11, 15, 21, 10, 17, 10]


Training:  64%|██████▍   | 32/50 [00:16<00:08,  2.16it/s]

Batch Sizes (Before Padding): [17, 13, 19, 16, 12, 10, 14, 10] [20, 19, 19, 20, 13, 12, 20, 13]


Training:  66%|██████▌   | 33/50 [00:17<00:07,  2.15it/s]

Batch Sizes (Before Padding): [14, 17, 18, 16, 12, 14, 12, 8] [18, 25, 15, 20, 15, 12, 18, 10]


Training:  68%|██████▊   | 34/50 [00:17<00:08,  1.93it/s]

Batch Sizes (Before Padding): [10, 13, 11, 9, 10, 10, 7, 7] [17, 17, 18, 11, 17, 17, 8, 9]


Training:  70%|███████   | 35/50 [00:18<00:07,  2.03it/s]

Batch Sizes (Before Padding): [10, 9, 10, 11, 15, 14, 18, 11] [8, 10, 10, 15, 19, 16, 18, 14]


Training:  72%|███████▏  | 36/50 [00:18<00:07,  1.97it/s]

Batch Sizes (Before Padding): [12, 9, 16, 9, 12, 12, 9, 8] [20, 11, 19, 15, 21, 13, 10, 9]


Training:  74%|███████▍  | 37/50 [00:19<00:06,  1.97it/s]

Batch Sizes (Before Padding): [11, 10, 19, 3, 11, 12, 10, 8] [13, 9, 18, 4, 13, 16, 11, 12]


Training:  76%|███████▌  | 38/50 [00:19<00:05,  2.04it/s]

Batch Sizes (Before Padding): [17, 15, 11, 13, 11, 9, 11, 15] [21, 19, 17, 16, 16, 10, 12, 11]


Training:  78%|███████▊  | 39/50 [00:20<00:05,  2.17it/s]

Batch Sizes (Before Padding): [18, 6, 10, 10, 12, 15, 16, 7] [22, 6, 8, 11, 14, 18, 17, 10]


Training:  80%|████████  | 40/50 [00:20<00:04,  2.12it/s]

Batch Sizes (Before Padding): [14, 15, 16, 9, 16, 8, 11, 17] [16, 16, 17, 11, 20, 10, 11, 23]


Training:  82%|████████▏ | 41/50 [00:21<00:04,  2.11it/s]

Batch Sizes (Before Padding): [8, 10, 9, 4, 11, 9, 13, 17] [9, 16, 11, 9, 11, 10, 11, 17]


Training:  84%|████████▍ | 42/50 [00:21<00:03,  2.02it/s]

Batch Sizes (Before Padding): [14, 7, 15, 9, 12, 11, 11, 17] [19, 8, 16, 11, 13, 11, 12, 16]


Training:  86%|████████▌ | 43/50 [00:22<00:03,  1.95it/s]

Batch Sizes (Before Padding): [11, 12, 9, 12, 10, 3, 8, 9] [9, 14, 10, 11, 15, 4, 9, 11]


Training:  88%|████████▊ | 44/50 [00:22<00:03,  1.86it/s]

Batch Sizes (Before Padding): [13, 9, 19, 12, 16, 13, 10, 17] [21, 11, 18, 13, 17, 19, 10, 20]


Training:  90%|█████████ | 45/50 [00:23<00:02,  1.99it/s]

Batch Sizes (Before Padding): [10, 10, 13, 12, 1, 8, 15, 11] [11, 10, 14, 10, 2, 9, 17, 11]


Training:  92%|█████████▏| 46/50 [00:23<00:02,  1.97it/s]

Batch Sizes (Before Padding): [11, 14, 17, 5, 13, 19, 14, 11] [11, 17, 19, 6, 14, 16, 18, 9]


Training:  94%|█████████▍| 47/50 [00:24<00:01,  1.97it/s]

Batch Sizes (Before Padding): [14, 18, 13, 11, 13, 9, 12, 15] [18, 16, 17, 16, 16, 10, 13, 18]


Training:  96%|█████████▌| 48/50 [00:24<00:01,  1.88it/s]

Batch Sizes (Before Padding): [15, 9, 2, 8, 14, 11, 17, 16] [16, 12, 3, 11, 16, 13, 14, 16]


Training:  98%|█████████▊| 49/50 [00:25<00:00,  2.01it/s]

Batch Sizes (Before Padding): [17, 8, 7, 14, 19, 12, 11] [16, 9, 9, 18, 23, 19, 15]


Training: 100%|██████████| 50/50 [00:25<00:00,  1.94it/s]
Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Batch Sizes (Before Padding): [16, 14, 16, 16, 16, 16, 11, 16] [20, 18, 20, 20, 20, 19, 15, 22]


Evaluating:  14%|█▍        | 1/7 [00:01<00:06,  1.08s/it]

Batch Sizes (Before Padding): [14, 11, 16, 14, 16, 17, 12, 16] [20, 14, 20, 18, 20, 16, 13, 18]


Evaluating:  29%|██▊       | 2/7 [00:02<00:05,  1.13s/it]

Batch Sizes (Before Padding): [15, 17, 10, 11, 9, 13, 17, 14] [16, 21, 12, 12, 10, 13, 15, 11]


Evaluating:  43%|████▎     | 3/7 [00:03<00:04,  1.12s/it]

Batch Sizes (Before Padding): [13, 9, 10, 17, 18, 9, 10, 13] [15, 11, 12, 19, 18, 11, 11, 14]


Evaluating:  57%|█████▋    | 4/7 [00:04<00:03,  1.13s/it]

Batch Sizes (Before Padding): [11, 8, 16, 13, 16, 12, 12, 8] [13, 10, 14, 12, 18, 12, 12, 8]


Evaluating:  71%|███████▏  | 5/7 [00:05<00:02,  1.11s/it]

Batch Sizes (Before Padding): [13, 13, 13, 12, 13, 17, 14, 12] [21, 16, 14, 18, 23, 18, 14, 13]


Evaluating: 100%|██████████| 7/7 [00:06<00:00,  1.04it/s]


Batch Sizes (Before Padding): [11] [19]
Epoch: 2 | Time: 0m 32s
	Train Loss: 4.947 | Train PPL: 140.719 | Train Accuracy: 48.241 %
	Val Loss: 8.669 |  Val PPL: 5821.110 | Val Accuracy: 0.000 %
	BLEU Score: 0.424


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Batch Sizes (Before Padding): [10, 12, 16, 9, 12, 13, 13, 12] [10, 20, 17, 11, 14, 19, 14, 13]


Training:   2%|▏         | 1/50 [00:00<00:32,  1.52it/s]

Batch Sizes (Before Padding): [7, 15, 8, 14, 12, 9, 17, 10] [9, 21, 10, 20, 13, 9, 20, 8]


Training:   4%|▍         | 2/50 [00:01<00:24,  1.98it/s]

Batch Sizes (Before Padding): [17, 11, 9, 16, 11, 14, 18, 1] [14, 13, 10, 20, 13, 18, 15, 4]


Training:   6%|▌         | 3/50 [00:01<00:23,  2.04it/s]

Batch Sizes (Before Padding): [9, 10, 10, 8, 14, 14, 10, 12] [9, 15, 10, 10, 19, 12, 11, 15]


Training:   8%|▊         | 4/50 [00:01<00:21,  2.13it/s]

Batch Sizes (Before Padding): [11, 8, 8, 13, 10, 8, 15, 17] [13, 9, 10, 19, 12, 8, 16, 16]


Training:  10%|█         | 5/50 [00:02<00:22,  2.01it/s]

Batch Sizes (Before Padding): [10, 17, 14, 17, 8, 11, 12, 16] [11, 22, 17, 22, 9, 11, 13, 20]


Training:  12%|█▏        | 6/50 [00:02<00:20,  2.12it/s]

Batch Sizes (Before Padding): [9, 10, 7, 9, 9, 15, 9, 19] [7, 10, 9, 11, 15, 17, 10, 18]


Training:  14%|█▍        | 7/50 [00:03<00:20,  2.08it/s]

Batch Sizes (Before Padding): [3, 13, 13, 12, 6, 11, 13, 7] [4, 17, 16, 15, 8, 17, 17, 11]


Training:  16%|█▌        | 8/50 [00:03<00:19,  2.12it/s]

Batch Sizes (Before Padding): [11, 18, 13, 3, 14, 9, 17, 11] [9, 19, 12, 4, 19, 12, 19, 15]


Training:  18%|█▊        | 9/50 [00:04<00:18,  2.17it/s]

Batch Sizes (Before Padding): [11, 11, 7, 7, 15, 14, 14, 15] [11, 11, 9, 9, 15, 14, 17, 19]


Training:  20%|██        | 10/50 [00:04<00:18,  2.21it/s]

Batch Sizes (Before Padding): [14, 17, 9, 10, 15, 10, 15, 13] [16, 22, 10, 11, 17, 9, 11, 15]


Training:  22%|██▏       | 11/50 [00:05<00:18,  2.14it/s]

Batch Sizes (Before Padding): [17, 12, 11, 7, 16, 11, 9, 11] [20, 13, 14, 10, 20, 9, 10, 12]


Training:  24%|██▍       | 12/50 [00:05<00:18,  2.06it/s]

Batch Sizes (Before Padding): [9, 9, 15, 12, 8, 12, 5, 15] [10, 11, 27, 16, 9, 13, 4, 19]


Training:  26%|██▌       | 13/50 [00:06<00:22,  1.66it/s]

Batch Sizes (Before Padding): [7, 9, 8, 8, 15, 13, 16, 11] [8, 11, 11, 10, 18, 13, 22, 10]


Training:  28%|██▊       | 14/50 [00:07<00:21,  1.69it/s]

Batch Sizes (Before Padding): [15, 11, 17, 11, 14, 14, 17, 9] [13, 16, 18, 16, 17, 14, 21, 10]


Training:  30%|███       | 15/50 [00:07<00:19,  1.84it/s]

Batch Sizes (Before Padding): [9, 10, 8, 16, 12, 13, 12, 7] [12, 17, 9, 17, 18, 10, 11, 11]


Training:  32%|███▏      | 16/50 [00:08<00:17,  1.95it/s]

Batch Sizes (Before Padding): [11, 16, 11, 8, 9, 10, 8, 9] [14, 21, 14, 9, 10, 12, 11, 10]


Training:  34%|███▍      | 17/50 [00:08<00:15,  2.10it/s]

Batch Sizes (Before Padding): [11, 13, 12, 10, 14, 2, 11, 8] [15, 17, 17, 13, 16, 3, 13, 9]


Training:  36%|███▌      | 18/50 [00:08<00:15,  2.09it/s]

Batch Sizes (Before Padding): [11, 13, 18, 12, 6, 12, 12, 10] [9, 13, 22, 13, 6, 19, 18, 15]


Training:  38%|███▊      | 19/50 [00:09<00:14,  2.13it/s]

Batch Sizes (Before Padding): [8, 7, 10, 14, 11, 9, 14, 15] [9, 7, 10, 16, 10, 10, 16, 15]


Training:  40%|████      | 20/50 [00:09<00:14,  2.08it/s]

Batch Sizes (Before Padding): [15, 3, 15, 14, 15, 11, 12, 13] [14, 4, 19, 23, 18, 15, 15, 16]


Training:  42%|████▏     | 21/50 [00:10<00:13,  2.08it/s]

Batch Sizes (Before Padding): [16, 19, 15, 16, 15, 8, 8, 14] [19, 18, 17, 13, 12, 9, 9, 18]


Training:  44%|████▍     | 22/50 [00:10<00:13,  2.05it/s]

Batch Sizes (Before Padding): [12, 10, 12, 13, 15, 9, 9, 10] [14, 11, 15, 13, 17, 11, 11, 8]


Training:  46%|████▌     | 23/50 [00:11<00:15,  1.79it/s]

Batch Sizes (Before Padding): [12, 10, 11, 13, 11, 9, 10, 9] [18, 12, 15, 13, 17, 10, 10, 9]


Training:  48%|████▊     | 24/50 [00:12<00:14,  1.77it/s]

Batch Sizes (Before Padding): [18, 10, 16, 11, 10, 10, 9, 10] [17, 12, 19, 13, 9, 12, 10, 10]


Training:  50%|█████     | 25/50 [00:12<00:13,  1.91it/s]

Batch Sizes (Before Padding): [18, 13, 14, 14, 7, 15, 11, 17] [18, 22, 14, 18, 7, 22, 18, 15]


Training:  52%|█████▏    | 26/50 [00:13<00:11,  2.07it/s]

Batch Sizes (Before Padding): [8, 16, 8, 11, 10, 13, 12, 7] [13, 17, 9, 11, 8, 16, 14, 8]


Training:  54%|█████▍    | 27/50 [00:13<00:10,  2.13it/s]

Batch Sizes (Before Padding): [10, 19, 10, 15, 12, 11, 14, 11] [11, 16, 10, 17, 11, 14, 14, 14]


Training:  56%|█████▌    | 28/50 [00:13<00:10,  2.12it/s]

Batch Sizes (Before Padding): [14, 19, 17, 14, 9, 14, 5, 14] [16, 23, 23, 17, 10, 18, 6, 17]


Training:  58%|█████▊    | 29/50 [00:14<00:09,  2.18it/s]

Batch Sizes (Before Padding): [10, 10, 16, 9, 10, 10, 9, 12] [8, 8, 17, 12, 8, 16, 11, 13]


Training:  60%|██████    | 30/50 [00:14<00:08,  2.28it/s]

Batch Sizes (Before Padding): [10, 12, 17, 14, 16, 19, 12, 12] [10, 13, 25, 19, 17, 19, 15, 15]


Training:  62%|██████▏   | 31/50 [00:15<00:08,  2.26it/s]

Batch Sizes (Before Padding): [9, 10, 10, 10, 13, 12, 18, 13] [11, 8, 16, 17, 16, 13, 22, 14]


Training:  64%|██████▍   | 32/50 [00:15<00:08,  2.21it/s]

Batch Sizes (Before Padding): [13, 8, 5, 4, 9, 8, 14, 9] [11, 12, 4, 9, 10, 8, 17, 11]


Training:  66%|██████▌   | 33/50 [00:16<00:08,  1.99it/s]

Batch Sizes (Before Padding): [14, 10, 10, 10, 16, 10, 9, 16] [16, 12, 12, 13, 17, 17, 12, 20]


Training:  68%|██████▊   | 34/50 [00:16<00:08,  1.92it/s]

Batch Sizes (Before Padding): [12, 7, 12, 12, 10, 11, 13, 16] [11, 6, 13, 21, 17, 16, 17, 20]


Training:  70%|███████   | 35/50 [00:17<00:08,  1.86it/s]

Batch Sizes (Before Padding): [17, 19, 12, 10, 15, 17, 14, 15] [17, 18, 12, 11, 19, 16, 14, 19]


Training:  72%|███████▏  | 36/50 [00:18<00:08,  1.75it/s]

Batch Sizes (Before Padding): [12, 9, 18, 13, 14, 12, 18, 14] [19, 10, 21, 16, 16, 10, 16, 16]


Training:  74%|███████▍  | 37/50 [00:18<00:07,  1.82it/s]

Batch Sizes (Before Padding): [10, 16, 14, 10, 15, 9, 10, 18] [8, 19, 20, 10, 13, 11, 10, 16]


Training:  76%|███████▌  | 38/50 [00:19<00:06,  1.91it/s]

Batch Sizes (Before Padding): [13, 10, 16, 8, 9, 10, 11, 18] [13, 15, 21, 8, 10, 14, 12, 21]


Training:  78%|███████▊  | 39/50 [00:19<00:06,  1.78it/s]

Batch Sizes (Before Padding): [12, 12, 7, 9, 10, 11, 11, 8] [16, 14, 8, 10, 10, 11, 12, 8]


Training:  80%|████████  | 40/50 [00:20<00:05,  1.96it/s]

Batch Sizes (Before Padding): [10, 13, 11, 17, 10, 13, 7, 19] [12, 12, 14, 17, 14, 19, 9, 23]


Training:  82%|████████▏ | 41/50 [00:20<00:04,  1.84it/s]

Batch Sizes (Before Padding): [11, 17, 13, 17, 14, 12, 14, 1] [15, 22, 16, 21, 20, 14, 18, 2]


Training:  84%|████████▍ | 42/50 [00:21<00:04,  1.89it/s]

Batch Sizes (Before Padding): [14, 10, 14, 9, 8, 11, 14, 16] [16, 13, 13, 12, 11, 13, 19, 20]


Training:  86%|████████▌ | 43/50 [00:21<00:03,  1.93it/s]

Batch Sizes (Before Padding): [16, 12, 15, 12, 10, 13, 15, 9] [16, 13, 18, 13, 12, 14, 26, 9]


Training:  88%|████████▊ | 44/50 [00:22<00:03,  1.91it/s]

Batch Sizes (Before Padding): [10, 11, 11, 11, 11, 8, 15, 10] [9, 11, 12, 12, 17, 10, 20, 9]


Training:  90%|█████████ | 45/50 [00:22<00:02,  1.99it/s]

Batch Sizes (Before Padding): [13, 11, 13, 15, 16, 13, 16, 13] [14, 12, 21, 22, 19, 16, 15, 20]


Training:  92%|█████████▏| 46/50 [00:23<00:02,  1.52it/s]

Batch Sizes (Before Padding): [8, 17, 12, 11, 8, 12, 12, 9] [8, 19, 13, 9, 9, 14, 18, 11]


Training:  94%|█████████▍| 47/50 [00:24<00:02,  1.44it/s]

Batch Sizes (Before Padding): [11, 17, 11, 18, 10, 11, 14, 16] [18, 18, 11, 17, 9, 10, 18, 20]


Training:  96%|█████████▌| 48/50 [00:25<00:01,  1.49it/s]

Batch Sizes (Before Padding): [15, 14, 15, 15, 15, 17, 13, 14] [18, 18, 21, 16, 21, 20, 11, 18]


Training:  98%|█████████▊| 49/50 [00:25<00:00,  1.54it/s]

Batch Sizes (Before Padding): [10, 14, 11, 10, 10, 10, 15] [11, 16, 11, 12, 8, 15, 16]


Training: 100%|██████████| 50/50 [00:26<00:00,  1.91it/s]
Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Batch Sizes (Before Padding): [16, 14, 16, 16, 16, 16, 11, 16] [20, 18, 20, 20, 20, 19, 15, 22]


Evaluating:  14%|█▍        | 1/7 [00:01<00:06,  1.11s/it]

Batch Sizes (Before Padding): [14, 11, 16, 14, 16, 17, 12, 16] [20, 14, 20, 18, 20, 16, 13, 18]


Evaluating:  29%|██▊       | 2/7 [00:02<00:05,  1.03s/it]

Batch Sizes (Before Padding): [15, 17, 10, 11, 9, 13, 17, 14] [16, 21, 12, 12, 10, 13, 15, 11]


Evaluating:  43%|████▎     | 3/7 [00:03<00:04,  1.01s/it]

Batch Sizes (Before Padding): [13, 9, 10, 17, 18, 9, 10, 13] [15, 11, 12, 19, 18, 11, 11, 14]


Evaluating:  57%|█████▋    | 4/7 [00:04<00:02,  1.00it/s]

Batch Sizes (Before Padding): [11, 8, 16, 13, 16, 12, 12, 8] [13, 10, 14, 12, 18, 12, 12, 8]


Evaluating:  71%|███████▏  | 5/7 [00:05<00:02,  1.02s/it]

Batch Sizes (Before Padding): [13, 13, 13, 12, 13, 17, 14, 12] [21, 16, 14, 18, 23, 18, 14, 13]


Evaluating: 100%|██████████| 7/7 [00:06<00:00,  1.11it/s]


Batch Sizes (Before Padding): [11] [19]
Epoch: 3 | Time: 0m 32s
	Train Loss: 4.720 | Train PPL: 112.211 | Train Accuracy: 48.250 %
	Val Loss: 7.846 |  Val PPL: 2555.870 | Val Accuracy: 0.000 %
	BLEU Score: 0.424


Training:   0%|          | 0/50 [00:00<?, ?it/s]

Batch Sizes (Before Padding): [8, 2, 7, 10, 11, 15, 15, 11] [9, 3, 9, 13, 17, 17, 13, 15]


Training:   0%|          | 0/50 [00:00<?, ?it/s]


KeyboardInterrupt: 

In [59]:
def draw(mode, live_update=False):
    plt.figure(figsize=(10, 6))

    for draw_epoch in range(1, total_epoch + 1):  # Change the loop variable name to avoid conflict
        if mode == 'loss':
            train = read('/kaggle/working/result/train_loss.txt')[:draw_epoch]
            valid = read('/kaggle/working/result/test_loss.txt')[:draw_epoch]
            plt.plot(train, 'r', label='train')
            plt.plot(valid, 'b', label='validation')
            plt.title('Train/Validation Loss vs. Epoch')
            plt.legend(loc='upper right')

        elif mode == 'bleu':
            bleu = read('/kaggle/working/result/bleu.txt')[:draw_epoch]
            plt.plot(bleu, 'b', label='BLEU score')
            plt.title('BLEU Score vs. Epoch')
            plt.legend(loc='lower right')

        elif mode == 'accuracy':
            accuracy = read('/kaggle/working/result/accuracies.txt')[:draw_epoch]
            plt.plot(accuracy, 'g', label='accuracy')
            plt.title('Accuracy vs. Epoch')
            plt.legend(loc='lower right')

        plt.xlabel('Epoch')
        plt.ylabel(mode.capitalize())
        plt.grid(True, which='both', axis='both')

        if live_update:
            clear_output(wait=True)
            # display(plt.gcf())
            time.sleep(1)  # Optional: Pause for a moment between updates
        else:
            plt.show()

In [57]:
# test_iter=DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=lambda batch: custom_collate(batch, dataset.src_sos_idx, dataset.src_eos_idx, dataset.trg_sos_idx, dataset.trg_eos_idx, dataset.src_pad_idx, dataset.trg_pad_idx))

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


model = Transformer(src_pad_idx=dataset.src_pad_idx,
                    trg_pad_idx=dataset.trg_pad_idx,
                    trg_sos_idx=dataset.trg_sos_idx,
                    d_model=d_model,
                    enc_voc_size=dataset.enc_voc_size,
                    dec_voc_size=dataset.dec_voc_size,
                    max_len=max_len,
                    ffn_hidden=ffn_hidden,
                    n_head=n_heads,
                    n_layers=n_layers,
                    drop_prob=drop_prob,
                    device=device).to(device)

print(f'The model has {count_parameters(model):,} trainable parameters')


def test_model(num_examples):
    iterator = test_iter
    model.load_state_dict(torch.load("/Users/romankasichhwa/Desktop/project/transformer/saved/model-1.pt"))

    with torch.no_grad():
        batch_bleu = []
        for i, batch in enumerate(iterator):
            src,trg = batch
            
            output = model(src, trg[:, :-1])

            total_bleu = []
            for j in range(num_examples):
                try:
                    src_words = idx_to_word(src[j], dataset.src_vocab_dict)
                    trg_words = idx_to_word(trg[j], dataset.trg_vocab_dict)
                    output_words = output[j].max(dim=1)[1]
                    output_words = idx_to_word(output_words, dataset.trg_vocab)

                    print('source :', src_words)
                    print('target :', trg_words)
                    print('predicted :', output_words)
                    print()
                    bleu = get_bleu(hypotheses=output_words.split(), reference=trg_words.split())
                    total_bleu.append(bleu)
                except:
                    pass
            try:
                total_bleu = sum(total_bleu) / len(total_bleu)
                print('BLEU SCORE = {}'.format(total_bleu))
            except ZeroDivisionError:
                print('No BLEU scores were calculated')
#             total_bleu = sum(total_bleu) / len(total_bleu)
#             print('BLEU SCORE = {}'.format(total_bleu))
            batch_bleu.append(total_bleu)

        batch_bleu = sum(batch_bleu) / len(batch_bleu)
        print('TOTAL BLEU SCORE = {}'.format(batch_bleu))


if __name__ == '__main__':
    test_model(num_examples=batch_size)


The model has 34,473,713 trainable parameters
No BLEU scores were calculated
No BLEU scores were calculated
No BLEU scores were calculated
No BLEU scores were calculated
No BLEU scores were calculated
No BLEU scores were calculated
No BLEU scores were calculated


TypeError: unsupported operand type(s) for +: 'int' and 'list'