# install required dependencies**


In [1]:
!pip install tqdm spacy 



In [2]:
!pip install indic-nlp-library

Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.4.0-py3-none-any.whl (12 kB)
Collecting morfessor (from indic-nlp-library)
  Downloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Collecting sphinx>=1.2.0 (from sphinx-argparse->indic-nlp-library)
  Downloading sphinx-7.2.6-py3-none-any.whl.metadata (5.9 kB)
Collecting sphinxcontrib-applehelp (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library)
  Downloading sphinxcontrib_applehelp-1.0.8-py3-none-any.whl.metadata (2.3 kB)
Collecting sphinxcontrib-devhelp (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library)
  Downloading sphinxcontrib_devhelp-1.0.6-py3-none-any.whl.metadata (2.3 kB)
Collecting sphinxcontrib-jsmath (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library)
  Downloading sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl (5.1 kB)
Collecting sphinxcontrib-htmlhelp>=2.0.0 (from sphinx>=1.2

# IMPORT REQUIRED LIBRARIES

In [4]:
import math
import random
import time
from torch import nn, optim
from torch.optim import Adam
import torch
from tqdm import tqdm 
import spacy
import re
import os
from indicnlp.tokenize import indic_tokenize
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from torch.utils.data import Dataset,DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print("Runnign on GPU")
else:
    print("No GPU")


Runnign on GPU


# EMBEDDING PART

In [5]:
class TokenEmbedding(nn.Embedding):
    """
    Token Embedding using torch.nn
    they will dense representation of word using weighted matrix
    """

    def __init__(self, vocab_size, d_model):
        """
        class for token embedding that included positional information

        :param vocab_size: size of vocabulary
        :param d_model: dimensions of model
        """
        super().__init__(vocab_size, d_model, padding_idx=1)

In [6]:
class PositionalEncoding(nn.Module):
    """
    compute sinusoid encoding.
    """

    def __init__(self, d_model, max_len, device):
        """
        constructor of sinusoid encoding class

        :param d_model: dimension of model
        :param max_len: max sequence length
        :param device: hardware device setting
        """
        super(PositionalEncoding, self).__init__()

        # same size with input matrix (for adding with input matrix)
        self.encoding = torch.zeros(max_len, d_model, device=device)
        self.encoding.requires_grad = False  # we don't need to compute gradient

        pos = torch.arange(0, max_len, device=device)
        pos = pos.float().unsqueeze(dim=1)
        # 1D => 2D unsqueeze to represent word's position

        _2i = torch.arange(0, d_model, step=2, device=device).float()
        # 'i' means index of d_model (e.g. embedding size = 50, 'i' = [0,50])
        # "step=2" means 'i' multiplied with two (same with 2 * i)

        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))
        # compute positional encoding to consider positional information of words

    def forward(self, x):
        # self.encoding
        # [max_len = 512, d_model = 512]

        batch_size, seq_len = x.size()
        # [batch_size = 128, seq_len = 30]

        return self.encoding[:seq_len, :]
        # [seq_len = 30, d_model = 512]
        # it will add with tok_emb : [128, 30, 512]


In [7]:
class TransformerEmbedding(nn.Module):
    """
    token embedding + positional encoding (sinusoid)
    positional encoding can give positional information to network
    """

    def __init__(self, vocab_size, d_model, max_len, drop_prob, device):
        """
        class for word embedding that included positional information

        :param vocab_size: size of vocabulary
        :param d_model: dimensions of model
        """
        
        super(TransformerEmbedding, self).__init__()
        self.tok_emb = TokenEmbedding(vocab_size, d_model)
        self.pos_emb = PositionalEncoding(d_model, max_len, device)
        self.drop_out = nn.Dropout(p=drop_prob)
        self.device=device
        

    def forward(self, x):
        x = x.long().to(self.device)
        tok_emb = self.tok_emb(x)
        pos_emb = self.pos_emb(x)
        return self.drop_out(tok_emb + pos_emb)

# MODEL BLOCKS

In [8]:
class Encoder(nn.Module):

    def __init__(self, enc_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device):
        super().__init__()
        self.emb = TransformerEmbedding(d_model=d_model,
                                        max_len=max_len,
                                        vocab_size=enc_voc_size,
                                        drop_prob=drop_prob,
                                        device=device)

        self.layers = nn.ModuleList([EncoderLayer(d_model=d_model,
                                                  ffn_hidden=ffn_hidden,
                                                  n_head=n_head,
                                                  drop_prob=drop_prob)
                                     for _ in range(n_layers)])

    def forward(self, x, src_mask):
        x = self.emb(x)

        for layer in self.layers:
            x = layer(x, src_mask)

        return x

In [9]:
class Decoder(nn.Module):
    def __init__(self, dec_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device):
        super().__init__()
        self.emb = TransformerEmbedding(d_model=d_model,
                                        drop_prob=drop_prob,
                                        max_len=max_len,
                                        vocab_size=dec_voc_size,
                                        device=device)

        self.layers = nn.ModuleList([DecoderLayer(d_model=d_model,
                                                  ffn_hidden=ffn_hidden,
                                                  n_head=n_head,
                                                  drop_prob=drop_prob)
                                     for _ in range(n_layers)])

        self.linear = nn.Linear(d_model, dec_voc_size)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        trg = self.emb(trg)

        for layer in self.layers:
            trg = layer(trg, enc_src, trg_mask, src_mask)

        # pass to LM head
        output = self.linear(trg)
        return output

In [10]:
class Transformer(nn.Module):

    def __init__(self, src_pad_idx, trg_pad_idx, trg_sos_idx, enc_voc_size, dec_voc_size, d_model, n_head, max_len,
                 ffn_hidden, n_layers, drop_prob, device):
        super().__init__()
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.trg_sos_idx = trg_sos_idx
        self.device = device
        self.encoder = Encoder(d_model=d_model,
                               n_head=n_head,
                               max_len=max_len,
                               ffn_hidden=ffn_hidden,
                               enc_voc_size=enc_voc_size,
                               drop_prob=drop_prob,
                               n_layers=n_layers,
                               device=device)

        self.decoder = Decoder(d_model=d_model,
                               n_head=n_head,
                               max_len=max_len,
                               ffn_hidden=ffn_hidden,
                               dec_voc_size=dec_voc_size,
                               drop_prob=drop_prob,
                               n_layers=n_layers,
                               device=device)

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        output = self.decoder(trg, enc_src, trg_mask, src_mask)
        return output

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask

    def make_trg_mask(self, trg):
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(3)
        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones(trg_len, trg_len)).type(torch.ByteTensor).to(trg.device)
        trg_mask = trg_pad_mask & trg_sub_mask
        return trg_mask

In [11]:
class ScaleDotProductAttention(nn.Module):
    """
    compute scale dot product attention

    Query : given sentence that we focused on (decoder)
    Key : every sentence to check relationship with Qeury(encoder)
    Value : every sentence same with Key (encoder)
    """

    def __init__(self):
        super(ScaleDotProductAttention, self).__init__()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, q, k, v, mask=None, e=1e-12):
        # input is 4 dimension tensor
        # [batch_size, head, length, d_tensor]
        batch_size, head, length, d_tensor = k.size()

        # 1. dot product Query with Key^T to compute similarity
        k_t = k.transpose(2, 3)  # transpose
        score = (q @ k_t) / math.sqrt(d_tensor)  # scaled dot product

        # 2. apply masking (opt)
        if mask is not None:
            mask=mask.to(q.device)
            score = score.masked_fill(mask == 0, -10000)

        # 3. pass them softmax to make [0, 1] range
        score = self.softmax(score)

        # 4. multiply with Value
        v = score @ v

        return v, score

In [12]:
class MultiHeadAttention(nn.Module):

    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
        self.n_head = n_head
        self.attention = ScaleDotProductAttention()
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_concat = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        # 1. dot product with weight matrices
        q, k, v = self.w_q(q), self.w_k(k), self.w_v(v)

        # 2. split tensor by number of heads
        q, k, v = self.split(q), self.split(k), self.split(v)

        # 3. do scale dot product to compute similarity
        out, attention = self.attention(q, k, v, mask=mask)

        # 4. concat and pass to linear layer
        out = self.concat(out)
        out = self.w_concat(out)

        # 5. visualize attention map
        # TODO : we should implement visualization

        return out

    def split(self, tensor):
        """
        split tensor by number of head

        :param tensor: [batch_size, length, d_model]
        :return: [batch_size, head, length, d_tensor]
        """
        batch_size, length, d_model = tensor.size()

        d_tensor = d_model // self.n_head
        tensor = tensor.view(batch_size, length, self.n_head, d_tensor).transpose(1, 2)
        # it is similar with group convolution (split by number of heads)

        return tensor

    def concat(self, tensor):
        """
        inverse function of self.split(tensor : torch.Tensor)

        :param tensor: [batch_size, head, length, d_tensor]
        :return: [batch_size, length, d_model]
        """
        batch_size, head, length, d_tensor = tensor.size()
        d_model = head * d_tensor

        tensor = tensor.transpose(1, 2).contiguous().view(batch_size, length, d_model)
        return tensor


In [13]:
class EncoderLayer(nn.Module):

    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm1 = LayerNorm(d_model=d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNorm(d_model=d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)

    def forward(self, x, src_mask):
        # 1. compute self attention
        _x = x
        x = self.attention(q=x, k=x, v=x, mask=src_mask)
        
        # 2. add and norm
        x = self.dropout1(x)
        x = self.norm1(x + _x)
        
        # 3. positionwise feed forward network
        _x = x
        x = self.ffn(x)
      
        # 4. add and norm
        x = self.dropout2(x)
        x = self.norm2(x + _x)
        return x


In [14]:
class DecoderLayer(nn.Module):

    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm1 = LayerNorm(d_model=d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.enc_dec_attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm2 = LayerNorm(d_model=d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)

        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm3 = LayerNorm(d_model=d_model)
        self.dropout3 = nn.Dropout(p=drop_prob)

    def forward(self, dec, enc, trg_mask, src_mask):
        # 1. compute self attention
        _x = dec
        x = self.self_attention(q=dec, k=dec, v=dec, mask=trg_mask)
        
        # 2. add and norm
        x = self.dropout1(x)
        x = self.norm1(x + _x)

        if enc is not None:
            # 3. compute encoder - decoder attention
            _x = x
            x = self.enc_dec_attention(q=x, k=enc, v=enc, mask=src_mask)
            
            # 4. add and norm
            x = self.dropout2(x)
            x = self.norm2(x + _x)

        # 5. positionwise feed forward network
        _x = x
        x = self.ffn(x)
        
        # 6. add and norm
        x = self.dropout3(x)
        x = self.norm3(x + _x)
        return x

In [15]:
class LayerNorm(nn.Module):
    def __init__(self, d_model, eps=1e-12):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, unbiased=False, keepdim=True)
        # '-1' means last dimension. 

        out = (x - mean) / torch.sqrt(var + self.eps)
        out = self.gamma * out + self.beta
        return out


In [16]:
class PositionwiseFeedForward(nn.Module):

    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [17]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [18]:
def bleu_stats(hypothesis, reference):
    """Compute statistics for BLEU."""
    stats = []
    stats.append(len(hypothesis))
    stats.append(len(reference))
    for n in range(1, 5):
        s_ngrams = Counter(
            [tuple(hypothesis[i:i + n]) for i in range(len(hypothesis) + 1 - n)]
        )
        r_ngrams = Counter(
            [tuple(reference[i:i + n]) for i in range(len(reference) + 1 - n)]
        )

        stats.append(max([sum((s_ngrams & r_ngrams).values()), 0]))
        stats.append(max([len(hypothesis) + 1 - n, 0]))
    return stats


def bleu(stats):
    """Compute BLEU given n-gram statistics."""
    if len(list(filter(lambda x: x == 0, stats))) > 0:
        return 0
    (c, r) = stats[:2]
    log_bleu_prec = sum(
        [math.log(float(x) / y) for x, y in zip(stats[2::2], stats[3::2])]
    ) / 4.
    return math.exp(min([0, 1 - float(r) / c]) + log_bleu_prec)


def get_bleu(hypotheses, reference):
    """Get validation BLEU score for dev set."""
    stats = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
    for hyp, ref in zip(hypotheses, reference):
        stats += np.array(bleu_stats(hyp, ref))
    return 100 * bleu(stats)


def idx_to_word(x, vocab):
    words = []
    for i in x:
        word = vocab.itos[i]
        if '<' not in word:
            words.append(word)
    words = " ".join(words)
    return words


# DATA LOADING , TOKENIZATION, VOCAB_BUILDING, TOKEN_TO_INDEX

In [None]:


def load_tokenizers():
    return indic_tokenize, spacy.load('en_core_web_sm')

def tokenize_ne(text: str, tokenizer):
        return [tok for tok in tokenizer.trivial_tokenize(text)]



def tokenize_en(text:str,tokenizer):
    return [tok.text for tok in tokenizer.tokenizer(text)]





class CustomDataset(Dataset):
    def __init__(self, source: str, target: str):
        self.nepali_root = source
        self.english_root = target
        self.current=0
        self.max_src_len=0
        self.max_trg_len=0
        self.tokenizers =load_tokenizers()
        self.src_vocab=set()
        self.trg_vocab=set()
        self.src_vocab.update(['<sos>', '<eos>', '<pad>','<unk>']) 
        self.trg_vocab.update(['<sos>', '<eos>', '<pad>','<unk>'])
        self.total_sentences = 0
        self.data = []

        
        with open(self.nepali_root, 'r') as nepali, open(self.english_root, 'r') as english:
            for nep, eng in zip(nepali, english):
                tokenized_nepali = tokenize_ne(nep, self.tokenizers[0])
                tokenized_eng = tokenize_en(eng, self.tokenizers[1])
                self.total_sentences += 1
                self.max_src_len = max(self.max_src_len, len(tokenized_nepali))
                self.max_trg_len = max(self.max_trg_len, len(tokenized_eng))
                self.src_vocab.update(tokenized_nepali)
                self.trg_vocab.update(tokenized_eng)
                self.data.append((tokenized_nepali, tokenized_eng))
                random.shuffle(self.data)

        
        self.src_vocab_dict={word: i for i, word in enumerate(self.src_vocab)}
        self.trg_vocab_dict={word: i for i, word in enumerate(self.trg_vocab)}
        

        self.trg_pad_idx = self.trg_vocab_dict['<pad>']
        self.trg_sos_idx = self.trg_vocab_dict['<sos>']
        self.trg_eos_idx = self.trg_vocab_dict['<eos>']
        self.src_pad_idx = self.src_vocab_dict['<pad>']
        self.src_sos_idx = self.src_vocab_dict['<sos>']   
        self.src_eos_idx = self.src_vocab_dict['<eos>']
        self.src_unk_idx = self.src_vocab_dict['<unk>'] 
        self.trg_unk_idx = self.trg_vocab_dict['<unk>']
        self.enc_voc_size = len(self.src_vocab)
        self.dec_voc_size= len(self.trg_vocab)
        
        random.shuffle(self.data)
        train_ratio = 0.8
        val_ratio = 0.1
        data_len = len(self.data)
        train_size = int(data_len * train_ratio)
        val_size = int(data_len * val_ratio)
        test_size = data_len - train_size - val_size


        self.train_data = self.data[:train_size]
        self.val_data = self.data[train_size:train_size+val_size]
        self.test_data = self.data[train_size+val_size:]
  


    def __len__(self):
        if self.train_data:
            return len(self.train_data)
        elif self.val_data:
            return len(self.val_data)
        elif self.test_data:
            return len(self.test_data)
        else:
            raise ValueError("No data available!")


    def __getitem__(self, idx):
        src, trg = self.train_data[idx]
        src = [self.src_vocab_dict[token] if token in self.src_vocab_dict else self.src_vocab_dict['<unk>'] for token in src]
        trg = [self.trg_vocab_dict[token] if token in self.trg_vocab_dict else self.trg_vocab_dict['<unk>'] for token in trg]
        return [src, trg]   
       


    def __iter__(self):
        return self    



    def __next__(self):
        if self.current < len(self.train_data):
            self.current += 1
            return self.__getitem__(self.current)

        raise StopIteration 


    def printv(self):
        print(self.src_vocab_dict)

           

def custom_collate(batch, src_sos_idx, src_eos_idx, trg_sos_idx, trg_eos_idx, src_pad_idx, trg_pad_idx,src_vocab_dict:dict,trg_vocab_dict:dict):
    src_batch, trg_batch = zip(*batch)
#     print(src_batch,trg_batch)
    # print("Batch Sizes (Before Padding):", [len(src) for src in src_batch], [len(trg) for trg in trg_batch])
    src_batch = [[src_vocab_dict[token] if token in src_vocab_dict else src_vocab_dict['<unk>'] for token in src] for src in src_batch]
    trg_batch = [[trg_vocab_dict[token] if token in trg_vocab_dict else trg_vocab_dict['<unk>'] for token in trg] for trg in trg_batch]
    # Pad sequences to the fixed length max_len
    padded_src = [torch.cat([torch.tensor([src_sos_idx]), torch.tensor(src), torch.tensor([src_eos_idx]), torch.full((max_len - len(src) - 2,), src_pad_idx, dtype=torch.long)]) for src in src_batch]
    padded_trg = [torch.cat([torch.tensor([trg_sos_idx]), torch.tensor(trg), torch.tensor([trg_eos_idx]), torch.full((max_len - len(trg) - 2,), trg_pad_idx, dtype=torch.long)]) for trg in trg_batch]
    
    # Stack the padded sequences
    padded_src = torch.stack(padded_src)
    padded_trg = torch.stack(padded_trg)
    # print("Batch Sizes (Before Padding):", [len(src) for src in padded_src], [len(trg) for trg in padded_trg])
    # print("Batch Sizes (After Padding):", padded_src.shape, padded_trg.shape)
    return [padded_src, padded_trg]

         


dataset = CustomDataset('/kaggle/input/150kdata/Dataset.ne', '/kaggle/input/150kdata/Dataset.en')

In [None]:

print (dataset.max_src_len, dataset.max_trg_len)

In [22]:
batch_size = 128
max_len = 45
d_model = 512
n_layers = 6
n_heads = 8
ffn_hidden = 1024
drop_prob = 0.1

# optimizer parameter setting
init_lr = 1e-5
factor = 0.9
adam_eps = 5e-9
patience = 10
warmup = 100
epoch = 100  #1000
clip = 1.0
weight_decay = 5e-4
inf = float('inf')

In [23]:
import os
import shutil

# to delete the modesl from kaggle/working except kaggle/working/result

In [24]:
# root_dir = "/kaggle/working"
# excluded_dir = "result"

# # Get all files and directories in the root directory
# all_items = os.listdir(root_dir)

# # Filter out the excluded directory
# items_to_delete = [item for item in all_items if item != excluded_dir]

# # Delete each item (file or directory)
# for item in items_to_delete:
#     item_path = os.path.join(root_dir, item)
#     if os.path.isdir(item_path):
#         shutil.rmtree(item_path)  # Use shutil.rmtree for directories
#     else:
#         os.remove(item_path)

# print(f"Successfully deleted all items except '{excluded_dir}' in {root_dir}.")

Successfully deleted all items except 'result' in /kaggle/working.


In [None]:
# dataset = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=lambda batch: custom_collate(batch, dataset.src_sos_idx, dataset.src_eos_idx, dataset.trg_sos_idx, dataset.trg_eos_idx, dataset.src_pad_idx, dataset.trg_pad_idx))
train_iter = DataLoader(dataset.train_data, batch_size=16, shuffle=True, collate_fn=lambda batch: custom_collate(batch, dataset.src_sos_idx, dataset.src_eos_idx, dataset.trg_sos_idx, dataset.trg_eos_idx, dataset.src_pad_idx, dataset.trg_pad_idx,dataset.src_vocab_dict,dataset.trg_vocab_dict))
test_iter = DataLoader(dataset.test_data, batch_size=16, shuffle=False, collate_fn=lambda batch: custom_collate(batch, dataset.src_sos_idx, dataset.src_eos_idx, dataset.trg_sos_idx, dataset.trg_eos_idx, dataset.src_pad_idx, dataset.trg_pad_idx,dataset.src_vocab_dict,dataset.trg_vocab_dict))
valid_iter = DataLoader(dataset.val_data, batch_size=16, shuffle=False, collate_fn=lambda batch: custom_collate(batch, dataset.src_sos_idx, dataset.src_eos_idx, dataset.trg_sos_idx, dataset.trg_eos_idx, dataset.src_pad_idx, dataset.trg_pad_idx,dataset.src_vocab_dict,dataset.trg_vocab_dict))


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.kaiming_uniform_(m.weight.data)


model = Transformer(src_pad_idx=dataset.src_pad_idx,
                    trg_pad_idx=dataset.trg_pad_idx,
                    trg_sos_idx=dataset.trg_sos_idx,
                    d_model=d_model,
                    enc_voc_size=dataset.enc_voc_size,
                    dec_voc_size=dataset.dec_voc_size,
                    max_len=max_len,
                    ffn_hidden=ffn_hidden,
                    n_head=n_heads,
                    n_layers=n_layers,
                    drop_prob=drop_prob,
                    device=device).to(device)

print(f'The model has {count_parameters(model):,} trainable parameters')
model.apply(initialize_weights)
optimizer = Adam(params=model.parameters(),
                 lr=init_lr,
                 weight_decay=weight_decay,
                 eps=adam_eps)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 verbose=True,
                                                 factor=factor,
                                                 patience=patience)

criterion = nn.CrossEntropyLoss(ignore_index=dataset.src_pad_idx)


def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    iterator = tqdm(iterator, total=len(iterator), desc='Training')
    for i, batch in enumerate(iterator):
        # src = batch.src
        src,trg=batch
        src, trg = src.to(device), trg.to(device)
        # print(src.shape)

        optimizer.zero_grad()
        output = model(src, trg[:, :-1])
        output_reshape = output.contiguous().view(-1, output.shape[-1])
        trg = trg[:, 1:].contiguous().view(-1)

        loss = criterion(output_reshape, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()
#         print('step :', round((i / len(iterator)) * 100, 2), '% , loss :', loss.item())

    return epoch_loss / len(iterator)


def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    batch_bleu = []
    iterator = tqdm(iterator, total=len(iterator), desc='Evaluating')
    with torch.no_grad():
        for i, (src_batch, trg_batch) in enumerate(iterator):
            src = src_batch
            trg = trg_batch
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg[:, :-1])
            output_reshape = output.contiguous().view(-1, output.shape[-1])
            output_reshape=output_reshape.to(device)
            trg = trg[:, 1:].contiguous().view(-1)
            trg = trg.to(output.device)
            loss = criterion(output_reshape, trg)
            epoch_loss += loss.item()

            total_bleu = []
            for j in range(batch_size):
                try:
                    trg_words = idx_to_word(batch.trg[j], loader.target.vocab)
                    output_words = output[j].max(dim=1)[1]
                    output_words = idx_to_word(output_words, loader.target.vocab)
                    bleu = get_bleu(hypotheses=output_words.split(), reference=trg_words.split())
                    total_bleu.append(bleu)
                except:
                    pass
            if total_bleu:  
                batch_bleu = sum(batch_bleu) / len(batch_bleu)
                return epoch_loss / len(iterator), batch_bleu
            else:
                print("Warning: No BLEU scores were calculated!")
                return epoch_loss / len(iterator), 0.0


def run(total_epoch, best_loss):
    train_losses, test_losses, bleus = [], [], []
    for step in range(total_epoch):
        start_time = time.time()
        train_loss = train(model, train_iter, optimizer, criterion, clip)
        valid_loss, bleu = evaluate(model, valid_iter, criterion)
        end_time = time.time()

        if step > warmup:
            scheduler.step(valid_loss)

        train_losses.append(train_loss)
        test_losses.append(valid_loss)
        bleus.append(bleu)
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_loss:
            best_loss = valid_loss
#             torch.save(model.state_dict(), '/kaggle/working/model-{0}.pt'.format(valid_loss))
            
        result_directory = '/kaggle/working/result'
        os.makedirs(result_directory, exist_ok=True)
        f = open('/kaggle/working/result/train_loss.txt', 'w')
        f.write(str(train_losses))
        f.close()

        f = open('/kaggle/working/result/bleu.txt', 'w')
        f.write(str(bleus))
        f.close()

        f = open('/kaggle/working/result/test_loss.txt', 'w')
        f.write(str(test_losses))
        f.close()

        print(f'Epoch: {step + 1} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\tVal Loss: {valid_loss:.3f} |  Val PPL: {math.exp(valid_loss):7.3f}')
        print(f'\tBLEU Score: {bleu:.3f}')
        
    torch.save(model.state_dict(), '/kaggle/working/model-1.pt'.format(valid_loss))


if __name__ == '__main__':
    run(total_epoch=epoch, best_loss=inf)


The model has 97,240,673 trainable parameters


Training: 100%|██████████| 801/801 [01:36<00:00,  8.34it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 1 | Time: 1m 36s
	Train Loss: 2.947 | Train PPL:  19.050
	Val Loss: 0.023 |  Val PPL:   1.023
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.43it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 2 | Time: 1m 35s
	Train Loss: 2.445 | Train PPL:  11.526
	Val Loss: 0.021 |  Val PPL:   1.022
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.41it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 3 | Time: 1m 35s
	Train Loss: 2.366 | Train PPL:  10.658
	Val Loss: 0.021 |  Val PPL:   1.021
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:34<00:00,  8.43it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 4 | Time: 1m 35s
	Train Loss: 2.327 | Train PPL:  10.246
	Val Loss: 0.021 |  Val PPL:   1.021
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.43it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 5 | Time: 1m 35s
	Train Loss: 2.293 | Train PPL:   9.901
	Val Loss: 0.020 |  Val PPL:   1.021
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.41it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 6 | Time: 1m 35s
	Train Loss: 2.247 | Train PPL:   9.464
	Val Loss: 0.020 |  Val PPL:   1.020
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:34<00:00,  8.43it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 7 | Time: 1m 35s
	Train Loss: 2.120 | Train PPL:   8.331
	Val Loss: 0.019 |  Val PPL:   1.019
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.43it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 8 | Time: 1m 35s
	Train Loss: 2.076 | Train PPL:   7.974
	Val Loss: 0.019 |  Val PPL:   1.019
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.41it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 9 | Time: 1m 35s
	Train Loss: 2.061 | Train PPL:   7.851
	Val Loss: 0.018 |  Val PPL:   1.019
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.42it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 10 | Time: 1m 35s
	Train Loss: 2.051 | Train PPL:   7.774
	Val Loss: 0.018 |  Val PPL:   1.019
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.42it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 11 | Time: 1m 35s
	Train Loss: 2.043 | Train PPL:   7.710
	Val Loss: 0.018 |  Val PPL:   1.018
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.41it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 12 | Time: 1m 35s
	Train Loss: 2.031 | Train PPL:   7.622
	Val Loss: 0.018 |  Val PPL:   1.018
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.43it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 13 | Time: 1m 35s
	Train Loss: 2.008 | Train PPL:   7.445
	Val Loss: 0.018 |  Val PPL:   1.018
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:34<00:00,  8.43it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 14 | Time: 1m 35s
	Train Loss: 1.977 | Train PPL:   7.219
	Val Loss: 0.018 |  Val PPL:   1.018
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.41it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 15 | Time: 1m 35s
	Train Loss: 1.952 | Train PPL:   7.040
	Val Loss: 0.017 |  Val PPL:   1.018
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.43it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 16 | Time: 1m 35s
	Train Loss: 1.932 | Train PPL:   6.905
	Val Loss: 0.017 |  Val PPL:   1.017
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.42it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 17 | Time: 1m 35s
	Train Loss: 1.917 | Train PPL:   6.797
	Val Loss: 0.017 |  Val PPL:   1.017
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.41it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 18 | Time: 1m 35s
	Train Loss: 1.902 | Train PPL:   6.702
	Val Loss: 0.017 |  Val PPL:   1.017
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.43it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 19 | Time: 1m 35s
	Train Loss: 1.888 | Train PPL:   6.607
	Val Loss: 0.017 |  Val PPL:   1.017
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.42it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 20 | Time: 1m 35s
	Train Loss: 1.877 | Train PPL:   6.535
	Val Loss: 0.017 |  Val PPL:   1.017
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.41it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 21 | Time: 1m 35s
	Train Loss: 1.865 | Train PPL:   6.455
	Val Loss: 0.017 |  Val PPL:   1.017
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.43it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 22 | Time: 1m 35s
	Train Loss: 1.855 | Train PPL:   6.390
	Val Loss: 0.017 |  Val PPL:   1.017
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.40it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 23 | Time: 1m 35s
	Train Loss: 1.846 | Train PPL:   6.332
	Val Loss: 0.017 |  Val PPL:   1.017
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.39it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 24 | Time: 1m 35s
	Train Loss: 1.839 | Train PPL:   6.289
	Val Loss: 0.017 |  Val PPL:   1.017
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.41it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 25 | Time: 1m 35s
	Train Loss: 1.832 | Train PPL:   6.247
	Val Loss: 0.016 |  Val PPL:   1.017
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.42it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 26 | Time: 1m 35s
	Train Loss: 1.825 | Train PPL:   6.200
	Val Loss: 0.016 |  Val PPL:   1.017
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.40it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 27 | Time: 1m 35s
	Train Loss: 1.819 | Train PPL:   6.169
	Val Loss: 0.016 |  Val PPL:   1.017
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.42it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 28 | Time: 1m 35s
	Train Loss: 1.813 | Train PPL:   6.129
	Val Loss: 0.016 |  Val PPL:   1.016
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.42it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 29 | Time: 1m 35s
	Train Loss: 1.808 | Train PPL:   6.099
	Val Loss: 0.016 |  Val PPL:   1.016
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.40it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 30 | Time: 1m 35s
	Train Loss: 1.801 | Train PPL:   6.056
	Val Loss: 0.016 |  Val PPL:   1.016
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.41it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 31 | Time: 1m 35s
	Train Loss: 1.796 | Train PPL:   6.027
	Val Loss: 0.016 |  Val PPL:   1.016
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.41it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 32 | Time: 1m 35s
	Train Loss: 1.793 | Train PPL:   6.008
	Val Loss: 0.016 |  Val PPL:   1.016
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.40it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 33 | Time: 1m 35s
	Train Loss: 1.789 | Train PPL:   5.985
	Val Loss: 0.016 |  Val PPL:   1.016
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.41it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 34 | Time: 1m 35s
	Train Loss: 1.785 | Train PPL:   5.962
	Val Loss: 0.016 |  Val PPL:   1.016
	BLEU Score: 0.000


Training: 100%|██████████| 801/801 [01:35<00:00,  8.42it/s]
Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


Epoch: 35 | Time: 1m 35s
	Train Loss: 1.781 | Train PPL:   5.937
	Val Loss: 0.016 |  Val PPL:   1.016
	BLEU Score: 0.000


Training:  60%|█████▉    | 478/801 [00:57<00:45,  7.17it/s]

In [None]:

def read(name):
    f = open(name, 'r')
    file = f.read()
    file = re.sub('\\[', '', file)
    file = re.sub('\\]', '', file)
    f.close()

    return [float(i) for idx, i in enumerate(file.split(','))]


def draw(mode):
    if mode == 'loss':
        train = read('/kaggle/working/result/train_loss.txt')
        test = read('/kaggle/working/result/test_loss.txt')
        plt.plot(train, 'r', label='train')
        plt.plot(test, 'b', label='validation')
        plt.legend(loc='lower left')


    elif mode == 'bleu':
        bleu = read('/kaggle/working/result/bleu.txt')
        plt.plot(bleu, 'b', label='bleu score')
        plt.legend(loc='lower right')

    plt.xlabel('epoch')
    plt.ylabel(mode)
    plt.title('training result')
    plt.grid(True, which='both', axis='both')
    plt.show()


if __name__ == '__main__':
    draw(mode='loss')
    draw(mode='bleu')


In [None]:
test_iter=DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=lambda batch: custom_collate(batch, dataset.src_sos_idx, dataset.src_eos_idx, dataset.trg_sos_idx, dataset.trg_eos_idx, dataset.src_pad_idx, dataset.trg_pad_idx))

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


model = Transformer(src_pad_idx=dataset.src_pad_idx,
                    trg_pad_idx=dataset.trg_pad_idx,
                    trg_sos_idx=dataset.trg_sos_idx,
                    d_model=d_model,
                    enc_voc_size=dataset.enc_voc_size,
                    dec_voc_size=dataset.dec_voc_size,
                    max_len=max_len,
                    ffn_hidden=ffn_hidden,
                    n_head=n_heads,
                    n_layers=n_layers,
                    drop_prob=0.00,
                    device=device).to(device)

print(f'The model has {count_parameters(model):,} trainable parameters')


def test_model(num_examples):
    iterator = test_iter
    model.load_state_dict(torch.load("./saved/model-saved.pt"))

    with torch.no_grad():
        batch_bleu = []
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg
            output = model(src, trg[:, :-1])

            total_bleu = []
            for j in range(num_examples):
                try:
                    src_words = idx_to_word(src[j], loader.source.vocab)
                    trg_words = idx_to_word(trg[j], loader.target.vocab)
                    output_words = output[j].max(dim=1)[1]
                    output_words = idx_to_word(output_words, loader.target.vocab)

                    print('source :', src_words)
                    print('target :', trg_words)
                    print('predicted :', output_words)
                    print()
                    bleu = get_bleu(hypotheses=output_words.split(), reference=trg_words.split())
                    total_bleu.append(bleu)
                except:
                    pass

            total_bleu = sum(total_bleu) / len(total_bleu)
            print('BLEU SCORE = {}'.format(total_bleu))
            batch_bleu.append(total_bleu)

        batch_bleu = sum(batch_bleu) / len(batch_bleu)
        print('TOTAL BLEU SCORE = {}'.format(batch_bleu))


if __name__ == '__main__':
    test_model(num_examples=batch_size)
