In [1]:
import torch
import torch.nn as nn
import math
from typing import Any
from torch.utils.data import Dataset,random_split,DataLoader
if torch.cuda.is_available():
    torch.backends.cudnn.deterministic=True

In [2]:
print(torch.cuda.is_available())

True


In [3]:
# embeddings

class Inputembedding(nn.Module):
    def __init__(self,d_model:int,vocab_size:int):
        super().__init__()
        self.d_model=d_model
        self.vocab_size=vocab_size
        self.embedding=nn.Embedding(vocab_size,d_model)
    def forward(self,x):
        return self.embedding(x)*math.sqrt(self.d_model)

In [4]:
class PositionalEncoding(nn.Module):
    def __init__(self,d_model:int,max_seq_len:int,dropout:float):
        super().__init__()
        self.d_model=d_model
        self.max_seq_len=max_seq_len
        self.dropout=nn.Dropout(dropout)
        #positional encoding filled with  zeroes
        pe=torch.zeros(max_seq_len,d_model)
        # creating a position
        position=torch.arange(0,max_seq_len,dtype=torch.float).unsqueeze(1)
        dividend_term=torch.exp(torch.arange(0,d_model,2).float()*(-math.log(10000)/d_model))
        #applying sine to even indices
        pe[:,0::2]=torch.sin(position*dividend_term)
        #applying cosine to odd indices
        pe[:,1::2]=torch.cos(position*dividend_term)
        #apply one dimension more for the batch_size
        pe=pe.unsqueeze(0)
        self.register_buffer("pe",pe)
        print(pe.shape)
    def forward(self,x):
        x=x+(self.pe[:,:x.shape[1],:]) # all bach size 0 to maxseqlen-1,dimension 
        return self.dropout(x)



In [5]:
class LayerNormalization(nn.Module):
    def __init__(self,eps:float=10**-6):
        super().__init__()
        self.eps=eps
        self.alpha=nn.Parameter(torch.ones(1))
        self.bias=nn.Parameter(torch.zeros(1))
    
    def forward(self,x):
        mean=x.mean(dim=-1,keepdim=True)
        std=x.std(dim=-1,keepdim=True)
        return self.alpha*(x-mean)/(std+self.eps)+self.bias



In [6]:
class FeedForwardNeuralNetwork(nn.Module):
    def __init__(self,d_model:int,d_ff:int,dropout:float):
        super().__init__()
        self.firstlayer=nn.Linear(d_model,d_ff)
        self.dropout=nn.Dropout(dropout)
        self.secondlayer=nn.Linear(d_ff,d_model)
    
    def forward(self,x):
        return self.secondlayer(self.dropout(torch.relu(self.firstlayer(x))))

In [7]:
class MultiHeadAttention(nn.Module):
    def __init__(self,d_model:int,num_heads:int,dropout:float):
        super().__init__()
        self.d_model=d_model
        self.num_heads=num_heads
        assert d_model%num_heads==0,'Dimension of model should be divisible by no of heads'
        self.d_k=d_model//num_heads

        # for the  weight metrices
        self.w_q=nn.Linear(d_model,d_model)# Weighted query
        self.w_k=nn.Linear(d_model,d_model)# weighted key
        self.w_v=nn.Linear(d_model,d_model)# weighted value
        self.w_o=nn.Linear(d_model,d_model) #weight of the concatenated layer
        self.dropout=nn.Dropout(dropout)  #last dropoutlayer
    @staticmethod
    def attention(query,key,value,mask,dropout=nn.Dropout):
        d_k=query.shape[-1]
        attention_scores=(query@key.transpose(-2,-1))/math.sqrt(d_k)
        if mask is not None:
            attention_scores.masked_fill_(mask==0,-1e9)# In-place: mask out positions with a large negative value to ignore them in softmax
        attention_scores=attention_scores.softmax(dim=-1) # applied at the last dimension that is max_selenght
        if dropout is not None:
            attention_scores=dropout(attention_scores)
        return (attention_scores@value) ,attention_scores

    def forward(self,q,k,v,mask):
        query=self.w_q(q)
        key=self.w_k(k)
        value=self.w_v(v)
        query=query.view(query.shape[0],query.shape[1],self.num_heads,self.d_k).transpose(1,2) #batchsize sequencelength number of head,d_k #transpose chai aaba independently head lai train garxam so batchsize,oofheads,max_seq_len,d_k hunxa
        key=key.view(key.shape[0],key.shape[1],self.num_heads,self.d_k).transpose(1,2)
        value=value.view(value.shape[0],value.shape[1],self.num_heads,self.d_k).transpose(1,2)
        #obtain output and attention scores
        x,self.attention_scores=MultiHeadAttention.attention(query,key,value,mask,self.dropout)
        # create  a concatenated matrix
        x=x.transpose(1,2).contiguous().view(x.shape[0],-1,self.num_heads*self.d_k)#
        return self.w_o(x)
    


In [8]:
class ResidualConnection(nn.Module):
    def __init__(self, dropout: float) -> None:
        super().__init__()
        # we use a dropout layer to prevent overfitting
        self.dropout=nn.Dropout(dropout)
        # we use a normalization layer
        self.norm=LayerNormalization()
        
    def forward(self, x, sublayer):
        # we normalize the input and add it to the original input x`. This creates the residual connection process
        return x+self.dropout(sublayer(self.norm(x)))

In [9]:
# Building Encoder Block
class EncoderBlock(nn.Module):
    def __init__(self,self_attention_block:MultiHeadAttention,ffn:FeedForwardNeuralNetwork,dropout:float):
        super().__init__()
        self.self_attention_block=self_attention_block
        self.ffn=ffn
        self.residual_connections=nn.ModuleList([ResidualConnection(dropout) for _ in range(2)])
    def forward(self,x,src_mask):
        x=self.residual_connections[0](x,lambda x : self.self_attention_block(x,x,x,src_mask)) 
        x=self.residual_connections[1](x,self.ffn) # x+x.self.ffn(x)
        # output tensor with applying feedforward selfattention feedforward 
        return x

class Encoder(nn.Module):
    def __init__(self, layers: nn.ModuleList)-> None:
        super().__init__()
        self.layers=layers # storing the EncoderBlocks
        # layer for the normalization of the output of the encoder layers
        self.norm=LayerNormalization()
    
    def forward(self, x, mask):
        # Iterating over each EncoderBlock stored in self.layers
        for layer in self.layers:
            # Applying each EncoderBlock to the input tensor x
            x=layer(x, mask)
        return self.norm(x) # Normalizing output After running all n layers



In [10]:
# Decoderblock #it takes multihead attention and crossattention
class DecoderBlock(nn.Module):
    def __init__(self,self_attention_block:MultiHeadAttention,crossattentionblock:MultiHeadAttention,ffn:FeedForwardNeuralNetwork,dropout:float):
        super().__init__()
        self.self_attention_block=self_attention_block
        self.cross_attention_block=crossattentionblock
        self.ffn=ffn
        self.residual_connection=nn.ModuleList([ResidualConnection(dropout) for _ in range(3)])
    
    def forward(self,x,encoderoutput,src_mask,tgt_mask):
        x=self.residual_connection[0](x,lambda x :self.self_attention_block(x,x,x,tgt_mask))
        x=self.residual_connection[1](x,lambda x : self.cross_attention_block(x,encoderoutput,encoderoutput,src_mask))
        x=self.residual_connection[2](x,self.ffn)
        return x


class Decoder(nn.Module):
    def __init__(self,layers:nn.ModuleList):
        super().__init__()
        self.layers=layers
        self.norm=LayerNormalization()
    
    def forward(self,x,encoder_output,src_mask,tgt_mask):
        for layer in self.layers:
            x=layer(x,encoder_output,src_mask,tgt_mask)
        return self.norm(x)



In [11]:
# Projection layer

class ProjectionLayer(nn.Module):
    def __init__(self,d_model:int,vocab_size:int):
        super().__init__()
        self.projection=nn.Linear(d_model,vocab_size)
    def forward(self,x):
        return torch.log_softmax(self.projection(x),dim=-1)

In [12]:
# The Transformer Architecture
# Contains all the Encoder Decoder Embeddings
class Transformer(nn.Module):
    def __init__(self,encoder:Encoder,decoder:Decoder,src_embeding:Inputembedding,tgt_embedding:Inputembedding,src_position:PositionalEncoding,tgt_position:PositionalEncoding,projection_layer:ProjectionLayer) -> None:
        super().__init__()
        self.encoder=encoder
        self.decoder=decoder
        self.src_embedding=src_embeding
        self.tgt_embedding=tgt_embedding
        self.src_position=src_position
        self.tgt_position=tgt_position
        self.projection_layer=projection_layer
    
    def encode(self,source,src_mask):
        #applying embedding to the input source language
        source=self.src_embedding(source)
        #applying positionalencoding to the source language
        source=self.src_position(source)
        return self.encoder(source,src_mask)

    def decode(self,encoder_output,src_mask,target,tgt_mask):
        target=self.tgt_embedding(target)
        target=self.tgt_position(target)
        return self.decoder(target,encoder_output,src_mask,tgt_mask)
        
    #applying projection with softmax
    def project(self,x):
        return self.projection_layer(x)


In [13]:
def build_transformer(src_vocab_size:int,tgt_vocab_size:int,src_seq_len:int,tgt_seq_len:int,d_model:int=512,N:int=6,h:int=8,dropout:float=0.2,d_ff:int=2048) -> Transformer:
    # Creating Embedding Layers
    src_embed=Inputembedding(d_model,src_vocab_size)
    tgt_embed=Inputembedding(d_model,tgt_vocab_size)
    #Creating Positional Encoding Layers
    src_pos=PositionalEncoding(d_model,src_seq_len,dropout)
    tgt_pos=PositionalEncoding(d_model,tgt_seq_len,dropout)

    # Encoders Blocks
    encoder_blocks=[]
    for _ in range (N) :
        encoder_self_attention_block=MultiHeadAttention(d_model,h,dropout)
        feed_forward_block=FeedForwardNeuralNetwork(d_model,d_ff,dropout)
        # one layer encoder block
        encoder_block=EncoderBlock(encoder_self_attention_block,feed_forward_block,dropout)
        encoder_blocks.append(encoder_block)
    
    # creating decoder blocks
    decoder_blocks=[]
    for _ in range(N):
        decoder_self_attention_block=MultiHeadAttention(d_model,h,dropout)
        decoder_cross_attention_block=MultiHeadAttention(d_model,h,dropout)
        feed_forward_block=FeedForwardNeuralNetwork(d_model,d_ff,dropout)

        # decoder block
        decoder_block=DecoderBlock(decoder_self_attention_block,decoder_cross_attention_block,feed_forward_block,dropout)
        decoder_blocks.append(decoder_block)
    
    # Encoder and decoder 
    encoder=Encoder(nn.ModuleList(encoder_blocks))
    decoder=Decoder(nn.ModuleList(decoder_blocks))

    #projection layer
    projection_layer=ProjectionLayer(d_model,tgt_vocab_size)
    # Fulltransforer
    transformer=Transformer(encoder,decoder,src_embed,tgt_embed,src_pos,tgt_pos,projection_layer)
    #initializing all the parameters
    for p in transformer.parameters():
        if p.dim()>1:
            nn.init.xavier_uniform_(p)
    
    return transformer


In [14]:
def get_all_sentences(ds,lang):
    for pair in ds :
        yield pair['translation'][lang]

In [15]:
from pathlib import Path
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from datasets import load_dataset 
def build_tokenizer(config,ds,lang):
    # A filepath for tokenizer 
    tokenizer_path=Path(config['tokenizer_file'].format(lang)) #The string 'models/tokenizer_{}.json' becomes 'models/tokenizer_ne.json' (since lang is 'ne').
    # check the path of the tokenizer 
    if not Path.exists(tokenizer_path):
        tokenizer=Tokenizer(WordLevel(unk_token='[UNK]'))
        tokenizer.pre_tokenizer=Whitespace() # we will spilt the text into tokens based ont hte whitespace

        # creating a trainer for the new tokenizer 
        trainer=WordLevelTrainer(special_tokens=['[UNK]','[PAD]','[SOS]','[EOS]'])
        tokenizer.train_from_iterator(get_all_sentences(ds,lang),trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer=Tokenizer.from_file(str(tokenizer_path))
    return tokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [16]:
def casual_mask(size):
    mask=torch.triu(torch.ones(1,size,size),diagonal=1).type(torch.int)
    return mask==0

In [17]:
from torch.utils.data import Dataset 
class BilingualDataset(Dataset):
    def __init__(self,ds,tokenizer_src,tokenizer_tgt,src_lang,tgt_lang,max_seq_len)-> None:
        super().__init__()
        self.seq_len=max_seq_len
        self.ds=ds
        self.tokenizer_src=tokenizer_src
        self.tokenizer_tgt=tokenizer_tgt
        self.src_lang=src_lang
        self.tgt_lang=tgt_lang

        # special tokens numerical value
        self.sos_token=torch.tensor([tokenizer_tgt.token_to_id('[SOS]')],dtype=torch.int64)
        self.eos_token=torch.tensor([tokenizer_tgt.token_to_id('[EOS]')], dtype=torch.int64)
        self.pad_token=torch.tensor([tokenizer_tgt.token_to_id('[PAD]')], dtype=torch.int64)

    # Return the length of the sentences
    def __len__(self):
        return len(self.ds)

    def __getitem__(self,index:Any)-> Any:
        src_target_pair=self.ds[index]
        src_text=src_target_pair['translation'][self.src_lang]
        tgt_text=src_target_pair['translation'][self.tgt_lang]

        #tokenizationgthe source and target text 
        enc_input_tokens=self.tokenizer_src.encode(src_text).ids
        dec_input_tokens=self.tokenizer_tgt.encode(tgt_text).ids

        # sentence ma hamlai aktiota pad token chainxa 
        enc_num_padding_tokens=self.seq_len-len(enc_input_tokens) -2 # -2 for eos and sos

        #target tokens 
        dec_num_padding_tokens=self.seq_len-len(dec_input_tokens)-1 # euta chai for sos

        if enc_num_padding_tokens<0 or dec_num_padding_tokens<0:
            raise ValueError("Sentences seem to be long")#yedi maxtokens 10 xa aani tokens 9 ota xa bhaney eos ra sos nai bhayena jaha -1 aauxa tei bahyera
        
        #suruma sos tokens last ma eos token ani padding tokens
        encoder_input=torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens,dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token]*enc_num_padding_tokens,dtype=torch.int64)#padding tokens add gareko jastai list[0]*5 huda list[0,0,0,0,0]

            ]
        )

        #building decoder input tensor
        decoder_input=torch.cat([
            self.sos_token, # inserting the '[SOS]' token
            torch.tensor(dec_input_tokens, dtype=torch.int64), # indersting the tokenized target text
            torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64) # adding padding tokens
        ])

        # yo bhaneko label target yo sanga comaper garera loss nikalxa
        # creating a label tensor, the expected output for training the model
        label=torch.cat([
            torch.tensor(dec_input_tokens, dtype=torch.int64), # inserting the tokenized targate text
            self.eos_token, # inserting the '[EOS]' token
            torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64) # adding padding tokens
        ])
        # Ensuring that the length of each tensor above is equal to the defined `seq_len`
        assert encoder_input.size(0)==self.seq_len,'Encoder input doesnt match with sequencelength'
        assert decoder_input.size(0)==self.seq_len,'Edecoder input doesnt match with sequencelength'
        assert label.size(0)==self.seq_len,'label  doesnt match with sequencelength'

        return {
            'encoder_input':encoder_input,
            'decoder_input':decoder_input,
            'encoder_mask': (encoder_input!=self.pad_token).unsqueeze(0).unsqueeze(0).int(),
            'decoder_mask': (decoder_input!=self.pad_token).unsqueeze(0).unsqueeze(0).int() & casual_mask(decoder_input.size(0)),
            'label':label,
            'src_text': src_text,
            'tgt_text': tgt_text
        }
    




In [18]:
def read_text_files(src_file,tgt_file):
    with open(src_file,'r',encoding='utf-8') as src_f ,open(tgt_file,"r",encoding='utf-8') as tgt_f :
        src_lines=src_f.readlines()
        tgt_lines=tgt_f.readlines()
    
    assert len(src_lines) ==len(tgt_lines) ,"Source and target files must have the same number of lines and lengths"
    dataset=[{'translation':{'src':src.strip(),'tgt':tgt.strip()}} for src,tgt in zip(src_lines,tgt_lines)]
    return dataset

In [19]:
def get_ds(config):
    #read dataset form text file

    ds_raw=read_text_files(config['src_file'],config['tgt_file'])
    # building and loading tokenizer for source and target file
    tokenizer_src=build_tokenizer(config,ds_raw,config['lang_src'])
    tokenizer_tgt=build_tokenizer(config,ds_raw,config['lang_tgt'])

    #splitting the dataset for training and validation
    train_ds_size=int(0.9 * len (ds_raw))
    val_ds_size=len(ds_raw) -train_ds_size
    train_ds_raw,val_ds_raw=random_split(ds_raw,[train_ds_size,val_ds_size])

    #processing dataset with bilingualdataset 
    train_ds=BilingualDataset(train_ds_raw,tokenizer_src,tokenizer_tgt,config['lang_src'],config['lang_tgt'],config['seq_len'])
    val_ds=BilingualDataset(val_ds_raw,tokenizer_src,tokenizer_tgt,config['lang_src'],config['lang_tgt'],config['seq_len'])

    #finding the maximum length in the dataset 
    max_len_src=0
    max_len_tgt=0
    for pair in ds_raw:
        src_ids=tokenizer_src.encode(pair['translation'][config['lang_src']]).ids
        tgt_ids=tokenizer_tgt.encode(pair['translation'][config['lang_tgt']]).ids

        max_len_src=max(max_len_src,len(src_ids))
        max_len_tgt=max(max_len_tgt,len(tgt_ids))

    print(f"MAx Length of source Sentence: {max_len_src}")
    print(f"MAx Length of target Sentence: {max_len_tgt}")

    train_dataloader=DataLoader(train_ds,batch_size=config['batch_size'],shuffle=True)
    val_dataloader=DataLoader(val_ds,batch_size=1,shuffle=True)

    return train_dataloader,val_dataloader,tokenizer_src,tokenizer_tgt


In [85]:
#greedy decode for inferenceing

def greedy_decode(model,source,source_mask,tokenizer_src,tokenizer_tgt,max_len,device):
    #retrieving the indices from the start and end of sequences
    sos_idx=tokenizer_tgt.token_to_id('[SOS]')
    eos_idx=tokenizer_tgt.token_to_id('[EOS]')

    # computing the output of the encoder 
    encoder_output=model.encode(source,source_mask)
    decoder_input=torch.empty(1,1).fill_(sos_idx).type_as(source).to(device) #tensor type is like source
    while True:
        if decoder_input.size(1)==max_len:
            break
        #building a mask for decoder input 
        decoder_mask=casual_mask(decoder_input.size(1)).type_as(source_mask).to(device)
        #calculating the output of the decoder
        out=model.decode(encoder_output,source_mask,decoder_input,decoder_mask)
        prob=model.project(out[:,-1])

        # Select token with the highest probability 
        _,next_word=torch.max(prob,dim=1)
        decoder_input=torch.cat([decoder_input,torch.empty(1,1).type_as(source).fill_(next_word.item()).to(device)],dim=1)
        if next_word==eos_idx:
            break

    # sequence of tokens generated by the decoder
    return decoder_input.squeeze(0)
def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, global_state, writer, num_examples=2):
    model.eval()
    count=0 # initializing counter to keep track of how many examples have been processed
    
    console_width=80 # fixed width for printed messages
    
    # creating evaluation loop
    with torch.no_grad(): # ensuring that no gradients are computed during this process
        for batch in validation_ds:
            count+=1
            encoder_input=batch['encoder_input'].to(device)
            encoder_mask=batch['encoder_mask'].to(device)
            
            # ensuring that the batch_size of the validation set is 1
            assert encoder_input.size(0)==1, 'Batch size must be 1 for validation.'
            
            # applying the `greedy_decode` functio to get the model's output of the source text of the input batch
            model_out=greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)
            
            # retraeving source and target texts from the batch
            source_text=batch['src_text'][0]
            target_text=batch['tgt_text'][0] # true translation
            # token_ids = model_out.argmax(dim=-1).squeeze().tolist() # Convert tensor to a list of token IDs
            # model_out_text = tokenizer_tgt.decode(token_ids) 
            model_out_text=tokenizer_tgt.decode(model_out.detach().cpu().numpy()) # decoded, human-readable model ouptut
            
            # printing results
            print_msg('-'*console_width)
            print_msg(f'SOURCE: {source_text}')
            print_msg(f'TARGET: {target_text}')
            print_msg(f'PREDICTED: {model_out_text}')
            
            # After two examples, we break the loop
            if count >= num_examples:
                break

In [76]:

from nltk.translate.bleu_score import sentence_bleu
import torch
from tqdm import tqdm

# This function will be used to compute BLEU score between reference and predicted translations
def compute_bleu(reference, candidate):
    """
    Compute BLEU score between reference and candidate translations.
    
    Args:
    reference (list of str): The ground truth translation split into tokens.
    candidate (list of str): The predicted translation split into tokens.
    
    Returns:
    float: BLEU score.
    """
    return sentence_bleu([reference], candidate)
def calculate_bleu_for_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, max_len, device):
    model.eval()  # Set the model to evaluation mode
    total_bleu_score = 0
    total_examples = 0
    example_printed = False 

    with torch.no_grad():  # No gradients needed during validation
        for batch in tqdm(val_dataloader, desc="Calculating BLEU for validation"):
            # Get source (input) and reference (target) texts
            encoder_input = batch['encoder_input'].to(device)
            encoder_mask = batch['encoder_mask'].to(device)
            target_texts = batch['tgt_text']  # Ground truth translations (list of strings)

            # Predict translations using the greedy decoding function
            model_output = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)

            # Decode predicted token IDs to text
            predicted_text = tokenizer_tgt.decode(model_output.tolist(), skip_special_tokens=True)
            
            # Iterate over each example in the batch
            for i, target_text in enumerate(target_texts):
                reference = target_text.split()  # Tokenize the reference (true) sentence
                candidate = predicted_text.split()  # Tokenize the predicted sentence
                
                # Calculate BLEU score for this example
                bleu_score = compute_bleu(reference, candidate)
                total_bleu_score += bleu_score
                total_examples += 1
                if not example_printed:
                    print("\n--- Sample Validation Output ---")
                    print(f"Real: {target_text}")
                    print(f"Predicted: {predicted_text}")
                    print(f"BLEU score for this example: {bleu_score:.4f}")
                    example_printed = True
                
    
    # Calculate and return the average BLEU score across all validation examples
    avg_bleu_score = total_bleu_score / total_examples if total_examples > 0 else 0
    return avg_bleu_score




In [21]:
# we pass as parameters the config dictionary, the length of the vocabulary of the source language and the target language
def get_model(config, vocab_src_len, vocab_tgt_len):
    # loading model using the `build_transformer` function
    # we will use the lengths of the source language and atarget language vocabularies, the `seq_len`, and the dimensionality of embeddings
    model=build_transformer(vocab_src_len, vocab_tgt_len, config['seq_len'], config['seq_len'], config['d_model'])
    return model

In [118]:
# define settings for building and training the transfomer model
def get_config():
    return {
        'batch_size': 4,
        'num_epochs': 50,
        'lr': 10**-4,
        'seq_len': 150,
        'd_model': 512,  # Dimensions of the embeddings in the transformer. 512 like in the paper
        'lang_src': 'src',  # Use 'src' as the source language identifier
        'lang_tgt': 'tgt',  # Use 'tgt' as the target language identifier
        'src_file': './mytestdata/eng.txt',  # Path to your English text file
        'tgt_file': './mytestdata/nep.txt',  # Path to your Nepali text file
        'model_folder': 'weights',
        'model_basename': 'tmodel_',
        'preload': 'epoch_3',
        'tokenizer_file': 'tokenizer_{0}.json',
        'experiment_name': 'runs/tmodel',
       
        
    }


# function to construct the path for saving and retrieving model weights
def get_weights_file_path(config, epoch: str):
    model_folder=config['model_folder'] # extracting model folder from the config
    model_basename=config['model_basename'] # extracting the base name for model files
    model_filename=f'{model_basename}{epoch}.pt'
    return str(Path('.')/model_folder/model_filename)


In [119]:
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
from pathlib import Path
from tqdm import tqdm
import os  # Needed for file deletion

def train_model(config):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f'Using device {device}')
    
    # Creating model directory to store weights
    Path(config['model_folder']).mkdir(parents=True, exist_ok=True)
    
    # Ensure the experiment directory exists
    experiment_path = Path(config['experiment_name'])
    experiment_path.mkdir(parents=True, exist_ok=True)
    
    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
    
    model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)
 
    # Tensorboard
    writer = SummaryWriter(config['experiment_name'])
    
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)
    
    initial_epoch = 0
    global_step = 0
    
    # Check if there is a pre-trained model to load
    if config['preload']:
        model_filename = get_weights_file_path(config, config['preload'])
        print(f'Preloading model {model_filename}')
        
        state = torch.load(model_filename)
        
        # Sets epoch to the saved in the state plus one, to resume from where it stopped
        initial_epoch = state['epoch'] + 1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state['global_step']

    # Initialize loss function
    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1).to(device)
    
    previous_model_filename = None  # Variable to track the last saved model file
    
    for epoch in range(initial_epoch, config['num_epochs']):
        batch_iterator = tqdm(train_dataloader, desc=f'Processing epoch {epoch:02d}')
        
        for i, batch in enumerate(batch_iterator):
            model.train()
            
            # Loading input data and masks onto the GPU
            encoder_input = batch['encoder_input'].to(device)
            decoder_input = batch['decoder_input'].to(device)
            encoder_mask = batch['encoder_mask'].to(device)
            decoder_mask = batch['decoder_mask'].to(device)
            
            # Running tensors through the transformer
            encoder_output = model.encode(encoder_input, encoder_mask)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask)
            proj_output = model.project(decoder_output)
            
            # Loading the target labels onto the GPU
            label = batch['label'].to(device)
            
            # Computing loss between model's output and true labels
            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
            
            # Updating progress bar
            batch_iterator.set_postfix({f'loss': f'{loss.item():6.3f}'})
            
            writer.add_scalar('train loss', loss.item(), global_step)
            writer.flush()
            
            # Performing backpropagation
            loss.backward()
            
            optimizer.step()
            
            # Clearing the gradients to prepare for the next batch
            optimizer.zero_grad()
            
            global_step += 1  # Updating global step count
            
            # Display results every 100 iterations
            if global_step % 100 == 0:
                print(f'Iteration {global_step}: loss = {loss.item():6.3f}')
                writer.add_scalar('iteration loss', loss.item(), global_step)
                writer.flush()
        
        # Running validation at the end of each epoch
        run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: batch_iterator.write(msg), global_step, writer)
        
        # Save model after every epoch
        model_filename = get_weights_file_path(config, f'epoch_{epoch+1}')
        
        # Writing current model state to the model_filename
        torch.save({
            'epoch': epoch,  # Current epoch
            'model_state_dict': model.state_dict(),  # Current model state
            'optimizer_state_dict': optimizer.state_dict(),  # Current optimizer state
            'global_step': global_step  # Current global step
        }, model_filename)
        
        print(f'Saved model for epoch {epoch+1}: {model_filename}')
        
        # Delete the model from the previous epoch
        if previous_model_filename and os.path.exists(previous_model_filename):
            os.remove(previous_model_filename)
            print(f'Deleted previous model: {previous_model_filename}')
        
        # Update the previous model filename to the current one
        previous_model_filename = model_filename

    print("\nTraining complete. Calculating BLEU score on validation data...")
    avg_bleu_score = calculate_bleu_for_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device)
    print(f"\nFinal Average BLEU score on validation data: {avg_bleu_score:.4f}")



MAx Length of source Sentence: 14
MAx Length of target Sentence: 11


In [115]:
import torch
torch.cuda.empty_cache()


In [120]:
import warnings
if __name__=='__main__':
    warnings.filterwarnings('ignore')
    config=get_config() #retrieving config settings
    train_model(config) # training model with config arguments

Using device cuda
MAx Length of source Sentence: 14
MAx Length of target Sentence: 11
torch.Size([1, 150, 512])
torch.Size([1, 150, 512])
Preloading model weights/tmodel_epoch_3.pt


Processing epoch 03: 100%|██████████| 23/23 [00:03<00:00,  6.06it/s, loss=6.189]


--------------------------------------------------------------------------------
SOURCE: Nike was (Pokhardande brother.
TARGET: नाइके थिए (पोखरडाँडे दाइ।
PREDICTED: 
--------------------------------------------------------------------------------
SOURCE: How 0?
TARGET: ० कसरी?
PREDICTED: 
Saved model for epoch 4: weights/tmodel_epoch_4.pt


Processing epoch 04:  35%|███▍      | 8/23 [00:01<00:02,  5.67it/s, loss=5.334]

Iteration 100: loss =  5.653


Processing epoch 04: 100%|██████████| 23/23 [00:03<00:00,  6.41it/s, loss=5.552]


--------------------------------------------------------------------------------
SOURCE: Again, only the voice goes to the FM.
TARGET: फेरि एफएममा आवाज मात्र जान्छ।
PREDICTED: ,
--------------------------------------------------------------------------------
SOURCE: Nike was (Pokhardande brother.
TARGET: नाइके थिए (पोखरडाँडे दाइ।
PREDICTED: यो
Saved model for epoch 5: weights/tmodel_epoch_5.pt
Deleted previous model: weights/tmodel_epoch_4.pt


Processing epoch 05: 100%|██████████| 23/23 [00:03<00:00,  6.38it/s, loss=5.738]


--------------------------------------------------------------------------------
SOURCE: Even so far, our politics is running in the old style.
TARGET: अहिलेसम्म पनि हाम्रो राजनीति पुरानै शैलीमा चलिरहेको छ।
PREDICTED: यो यो ।
--------------------------------------------------------------------------------
SOURCE: And, no one has enough documents to open the identity.
TARGET: अनि, कसैसँग पनि परिचय खुल्ने पर्याप्त कागजात हुँदैनन्।
PREDICTED: यो यो ।
Saved model for epoch 6: weights/tmodel_epoch_6.pt
Deleted previous model: weights/tmodel_epoch_5.pt


Processing epoch 06: 100%|██████████| 23/23 [00:03<00:00,  6.38it/s, loss=5.506]


--------------------------------------------------------------------------------
SOURCE: This place has 191 houses.
TARGET: यो ठाउँमा ८९६ वटा घर छन्।
PREDICTED: यो यो यो छ ।
--------------------------------------------------------------------------------
SOURCE: What is fun.
TARGET: क्या मज्जाको छ।
PREDICTED: यो यो छ
Saved model for epoch 7: weights/tmodel_epoch_7.pt
Deleted previous model: weights/tmodel_epoch_6.pt


Processing epoch 07: 100%|██████████| 23/23 [00:03<00:00,  6.34it/s, loss=4.202]


--------------------------------------------------------------------------------
SOURCE: This place has 191 houses.
TARGET: यो ठाउँमा ८९६ वटा घर छन्।
PREDICTED: यो , ।
--------------------------------------------------------------------------------
SOURCE: How 0?
TARGET: ० कसरी?
PREDICTED: अनि ?
Saved model for epoch 8: weights/tmodel_epoch_8.pt
Deleted previous model: weights/tmodel_epoch_7.pt


Processing epoch 08:  74%|███████▍  | 17/23 [00:02<00:00,  6.25it/s, loss=3.850]

Iteration 200: loss =  3.871


Processing epoch 08: 100%|██████████| 23/23 [00:03<00:00,  6.36it/s, loss=4.204]


--------------------------------------------------------------------------------
SOURCE: Nike was (Pokhardande brother.
TARGET: नाइके थिए (पोखरडाँडे दाइ।
PREDICTED: यसको पहिलो भएको थियो ।
--------------------------------------------------------------------------------
SOURCE: 102. Strike to resign
TARGET: १०२. राजीनामा दिएर गरिने हडताल
PREDICTED: यसको अदालत अदालत अदालत ।
Saved model for epoch 9: weights/tmodel_epoch_9.pt
Deleted previous model: weights/tmodel_epoch_8.pt


Processing epoch 09: 100%|██████████| 23/23 [00:03<00:00,  6.36it/s, loss=3.093]


--------------------------------------------------------------------------------
SOURCE: Again, only the voice goes to the FM.
TARGET: फेरि एफएममा आवाज मात्र जान्छ।
PREDICTED: ऐनमा , त्यति ।
--------------------------------------------------------------------------------
SOURCE: And, no one has enough documents to open the identity.
TARGET: अनि, कसैसँग पनि परिचय खुल्ने पर्याप्त कागजात हुँदैनन्।
PREDICTED: ऐनमा , आफन्छहरू ।
Saved model for epoch 10: weights/tmodel_epoch_10.pt
Deleted previous model: weights/tmodel_epoch_9.pt


Processing epoch 10: 100%|██████████| 23/23 [00:03<00:00,  6.29it/s, loss=3.175]


--------------------------------------------------------------------------------
SOURCE: Even so far, our politics is running in the old style.
TARGET: अहिलेसम्म पनि हाम्रो राजनीति पुरानै शैलीमा चलिरहेको छ।
PREDICTED: यो सङ्ग्रहालयको मात्र छ
--------------------------------------------------------------------------------
SOURCE: And, no one has enough documents to open the identity.
TARGET: अनि, कसैसँग पनि परिचय खुल्ने पर्याप्त कागजात हुँदैनन्।
PREDICTED: यो गोंगबु मात्र छ
Saved model for epoch 11: weights/tmodel_epoch_11.pt
Deleted previous model: weights/tmodel_epoch_10.pt


Processing epoch 11: 100%|██████████| 23/23 [00:03<00:00,  6.35it/s, loss=2.172]


--------------------------------------------------------------------------------
SOURCE: Tasili An Azer is located in the mixture of Algeria
TARGET: तससिली एन अज्जेर अल्जेरियाको मिश्रितमा अवस्थित छ
PREDICTED: अहिलेको चीन गोंगबु समस्या अझ बढी हुन्छ
--------------------------------------------------------------------------------
SOURCE: Even so far, our politics is running in the old style.
TARGET: अहिलेसम्म पनि हाम्रो राजनीति पुरानै शैलीमा चलिरहेको छ।
PREDICTED: यो सङ्ग्रहालयको स्थापना . २०५२ काम छ ।
Saved model for epoch 12: weights/tmodel_epoch_12.pt
Deleted previous model: weights/tmodel_epoch_11.pt


Processing epoch 12: 100%|██████████| 23/23 [00:03<00:00,  6.30it/s, loss=2.428]


--------------------------------------------------------------------------------
SOURCE: How 0?
TARGET: ० कसरी?
PREDICTED: कति मान्छे ?
--------------------------------------------------------------------------------
SOURCE: This place has 191 houses.
TARGET: यो ठाउँमा ८९६ वटा घर छन्।
PREDICTED: यो यथार्थ होइन ।
Saved model for epoch 13: weights/tmodel_epoch_13.pt
Deleted previous model: weights/tmodel_epoch_12.pt


Processing epoch 13:   9%|▊         | 2/23 [00:00<00:03,  6.61it/s, loss=1.901]

Iteration 300: loss =  2.338


Processing epoch 13: 100%|██████████| 23/23 [00:03<00:00,  6.27it/s, loss=2.317]


--------------------------------------------------------------------------------
SOURCE: 102. Strike to resign
TARGET: १०२. राजीनामा दिएर गरिने हडताल
PREDICTED: कुनै ।
--------------------------------------------------------------------------------
SOURCE: And, no one has enough documents to open the identity.
TARGET: अनि, कसैसँग पनि परिचय खुल्ने पर्याप्त कागजात हुँदैनन्।
PREDICTED: तर , परिस्थिति अलि भिन्न ।
Saved model for epoch 14: weights/tmodel_epoch_14.pt
Deleted previous model: weights/tmodel_epoch_13.pt


Processing epoch 14: 100%|██████████| 23/23 [00:03<00:00,  6.36it/s, loss=1.769]


--------------------------------------------------------------------------------
SOURCE: Nike was (Pokhardande brother.
TARGET: नाइके थिए (पोखरडाँडे दाइ।
PREDICTED: कुनै साँझ् खाली जाँदैन ।
--------------------------------------------------------------------------------
SOURCE: And, no one has enough documents to open the identity.
TARGET: अनि, कसैसँग पनि परिचय खुल्ने पर्याप्त कागजात हुँदैनन्।
PREDICTED: कुनै अपरिचित विदेशी धुनले वातावरण ।
Saved model for epoch 15: weights/tmodel_epoch_15.pt
Deleted previous model: weights/tmodel_epoch_14.pt


Processing epoch 15: 100%|██████████| 23/23 [00:03<00:00,  6.34it/s, loss=2.051]


--------------------------------------------------------------------------------
SOURCE: What is fun.
TARGET: क्या मज्जाको छ।
PREDICTED: यो यथार्थ होइन ।
--------------------------------------------------------------------------------
SOURCE: Nike was (Pokhardande brother.
TARGET: नाइके थिए (पोखरडाँडे दाइ।
PREDICTED: यो सङ्ग्रहालयको स्थापना वि . २०५२ ।
Saved model for epoch 16: weights/tmodel_epoch_16.pt
Deleted previous model: weights/tmodel_epoch_15.pt


Processing epoch 16: 100%|██████████| 23/23 [00:03<00:00,  6.37it/s, loss=1.604]


--------------------------------------------------------------------------------
SOURCE: What is fun.
TARGET: क्या मज्जाको छ।
PREDICTED: यो यथार्थ होइन ।
--------------------------------------------------------------------------------
SOURCE: How 0?
TARGET: ० कसरी?
PREDICTED: कति मान्छे मारामार ?
Saved model for epoch 17: weights/tmodel_epoch_17.pt
Deleted previous model: weights/tmodel_epoch_16.pt


Processing epoch 17:  43%|████▎     | 10/23 [00:01<00:02,  6.26it/s, loss=1.429]

Iteration 400: loss =  1.470


Processing epoch 17: 100%|██████████| 23/23 [00:03<00:00,  6.24it/s, loss=1.528]


--------------------------------------------------------------------------------
SOURCE: Nike was (Pokhardande brother.
TARGET: नाइके थिए (पोखरडाँडे दाइ।
PREDICTED: यो सङ्ग्रहालयको स्थापना वि . स . २०५२ सालमा भएको थियो
--------------------------------------------------------------------------------
SOURCE: Tasili An Azer is located in the mixture of Algeria
TARGET: तससिली एन अज्जेर अल्जेरियाको मिश्रितमा अवस्थित छ
PREDICTED: ऐनमा यस्ता कुरा समेटिनेछन्
Saved model for epoch 18: weights/tmodel_epoch_18.pt
Deleted previous model: weights/tmodel_epoch_17.pt


Processing epoch 18: 100%|██████████| 23/23 [00:03<00:00,  6.34it/s, loss=1.341]


--------------------------------------------------------------------------------
SOURCE: Even so far, our politics is running in the old style.
TARGET: अहिलेसम्म पनि हाम्रो राजनीति पुरानै शैलीमा चलिरहेको छ।
PREDICTED: यो सङ्ग्रहालयको स्थापना वि . २०५२ सालमा भएको थियो
--------------------------------------------------------------------------------
SOURCE: Nike was (Pokhardande brother.
TARGET: नाइके थिए (पोखरडाँडे दाइ।
PREDICTED: यो वास्तविकता हो ।
Saved model for epoch 19: weights/tmodel_epoch_19.pt
Deleted previous model: weights/tmodel_epoch_18.pt


Processing epoch 19: 100%|██████████| 23/23 [00:03<00:00,  6.33it/s, loss=1.376]


--------------------------------------------------------------------------------
SOURCE: 102. Strike to resign
TARGET: १०२. राजीनामा दिएर गरिने हडताल
PREDICTED: कुनै साँझ् खाली जाँदैन ।
--------------------------------------------------------------------------------
SOURCE: And, no one has enough documents to open the identity.
TARGET: अनि, कसैसँग पनि परिचय खुल्ने पर्याप्त कागजात हुँदैनन्।
PREDICTED: कुनै अपरिचित विदेशी धुनले वातावरण मुखर थियो ।
Saved model for epoch 20: weights/tmodel_epoch_20.pt
Deleted previous model: weights/tmodel_epoch_19.pt


Processing epoch 20: 100%|██████████| 23/23 [00:03<00:00,  6.16it/s, loss=1.513]


--------------------------------------------------------------------------------
SOURCE: Again, only the voice goes to the FM.
TARGET: फेरि एफएममा आवाज मात्र जान्छ।
PREDICTED: कुनै साँझ् खाली जाँदैन ।
--------------------------------------------------------------------------------
SOURCE: What is fun.
TARGET: क्या मज्जाको छ।
PREDICTED: बंगलादेशमा रमाइलो हुने पक्कापक्की भयो ।
Saved model for epoch 21: weights/tmodel_epoch_21.pt
Deleted previous model: weights/tmodel_epoch_20.pt


Processing epoch 21:  78%|███████▊  | 18/23 [00:02<00:00,  6.07it/s, loss=1.309]

Iteration 500: loss =  1.276


Processing epoch 21: 100%|██████████| 23/23 [00:03<00:00,  6.19it/s, loss=1.278]


--------------------------------------------------------------------------------
SOURCE: Again, only the voice goes to the FM.
TARGET: फेरि एफएममा आवाज मात्र जान्छ।
PREDICTED: कुनै साँझ् खाली जाँदैन ।
--------------------------------------------------------------------------------
SOURCE: What is fun.
TARGET: क्या मज्जाको छ।
PREDICTED: यो यथार्थ ।
Saved model for epoch 22: weights/tmodel_epoch_22.pt
Deleted previous model: weights/tmodel_epoch_21.pt


Processing epoch 22: 100%|██████████| 23/23 [00:03<00:00,  6.29it/s, loss=1.255]


--------------------------------------------------------------------------------
SOURCE: And, no one has enough documents to open the identity.
TARGET: अनि, कसैसँग पनि परिचय खुल्ने पर्याप्त कागजात हुँदैनन्।
PREDICTED: मसला लस्सादार भएमा इँट लगाउँने काम सजिलो हुन्छ ।
--------------------------------------------------------------------------------
SOURCE: This place has 191 houses.
TARGET: यो ठाउँमा ८९६ वटा घर छन्।
PREDICTED: यो पत्रिकाको सम्पादन कार्य लक्ष्मण ' ।
Saved model for epoch 23: weights/tmodel_epoch_23.pt
Deleted previous model: weights/tmodel_epoch_22.pt


Processing epoch 23: 100%|██████████| 23/23 [00:03<00:00,  6.14it/s, loss=1.136]


--------------------------------------------------------------------------------
SOURCE: 102. Strike to resign
TARGET: १०२. राजीनामा दिएर गरिने हडताल
PREDICTED: यो सङ्ग्रहालयको स्थापना वि . २०५२ सालमा ।
--------------------------------------------------------------------------------
SOURCE: Nike was (Pokhardande brother.
TARGET: नाइके थिए (पोखरडाँडे दाइ।
PREDICTED: यो वास्तविकता हो भनी स्वीकार गरें ।
Saved model for epoch 24: weights/tmodel_epoch_24.pt
Deleted previous model: weights/tmodel_epoch_23.pt


Processing epoch 24: 100%|██████████| 23/23 [00:03<00:00,  6.32it/s, loss=1.174]


--------------------------------------------------------------------------------
SOURCE: How 0?
TARGET: ० कसरी?
PREDICTED: कति मान्छे मारामार गर्ने ?
--------------------------------------------------------------------------------
SOURCE: What is fun.
TARGET: क्या मज्जाको छ।
PREDICTED: यो यथार्थ होइन ।
Saved model for epoch 25: weights/tmodel_epoch_25.pt
Deleted previous model: weights/tmodel_epoch_24.pt


Processing epoch 25: 100%|██████████| 23/23 [00:03<00:00,  6.21it/s, loss=1.147]


--------------------------------------------------------------------------------
SOURCE: And, no one has enough documents to open the identity.
TARGET: अनि, कसैसँग पनि परिचय खुल्ने पर्याप्त कागजात हुँदैनन्।
PREDICTED: कुनै अपरिचित विदेशी धुनले वातावरण मुखर थियो ।
--------------------------------------------------------------------------------
SOURCE: How 0?
TARGET: ० कसरी?
PREDICTED: कति मान्छे मारामार गर्ने ?
Saved model for epoch 26: weights/tmodel_epoch_26.pt
Deleted previous model: weights/tmodel_epoch_25.pt


Processing epoch 26:  13%|█▎        | 3/23 [00:00<00:03,  6.56it/s, loss=1.169]

Iteration 600: loss =  1.194


Processing epoch 26: 100%|██████████| 23/23 [00:03<00:00,  6.24it/s, loss=1.145]


--------------------------------------------------------------------------------
SOURCE: Tasili An Azer is located in the mixture of Algeria
TARGET: तससिली एन अज्जेर अल्जेरियाको मिश्रितमा अवस्थित छ
PREDICTED: षडानन्द नगरपालिकाको केन्द्रभने दिङ्ला बजार रहको छ
--------------------------------------------------------------------------------
SOURCE: Nike was (Pokhardande brother.
TARGET: नाइके थिए (पोखरडाँडे दाइ।
PREDICTED: यो वास्तविकता हो भनी स्वीकार गरें ।
Saved model for epoch 27: weights/tmodel_epoch_27.pt
Deleted previous model: weights/tmodel_epoch_26.pt


Processing epoch 27: 100%|██████████| 23/23 [00:03<00:00,  6.18it/s, loss=1.158]


--------------------------------------------------------------------------------
SOURCE: Again, only the voice goes to the FM.
TARGET: फेरि एफएममा आवाज मात्र जान्छ।
PREDICTED: कुनै साँझ् खाली जाँदैन ।
--------------------------------------------------------------------------------
SOURCE: 102. Strike to resign
TARGET: १०२. राजीनामा दिएर गरिने हडताल
PREDICTED: यो सङ्ग्रहालयको स्थापना ।


In [117]:

%load_ext tensorboard
%reload_ext tensorboard

# Step 2: Start TensorBoard in the notebook and point it to the log directory
%tensorboard --logdir runs/tmodel


The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6007 (pid 59779), started 0:39:11 ago. (Use '!kill 59779' to kill it.)

In [59]:


# This function will iterate over the validation data and compute BLEU score


In [None]:
list1=[1,2,3]
list2=[3,4,5]
zipped=zip(list1,list2)
print(list(zipped))

[(1, 3), (2, 4), (3, 5)]


In [None]:
# import torch
# import torch.nn as nn
# import math

# class MultiHeadAttention(nn.Module):
#     def __init__(self, d_model: int, num_heads: int, dropout: float):
#         super().__init__()
#         self.d_model = d_model
#         self.num_heads = num_heads
#         assert d_model % num_heads == 0, 'Dimension of model should be divisible by the number of heads'
#         self.d_k = d_model // num_heads

#         # Linear layers for query, key, value, and the output
#         self.w_q = nn.Linear(d_model, d_model)  # Weighted query
#         self.w_k = nn.Linear(d_model, d_model)  # Weighted key
#         self.w_v = nn.Linear(d_model, d_model)  # Weighted value
#         self.w_o = nn.Linear(d_model, d_model)  # Output projection layer
#         self.dropout = nn.Dropout(dropout)  # Dropout layer
    
#     @staticmethod
#     def attention(query, key, value, mask=None, dropout=None):
#         d_k = query.shape[-1]
#         attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)  # Scaled dot-product attention

#         # Mask needs to match the size of the attention_scores
#         if mask is not None:
#             attention_scores = attention_scores.masked_fill(mask == 0, -1e9)  # Mask out padded positions
        
#         attention_scores = attention_scores.softmax(dim=-1)  # Apply softmax over the last dimension
        
#         if dropout is not None:
#             attention_scores = dropout(attention_scores)
        
#         return attention_scores @ value, attention_scores

#     def forward(self, q, k, v, mask=None):
#         query = self.w_q(q)
#         key = self.w_k(k)
#         value = self.w_v(v)
        
#         # Split into num_heads and reshape
#         query = query.view(query.shape[0], query.shape[1], self.num_heads, self.d_k).transpose(1, 2)
#         key = key.view(key.shape[0], key.shape[1], self.num_heads, self.d_k).transpose(1, 2)
#         value = value.view(value.shape[0], value.shape[1], self.num_heads, self.d_k).transpose(1, 2)
        
#         # Expand the mask to [batch_size, 1, 1, sequence_length] so it can be broadcasted
#         if mask is not None:
#             mask = mask.unsqueeze(1).unsqueeze(2)  # Add dimensions for num_heads and sequence_length
        
#         # Obtain the output and attention scores
#         x, attention_scores = MultiHeadAttention.attention(query, key, value, mask, self.dropout)
        
#         # Concatenate and project back to d_model
#         x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.num_heads * self.d_k)
        
#         # Pass through the output linear layer
#         return self.w_o(x)


In [None]:
# Define some dummy inputs
batch_size = 2
sequence_length = 200
d_model = 512
num_heads = 8
dropout = 0.1

# Create random tensors for q, k, v with shape [batch_size, sequence_length, d_model]
q = torch.randn(batch_size, sequence_length, d_model)
k = torch.randn(batch_size, sequence_length, d_model)
v = torch.randn(batch_size, sequence_length, d_model)

# Optionally, create a mask (shape: [batch_size, sequence_length])
mask = torch.ones(batch_size, sequence_length).bool()  # No masking here, all ones

# Initialize the multi-head attention module
mha = MultiHeadAttention(d_model=d_model, num_heads=num_heads, dropout=dropout)

# Forward pass through the module
# output = mha(q, k, v, mask)

# # Print the output shape and output tensor
# print("Output shape:", output.shape)  # Should print torch.Size([2, 5, 8])
# print("Output tensor:", output)


In [None]:
# mha.forward(q,k,v,mask)

RuntimeError: The size of tensor a (200) must match the size of tensor b (2) at non-singleton dimension 2

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

# Get model
model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size())

# Check if model is a valid PyTorch module
print(type(model))  # Ensure it's <class 'torch.nn.Module'>

# Move model to device
model = model.to(device)


Using device: cpu


NameError: name 'tokenizer_src' is not defined

In [None]:
# x=torch.randint(0,3,(2,200,512),dtype=float)

In [None]:
# ffn = FeedForwardNeuralNetwork(512, 2048, 0.2)

# # Input tensor x with shape [batch_size, sequence_length, d_model]
# x = torch.randint(0, 3, (2, 200, 512), dtype=torch.float)  # Use dtype=torch.float
# output = ffn.forward(x)
# # print(output.shape) 

torch.Size([2, 200, 512])


In [None]:
# x

tensor([[-0.7426, -0.1935, -0.9282],
        [-0.9697, -1.4313,  0.5497]])

In [None]:
# x = torch.randint(0, vocab_size, (batch_size, sequence_length)).float()


In [None]:
# import torch

# pop = torch.tensor([
#     [1, 2, 3, 4, 5, 6],
#     [7, 8, 9, 10, 11, 12],
#     [13, 14, 15, 16, 17, 18],
#     [19, 20, 21, 22, 23, 24]
# ])


In [None]:
# pop[:,1::2]

tensor([[ 2,  4,  6],
        [ 8, 10, 12],
        [14, 16, 18],
        [20, 22, 24]])