In [1]:
import os

In [6]:
import torch
import torch.nn as nn
import math
from typing import Any
if torch.cuda.is_available():
    torch.backends.cudnn.deterministic=True
from translator.logging import logger

In [16]:
from translator.constants import *
from translator.utils.common import read_yaml,create_directories

In [3]:
print(torch.cuda.is_available())

True


In [23]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainingConfig:
    model_path:Path
    experiment_path:Path
    max_seq_len:int
    d_model:int
    lr:float
    preload:str|None=None
    model_basename:str
    num_epochs:int






In [25]:
class ConfigurationManager:
    def __init__(self,config_file_path=CONFIG_FILE_PATH,params_file_path=PARAMS_FILE_PATH):
        self.config=read_yaml(config_file_path)
        self.params=read_yaml(params_file_path)
        create_directories([self.config.artifacts_root])
    
    def get_model_training_config(self)-> ModelTrainingConfig:
        config=self.config.data_training
        params=self.params.modeltrainer
        print(len(params))
        print("running")
        create_directories([config.root_dir])
        model_training_config=ModelTrainingConfig(
            root_dir=config.root_dir,
            model_path=config.model_path,
            experiment_path=config.experimnet_path,
            max_seq_len=params.max_seq_len,
            d_model=params.d_model,
            lr=params.learning_rate,
            preload=config.preload,
            model_basename=config.model_basename,
            num_epochs=params.num_epochs

        )
        return model_training_config

In [5]:
#Model 
# embeddings

class Inputembedding(nn.Module):
    def __init__(self,d_model:int,vocab_size:int):
        super().__init__()
        self.d_model=d_model
        self.vocab_size=vocab_size
        self.embedding=nn.Embedding(vocab_size,d_model)
    def forward(self,x):
        return self.embedding(x)*math.sqrt(self.d_model)

class PositionalEncoding(nn.Module):
    def __init__(self,d_model:int,max_seq_len:int,dropout:float):
        super().__init__()
        self.d_model=d_model
        self.max_seq_len=max_seq_len
        self.dropout=nn.Dropout(dropout)
        #positional encoding filled with  zeroes
        pe=torch.zeros(max_seq_len,d_model)
        # creating a position
        position=torch.arange(0,max_seq_len,dtype=torch.float).unsqueeze(1)
        dividend_term=torch.exp(torch.arange(0,d_model,2).float()*(-math.log(10000)/d_model))
        #applying sine to even indices
        pe[:,0::2]=torch.sin(position*dividend_term)
        #applying cosine to odd indices
        pe[:,1::2]=torch.cos(position*dividend_term)
        #apply one dimension more for the batch_size
        pe=pe.unsqueeze(0)
        self.register_buffer("pe",pe)
        print(pe.shape)
    def forward(self,x):
        x=x+(self.pe[:,:x.shape[1],:]) # all bach size 0 to maxseqlen-1,dimension 
        return self.dropout(x)
class LayerNormalization(nn.Module):
    def __init__(self,eps:float=10**-6):
        super().__init__()
        self.eps=eps
        self.alpha=nn.Parameter(torch.ones(1))
        self.bias=nn.Parameter(torch.zeros(1))
    
    def forward(self,x):
        mean=x.mean(dim=-1,keepdim=True)
        std=x.std(dim=-1,keepdim=True)
        return self.alpha*(x-mean)/(std+self.eps)+self.bias

class FeedForwardNeuralNetwork(nn.Module):
    def __init__(self,d_model:int,d_ff:int,dropout:float):
        super().__init__()
        self.firstlayer=nn.Linear(d_model,d_ff)
        self.dropout=nn.Dropout(dropout)
        self.secondlayer=nn.Linear(d_ff,d_model)
    
    def forward(self,x):
        return self.secondlayer(self.dropout(torch.relu(self.firstlayer(x))))

class MultiHeadAttention(nn.Module):
    def __init__(self,d_model:int,num_heads:int,dropout:float):
        super().__init__()
        self.d_model=d_model
        self.num_heads=num_heads
        assert d_model%num_heads==0,'Dimension of model should be divisible by no of heads'
        self.d_k=d_model//num_heads

        # for the  weight metrices
        self.w_q=nn.Linear(d_model,d_model)# Weighted query
        self.w_k=nn.Linear(d_model,d_model)# weighted key
        self.w_v=nn.Linear(d_model,d_model)# weighted value
        self.w_o=nn.Linear(d_model,d_model) #weight of the concatenated layer
        self.dropout=nn.Dropout(dropout)  #last dropoutlayer
    @staticmethod
    def attention(query,key,value,mask,dropout=nn.Dropout):
        d_k=query.shape[-1]
        attention_scores=(query@key.transpose(-2,-1))/math.sqrt(d_k)
        if mask is not None:
            attention_scores.masked_fill_(mask==0,-1e9)# In-place: mask out positions with a large negative value to ignore them in softmax
        attention_scores=attention_scores.softmax(dim=-1) # applied at the last dimension that is max_selenght
        if dropout is not None:
            attention_scores=dropout(attention_scores)
        return (attention_scores@value) ,attention_scores

    def forward(self,q,k,v,mask):
        query=self.w_q(q)
        key=self.w_k(k)
        value=self.w_v(v)
        query=query.view(query.shape[0],query.shape[1],self.num_heads,self.d_k).transpose(1,2) #batchsize sequencelength number of head,d_k #transpose chai aaba independently head lai train garxam so batchsize,oofheads,max_seq_len,d_k hunxa
        key=key.view(key.shape[0],key.shape[1],self.num_heads,self.d_k).transpose(1,2)
        value=value.view(value.shape[0],value.shape[1],self.num_heads,self.d_k).transpose(1,2)
        #obtain output and attention scores
        x,self.attention_scores=MultiHeadAttention.attention(query,key,value,mask,self.dropout)
        # create  a concatenated matrix
        x=x.transpose(1,2).contiguous().view(x.shape[0],-1,self.num_heads*self.d_k)#
        return self.w_o(x)
    
class ResidualConnection(nn.Module):
    def __init__(self, dropout: float) -> None:
        super().__init__()
        # we use a dropout layer to prevent overfitting
        self.dropout=nn.Dropout(dropout)
        # we use a normalization layer
        self.norm=LayerNormalization()
        
    def forward(self, x, sublayer):
        # we normalize the input and add it to the original input x`. This creates the residual connection process
        return x+self.dropout(sublayer(self.norm(x)))

# Building Encoder Block
class EncoderBlock(nn.Module):
    def __init__(self,self_attention_block:MultiHeadAttention,ffn:FeedForwardNeuralNetwork,dropout:float):
        super().__init__()
        self.self_attention_block=self_attention_block
        self.ffn=ffn
        self.residual_connections=nn.ModuleList([ResidualConnection(dropout) for _ in range(2)])
    def forward(self,x,src_mask):
        x=self.residual_connections[0](x,lambda x : self.self_attention_block(x,x,x,src_mask)) 
        x=self.residual_connections[1](x,self.ffn) # x+x.self.ffn(x)
        # output tensor with applying feedforward selfattention feedforward 
        return x

class Encoder(nn.Module):
    def __init__(self, layers: nn.ModuleList)-> None:
        super().__init__()
        self.layers=layers # storing the EncoderBlocks
        # layer for the normalization of the output of the encoder layers
        self.norm=LayerNormalization()
    
    def forward(self, x, mask):
        # Iterating over each EncoderBlock stored in self.layers
        for layer in self.layers:
            # Applying each EncoderBlock to the input tensor x
            x=layer(x, mask)
        return self.norm(x) # Normalizing output After running all n layers

# Decoderblock #it takes multihead attention and crossattention
class DecoderBlock(nn.Module):
    def __init__(self,self_attention_block:MultiHeadAttention,crossattentionblock:MultiHeadAttention,ffn:FeedForwardNeuralNetwork,dropout:float):
        super().__init__()
        self.self_attention_block=self_attention_block
        self.cross_attention_block=crossattentionblock
        self.ffn=ffn
        self.residual_connection=nn.ModuleList([ResidualConnection(dropout) for _ in range(3)])
    
    def forward(self,x,encoderoutput,src_mask,tgt_mask):
        x=self.residual_connection[0](x,lambda x :self.self_attention_block(x,x,x,tgt_mask))
        x=self.residual_connection[1](x,lambda x : self.cross_attention_block(x,encoderoutput,encoderoutput,src_mask))
        x=self.residual_connection[2](x,self.ffn)
        return x


class Decoder(nn.Module):
    def __init__(self,layers:nn.ModuleList):
        super().__init__()
        self.layers=layers
        self.norm=LayerNormalization()
    
    def forward(self,x,encoder_output,src_mask,tgt_mask):
        for layer in self.layers:
            x=layer(x,encoder_output,src_mask,tgt_mask)
        return self.norm(x)
# Projection layer

class ProjectionLayer(nn.Module):
    def __init__(self,d_model:int,vocab_size:int):
        super().__init__()
        self.projection=nn.Linear(d_model,vocab_size)
    def forward(self,x):
        return torch.log_softmax(self.projection(x),dim=-1)

# The Transformer Architecture
# Contains all the Encoder Decoder Embeddings
class Transformer(nn.Module):
    def __init__(self,encoder:Encoder,decoder:Decoder,src_embeding:Inputembedding,tgt_embedding:Inputembedding,src_position:PositionalEncoding,tgt_position:PositionalEncoding,projection_layer:ProjectionLayer) -> None:
        super().__init__()
        self.encoder=encoder
        self.decoder=decoder
        self.src_embedding=src_embeding
        self.tgt_embedding=tgt_embedding
        self.src_position=src_position
        self.tgt_position=tgt_position
        self.projection_layer=projection_layer
    
    def encode(self,source,src_mask):
        #applying embedding to the input source language
        source=self.src_embedding(source)
        #applying positionalencoding to the source language
        source=self.src_position(source)
        return self.encoder(source,src_mask)

    def decode(self,encoder_output,src_mask,target,tgt_mask):
        target=self.tgt_embedding(target)
        target=self.tgt_position(target)
        return self.decoder(target,encoder_output,src_mask,tgt_mask)
        
    #applying projection with softmax
    def project(self,x):
        return self.projection_layer(x)
def build_transformer(src_vocab_size:int,tgt_vocab_size:int,src_seq_len:int,tgt_seq_len:int,d_model:int=512,N:int=6,h:int=8,dropout:float=0.2,d_ff:int=2048) -> Transformer:
    # Creating Embedding Layers
    src_embed=Inputembedding(d_model,src_vocab_size)
    tgt_embed=Inputembedding(d_model,tgt_vocab_size)
    #Creating Positional Encoding Layers
    src_pos=PositionalEncoding(d_model,src_seq_len,dropout)
    tgt_pos=PositionalEncoding(d_model,tgt_seq_len,dropout)

    # Encoders Blocks
    encoder_blocks=[]
    for _ in range (N) :
        encoder_self_attention_block=MultiHeadAttention(d_model,h,dropout)
        feed_forward_block=FeedForwardNeuralNetwork(d_model,d_ff,dropout)
        # one layer encoder block
        encoder_block=EncoderBlock(encoder_self_attention_block,feed_forward_block,dropout)
        encoder_blocks.append(encoder_block)
    
    # creating decoder blocks
    decoder_blocks=[]
    for _ in range(N):
        decoder_self_attention_block=MultiHeadAttention(d_model,h,dropout)
        decoder_cross_attention_block=MultiHeadAttention(d_model,h,dropout)
        feed_forward_block=FeedForwardNeuralNetwork(d_model,d_ff,dropout)

        # decoder block
        decoder_block=DecoderBlock(decoder_self_attention_block,decoder_cross_attention_block,feed_forward_block,dropout)
        decoder_blocks.append(decoder_block)
    
    # Encoder and decoder 
    encoder=Encoder(nn.ModuleList(encoder_blocks))
    decoder=Decoder(nn.ModuleList(decoder_blocks))

    #projection layer
    projection_layer=ProjectionLayer(d_model,tgt_vocab_size)
    # Fulltransforer
    transformer=Transformer(encoder,decoder,src_embed,tgt_embed,src_pos,tgt_pos,projection_layer)
    #initializing all the parameters
    for p in transformer.parameters():
        if p.dim()>1:
            nn.init.xavier_uniform_(p)
    
    return transformer



In [28]:
from translator.constants import *
from translator.utils.common import read_yaml,create_directories
from translator.pipeline.stage_03_datatransformation import DataTransformationPipeline
from torch.utils.tensorboard import SummaryWriter

from tqdm import tqdm

In [None]:
class train_model:
    def __init__(self,config:ModelTrainingConfig):
        self.config=config
    
    def get_weights_file_path(self,epoch:str):
        model_folder=self.config.model_path # extracting model folder from the config
        model_basename=self.config.model_basename# extracting the base name for model files
        model_filename=f'{model_basename}{epoch}.pt'
        return str(Path('.')/model_folder/model_filename)
    
    def get_model(self,vocab_src_len,vocab_tgt_len):
        model=build_transformer(vocab_src_len, vocab_tgt_len, self.config.max_seq_len, self.config.max_seq_len, self.config.d_model)
        return model

    
    def trainmodel(self):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        logger.info(f"Using device{device}")
        #model directory to store weight
        self.config.model_path.mkdir(parents=True,exist_ok=True)
        #Experiment directory
        self.config.experiment_path.mkdir(parents=True,exist_ok=True)
        train_dataloader,val_dataloader,tokenizer_src,tokenizer_tgt=DataTransformationPipeline.main()
        model = self.get_model( tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=self.config.lr, eps=1e-9)
        initial_epoch = 0
        global_step = 0
    # Check if there is a pre-trained model to load
        if self.config.preload:
            model_filename = self.get_weights_file_path(self.config.preload)
            logger.info(f"preloading model {model_filename}")
            print(f'Preloading model {model_filename}')
            state = torch.load(model_filename)
            
            # Sets epoch to the saved in the state plus one, to resume from where it stopped
            initial_epoch = state['epoch'] + 1
            writer = SummaryWriter(self.config.experiment_path)
            optimizer.load_state_dict(state['optimizer_state_dict'])
            global_step = state['global_step']
        loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1).to(device)
        previous_model_filename = None  # Variable to track the last saved model file
        for epoch in range(initial_epoch, self.num_epochs):
            batch_iterator = tqdm(train_dataloader, desc=f'Processing epoch {epoch:02d}')
            for i, batch in enumerate(batch_iterator):
                model.train()
                
                # Loading input data and masks onto the GPU
                encoder_input = batch['encoder_input'].to(device)
                decoder_input = batch['decoder_input'].to(device)
                encoder_mask = batch['encoder_mask'].to(device)
                decoder_mask = batch['decoder_mask'].to(device)
                
                # Running tensors through the transformer
                encoder_output = model.encode(encoder_input, encoder_mask)
                decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask)
                proj_output = model.project(decoder_output)
                # Loading the target labels onto the GPU
                label = batch['label'].to(device)
                # Computing loss between model's output and true labels
                loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
                # Updating progress bar
                batch_iterator.set_postfix({f'loss': f'{loss.item():6.3f}'})
                writer.add_scalar('train loss', loss.item(), global_step)
                writer.flush()
                # Performing backpropagation
                loss.backward()
                optimizer.step()
                # Clearing the gradients to prepare for the next batch
                optimizer.zero_grad()
                global_step += 1  # Updating global step coun
                if global_step % 100 == 0:
                    print(f'Iteration {global_step}: loss = {loss.item():6.3f}')
                    writer.add_scalar('iteration loss', loss.item(), global_step)
                    writer.flush()
            run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: batch_iterator.write(msg), global_step, writer)
        
            # Save model after every epoch
            model_filename = get_weights_file_path(config, f'epoch_{epoch+1}')
            
            # Writing current model state to the model_filename
            torch.save({
                'epoch': epoch,  # Current epoch
                'model_state_dict': model.state_dict(),  # Current model state
                'optimizer_state_dict': optimizer.state_dict(),  # Current optimizer state
                'global_step': global_step  # Current global step
            }, model_filename)
            
            print(f'Saved model for epoch {epoch+1}: {model_filename}')
            
            # Delete the model from the previous epoch
            if previous_model_filename and os.path.exists(previous_model_filename):
                os.remove(previous_model_filename)
                print(f'Deleted previous model: {previous_model_filename}')
            
            # Update the previous model filename to the current one
            previous_model_filename = model_filename

        print("\nTraining complete. Calculating BLEU score on validation data...")
        avg_bleu_score = calculate_bleu_for_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device)
        print(f"\nFinal Average BLEU score on validation data: {avg_bleu_score:.4f}")
                
        



    

In [30]:
from translator.utils.common import casual_mask

ImportError: cannot import name 'casual_mask' from 'translator.utils.common' (/media/puzan/NewVolume/MyFolder/src/translator/utils/common.py)

In [None]:
class Inference:
    def __init__(self,config:ModelTrainingConfig) :
        self.config=config
    
    def greedy_decode(self,model,source,source_mask,tokenizer_src,tokenizer_tgt,device):
        sos_idx=tokenizer_tgt.token_to_id('[SOS]')
        eos_idx=tokenizer_tgt.token_to_id('[EOS]')

    # computing the output of the encoder 
        encoder_output=model.encode(source,source_mask)
        decoder_input=torch.empty(1,1).fill_(sos_idx).type_as(source).to(device) #tensor type is like source
        while True:
            if decoder_input.size(1)==self.config.max_seq_len:
                break
            #building a mask for decoder input 
            decoder_mask=casual_mask(decoder_input.size(1)).type_as(source_mask).to(device)
            #calculating the output of the decoder
            out=model.decode(encoder_output,source_mask,decoder_input,decoder_mask)
            prob=model.project(out[:,-1])

            # Select token with the highest probability 
            _,next_word=torch.max(prob,dim=1)
            decoder_input=torch.cat([decoder_input,torch.empty(1,1).type_as(source).fill_(next_word.item()).to(device)],dim=1)
            if next_word==eos_idx:
                break
    def run_validation(self,model,validation_ds,tokenizer_src,tokenizer_tgt,device,print_msg, global_state, writer, num_examples=2):
        model.eval()
        count=0
        console_width=0
        #evaluation loop
        with torch.no_grad():
            for batch in validation_ds:
                count+=1
                encoder_input=batch['encoder_input'].to(device)
                encoder_mask=batch['encoder_mask'].to(device)
                assert encoder_input.size(0)==1, 'Batch size must be 1 for validation.'
                model_out=greedy_decode(model,encoder_input,encoder_mask,tokenizer_src,tokenizer_tgt,device)


        

In [None]:
#greedy decode for inferenceing

def greedy_decode(model,source,source_mask,tokenizer_src,tokenizer_tgt,max_len,device):
    #retrieving the indices from the start and end of sequences
    sos_idx=tokenizer_tgt.token_to_id('[SOS]')
    eos_idx=tokenizer_tgt.token_to_id('[EOS]')

    # computing the output of the encoder 
    encoder_output=model.encode(source,source_mask)
    decoder_input=torch.empty(1,1).fill_(sos_idx).type_as(source).to(device) #tensor type is like source
    while True:
        if decoder_input.size(1)==max_len:
            break
        #building a mask for decoder input 
        decoder_mask=casual_mask(decoder_input.size(1)).type_as(source_mask).to(device)
        #calculating the output of the decoder
        out=model.decode(encoder_output,source_mask,decoder_input,decoder_mask)
        prob=model.project(out[:,-1])

        # Select token with the highest probability 
        _,next_word=torch.max(prob,dim=1)
        decoder_input=torch.cat([decoder_input,torch.empty(1,1).type_as(source).fill_(next_word.item()).to(device)],dim=1)
        if next_word==eos_idx:
            break

    # sequence of tokens generated by the decoder
    return decoder_input.squeeze(0)
def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, global_state, writer, num_examples=2):
    model.eval()
    count=0 # initializing counter to keep track of how many examples have been processed
    
    console_width=80 # fixed width for printed messages
    
    # creating evaluation loop
    with torch.no_grad(): # ensuring that no gradients are computed during this process
        for batch in validation_ds:
            count+=1
            encoder_input=batch['encoder_input'].to(device)
            encoder_mask=batch['encoder_mask'].to(device)
            
            # ensuring that the batch_size of the validation set is 1
            assert encoder_input.size(0)==1, 'Batch size must be 1 for validation.'
            
            # applying the `greedy_decode` functio to get the model's output of the source text of the input batch
            model_out=greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)
            
            # retraeving source and target texts from the batch
            source_text=batch['src_text'][0]
            target_text=batch['tgt_text'][0] # true translation
            # token_ids = model_out.argmax(dim=-1).squeeze().tolist() # Convert tensor to a list of token IDs
            # model_out_text = tokenizer_tgt.decode(token_ids) 
            model_out_text=tokenizer_tgt.decode(model_out.detach().cpu().numpy()) # decoded, human-readable model ouptut
            
            # printing results
            print_msg('-'*console_width)
            print_msg(f'SOURCE: {source_text}')
            print_msg(f'TARGET: {target_text}')
            print_msg(f'PREDICTED: {model_out_text}')
            
            # After two examples, we break the loop
            if count >= num_examples:
                break