In [12]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import logging
from torch.utils.data import Dataset, DataLoader
from bpe_tokenizer import BPETokenizer
import tqdm
from torch.cuda.amp import GradScaler, autocast
import pandas as pd
import json
from typing import Dict, Any
import torchmetrics
import torchinfo
import mlflow

In [3]:
logger = logging.getLogger("TA-EN NMT")
logger.setLevel(logging.DEBUG)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)

In [4]:
tamil_bpe = None
english_bpe = None
with open("trg_bpe1.vocab.json","r") as f:
    english_bpe = json.load(f)
with open("src_bpe1.vocab.json","r") as f:
    tamil_bpe = json.load(f)


In [16]:
len(tamil_bpe), len(english_bpe)

(21401, 13465)

In [None]:

class TranslationDataset(Dataset):
    def __init__(self, datasets, src_bpe_path, trg_bpe_path):
        '''
        datasets: list[str] -> list of dataset csv files
        src_bpe_path: str -> path to source bpe file (path/to/src_bpe1.vocab.json do not include .vocab.json)
        trg_bpe_path: str -> path to target bpe file (path/to/trg_bpe1.vocab.json do not include .vocab.json)
        '''
        initial_csv = pd.read_csv(datasets[0])
        for dataset in datasets[1:]:
            initial_csv = pd.concat([initial_csv, pd.read_csv(dataset)])
        self.BPE_tokenizer_ta = BPETokenizer.load(src_bpe_path, "ta")
        self.BPE_tokenizer_en = BPETokenizer.load(trg_bpe_path, "en")
        self.tamil = initial_csv["ta"].tolist()
        self.english = initial_csv["en"].tolist()
        assert len(self.tamil) == len(self.english), "Tamil and English sentences are not of the same length"
    def __len__(self): return len(self.tamil)
    def __getitem__(self, idx):
        tamil_sentence = self.tamil[idx]
        english_sentence = self.english[idx]
        tamil_tokens = self.BPE_tokenizer_ta.encode(tamil_sentence)
        english_tokens = self.BPE_tokenizer_en.encode(english_sentence)
        
        trg_pad_mask = None
        src_pad_mask = None
        if len(english_tokens) > len(tamil_tokens):
            pad_len = len(english_tokens) - len(tamil_tokens)
            src_pad_mask = torch.zeros(pad_len)
            tamil_tokens.extend([0] * pad_len)
            tamil_tokens = torch.tensor(tamil_tokens)
            src_pad_mask = src_pad_mask == tamil_tokens
        elif len(tamil_tokens) > len(english_tokens):
            pad_len = len(tamil_tokens) - len(english_tokens)
            trg_pad_mask = torch.zeros(pad_len)
            english_tokens.extend([0] * pad_len)
            english_tokens = torch.tensor(english_tokens)
            trg_pad_mask = trg_pad_mask == english_tokens
        return tamil_tokens, english_tokens, src_pad_mask, trg_pad_mask

dataset = TranslationDataset(["en-ta//pmindia.v1.ta-en 39k.csv","en-ta//general_en_ta 87k.csv"], "src_bpe1", "trg_bpe1")
loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [16]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, n_heads=8, n_layers=6, d_ff=2048, dropout=0.1):
        super(Transformer, self).__init__()
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model, n_heads, d_ff, dropout),
            num_layers=n_layers
        )
        self.decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model, n_heads, d_ff, dropout),
            num_layers=n_layers
        )
        self.src_emb = nn.Embedding(src_vocab_size, d_model)
        self.tgt_emb = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_enc = nn.Parameter(torch.zeros(10000, d_model))  # Positional encoding
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        self.d_model = d_model
        self.transformer = nn.Transformer()
    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        src = self.src_emb(src) + self.pos_enc[:src.size(0)]
        tgt = self.tgt_emb(tgt) + self.pos_enc[:tgt.size(0)]
        memory = self.encoder(src, src_key_padding_mask=src_mask)
        output = self.decoder(tgt, memory, memory_key_padding_mask=src_mask, tgt_mask=tgt_mask)
        return self.fc_out(output)

# Example usage (later in training)
model = Transformer(src_vocab_size=21401, tgt_vocab_size=13465)  # Adjust vocab sizes
if torch.cuda.is_available():
    model = model.cuda()  # Use RTX 4060



In [None]:
class NMTTrainer:
    def __init__(self, model, train_loader, val_loader, device):
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.device = device
        self.optimizer = None
        self.criterion = nn.CrossEntropyLoss(ignore_index=0)  # Assuming 0 is the padding index
        self.scaler = GradScaler()
        self.metrics = {
            'loss': [],
            'bleu': []
        }

    def set_optimizer(self, optimizer_name, **kwargs):
        if optimizer_name == 'adam':
            self.optimizer = optim.Adam(self.model.parameters(), **kwargs)
        elif optimizer_name == 'sgd':
            self.optimizer = optim.SGD(self.model.parameters(), **kwargs)
        else:
            raise ValueError(f"Unsupported optimizer: {optimizer_name}")

    def train_epoch(self):
        self.model.train()
        total_loss = 0
        for tamil_batch, english_batch in self.train_loader:
            tamil = torch.tensor(tamil_batch, dtype=torch.long).to(self.device)
            english = torch.tensor(english_batch, dtype=torch.long).to(self.device)
            self.optimizer.zero_grad()
            with autocast():
                output = self.model(tamil, english[:, :-1])
                loss = self.criterion(output.view(-1, output.size(-1)), english[:, 1:].contiguous().view(-1))
            self.scaler.scale(loss).backward()
            self.scaler.step(self.optimizer)
            self.scaler.update()
            total_loss += loss.item()
        avg_loss = total_loss / len(self.train_loader)
        self.metrics['loss'].append(avg_loss)
        logger.info(f"Training Loss: {avg_loss}")

    def evaluate(self):
        self.model.eval()
        total_loss = 0
        with torch.no_grad():
            for tamil_batch, english_batch in self.val_loader:
                tamil = torch.tensor(tamil_batch, dtype=torch.long).to(self.device)
                english = torch.tensor(english_batch, dtype=torch.long).to(self.device)
                output = self.model(tamil, english[:, :-1])
                loss = self.criterion(output.view(-1, output.size(-1)), english[:, 1:].contiguous().view(-1))
                total_loss += loss.item()
        avg_loss = total_loss / len(self.val_loader)
        self.metrics['loss'].append(avg_loss)
        logger.info(f"Validation Loss: {avg_loss}")

    def calculate_bleu(self, references, hypotheses):
        # Placeholder for BLEU score calculation
        bleu_score = 0.0
        self.metrics['bleu'].append(bleu_score)
        logger.info(f"BLEU Score: {bleu_score}")

    def train(self, num_epochs):
        for epoch in tqdm.tqdm(range(num_epochs)):
            logger.info(f"Epoch {epoch + 1}/{num_epochs}")
            self.train_epoch()
            self.evaluate()
            # Assuming you have a method to get references and hypotheses
            # references, hypotheses = self.get_references_and_hypotheses()
            # self.calculate_bleu(references, hypotheses)

    def update_parameters(self, new_params):
        for param, value in new_params.items():
            if hasattr(self.model, param):
                setattr(self.model, param, value)
            else:
                logger.warning(f"Parameter {param} not found in model.")

# Example usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset = TranslationDataset(["en-ta//pmindia.v1.ta-en 39k.csv", "en-ta//general_en_ta 87k.csv"], "src_bpe1", "trg_bpe1")
loader = DataLoader(dataset, batch_size=32, shuffle=True)
model = Transformer(src_vocab_size=21401, tgt_vocab_size=13465).to(device)
trainer = NMTTrainer(model, loader, loader, device)  # Assuming same loader for train/val for simplicity
trainer.set_optimizer('adam', lr=0.0001)
trainer.train(num_epochs=10)