In [1]:
import pandas as pd
import numpy as np
import os
import re,gc,random
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq



class CONFIG:
    MODEL_NAME = "facebook/nllb-200-distilled-600M"
    TRAIN_PATH = "/kaggle/input/deep-past-initiative-machine-translation/train.csv"
    SEED = 42 
    MODEL = "NLLB-DISTIL-600M"
    EPOCHS = 15
    MAX_LEN = 128
    OUTPUT = '/kaggle/working/'
    BATCH_SIZE = 8
    GRAD_ACCUM = 2
    SRC_LANG = "arb_Arab"
    TRGET_LANG = "eng_Latn"
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    GPU = torch.cuda.get_device_name(0)
    
    def create_display(self) -> None:
        
        print(f"{'='*70}")
        print("TRAINING PIPELINE RUNNING....")
        print(f"{'='*70}")
        print(f"DEVICE: {self.DEVICE}")
        print(f"GPU:{self.GPU}")
        print(f"MODEL:{self.MODEL}")
        print(f"BATCH SIZE:{self.BATCH_SIZE}")
        print(f"EPOCHS:{self.EPOCHS}")
        print(f"{'='*70}\n")


class ALIGNER:
    def __init__(self,csv_path:str):
        self.csv_path = csv_path

    def simple_sentence_aligner (self,csv_path):
        df = pd.read_csv(csv_path)
        aligned_data = []

        for idx,row in df.iterrows():
            src = str(row['transliteration'])
            tgt = str(row['translation'])

            tgt_sents = [t.strip() for t in re.split(r'(?<=[.!?])\s+', tgt) if t.strip()]
            src_lines = [s.strip() for s in src.split('\n') if s.strip()]
            if len(tgt_sents) > 1 and len(tgt_sents) == len(src_lines):
                
                for s, t in zip(src_lines, tgt_sents):
                    
                    if len(s) > 3 and len(t) > 3:
                        aligned_data.append({'transliteration': s, 'translation': t})
            else:
                aligned_data.append({'transliteration': src, 'translation': tgt})

    
            
        return pd.DataFrame(aligned_data)


class DS:
    def __init__(self,df,tok,max_len):
        self.s = df["src"].tolist()
        self.t = df["tgt"].tolist()
        self.tok = tok
        self.max_len = max_len

    def __len__(self):
        return len(self.s)

    def __getitem__(self, i):
        a = self.tok(self.s[i], max_length=self.max_len, truncation=True)
        b = self.tok(text_target=self.t[i], max_length=self.max_len, truncation=True)
        return {"input_ids": a["input_ids"], "attention_mask": a["attention_mask"], "labels": b["input_ids"]}

class NLLBTRAINER():
    def __init__(self,model,tokenizer,val_ds,train_ds,batch_size,grad_accum,epoches,output_dir):
        self.model = model
        self.tokenizer = tokenizer
        self.val_ds = val_ds
        self.train_ds = train_ds
        self.batch_size = batch_size
        self.grad_accum = grad_accum
        self.epoches = epoches
        self.output_dir = output_dir

    def train(self):
        collator = DataCollatorForSeq2Seq(
            self.tokenizer,
            model=self.model, 
            padding=True)

        args = Seq2SeqTrainingArguments(
            
            output_dir = self.output_dir,
            eval_strategy = "epoch",
            save_strategy="no",
            learning_rate = 3e-4,
            per_device_train_batch_size=self.batch_size,
            per_device_eval_batch_size=self.batch_size,
            gradient_accumulation_steps=self.grad_accum,
            num_train_epochs = self.epoches,
            warmup_ratio = 0.1,
            fp16 = True,
            logging_steps = 50,
            report_to="none",
            remove_unused_columns=False
           
        )
             

        trainer = Seq2SeqTrainer(
            
             
            model=self.model,
            args=args,
            train_dataset=self.train_ds,
            eval_dataset=self.val_ds,
            tokenizer=self.tokenizer,
            data_collator=collator)
        
        trainer.train()
        self.model.save_pretrained(self.output_dir)
        self.tokenizer.save_pretrained(self.output_dir)
        
    

def clean(t):
    if pd.isna(t): return ""
    return re.sub(r"\s+", " ", str(t)).strip()

def main ():
    config = CONFIG()
    config.create_display()

    alliner = ALIGNER(config.TRAIN_PATH)
    train_df = alliner.simple_sentence_aligner(config.TRAIN_PATH)
    train_df["src"] = train_df["transliteration"].apply(clean)
    train_df["tgt"] = train_df["translation"].apply(clean)
    train_df = train_df[train_df["src"].str.len() > 10]
    train_df = train_df.sample(frac=1, random_state=config.SEED).reset_index(drop=True)

    val_df = train_df.iloc[:150]
    train_df = train_df.iloc[150:]

    tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME, src_lang=config.SRC_LANG, tgt_lang=config.TRGET_LANG)
    model = AutoModelForSeq2SeqLM.from_pretrained(config.MODEL_NAME)
    ENG_TOKEN_ID = tokenizer.convert_tokens_to_ids("eng_Latn")
    model.to(config.DEVICE)
    train_ds = DS(train_df,tokenizer,config.MAX_LEN)
    val_ds = DS(val_df,tokenizer,config.MAX_LEN)


    trainer=NLLBTRAINER(model,tokenizer,val_ds,train_ds,config.BATCH_SIZE,config.GRAD_ACCUM,config.EPOCHS,config.OUTPUT)
    trainer.train()
    
    print(f"\n{'='*70}")
    print(" All training completed!")
    print(f"{'='*70}\n")

if __name__ == "__main__":
    main()


2025-12-24 09:57:22.816087: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766570242.987836      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766570243.043525      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766570243.449737      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766570243.449774      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766570243.449777      55 computation_placer.cc:177] computation placer alr

TRAINING PIPELINE RUNNING....
DEVICE: cuda
GPU:Tesla P100-PCIE-16GB
MODEL:NLLB-DISTIL-600M
BATCH SIZE:8
EPOCHS:15



tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,3.7807,2.156007
2,1.883,1.767273
3,1.4252,1.639188
4,1.1597,1.596382
5,0.9274,1.627087
6,0.7416,1.720291
7,0.6081,1.794332
8,0.4886,1.919731
9,0.3994,1.990794
10,0.3,2.052968





 All training completed!

