In [None]:
!pip install sacremoses

In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
from datasets import Dataset

# Loading the data
def load_parallel_data(romanian_path, romani_path):
    with open(romanian_path, 'r', encoding='utf-8') as ro_file, \
         open(romani_path, 'r', encoding='utf-8') as roma_file:
        romanian_lines = ro_file.readlines()
        romani_lines = roma_file.readlines()

    assert len(romanian_lines) == len(romani_lines), "Mismatched number of lines!"

    data = {"translation": [{"ro": ro.strip(), "roma": roma.strip()}
                              for ro, roma in zip(romanian_lines, romani_lines)]}
    return Dataset.from_dict(data)


huggingface_dataset = load_parallel_data('/kaggle/input/sptop-dataset/romanian.txt', '/kaggle/input/sptop-dataset/romani.txt')
huggingface_dataset_shuffled = huggingface_dataset.shuffle(seed = 42)
train_test_split = huggingface_dataset_shuffled.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
test_eval_split = train_test_split["test"].train_test_split(test_size = 0.5)

eval_dataset = test_eval_split['train']
test_dataset = test_eval_split['test']

In [None]:
import pandas as pd

train_df = pd.DataFrame(train_dataset['translation'])
val_df = pd.DataFrame(eval_dataset['translation'])
test_df = pd.DataFrame(test_dataset['translation'])

In [None]:
import re
import sys
import unicodedata
from sacremoses import MosesPunctNormalizer

mpn = MosesPunctNormalizer(lang="en")
mpn.substitutions = [
    (re.compile(r), sub) for r, sub in mpn.substitutions
]

def get_non_printing_char_replacer(replace_by: str = " "):
    non_printable_map = {
        ord(c): replace_by
        for c in (chr(i) for i in range(sys.maxunicode + 1))
        # same as \p{C} in perl
        # see https://www.unicode.org/reports/tr44/#General_Category_Values
        if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
    }

    def replace_non_printing_char(line) -> str:
        return line.translate(non_printable_map)

    return replace_non_printing_char

replace_nonprint = get_non_printing_char_replacer(" ")

def preproc(text):
    clean = mpn.normalize(text)
    clean = replace_nonprint(clean)
    # replace 𝓕𝔯𝔞𝔫𝔠𝔢𝔰𝔠𝔞 by Francesca
    clean = unicodedata.normalize("NFKC", clean)
    return clean

In [None]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import nltk
from tqdm import tqdm
import numpy as np

def translate_and_evaluate(input_sentences, correct_translations, 
                          src_lang, tgt_lang, a=32, b=3, 
                          max_input_length=1024, num_beams=4, batch_size=16):
    # Ensure we have same number of inputs and references
    assert len(input_sentences) == len(correct_translations), \
        "Input sentences and correct translations must be same length"
    
    model.eval()
    translations = []
    
    # Process in batches
    num_batches = int(np.ceil(len(input_sentences) / batch_size))
    
    # Translate all sentences
    for batch_idx in tqdm(range(num_batches), desc="Translating batches"):
        # Get batch start/end indices
        start = batch_idx * batch_size
        end = min(start + batch_size, len(input_sentences))
        batch_inputs = input_sentences[start:end]
        
        # Tokenize input with source language code
        tokenizer.src_lang = src_lang
        tokenizer.tgt_lang = tgt_lang
        inputs = tokenizer(batch_inputs.tolist(), return_tensors="pt", padding=True, truncation=True, 
                          max_length=max_input_length).to(model.device)
        
        # Generate translation
        generated_tokens = model.generate(
            **inputs.to(model.device),
            forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
            max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
            num_beams=num_beams,
        )
        
        # Decode generated tokens
        batch_translations = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        translations.extend(batch_translations)

    # Calculate BLEU score
    # Tokenize references and hypotheses
    references = [[nltk.word_tokenize(ref)] for ref in correct_translations]
    hypotheses = [nltk.word_tokenize(hyp) for hyp in translations]
    
    # Use smoothing function to avoid zero scores
    smoothing = SmoothingFunction().method1
    bleu_score = corpus_bleu(references, hypotheses, smoothing_function=smoothing)
    
    return translations, bleu_score

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model_name = "facebook/nllb-200-distilled-600M"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
from transformers.optimization import Adafactor
from transformers import get_constant_schedule_with_warmup
model.cuda()
optimizer = Adafactor(
    [p for p in model.parameters() if p.requires_grad],
    scale_parameter=False,
    relative_step=False,
    lr=1e-4,
    clip_threshold=1.0,
    weight_decay=1e-3,
)
scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=1000)

In [None]:
import gc
import torch

def cleanup():
    """Try to free GPU memory"""
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
batch_size = 16
max_length = 128
num_epochs = 30
avg_losses = []
bleu_scores = []
MODEL_SAVE_PATH = '/kaggle/working/nllb-ro-roma-v2'

LANGS = [('ro', 'ro_Latn'), ('roma', 'roma_Latn')]

In [None]:
from tqdm.notebook import trange
import numpy as np
import random

model.train()
optimizer.zero_grad(set_to_none=True)
best_bleu_score = 0.0

for epoch in range(num_epochs):
    # Shuffle dataset indices at start of each epoch
    data_len = len(train_df)
    shuffled_indices = np.random.permutation(data_len)
    num_batches = (data_len + batch_size - 1) // batch_size  # Include partial batch
    
    epoch_losses = []
    cleanup()
    
    # Epoch progress bar with batch-level updates
    tq = trange(num_batches, desc=f'Epoch {epoch+1}/{num_epochs}')
    for batch_idx in tq:
        # Randomly select language pair for this batch
        (l1, long1), (l2, long2) = random.sample(LANGS, 2)
        
        # Get batch data from shuffled indices
        start = batch_idx * batch_size
        end = start + batch_size
        batch_indices = shuffled_indices[start:end]
        
        # Prepare batch data
        xx, yy = [], []
        for idx in batch_indices:
            item = train_df.iloc[idx]
            xx.append(preproc(item[l1]))
            yy.append(preproc(item[l2]))
        
        try:
            # Tokenization and model processing
            tokenizer.src_lang = long1
            x = tokenizer(xx, return_tensors='pt', padding=True, 
                         truncation=True, max_length=max_length).to(model.device)
            
            tokenizer.src_lang = long2
            y = tokenizer(yy, return_tensors='pt', padding=True,
                         truncation=True, max_length=max_length).to(model.device)
            y.input_ids[y.input_ids == tokenizer.pad_token_id] = -100

            # Forward pass and training
            loss = model(**x, labels=y.input_ids).loss
            loss.backward()
            epoch_losses.append(loss.item())
            
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()

            # Update progress bar with current batch loss
            tq.set_postfix(loss=loss.item())

        except RuntimeError as e:
            optimizer.zero_grad(set_to_none=True)
            cleanup()
            print(f'Error in epoch {epoch+1}, batch {batch_idx}: {str(e)}')
            continue

    # After completing all batches in epoch
    avg_epoch_loss = np.mean(epoch_losses)
    avg_losses.append(avg_epoch_loss)
    print(f'Epoch {epoch+1} completed. Average loss: {avg_epoch_loss:.4f}')

    # Compute validation BLEU score
    _, bleu_score = translate_and_evaluate(
        val_df['ro'], val_df['roma'], 'ro_Latn', 'roma_Latn'
    )
    bleu_scores.append(bleu_score)
    print(f'BLEU score: {bleu_score:.4f}')
    
    if bleu_score > best_bleu_score:
        # Save model checkpoint
        model.save_pretrained(MODEL_SAVE_PATH)
        tokenizer.save_pretrained(MODEL_SAVE_PATH)
        print("Saving model...")

        best_bleu_score = bleu_score

cleanup()

In [None]:
with open("/kaggle/working/losses.txt", "w") as f:
    for loss in avg_losses:
        f.write(f"{loss}\n")

In [None]:
with open("/kaggle/working/bleu_scores.txt", "w") as f:
    for score in bleu_scores:
        f.write(f"{score}\n")

In [None]:
del model
del tokenizer
cleanup()

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_SAVE_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_SAVE_PATH)

In [None]:
model.cuda()

In [None]:
_, test_bleu = translate_and_evaluate(
    test_df['ro'], test_df['roma'], 'ro_Latn', 'roma_Latn'
)
with open("/kaggle/working/test_bleu.txt", "w") as f:
    f.write(f"{test_bleu:.4f}\n")