In [1]:
from transformers import MBart50TokenizerFast
import json
import torch
from datasets import Dataset

In [2]:
text_json = "Subtitle_Dataset/aligned_subtitles.json"
tokenized_file = "Subtitle_Dataset/tokenized_subtitles.pt"

In [3]:
# Load tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)

In [4]:
# Set language codes
SRC_LANG = "en_XX"
TGT_LANG = "si_LK"
tokenizer.src_lang = SRC_LANG
tokenizer.tgt_lang = TGT_LANG

In [5]:

# Load your cleaned JSON data
with open(text_json, "r", encoding="utf-8") as f:
    data = json.load(f)

# Prepare inputs
sources = [item["en"] for item in data]
targets = [item["si"] for item in data]

# ✅ New way: tokenize source + target in one call
tokenized_data = tokenizer(
    sources,
    text_target=targets,
    max_length=128,
    padding="max_length",
    truncation=True,
    return_tensors="pt"
)

torch.save(tokenized_data, tokenized_file)

print("✅ Tokenization complete. Tensors saved to 'tokenized_subtitles.pt'")


✅ Tokenization complete. Tensors saved to 'tokenized_subtitles.pt'


In [6]:
# Load cleaned data
with open(text_json, "r", encoding="utf-8") as f:
    data = json.load(f)

# Convert to HuggingFace Dataset
dataset = Dataset.from_list(data)

# Optional: Split train/val
dataset = dataset.train_test_split(test_size=0.1)


In [7]:
from transformers import MBart50TokenizerFast

tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "si_LK"

def preprocess(example):
    model_inputs = tokenizer(
        example["en"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        example["si"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )["input_ids"]

    model_inputs["labels"] = labels
    return model_inputs

# Tokenize dataset
tokenized_dataset = dataset.map(preprocess, batched=True)


Map:   0%|          | 0/4059 [00:00<?, ? examples/s]

Map:   0%|          | 0/452 [00:00<?, ? examples/s]

In [8]:
from transformers import MBartForConditionalGeneration

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./checkpoints-mbart50-en-si",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    save_strategy="epoch",
    save_total_limit=2,
    gradient_accumulation_steps=2,
    fp16=True  # if using GPU
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer
)

# 🏁 Start training
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.8398,0.992283
2,0.917,0.969998
3,0.8353,0.970196




TrainOutput(global_step=1524, training_loss=1.191796876001233, metrics={'train_runtime': 10783.9569, 'train_samples_per_second': 1.129, 'train_steps_per_second': 0.141, 'total_flos': 3298642398019584.0, 'train_loss': 1.191796876001233, 'epoch': 3.0})

In [9]:
model.save_pretrained("mbart50-en-si")
tokenizer.save_pretrained("mbart50-en-si")


('mbart50-en-si/tokenizer_config.json',
 'mbart50-en-si/special_tokens_map.json',
 'mbart50-en-si/sentencepiece.bpe.model',
 'mbart50-en-si/added_tokens.json',
 'mbart50-en-si/tokenizer.json')

In [10]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

model = MBartForConditionalGeneration.from_pretrained("mbart50-en-si")
tokenizer = MBart50TokenizerFast.from_pretrained("mbart50-en-si")

model.eval()


MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        

In [13]:
# Example format
# [{"en": "...", "si": "..."}, ...]
import random

with open(text_json, "r", encoding="utf-8") as f:
    eval_data = json.load(f)

eval_sample = random.sample(eval_data, 50)

In [14]:
from tqdm import tqdm

references = []
predictions = []

for example in tqdm(eval_sample):
    input_text = example["en"]
    reference = example["si"]

    # Tokenize input
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(model.device)
    
    # Generate translation
    generated_tokens = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.lang_code_to_id["si_LK"],  # Sinhala
        max_length=128,
        num_beams=5
    )

    translated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

    references.append([reference.split()])  # List of references
    predictions.append(translated_text.split())


100%|██████████| 50/50 [07:50<00:00,  9.41s/it]


In [15]:
from nltk.translate.bleu_score import corpus_bleu

bleu_score = corpus_bleu(references, predictions)
print(f"🔵 BLEU score: {bleu_score * 100:.2f}")


🔵 BLEU score: 2.24


In [20]:

# for example in tqdm(eval_sample):
input_text = "Lord Stark!"
# reference = example["si"]

# Tokenize input
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(model.device)

# Generate translation
generated_tokens = model.generate(
    **inputs,
    forced_bos_token_id=tokenizer.lang_code_to_id["si_LK"],  # Sinhala
    max_length=128,
    num_beams=5
)

translated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

# references.append([reference.split()])  # List of references
# predictions.append(translated_text.split())

print(f"Input: {input_text}")
print(f"Translated: {translated_text}")

Input: Lord Stark!
Translated: ස්ටාක් උතුමාණෙනි!


In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0
)

model = AutoModelForSeq2SeqLM.from_pretrained(
    "mbart50-en-si",  # your fine-tuned model dir
    device_map="auto",
    quantization_config=bnb_config
)

tokenizer = AutoTokenizer.from_pretrained("mbart50-en-si")


In [32]:
import bitsandbytes as bnb
print(bnb.__version__)

0.46.1
