In [1]:
from transformers import MBart50TokenizerFast
import json
import torch
from datasets import Dataset

In [2]:
text_json = "Subtitle_Dataset/aligned_subtitles.json"
tokenized_file = "Subtitle_Dataset/tokenized_subtitles.pt"

In [3]:
# Load tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)

In [4]:
# Set language codes
SRC_LANG = "en_XX"
TGT_LANG = "si_LK"
tokenizer.src_lang = SRC_LANG
tokenizer.tgt_lang = TGT_LANG

In [5]:

# Load your cleaned JSON data
with open(text_json, "r", encoding="utf-8") as f:
    data = json.load(f)

# Prepare inputs
sources = [item["en"] for item in data]
targets = [item["si"] for item in data]

# ✅ New way: tokenize source + target in one call
tokenized_data = tokenizer(
    sources,
    text_target=targets,
    max_length=128,
    padding="max_length",
    truncation=True,
    return_tensors="pt"
)

torch.save(tokenized_data, tokenized_file)

print("✅ Tokenization complete. Tensors saved to 'tokenized_subtitles.pt'")


✅ Tokenization complete. Tensors saved to 'tokenized_subtitles.pt'


In [6]:
# Load cleaned data
with open(text_json, "r", encoding="utf-8") as f:
    data = json.load(f)

# Convert to HuggingFace Dataset
dataset = Dataset.from_list(data)

# Optional: Split train/val
dataset = dataset.train_test_split(test_size=0.1)


In [7]:
from transformers import MBart50TokenizerFast

tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "si_LK"

def preprocess(example):
    model_inputs = tokenizer(
        example["en"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        example["si"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )["input_ids"]

    model_inputs["labels"] = labels
    return model_inputs

# Tokenize dataset
tokenized_dataset = dataset.map(preprocess, batched=True)


Map:   0%|          | 0/4059 [00:00<?, ? examples/s]

Map:   0%|          | 0/452 [00:00<?, ? examples/s]

In [8]:
from transformers import MBartForConditionalGeneration

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./checkpoints-mbart50-en-si",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_strategy="epoch",
    save_total_limit=2,
    predict_with_generate=True,
    fp16=True  # if using GPU
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer
)

# 🏁 Start training
trainer.train()


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'