In [1]:
from datasets import load_dataset
from transformers import MarianTokenizer, MarianMTModel
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from evaluate import load


In [2]:

# 1. Load dataset (WMT16 English-Romanian)
dataset = load_dataset("wmt16", "ro-en", split="train[:200]")  # Small subset for testing


README.md:   0%|          | 0.00/11.1k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/108M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/362k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/342k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/610320 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1999 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1999 [00:00<?, ? examples/s]

In [3]:

# 2. Load tokenizer and model for English to Romanian
model_name = "Helsinki-NLP/opus-mt-en-ro"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/789k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/817k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/301M [00:00<?, ?B/s]

In [4]:

# 3. Tokenization function
def tokenize(example):
    inputs = tokenizer(example["translation"]["en"], padding="max_length", truncation=True, max_length=128)
    with tokenizer.as_target_tokenizer():
        targets = tokenizer(example["translation"]["ro"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(tokenize)


Map:   0%|          | 0/200 [00:00<?, ? examples/s]



In [5]:

# 4. PyTorch-compatible dataset class
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, hf_dataset):
        self.input_ids = hf_dataset["input_ids"]
        self.attention_mask = hf_dataset["attention_mask"]
        self.labels = hf_dataset["labels"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.input_ids[idx]),
            "attention_mask": torch.tensor(self.attention_mask[idx]),
            "labels": torch.tensor(self.labels[idx]),
        }

train_dataset = TranslationDataset(tokenized_dataset)
dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)


In [6]:

# 5. Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(1):
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"✅ Epoch complete | Loss: {loss.item():.4f}")


✅ Epoch complete | Loss: 0.0739


In [7]:

# 6. Save model
model.save_pretrained("./finetuned-en-ro")
tokenizer.save_pretrained("./finetuned-en-ro")
print("✅ Model saved to ./finetuned-en-ro")


✅ Model saved to ./finetuned-en-ro




In [8]:

# 7. BLEU evaluation
bleu = load("bleu")
model.eval()
predictions, references = [], []
sample_dataset = tokenized_dataset.select(range(50))

with torch.no_grad():
    for example in sample_dataset:
        input_ids = torch.tensor(example["input_ids"]).unsqueeze(0).to(device)
        output_ids = model.generate(input_ids, max_length=128)[0]
        pred = tokenizer.decode(output_ids, skip_special_tokens=True)
        ref = tokenizer.decode(example["labels"], skip_special_tokens=True)
        predictions.append(pred.strip())
        references.append([ref.strip()])

score = bleu.compute(predictions=predictions, references=references)
print(f"✅ BLEU score on 50 examples: {score['bleu']:.4f}")


✅ BLEU score on 50 examples: 0.5729
