In [1]:
from datasets import load_dataset
from transformers import MarianTokenizer, MarianMTModel
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from evaluate import load


In [2]:

# 1. Load dataset
#dataset = load_dataset("wmt14", "de-en", split="train[:1%]")  # small subset for dev
dataset = load_dataset("wmt14", "de-en", split="train[:200]")  # just 200 samples


In [3]:

# 2. Load tokenizer and model
model_name = "Helsinki-NLP/opus-mt-en-de"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)




In [4]:

# 3. Tokenize
def tokenize(example):
    inputs = tokenizer(example["translation"]["en"], padding="max_length", truncation=True, max_length=128)
    with tokenizer.as_target_tokenizer():
        targets = tokenizer(example["translation"]["de"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(tokenize)


In [5]:

# 4. Torch-compatible dataset
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, hf_dataset):
        self.input_ids = hf_dataset["input_ids"]
        self.attention_mask = hf_dataset["attention_mask"]
        self.labels = hf_dataset["labels"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.input_ids[idx]),
            "attention_mask": torch.tensor(self.attention_mask[idx]),
            "labels": torch.tensor(self.labels[idx]),
        }

train_dataset = TranslationDataset(tokenized_dataset)
dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)


In [6]:

# 5. Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(1):
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f" Epoch complete | Loss: {loss.item():.4f}")


 Epoch complete | Loss: 0.3315


In [7]:

# 6. Save model
model.save_pretrained("./simple-finetuned-en-de")
tokenizer.save_pretrained("./simple-finetuned-en-de")
print(" Model saved to ./simple-finetuned-en-de")


 Model saved to ./simple-finetuned-en-de




In [8]:

# 7. Evaluate BLEU on a few examples
bleu = load("bleu")
model.eval()
predictions, references = [], []
sample_dataset = tokenized_dataset.select(range(100))

with torch.no_grad():
    for example in sample_dataset:
        input_ids = torch.tensor(example["input_ids"]).unsqueeze(0).to(device)
        output_ids = model.generate(input_ids, max_length=128)[0]
        pred = tokenizer.decode(output_ids, skip_special_tokens=True)
        ref = tokenizer.decode(example["labels"], skip_special_tokens=True)
        predictions.append(pred.strip())
        references.append([ref.strip()])

score = bleu.compute(predictions=predictions, references=references)
print(f" BLEU score on 100 examples: {score['bleu']:.4f}")


 BLEU score on 100 examples: 0.3236
