In [18]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset
import torch

In [19]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("opus_books", "en-fr")

# Original train dataset
train_dataset = dataset["train"]

# Create a validation split (e.g., 5% of the train set)
split = train_dataset.train_test_split(test_size=0.05, seed=42)
train_dataset = split['train']
val_dataset = split['test']

print(f"Train size: {len(train_dataset)}")
print(f"Validation size: {len(val_dataset)}")


Train size: 120730
Validation size: 6355


In [20]:
train_dataset = train_dataset.select(range(2000))  # first 2000 examples
val_dataset = val_dataset.select(range(200))

In [None]:
MODEL_NAME = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)




In [22]:
# ===== 3. Tokenization =====
def preprocess(batch):
    # Columns in opus_books: 'translation' dict containing 'en' and 'de'
    inputs = tokenizer([t["en"] for t in batch["translation"]],
                       truncation=True, padding="max_length", max_length=64)
    targets = tokenizer([t["fr"] for t in batch["translation"]],
                        truncation=True, padding="max_length", max_length=64)
    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["labels"] = targets.input_ids
    return batch

train_tokenized = train_dataset.map(preprocess, batched=True)
val_tokenized = val_dataset.map(preprocess, batched=True)


In [23]:
# ===== 4. Training Arguments =====
args = Seq2SeqTrainingArguments(
    output_dir="./opus-mt-en-fr-finetuned",
    eval_steps=50,
    save_steps=100,
    logging_steps=20,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
)



In [24]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [25]:
# ===== 5. Trainer =====
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  trainer = Seq2SeqTrainer(


In [26]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Step,Training Loss
20,6.0065
40,3.5291
60,4.496
80,3.7591
100,3.2765
120,3.886
140,2.2313
160,3.7565
180,3.7365
200,3.6577




TrainOutput(global_step=2000, training_loss=2.7274599876403807, metrics={'train_runtime': 426.1516, 'train_samples_per_second': 4.693, 'train_steps_per_second': 4.693, 'total_flos': 33898364928000.0, 'train_loss': 2.7274599876403807, 'epoch': 1.0})

In [27]:
trainer.save_model("./opus-mt-en-de-finetuned")
tokenizer.save_pretrained("./opus-mt-en-de-finetuned")


('./opus-mt-en-de-finetuned/tokenizer_config.json',
 './opus-mt-en-de-finetuned/special_tokens_map.json',
 './opus-mt-en-de-finetuned/vocab.json',
 './opus-mt-en-de-finetuned/source.spm',
 './opus-mt-en-de-finetuned/target.spm',
 './opus-mt-en-de-finetuned/added_tokens.json')

In [28]:
model.to("cuda" if torch.cuda.is_available() else "cpu")

examples = [
    "Hello, How are you",
    "I love pizza",
    "Virat Kohli is the best cricketer of all time",
]

for text in examples:
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    output_tokens = model.generate(**inputs, num_beams=4, max_length=64)
    translated_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    print(f"Input: {text}")
    print(f"Translation: {translated_text}")
    print("-" * 50)


Input: Good morning! How are you?
Translation: Mais! À-moi-vous?
--------------------------------------------------
Input: I love programming.
Translation: Je lover réproduir.
--------------------------------------------------
Input: Where is the nearest railway station?
Translation: Une s'était la traître?
--------------------------------------------------
