In [None]:
import json
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    DataCollatorForSeq2Seq, 
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer
)
from pathlib import Path
import os
import gdown
import dotenv
dotenv.load_dotenv()

False

In [51]:
# only run on colab
%pip install sacremoses



In [53]:
!git clone https://github.com/TanaseVictorFlavian/Ro-Md-En-MT-Analysis.git

Cloning into 'Ro-Md-En-MT-Analysis'...
remote: Enumerating objects: 17839, done.[K
remote: Counting objects: 100% (17839/17839), done.[K
remote: Compressing objects: 100% (17822/17822), done.[K
remote: Total 17839 (delta 8), reused 17835 (delta 6), pack-reused 0 (from 0)[K
Receiving objects: 100% (17839/17839), 4.46 MiB | 18.96 MiB/s, done.
Resolving deltas: 100% (8/8), done.


In [None]:
# Loading Data
parallel_corpus_dir = Path.cwd() / "parallel_corpus"

if parallel_corpus_dir.exists() is False:
    # fallback for using colab extension in vscode
    parallel_corpus_dir = Path("/content/Ro-Md-En-MT-Analysis/parallel_corpus")

json_files = list(parallel_corpus_dir.glob("*.json"))
data_list = []

for index, file_path in enumerate(json_files):
    try:
        content = file_path.read_text(encoding='utf-8')
        if index == 1000:
            break
        data_list.append(json.loads(content))
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

In [58]:
raw_dataset = Dataset.from_list(data_list)
split_datasets = raw_dataset.train_test_split(test_size=0.2, seed=42)

dataset = DatasetDict({
    'train': split_datasets['train'],
    'dev': split_datasets['test']
})

In [59]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [61]:
model_checkpoint = "Helsinki-NLP/opus-mt-ROMANCE-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)

In [62]:
def preprocess_function(examples):
    inputs = [ex for ex in examples["source"]]
    targets = [ex for ex in examples["target"]]
    
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]



Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [63]:
args = Seq2SeqTrainingArguments(
    output_dir="./opus-mt-finetuned",
    eval_strategy="epoch",      
    save_strategy="epoch", 
    load_best_model_at_end=True,      
    metric_for_best_model="eval_loss",
    greater_is_better=False,         
    save_total_limit=1,            
    learning_rate=2e-5,               
    per_device_train_batch_size=32,   
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    num_train_epochs=10,                
    fp16=torch.cuda.is_available(),    
    report_to="none",            
    logging_strategy="epoch",     
    logging_steps=1,                 
    logging_dir=None,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print("Starting training...")
trainer.train()
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")
print("Training complete. Model saved to './final_model'")

  trainer = Seq2SeqTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Starting training...


Epoch,Training Loss,Validation Loss
1,0.8602,0.790198
2,0.651,0.768445
3,0.5412,0.757784
4,0.4614,0.759584
5,0.4095,0.760808
6,0.3635,0.764995
7,0.3272,0.766895
8,0.3075,0.769391
9,0.2941,0.770834
10,0.2849,0.771506


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


Training complete. Model saved to './final_model'


In [80]:
!zip -r model_output.zip /content/final_model


  adding: content/final_model/ (stored 0%)
  adding: content/final_model/special_tokens_map.json (deflated 35%)
  adding: content/final_model/tokenizer_config.json (deflated 62%)
  adding: content/final_model/training_args.bin (deflated 53%)
  adding: content/final_model/config.json (deflated 63%)
  adding: content/final_model/source.spm (deflated 49%)
  adding: content/final_model/target.spm (deflated 50%)
  adding: content/final_model/vocab.json (deflated 70%)
  adding: content/final_model/model.safetensors (deflated 7%)
  adding: content/final_model/generation_config.json (deflated 40%)


In [None]:
input_text = "Merg la piață să iau pepene." 
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

outputs = model.generate(input_ids)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Source: {input_text}")
print(f"Translation: {decoded}")