In [8]:
import json
import torch
from datasets import Dataset, DatasetDict)
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback
)
from peft import get_peft_model, LoraConfig, TaskType
from pathlib import Path
import os
import gdown
import dotenv
dotenv.load_dotenv()

False

In [2]:
# only run on colab
%pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [3]:
!git clone https://github.com/TanaseVictorFlavian/Ro-Md-En-MT-Analysis.git

Cloning into 'Ro-Md-En-MT-Analysis'...
remote: Enumerating objects: 30044, done.[K
remote: Counting objects: 100% (30044/30044), done.[K
remote: Compressing objects: 100% (30017/30017), done.[K
remote: Total 30044 (delta 19), reused 30037 (delta 14), pack-reused 0 (from 0)[K
Receiving objects: 100% (30044/30044), 6.71 MiB | 11.18 MiB/s, done.
Resolving deltas: 100% (19/19), done.
Updating files: 100% (30012/30012), done.


In [4]:
# Loading Data
parallel_corpus_dir = Path.cwd() / "parallel_corpus"

if parallel_corpus_dir.exists() is False:
    # fallback for using colab extension in vscode
    parallel_corpus_dir = Path("/content/Ro-Md-En-MT-Analysis/parallel_corpus")

json_files = list(parallel_corpus_dir.glob("*.json"))
data_list = []

for file_path in json_files:
    try:
        content = file_path.read_text(encoding='utf-8')
        data_list.append(json.loads(content))
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

print(len(json_files)) 

30000


In [5]:
raw_dataset = Dataset.from_list(data_list).shuffle(seed=2002) 
split_datasets = raw_dataset.train_test_split(test_size=0.2, seed=42) 

dataset = DatasetDict({
    'train': split_datasets['train'],
    'dev': split_datasets['test']
})

In [None]:
"""
raw_subset = raw_dataset.select(range(0, 1000))
split_subset = raw_subset.train_test_split(test_size=0.2, seed=42)

subset = DatasetDict({
    "train":split_subset['train'],
    "test":split_subset['test']
})
"""

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


In [None]:
model_checkpoint = "Helsinki-NLP/opus-mt-ROMANCE-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)

In [None]:
def preprocess_function(examples):
    inputs = examples["source"]
    targets = examples["target"]
    
    model_inputs = tokenizer(
        inputs, 
        text_target=targets, 
        max_length=128, 
        truncation=True
    )
    
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM, 
    inference_mode=False, 
    r=16,           
    lora_alpha=32,   
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"] 
)   

In [None]:
args = Seq2SeqTrainingArguments(
    output_dir="./opus-lora-finetuned",
    learning_rate=3e-4,          
    per_device_train_batch_size=32, 
    num_train_epochs=10,       
    
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_eval_batch_size=32,
    save_total_limit=1,
    predict_with_generate=True,  
    fp16=True,
    load_best_model_at_end=True,
    report_to="none",
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    processing_class=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Start training...")
trainer.train()
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")

  trainer = Seq2SeqTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Starting training...


Epoch,Training Loss,Validation Loss
1,0.8602,0.790198
2,0.651,0.768445
3,0.5412,0.757784
4,0.4614,0.759584
5,0.4095,0.760808
6,0.3635,0.764995
7,0.3272,0.766895
8,0.3075,0.769391
9,0.2941,0.770834
10,0.2849,0.771506


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


Training complete. Model saved to './final_model'


In [None]:
input_text = "Merg la piață să iau pepene." 
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

outputs = model.generate(input_ids=input_ids)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Source: {input_text}")
print(f"Translation: {decoded}")