In [None]:
import json
import torch
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback
)
from peft import get_peft_model, LoraConfig, TaskType
from pathlib import Path
import os
import dotenv
import gc

dotenv.load_dotenv()

True

In [2]:
# only run on colab
%pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [3]:
!git clone https://github.com/TanaseVictorFlavian/Ro-Md-En-MT-Analysis.git

Cloning into 'Ro-Md-En-MT-Analysis'...
remote: Enumerating objects: 30044, done.[K
remote: Counting objects: 100% (30044/30044), done.[K
remote: Compressing objects: 100% (30017/30017), done.[K
remote: Total 30044 (delta 19), reused 30037 (delta 14), pack-reused 0 (from 0)[K
Receiving objects: 100% (30044/30044), 6.71 MiB | 11.18 MiB/s, done.
Resolving deltas: 100% (19/19), done.
Updating files: 100% (30012/30012), done.


In [None]:
# Loading Data
parallel_corpus_dir = Path.cwd() / "parallel_corpus"

if parallel_corpus_dir.exists() is False:
    # fallback for using colab extension in vscode
    parallel_corpus_dir = Path("/content/Ro-Md-En-MT-Analysis/parallel_corpus")

json_files = list(parallel_corpus_dir.glob("*.json"))
data_list = []

for file_path in json_files:
    try:
        content = file_path.read_text(encoding='utf-8')
        data_list.append(json.loads(content))
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

print(len(json_files)) 

In [12]:
dataset = load_from_disk("split_dataset")
ro_dataset = DatasetDict({
    'train': dataset['train_ro'],
    'dev': dataset['dev_ro']
})

md_dataset = DatasetDict({
    'train': dataset['train_md'],
    'dev': dataset['dev_md']
})

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


In [None]:
model_checkpoint = "Helsinki-NLP/opus-mt-ROMANCE-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
def preprocess_function(examples):
    inputs = examples["source"]
    targets = examples["target"]
    
    model_inputs = tokenizer(
        inputs, 
        text_target=targets, 
        max_length=128, 
        truncation=True
    )
    
    return model_inputs

tokenized_ro = ro_dataset.map(preprocess_function, batched=True)
tokenized_md = md_dataset.map(preprocess_function, batched=True)

In [None]:
def train_translation_model(train_data, eval_data, model_name_suffix):
    """
    Function to train the model on a specific language split
    """
    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)
    
    peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM, 
    inference_mode=False, 
    r=16,           
    lora_alpha=32,   
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"] 
    )   
    
    model = get_peft_model(model, peft_config)
    
    output_dir = f"./opus-lora-finetuned-{model_name_suffix}"
    final_dir = f"./final_model_{model_name_suffix}"

    args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        learning_rate=3e-4,          
        per_device_train_batch_size=32, 
        num_train_epochs=10,       
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        per_device_eval_batch_size=32,
        save_total_limit=1,
        predict_with_generate=True,  
        fp16=True,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss", 
        report_to="none",
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

    trainer = Seq2SeqTrainer(
        model=model,
        args=args,
        train_dataset=train_data,
        eval_dataset=eval_data,
        data_collator=data_collator,
        processing_class=tokenizer,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    print(f"--- Starting training for: {model_name_suffix} ---")
    trainer.train()
    trainer.save_model(final_dir)
    tokenizer.save_pretrained(final_dir)
    print(f"Model saved to {final_dir}")
    
    del model
    del trainer
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
train_translation_model(tokenized_ro["train"], tokenized_ro["dev"], "ro")

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
train_translation_model(tokenized_md["train"], tokenized_md["dev"], "md")

In [None]:
"""
input_text = "Merg la piață să iau pepene." 
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

outputs = model.generate(input_ids=input_ids)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Source: {input_text}")
print(f"Translation: {decoded}")
"""