In [None]:
import torch 
from pathlib import Path
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel


MODEL_NAME = "Helsinki-NLP/opus-mt-ROMANCE-en"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 32  # Adjust based on your GPU VRAM

# --- Load Resources ---
print(f"Loading model on {DEVICE}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)

adapter_path = "models\model_opus_ro"
model = PeftModel.from_pretrained(model, adapter_path)

input_text = "Salut, lume!"
inputs = tokenizer(input_text, return_tensors="pt").to(DEVICE)
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=60)

# Decode (Turn numbers back into words)
translation = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Input:  {input_text}")
print(f"Output: {translation}")

Loading model on cuda...
Input:  Salut, lume!
Output: Hello, world!


In [None]:
import torch
import numpy as np
from pathlib import Path
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline,DataCollatorForSeq2Seq
import evaluate
from tqdm import tqdm

# --- Configuration ---
MODEL_NAME = "Helsinki-NLP/opus-mt-ROMANCE-en"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 32  # Adjust based on your GPU VRAM

# --- Load Resources ---
print(f"Loading model on {DEVICE}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)

adapter_path = "models\model_opus_ro"
model = PeftModel.from_pretrained(model, adapter_path)

# Load Metrics
metric_chrf = evaluate.load("chrf")
metric_ter = evaluate.load("ter")
metric_bertscore = evaluate.load("bertscore")
# NER Pipeline for English (since output is English)
ner_pipe = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple", device=0 if DEVICE=="cuda" else -1)

# Load Data
base_dir = Path.cwd()
dataset = load_from_disk("split_dataset")

def calculate_perplexity(model, tokenizer, sources, references):
    model.eval()
    
    # Use the same collator as training to guarantee identical behavior
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)
    batch_losses = []
    
    for i in range(0, len(sources), BATCH_SIZE):
        batch_src = sources[i:i+BATCH_SIZE]
        batch_ref = references[i:i+BATCH_SIZE]

        inputs = tokenizer(batch_src, truncation=True, max_length=128)
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(batch_ref, truncation=True, max_length=128)
            
        features = []
        for j in range(len(batch_src)):
            features.append({
                "input_ids": inputs["input_ids"][j],
                "attention_mask": inputs["attention_mask"][j],
                "labels": labels["input_ids"][j]
            })
            
        batch = data_collator(features)
        batch = {k: v.to(DEVICE) for k, v in batch.items()}

        with torch.no_grad():
            outputs = model(**batch)
            batch_losses.append(outputs.loss.item())

    # Calculate perplexity by taking the mean of losses first, then exponentiate
    mean_loss = np.mean(batch_losses)
    final_ppl = np.exp(mean_loss)
    
    # Calculate variance 
    batch_ppls = [np.exp(loss) for loss in batch_losses]
    ppl_var = np.var(batch_ppls)
    
    return final_ppl, ppl_var

def calculate_ner_recall(references, predictions):
    """
    Calculates NER Recall: What % of entities in Reference were found in Prediction?
    """
    total_recall_scores = []
    
    # Process in batches to speed up NER pipeline
    for i in tqdm(range(0, len(references), BATCH_SIZE), desc="Calculating NER"):
        batch_refs = references[i:i+BATCH_SIZE]
        batch_preds = predictions[i:i+BATCH_SIZE]
        
        # Get entities
        ref_entities_batch = ner_pipe(batch_refs)
        pred_entities_batch = ner_pipe(batch_preds)
        
        for ref_ents, pred_ents in zip(ref_entities_batch, pred_entities_batch):
            # Extract simple set of entity text (lowercase to be forgiving)
            ref_set = {ent['word'].lower() for ent in ref_ents}
            pred_set = {ent['word'].lower() for ent in pred_ents}
            
            if len(ref_set) == 0:
                continue # Skip sentences with no named entities in reference
                
            # Intersection
            matched = ref_set.intersection(pred_set)
            recall = len(matched) / len(ref_set)
            total_recall_scores.append(recall)
            
    return np.mean(total_recall_scores) if total_recall_scores else 0.0

def run_evaluation(split_name, data):
    print(f"\n--- Processing {split_name} ---")
    
    # 1. Extract Source and References
    # ASSUMPTION: Dataset has 'ro'/'md' column or 'source', and 'en' or 'target'
    # Adjust these keys based on your actual dataset structure!
    src_key = 'ro' if 'ro' in data.column_names else 'md' 
    if src_key not in data.column_names: src_key = 'source' # fallback
    ref_key = 'en' if 'en' in data.column_names else 'target'
    
    sources = data[src_key]
    references = data[ref_key]
    
    # 2. Generate Translations (Hypotheses)
    print("Generating translations...")
    hypotheses = []
    for i in tqdm(range(0, len(sources), BATCH_SIZE)):
        batch = sources[i:i+BATCH_SIZE]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
        
        with torch.no_grad():
            generated_ids = model.generate(**inputs, max_new_tokens=128)
        
        decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        hypotheses.extend(decoded)

    # --- PIPELINE 1: Translation Metrics ---
    print("Computing Translation Metrics (ChrF++, TER, BERTScore)...")
    
    score_chrf = metric_chrf.compute(predictions=hypotheses, references=references, word_order=2) # word_order=2 enables chrf++
    score_ter = metric_ter.compute(predictions=hypotheses, references=references)
    score_bert = metric_bertscore.compute(predictions=hypotheses, references=references, lang="en")
    
    print(f"[{split_name}] ChrF++: {score_chrf['score']:.2f}")
    print(f"[{split_name}] TER: {score_ter['score']:.2f}")
    print(f"[{split_name}] BERTScore F1: {np.mean(score_bert['f1']):.4f}")

    # --- PIPELINE 2: Linguistic Analysis ---
    print("Computing Linguistic Metrics...")
    
    # Perplexity (Reference given Source)
    ppl_mean, ppl_var = calculate_perplexity(model, tokenizer, sources, references)
    
    # Sentence Length (of Translations)
    lens = [len(x.split()) for x in hypotheses]
    len_mean = np.mean(lens)
    len_var = np.var(lens)
    
    # NER Recall
    ner_rec = calculate_ner_recall(references, hypotheses)
    
    print(f"[{split_name}] Perplexity: {ppl_mean:.2f} ± {ppl_var:.2f}")
    print(f"[{split_name}] Sentence Len: {len_mean:.2f} ± {len_var:.2f}")
    print(f"[{split_name}] NER Recall: {ner_rec:.4f}")

# --- Run ---
run_evaluation("dev_ro", dataset["train_ro"])
run_evaluation("dev_md", dataset["train_md"])

Loading model on cuda...


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0



--- Processing dev_ro ---
Generating translations...


100%|██████████| 422/422 [05:53<00:00,  1.19it/s]


Computing Translation Metrics (ChrF++, TER, BERTScore)...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[dev_ro] ChrF++: 78.19
[dev_ro] TER: 25.32
[dev_ro] BERTScore F1: 0.9725
Computing Linguistic Metrics...


Calculating NER: 100%|██████████| 422/422 [03:21<00:00,  2.09it/s]


[dev_ro] Perplexity: 3252.28 ± 3588585.80
[dev_ro] Sentence Len: 13.59 ± 25.76
[dev_ro] NER Recall: 0.7324

--- Processing dev_md ---
Generating translations...


100%|██████████| 422/422 [06:29<00:00,  1.08it/s]


Computing Translation Metrics (ChrF++, TER, BERTScore)...
[dev_md] ChrF++: 73.99
[dev_md] TER: 31.55
[dev_md] BERTScore F1: 0.9662
Computing Linguistic Metrics...


Calculating NER: 100%|██████████| 422/422 [06:53<00:00,  1.02it/s]

[dev_md] Perplexity: 3201.16 ± 4007790.54
[dev_md] Sentence Len: 13.93 ± 25.18
[dev_md] NER Recall: 0.6358



