In [1]:
# ==============================================================================
# 1. SETUP AND INSTALLATIONS
# ==============================================================================
%pip install --upgrade \
    "transformers==4.38.2" \
    "accelerate==0.28.0" \
    "peft==0.9.0" \
    "bitsandbytes>=0.41.3" \
    "evaluate" \
    "rouge_score" \
    "bert_score" \
    "sacrebleu" \
    "unbabel-comet"

# After this cell finishes, you MUST restart the kernel for the updates to take effect.

Collecting transformers==4.38.2
  Using cached transformers-4.38.2-py3-none-any.whl.metadata (130 kB)
Collecting accelerate==0.28.0
  Using cached accelerate-0.28.0-py3-none-any.whl.metadata (18 kB)
Collecting peft==0.9.0
  Using cached peft-0.9.0-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.38.2)
  Using cached tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached transformers-4.38.2-py3-none-any.whl (8.5 MB)
Using cached accelerate-0.28.0-py3-none-any.whl (290 kB)
Using cached peft-0.9.0-py3-none-any.whl (190 kB)
Using cached tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
Installing collected packages: tokenizers, transformers, accelerate, peft
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.4
    Uninstalling tokenizers-0.21.4:
      Successfully uninstalled tokenizers-0.21.4
  Attempting uninstall: transformers

In [2]:
import os

# Upgrade all the necessary libraries to ensure compatibility
os.system("pip install --upgrade --quiet transformers peft accelerate bitsandbytes")

print("Environment setup complete. Please restart the kernel now.")

Environment setup complete. Please restart the kernel now.


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
# ==============================================================================
# 2. CONFIGURATION AND MODEL LOADING
# ==============================================================================
import os
import torch
import evaluate
import pandas as pd
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from tqdm import tqdm


# --- Configuration ---
class EvalConfig:
    # Path to the base model
    MODEL_ID = "CohereLabs/aya-101"
    
    # Path to your trained LoRA adapter.
    # VERIFY this path matches the output of your training script.
    ADAPTER_PATH = "./results_aya_101/final_checkpoint"
    
    # Dataset configuration
    DATASET_NAME = "RobbedoesHF/dutch-definitions"
    DATASET_TEST_SPLIT = "test"
    
    # Generation parameters
    MAX_SOURCE_LENGTH = 256
    MAX_TARGET_LENGTH = 384
    BATCH_SIZE = 16
    
config = EvalConfig()

# --- Model and Tokenizer Loading ---
print("Loading base model and tokenizer...")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)



model = AutoModelForSeq2SeqLM.from_pretrained(
    config.MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
)
# Load the tokenizer with trust_remote_code=True
tokenizer = AutoTokenizer.from_pretrained(
    config.MODEL_ID,
)


print(f"Loading LoRA adapter from: {config.ADAPTER_PATH}")
model = PeftModel.from_pretrained(model, config.ADAPTER_PATH)

model.eval()
print("Model loaded successfully.")

Loading base model and tokenizer...


config.json:   0%|          | 0.00/836 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

model-00004-of-00011.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00008-of-00011.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00005-of-00011.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00006-of-00011.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00003-of-00011.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00011.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00007-of-00011.safetensors:   0%|          | 0.00/4.87G [00:00<?, ?B/s]

model-00001-of-00011.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00009-of-00011.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00010-of-00011.safetensors:   0%|          | 0.00/2.99G [00:00<?, ?B/s]

model-00011-of-00011.safetensors:   0%|          | 0.00/4.10G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/11 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/833 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

Loading LoRA adapter from: ./results_aya_101/final_checkpoint
Model loaded successfully.


In [3]:
# ==============================================================================
# 3. GENERATE PREDICTIONS FOR THE TEST SET
# ==============================================================================
print("Loading test dataset and preparing for generation...")
dataset = load_dataset(config.DATASET_NAME, split=config.DATASET_TEST_SPLIT)

# We will store the results here
results = []

# This function creates the simple prompt string used during AYA 101 fine-tuning.
def create_inference_prompt(lemma, short_def):
    """
    Creates a single formatted prompt string for inference, matching the training format.
    """
    # Define the role-playing system prompt as used in training
    system_prompt = "Je bent een expert-lexicograaf die definities schrijft voor een Nederlands woordenboek." #
    
    # Combine the system prompt with the task-specific instruction
    return f"{system_prompt}\\n\\nBreid de volgende korte definitie voor het woord '{lemma}' uit tot een volledige definitie: '{short_def}'" #

print(f"Generating predictions for {len(dataset)} samples...")
# Process the dataset in batches for efficiency
for i in tqdm(range(0, len(dataset), config.BATCH_SIZE)):
    batch = dataset[i:i+config.BATCH_SIZE]

    # Create the input prompts
    prompts = [
        create_inference_prompt(lemma, short_def)
        for lemma, short_def in zip(batch['Lemma'], batch['DefinitionShort'])
    ]

    # The 'source' for COMET is the full input prompt
    sources = prompts

    # Tokenize the batch of prompts
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=config.MAX_SOURCE_LENGTH).to("cuda")

    # Generate outputs
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=config.MAX_TARGET_LENGTH,
            num_beams=4,
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode the generated sequences
    predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Store the results
    for j in range(len(predictions)):
        results.append({
            "source": sources[j],
            "prediction": predictions[j],
            "reference": batch['DefinitionFull'][j]
        })

# Convert results to a pandas DataFrame for easier handling
results_df = pd.DataFrame(results)
print("\nGeneration complete. Example result:")
print(results_df.head())

Loading test dataset and preparing for generation...


README.md:   0%|          | 0.00/723 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.80M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/381k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/379k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/27880 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3494 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3450 [00:00<?, ? examples/s]

Generating predictions for 3450 samples...


100%|██████████| 216/216 [20:19<00:00,  5.65s/it] 


Generation complete. Example result:
                                              source  \
0  Je bent een expert-lexicograaf die definities ...   
1  Je bent een expert-lexicograaf die definities ...   
2  Je bent een expert-lexicograaf die definities ...   
3  Je bent een expert-lexicograaf die definities ...   
4  Je bent een expert-lexicograaf die definities ...   

                                          prediction  \
0  maatschappij waarin iedereen minimaal 1,5 mete...   
1  centrale waarop de politie en de hulpdiensten ...   
2  bankbiljet dat de waarde van 200 euro vertegen...   
3                   monitor die 3D-beelden weergeeft   
4  autootje met de status van een bromfiets en ee...   

                                           reference  
0  maatschappij waarin mensen die niet tot hetzel...  
1  alarmcentrale die bereikbaar is onder het tele...  
2  bankbiljet dat de waarde van 200 euro vertegen...  
3       monitor die driedimensionaal beeld weergeeft  
4  motorvoert




In [5]:
# ==============================================================================
# 4. CALCULATE AND DISPLAY METRICS
# ==============================================================================
print("Calculating evaluation metrics...")

# Extract lists of predictions, references, and sources from the DataFrame
predictions = results_df["prediction"].tolist()
references = results_df["reference"].tolist()
sources = results_df["source"].tolist()

# --- ROUGE ---
rouge = evaluate.load('rouge')
rouge_results = rouge.compute(predictions=predictions, references=references)
print("\n--- ROUGE Scores ---")
print(rouge_results)

# --- BLEU ---
bleu = evaluate.load('bleu')
# Note: BLEU expects references to be a list of lists
bleu_results = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
print("\n--- BLEU Score ---")
print(bleu_results)

# --- BERTScore ---
bertscore = evaluate.load("bertscore")
bertscore_results = bertscore.compute(predictions=predictions, references=references, lang="nl")

avg_precision = sum(bertscore_results['precision']) / len(bertscore_results['precision'])
avg_recall = sum(bertscore_results['recall']) / len(bertscore_results['recall'])
avg_f1 = sum(bertscore_results['f1']) / len(bertscore_results['f1'])

print(f"\n--- BERTScore ---")
print(f"{'Average Precision':>20}: {avg_precision:.4f}")
print(f"{'Average Recall':>20}: {avg_recall:.4f}")
print(f"{'Average F1':>20}: {avg_f1:.4f}")

# --- COMET ---
print("\n--- COMET Scores ---")
print("Loading COMET models (this may take a while)...")
comet_22 = evaluate.load('comet', 'Unbabel/wmt22-comet-da')
comet_kiwi = evaluate.load('comet', 'Unbabel/wmt22-cometkiwi-da')
xcomet = evaluate.load('comet', 'Unbabel/XCOMET-XL')

comet_22_results = comet_22.compute(predictions=predictions, references=references, sources=sources)
comet_kiwi_results = comet_kiwi.compute(predictions=predictions, references=references, sources=sources)
xcomet_results = xcomet.compute(predictions=predictions, references=references, sources=sources)

print(f"\n--- COMET-22 Score ---")
print(f"{'score':>20}: {comet_22_results['mean_score']:.4f}")
print(f"\n--- COMETkiwi Score ---")
print(f"{'score':>20}: {comet_kiwi_results['mean_score']:.4f}")
print(f"\n--- XCOMET Score ---")
print(f"{'score':>20}: {xcomet_results['mean_score']:.4f}")


# ==============================================================================
# 5. SAVE RESULTS TO DISK
# ==============================================================================
print("\nSaving results to disk...")

# --- Save the summary scores to a text file ---
summary_path = "./evaluation_summary_aya-101.txt"
with open(summary_path, "w") as f:
    f.write("--- ROUGE Scores ---\n")
    f.write(str(rouge_results) + "\n")
    f.write("\n--- BLEU Score ---\n")
    f.write(str(bleu_results) + "\n")
    f.write("\n--- BERTScore ---\n")
    f.write(f"{'Average Precision':>20}: {avg_precision:.4f}\n")
    f.write(f"{'Average Recall':>20}: {avg_recall:.4f}\n")
    f.write(f"{'Average F1':>20}: {avg_f1:.4f}\n")
    f.write("\n--- COMET-22 Score ---\n")
    f.write(f"{'score':>20}: {comet_22_results['mean_score']:.4f}\n")
    f.write("\n--- COMETkiwi Score ---\n")
    f.write(f"{'score':>20}: {comet_kiwi_results['mean_score']:.4f}\n")
    f.write("\n--- XCOMET Score ---\n")
    f.write(f"{'score':>20}: {xcomet_results['mean_score']:.4f}\n")

print(f"Summary of scores saved to: {summary_path}")
print("\nEvaluation finished.")

Calculating evaluation metrics...

--- ROUGE Scores ---
{'rouge1': 0.40284882839281466, 'rouge2': 0.2373084287696915, 'rougeL': 0.3780132098671784, 'rougeLsum': 0.3784015137868909}

--- BLEU Score ---
{'bleu': 0.10628725926459942, 'precisions': [0.510846345939766, 0.28280793634807133, 0.18753085013479137, 0.14034321645342998], 'brevity_penalty': 0.4280255563172645, 'length_ratio': 0.5409579922201787, 'translation_length': 33237, 'reference_length': 61441}


  return forward_call(*args, **kwargs)



--- BERTScore ---
   Average Precision: 0.8075
      Average Recall: 0.7561
          Average F1: 0.7796

--- COMET Scores ---
Loading COMET models (this may take a while)...


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

checkpoints/model.ckpt:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

hparams.yaml:   0%|          | 0.00/716 [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.2 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-cometkiwi-da/snapshots/1ad785194e391eebc6c53e2d0776cada8f83179a/checkpoints/model.ckpt`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/513 [00:00<?, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.55k [00:00<?, ?B/s]

checkpoints/model.ckpt:   0%|          | 0.00/13.9G [00:00<?, ?B/s]

hparams.yaml:   0%|          | 0.00/988 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/405 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100 80GB PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
huggingface/tokeniz


--- COMET-22 Score ---
               score: 0.6536

--- COMETkiwi Score ---
               score: 0.6165

--- XCOMET Score ---
               score: 0.6298

Saving results to disk...
Summary of scores saved to: ./evaluation_summary_aya-101.txt

Evaluation finished.


In [6]:
# ==============================================================================
# 6. CREATE AND SAVE DETAILED PER-ENTRY RESULTS TSV
# ==============================================================================
print("--- Creating a detailed results file with per-entry scores ---")

try:
    # Convert the original test set to a pandas DataFrame
    final_results_df = dataset.to_pandas()

    # Add the model's predictions from the generation step
    final_results_df['model_prediction'] = results_df['prediction']

    # --- Add Per-Entry Metric Scores ---
    # Add BERTScore (Precision, Recall, and F1)
    final_results_df['bertscore_precision'] = bertscore_results['precision']
    final_results_df['bertscore_recall'] = bertscore_results['recall']
    final_results_df['bertscore_f1'] = bertscore_results['f1']
    
    # Add COMET-22, CometKiwi and XCOMET scores
    final_results_df['comet22_score'] = comet_22_results['scores']
    final_results_df['cometkiwi_score'] = comet_kiwi_results['scores']
    final_results_df['xcomet_score'] = xcomet_results['scores']

    # --- Save to TSV File ---
    tsv_path = "./evaluation_results_per_entry_aya-101.tsv"
    final_results_df.to_csv(tsv_path, sep='\t', index=False)

    print(f"\nSuccessfully created and saved the detailed results file.")
    print(f"File saved to: {tsv_path}")
    
    print("\n--- Data Preview ---")
    print("Note: BLEU and ROUGE are corpus-level metrics and are not included in this per-entry file.")
    print(final_results_df.head())

except Exception as e:
    print(f"\nAn error occurred while creating the TSV file: {e}")

--- Creating a detailed results file with per-entry scores ---

Successfully created and saved the detailed results file.
File saved to: ./evaluation_results_per_entry_aya-101.tsv

--- Data Preview ---
Note: BLEU and ROUGE are corpus-level metrics and are not included in this per-entry file.
                   Lemma          POS MeaningNumber  LemmaID  MeaningID  \
0  1,5 metermaatschappij  substantief           1.0   909355     909359   
1           112-centrale  substantief           1.0      313        314   
2         200 eurobiljet  substantief           1.0   495566     495570   
3             3D-monitor  substantief           1.0   232019     232022   
4      45 kilometerwagen  substantief           1.0   871087     871098   

                                      DefinitionFull  \
0  maatschappij waarin mensen die niet tot hetzel...   
1  alarmcentrale die bereikbaar is onder het tele...   
2  bankbiljet dat de waarde van 200 euro vertegen...   
3       monitor die driedimensio