In [1]:
# ==============================================================================
# 1. SETUP AND INSTALLATIONS
# ==============================================================================
%pip install --upgrade \
    "peft>=0.11.1" \
    "bitsandbytes>=0.43.1" \
    "transformers>=4.42.3" \
    "accelerate>=0.31.0" \
    "evaluate" \
    "rouge_score" \
    "bert_score" \
    "sacrebleu" \
    "unbabel-comet"

# After this cell finishes, you MUST restart the kernel for the updates to take effect.

Collecting peft>=0.11.1
  Downloading peft-0.16.0-py3-none-any.whl.metadata (14 kB)
Collecting transformers>=4.42.3
  Downloading transformers-4.54.1-py3-none-any.whl.metadata (41 kB)
Collecting accelerate>=0.31.0
  Downloading accelerate-1.9.0-py3-none-any.whl.metadata (19 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting unbabel-comet
  Downloading unbabel_comet-2.2.6-py3-none-any.whl.metadata (19 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers>=4.42.3)
  Downloading tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting absl-py (from rouge_score)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting nltk (from rouge_score)
  Downloading nltk-3.9.1-py

In [1]:
# ==============================================================================
# 2. CONFIGURATION AND MODEL LOADING
# ==============================================================================
import os
import torch
import evaluate
import pandas as pd
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from tqdm import tqdm

# --- Configuration ---
class EvalConfig:
    # Path to the base model
    MODEL_ID = "bramvanroy/geitje-7b-ultra"
    
    # Path to your trained LoRA adapter.
    # VERIFY this path matches the output of your training script.
    ADAPTER_PATH = "./results_geitje_7b_ultra/final_checkpoint"
    
    # Dataset configuration
    DATASET_NAME = "RobbedoesHF/dutch-definitions"
    DATASET_TEST_SPLIT = "test"
    
    # Generation parameters
    MAX_SEQ_LENGTH = 512
    BATCH_SIZE = 32  # Adjust based on your GPU memory

config = EvalConfig()

# --- Model and Tokenizer Loading ---
print("Loading base model and tokenizer...")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    config.MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation="flash_attention_2",
)
tokenizer = AutoTokenizer.from_pretrained(config.MODEL_ID)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Loading LoRA adapter from: {config.ADAPTER_PATH}")
model = PeftModel.from_pretrained(model, config.ADAPTER_PATH)

model.eval()
print("Model loaded successfully.")

Loading base model and tokenizer...


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Loading LoRA adapter from: ./results_geitje_7b_ultra/final_checkpoint
Model loaded successfully.


In [3]:
# ==============================================================================
# 3. GENERATE PREDICTIONS FOR THE TEST SET
# ==============================================================================
print("Loading test dataset and preparing for generation...")
dataset = load_dataset(config.DATASET_NAME, split=config.DATASET_TEST_SPLIT)

# We will store the results here
results = []

# Helper function to create the correct prompt format for inference
def create_inference_prompt(lemma, short_def):
    chat = [
        {
            "role": "system",
            "content": "Je bent een expert-lexicograaf die definities schrijft voor een Nederlands woordenboek."
        },
        {
            "role": "user",
            "content": (
                f"Breid de volgende korte definitie voor het woord "
                f"'{lemma}' uit tot een volledige definitie: "
                f"'{short_def}'"
            )
        },
    ]
    return tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)


print(f"Generating predictions for {len(dataset)} samples...")
# Process the dataset in batches for efficiency
for i in tqdm(range(0, len(dataset), config.BATCH_SIZE)):
    batch = dataset[i:i+config.BATCH_SIZE]
    
    # Create the input prompts by correctly iterating through the batch columns
    prompts = [
        create_inference_prompt(lemma, short_def)
        for lemma, short_def in zip(batch['Lemma'], batch['DefinitionShort'])
    ]
    
    # The 'source' for COMET is the user instruction part
    sources = [p.split("<|user|>")[1].split("<|assistant|>")[0].strip() for p in prompts]
    
    # Tokenize the batch of prompts
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=config.MAX_SEQ_LENGTH).to("cuda")
    
    # Generate outputs
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=config.MAX_SEQ_LENGTH, 
            num_beams=4, 
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode the generated tokens
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
    # For causal models, the output includes the prompt, so we must remove it.
    predictions = [decoded.split("<|assistant|>")[1].strip() for decoded in decoded_outputs]
    
    # Store the results
    for j in range(len(predictions)):
        results.append({
            "source": sources[j],
            "prediction": predictions[j],
            "reference": batch['DefinitionFull'][j]
        })

# Convert results to a pandas DataFrame for easier handling
results_df = pd.DataFrame(results)
print("\nGeneration complete. Example result:")
print(results_df.head())

Loading test dataset and preparing for generation...
Generating predictions for 3450 samples...


100%|██████████| 108/108 [37:19<00:00, 20.74s/it] 


Generation complete. Example result:
                                              source  \
0  Breid de volgende korte definitie voor het woo...   
1  Breid de volgende korte definitie voor het woo...   
2  Breid de volgende korte definitie voor het woo...   
3  Breid de volgende korte definitie voor het woo...   
4  Breid de volgende korte definitie voor het woo...   

                                          prediction  \
0  maatschappij waarin mensen die niet tot hetzel...   
1  alarmcentrale die bereikbaar is onder het tele...   
2              bankbiljet met de waarde van 200 euro   
3  monitor waarop driedimensionale beelden worden...   
4  auto met de officiële status van een bromfiets...   

                                           reference  
0  maatschappij waarin mensen die niet tot hetzel...  
1  alarmcentrale die bereikbaar is onder het tele...  
2  bankbiljet dat de waarde van 200 euro vertegen...  
3       monitor die driedimensionaal beeld weergeeft  
4  motorvoert




In [4]:
# ==============================================================================
# 4. CALCULATE AND DISPLAY METRICS
# ==============================================================================
print("Calculating evaluation metrics...")

# Extract lists of predictions, references, and sources from the DataFrame
predictions = results_df["prediction"].tolist()
references = results_df["reference"].tolist()
sources = results_df["source"].tolist()

# --- ROUGE ---
rouge = evaluate.load('rouge')
rouge_results = rouge.compute(predictions=predictions, references=references)
print("\n--- ROUGE Scores ---")
print(rouge_results)

# --- BLEU ---
bleu = evaluate.load('bleu')
# Note: BLEU expects references to be a list of lists
bleu_results = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
print("\n--- BLEU Score ---")
print(bleu_results)

# --- BERTScore ---
bertscore = evaluate.load("bertscore")
bertscore_results = bertscore.compute(predictions=predictions, references=references, lang="nl")

avg_precision = sum(bertscore_results['precision']) / len(bertscore_results['precision'])
avg_recall = sum(bertscore_results['recall']) / len(bertscore_results['recall'])
avg_f1 = sum(bertscore_results['f1']) / len(bertscore_results['f1'])

print(f"\n--- BERTScore ---")
print(f"{'Average Precision':>20}: {avg_precision:.4f}")
print(f"{'Average Recall':>20}: {avg_recall:.4f}")
print(f"{'Average F1':>20}: {avg_f1:.4f}")

# --- COMET ---
print("\n--- COMET Scores ---")
print("Loading COMET models (this may take a while)...")
comet_22 = evaluate.load('comet', 'Unbabel/wmt22-comet-da')
comet_kiwi = evaluate.load('comet', 'Unbabel/wmt22-cometkiwi-da')
xcomet = evaluate.load('comet', 'Unbabel/XCOMET-XL')

comet_22_results = comet_22.compute(predictions=predictions, references=references, sources=sources)
comet_kiwi_results = comet_kiwi.compute(predictions=predictions, references=references, sources=sources)
xcomet_results = xcomet.compute(predictions=predictions, references=references, sources=sources)

print(f"\n--- COMET-22 Score ---")
print(f"{'score':>20}: {comet_22_results['mean_score']:.4f}")
print(f"\n--- COMETkiwi Score ---")
print(f"{'score':>20}: {comet_kiwi_results['mean_score']:.4f}")
print(f"\n--- XCOMET Score ---")
print(f"{'score':>20}: {xcomet_results['mean_score']:.4f}")


# ==============================================================================
# 5. SAVE RESULTS TO DISK
# ==============================================================================
print("\nSaving results to disk...")

# --- Save the summary scores to a text file ---
summary_path = "./evaluation_summary_geitje.txt"
with open(summary_path, "w") as f:
    f.write("--- ROUGE Scores ---\n")
    f.write(str(rouge_results) + "\n")
    f.write("\n--- BLEU Score ---\n")
    f.write(str(bleu_results) + "\n")
    f.write("\n--- BERTScore ---\n")
    f.write(f"{'Average Precision':>20}: {avg_precision:.4f}\n")
    f.write(f"{'Average Recall':>20}: {avg_recall:.4f}\n")
    f.write(f"{'Average F1':>20}: {avg_f1:.4f}\n")
    f.write("\n--- COMET-22 Score ---\n")
    f.write(f"{'score':>20}: {comet_22_results['mean_score']:.4f}\n")
    f.write("\n--- COMETkiwi Score ---\n")
    f.write(f"{'score':>20}: {comet_kiwi_results['mean_score']:.4f}\n")
    f.write("\n--- XCOMET Score ---\n")
    f.write(f"{'score':>20}: {xcomet_results['mean_score']:.4f}\n")

print(f"Summary of scores saved to: {summary_path}")
print("\nEvaluation finished.")

Calculating evaluation metrics...


Downloading builder script: 0.00B [00:00, ?B/s]


--- ROUGE Scores ---
{'rouge1': 0.4123357238171513, 'rouge2': 0.25161913131468583, 'rougeL': 0.38714679587966194, 'rougeLsum': 0.3870685910643292}


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]


--- BLEU Score ---
{'bleu': 0.14475841666935474, 'precisions': [0.4588880566361418, 0.2604317684582334, 0.18081651588958478, 0.14115071315882674], 'brevity_penalty': 0.6159753773328359, 'length_ratio': 0.6736055728259631, 'translation_length': 41387, 'reference_length': 61441}


Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

  return forward_call(*args, **kwargs)



--- BERTScore ---
   Average Precision: 0.8003
      Average Recall: 0.7623
          Average F1: 0.7795

--- COMET Scores ---
Loading COMET models (this may take a while)...


Downloading builder script: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

checkpoints/model.ckpt:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

hparams.yaml:   0%|          | 0.00/567 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

LICENSE: 0.00B [00:00, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

LICENSE:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.10k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

checkpoints/model.ckpt:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

hparams.yaml:   0%|          | 0.00/716 [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.2 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-cometkiwi-da/snapshots/1ad785194e391eebc6c53e2d0776cada8f83179a/checkpoints/model.ckpt`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/513 [00:00<?, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

checkpoints/model.ckpt:   0%|          | 0.00/13.9G [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.55k [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

hparams.yaml:   0%|          | 0.00/988 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/405 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100 80GB PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
huggingface/tokeniz


--- COMET-22 Score ---
               score: 0.6661

--- COMETkiwi Score ---
               score: 0.6462

--- XCOMET Score ---
               score: 0.7354

Saving results to disk...
Summary of scores saved to: ./evaluation_summary_geitje.txt

Evaluation finished.


In [5]:
# ==============================================================================
# 6. CREATE AND SAVE DETAILED PER-ENTRY RESULTS TSV
# ==============================================================================
print("--- Creating a detailed results file with per-entry scores ---")

try:
    # Convert the original test set to a pandas DataFrame
    final_results_df = dataset.to_pandas()

    # Add the model's predictions from the generation step
    final_results_df['model_prediction'] = results_df['prediction']

    # --- Add Per-Entry Metric Scores ---
    # Add BERTScore (Precision, Recall, and F1)
    final_results_df['bertscore_precision'] = bertscore_results['precision']
    final_results_df['bertscore_recall'] = bertscore_results['recall']
    final_results_df['bertscore_f1'] = bertscore_results['f1']
    
    # Add COMET-22, CometKiwi and XCOMET scores
    final_results_df['comet22_score'] = comet_22_results['scores']
    final_results_df['cometkiwi_score'] = comet_kiwi_results['scores']
    final_results_df['xcomet_score'] = xcomet_results['scores']

    # --- Save to TSV File ---
    tsv_path = "./evaluation_results_per_entry_geitje.tsv"
    final_results_df.to_csv(tsv_path, sep='\t', index=False)

    print(f"\nSuccessfully created and saved the detailed results file.")
    print(f"File saved to: {tsv_path}")
    
    print("\n--- Data Preview ---")
    print("Note: BLEU and ROUGE are corpus-level metrics and are not included in this per-entry file.")
    print(final_results_df.head())

except Exception as e:
    print(f"\nAn error occurred while creating the TSV file: {e}")

--- Creating a detailed results file with per-entry scores ---

Successfully created and saved the detailed results file.
File saved to: ./evaluation_results_per_entry_geitje.tsv

--- Data Preview ---
Note: BLEU and ROUGE are corpus-level metrics and are not included in this per-entry file.
                   Lemma          POS MeaningNumber  LemmaID  MeaningID  \
0  1,5 metermaatschappij  substantief           1.0   909355     909359   
1           112-centrale  substantief           1.0      313        314   
2         200 eurobiljet  substantief           1.0   495566     495570   
3             3D-monitor  substantief           1.0   232019     232022   
4      45 kilometerwagen  substantief           1.0   871087     871098   

                                      DefinitionFull  \
0  maatschappij waarin mensen die niet tot hetzel...   
1  alarmcentrale die bereikbaar is onder het tele...   
2  bankbiljet dat de waarde van 200 euro vertegen...   
3       monitor die driedimension