In [1]:
import json
import os
import gc
import torch
import pandas as pd
from tqdm.auto import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, BitsAndBytesConfig, logging
import evaluate

# --- Basic Setup ---
# Suppress verbose logging for cleaner output
logging.set_verbosity_error()

# Define the models to be evaluated
MODEL_IDS = [
    "google/mt5-xl"
]

# Load the shared dataset
dataset = load_dataset("RobbedoesHF/dutch-definitions", split="test")

# Load the few-shot examples from the JSONL file
try:
    with open('prompts_alpha_0_8.jsonl', 'r', encoding='utf-8') as f:
        few_shot_examples = [json.loads(line) for line in f]
    # Create a dictionary and a single example for prompts
    few_shot_dict = {example['lemma']: example for example in few_shot_examples}
    one_shot_example = few_shot_examples[0]['few_shot_example_1']
    print("Successfully loaded few-shot examples.")
except FileNotFoundError:
    print("Warning: 'prompts_alpha_0_8.jsonl' not found. Few-shot and one-shot evaluations will not work.")
    few_shot_dict = {}
    one_shot_example = {}

README.md:   0%|          | 0.00/723 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.80M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/381k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/379k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/27880 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3494 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3450 [00:00<?, ? examples/s]

Successfully loaded few-shot examples.


In [4]:
# --- Prompt Creation Function ---
def create_prompt(model_id, tokenizer, shot_type, lemma, gloss, one_shot_example=None, few_shot_dict=None):
    """Creates a structured prompt based on the model, shot type, and input."""
    
    system_prompt = "Je bent een expert-lexicograaf die definities schrijft voor een Nederlands woordenboek."
    base_instruction = f"Breid de volgende korte definitie voor het woord '{lemma}' uit tot een volledige definitie: '{gloss}'"

    # --- Prompts for Aya models ---
    if model_id in ["CohereLabs/aya-101", "google/mt5-xl"]:
        if shot_type == 'zero_shot' and model_id == "CohereLabs/aya-101":
            return f"{system_prompt}\n\n{base_instruction}"

        elif shot_type == 'zero_shot' and model_id == "google/mt5-xl":
            return base_instruction
        
        elif shot_type == 'one_shot':
            one_shot_text = f"Voorbeeld:\nWoord: {one_shot_example['lemma']}\nKorte definitie: {one_shot_example['short_definition']}\nLange definitie: {one_shot_example['long_definition']}"
            return f"{system_prompt}\n\n{one_shot_text}\n\n{base_instruction}"

        elif shot_type == 'few_shot':
            examples = few_shot_dict.get(lemma, {})
            few_shot_text = "Voorbeelden:\n"
            for i in range(1, 4):
                example = examples.get(f'few_shot_example_{i}')
                if example:
                    few_shot_text += f"Woord: {example['lemma']}\nKorte definitie: {example['short_definition']}\nLange definitie: {example['long_definition']}\n\n"
            return f"{system_prompt}\n\n{few_shot_text}{base_instruction}"

    # --- Prompts for Aya-23 (with Chat Template) ---
    elif model_id == "CohereLabs/aya-expanse-8b":
        # For this model, the entire user-facing prompt is constructed first,
        # then wrap it in the required chat tokens as a single user turn.
        user_content = ""
        if shot_type == 'zero_shot':
            user_content = f"{system_prompt}\n\n{base_instruction}"
        
        elif shot_type == 'one_shot':
            one_shot_text = f"Voorbeeld:\nWoord: {one_shot_example['lemma']}\nKorte definitie: {one_shot_example['short_definition']}\nLange definitie: {one_shot_example['long_definition']}"
            user_content = f"{system_prompt}\n\n{one_shot_text}\n\n{base_instruction}"

        elif shot_type == 'few_shot':
            examples = few_shot_dict.get(lemma, {})
            few_shot_text = "Voorbeelden:\n"
            for i in range(1, 4):
                example = examples.get(f'few_shot_example_{i}')
                if example:
                    few_shot_text += f"Woord: {example['lemma']}\nKorte definitie: {example['short_definition']}\nLange definitie: {example['long_definition']}\n\n"
            user_content = f"{system_prompt}\n\n{few_shot_text}{base_instruction}"
        
        # Wrap the complete user content in the model's chat format
        return f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{user_content}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
            
    # --- Prompts for Geitje (using single user turn with examples) ---
    elif model_id == "bramvanroy/geitje-7b-ultra":
        # In this approach, we combine all examples and the final instruction
        # into a single block of text, which is then given the 'user' role.
        user_content = ""
        if shot_type == 'zero_shot':
            user_content = base_instruction
        
        elif shot_type == 'one_shot':
            one_shot_text = f"Voorbeeld:\nWoord: {one_shot_example['lemma']}\nKorte definitie: {one_shot_example['short_definition']}\nLange definitie: {one_shot_example['long_definition']}"
            user_content = f"{one_shot_text}\n\n{base_instruction}"

        elif shot_type == 'few_shot':
            few_shot_text = "Voorbeelden:\n"
            examples = few_shot_dict.get(lemma, {})
            for i in range(1, 4):
                example = examples.get(f'few_shot_example_{i}')
                if example:
                    few_shot_text += f"Woord: {example['lemma']}\nKorte definitie: {example['short_definition']}\nLange definitie: {example['long_definition']}\n\n"
            user_content = f"{few_shot_text}{base_instruction}"

        # The chat structure now has a system message and one user message containing everything.
        chat = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_content}
        ]
        
        # Apply the template. add_generation_prompt=True will add the final prompt for the assistant to start generating.
        return tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)


    return ""

In [3]:
# Load the actual dataset
try:
    dataset = load_dataset("RobbedoesHF/dutch-definitions", split="test")
    # Take the first entry from the dataset as our real example
    real_entry = dataset[0]
    sample_lemma = real_entry['Lemma']
    sample_gloss = real_entry['DefinitionShort']
    print(f"Using real data for lemma: '{sample_lemma}'")
except Exception as e:
    print(f"Could not load dataset. Using fallback sample data. Error: {e}")
    sample_lemma = "evaluatie"
    sample_gloss = "het proces van beoordelen"

# Load the few-shot examples and find the ones corresponding to our real entry
try:
    with open('prompts_alpha_0_8.jsonl', 'r', encoding='utf-8') as f:
        all_few_shot_examples = [json.loads(line) for line in f]
    
    few_shot_dict_full = {example['lemma']: example for example in all_few_shot_examples}
    
    # Get the specific few-shot examples for our chosen lemma
    sample_few_shot_dict = {sample_lemma: few_shot_dict_full.get(sample_lemma, {})}
    
    # Use the first of these examples for the one-shot prompt
    sample_one_shot_example = sample_few_shot_dict[sample_lemma].get('few_shot_example_1', {})
    
    if not sample_one_shot_example:
        print("Warning: Could not find specific few-shot examples for the first dataset entry. Prompts will fall back to zero-shot.")

except Exception as e:
    print(f"Could not load few-shot file. Using empty examples. Error: {e}")
    sample_one_shot_example = {}
    sample_few_shot_dict = {}


# --- Generate and Print Examples ---
for model_id in MODEL_IDS:
    print(f"\n{'='*80}")
    print(f"PROMPT EXAMPLES FOR MODEL: {model_id}")
    print(f"{'='*80}\n")
    
    # We need to load the tokenizer for each model, as it's required for Geitje's prompt creation
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
    except Exception as e:
        print(f"Could not load tokenizer for {model_id}. Skipping. Error: {e}")
        continue

    for shot_type in ["zero_shot", "one_shot", "few_shot"]:
        prompt = create_prompt(
            model_id=model_id,
            tokenizer=tokenizer,
            shot_type=shot_type,
            lemma=sample_lemma,
            gloss=sample_gloss,
            one_shot_example=sample_one_shot_example,
            few_shot_dict=sample_few_shot_dict
        )
        
        print(f"--- {shot_type.upper()} PROMPT ---")
        print(prompt)
        print(f"--- END {shot_type.upper()} PROMPT ---\n")


Using real data for lemma: '1,5 metermaatschappij'

PROMPT EXAMPLES FOR MODEL: bramvanroy/geitje-7b-ultra

--- ZERO_SHOT PROMPT ---
<|system|>
Je bent een expert-lexicograaf die definities schrijft voor een Nederlands woordenboek.</s>
<|user|>
Breid de volgende korte definitie voor het woord '1,5 metermaatschappij' uit tot een volledige definitie: 'maatschappij waarin fysieke afstand nodig is'</s>
<|assistant|>

--- END ZERO_SHOT PROMPT ---

--- ONE_SHOT PROMPT ---
<|system|>
Je bent een expert-lexicograaf die definities schrijft voor een Nederlands woordenboek.</s>
<|user|>
Voorbeeld:
Woord: anderhalvemetermaatschappij
Korte definitie: maatschappij waarin fysieke afstand nodig is
Lange definitie: maatschappij waarin mensen die niet tot hetzelfde huishouden behoren in de publieke ruimte een fysieke afstand van minimaal anderhalve meter tot elkaar moeten bewaren om verspreiding van een virus te voorkomen

Breid de volgende korte definitie voor het woord '1,5 metermaatschappij' uit tot e



--- ZERO_SHOT PROMPT ---
Je bent een expert-lexicograaf die definities schrijft voor een Nederlands woordenboek.

Breid de volgende korte definitie voor het woord '1,5 metermaatschappij' uit tot een volledige definitie: 'maatschappij waarin fysieke afstand nodig is'
--- END ZERO_SHOT PROMPT ---

--- ONE_SHOT PROMPT ---
Je bent een expert-lexicograaf die definities schrijft voor een Nederlands woordenboek.

Voorbeeld:
Woord: anderhalvemetermaatschappij
Korte definitie: maatschappij waarin fysieke afstand nodig is
Lange definitie: maatschappij waarin mensen die niet tot hetzelfde huishouden behoren in de publieke ruimte een fysieke afstand van minimaal anderhalve meter tot elkaar moeten bewaren om verspreiding van een virus te voorkomen

Breid de volgende korte definitie voor het woord '1,5 metermaatschappij' uit tot een volledige definitie: 'maatschappij waarin fysieke afstand nodig is'
--- END ONE_SHOT PROMPT ---

--- FEW_SHOT PROMPT ---
Je bent een expert-lexicograaf die definities sc

In [5]:
# --- Main Evaluation Loop ---
for model_id in MODEL_IDS:
    print(f"\n{'='*20}\nEVALUATING MODEL: {model_id}\n{'='*20}")

    # --- 1. Set Model-Specific Configuration ---
    if model_id in ["bramvanroy/geitje-7b-ultra", "CohereLabs/aya-expanse-8b"]:
        MAX_SOURCE_LENGTH = 512
        MAX_TARGET_LENGTH = 512
        BATCH_SIZE = 32
    elif model_id == "CohereLabs/aya-101":
        MAX_SOURCE_LENGTH = 384
        MAX_TARGET_LENGTH = 384
        BATCH_SIZE = 16
    elif model_id == "google/mt5-xl":
        MAX_SOURCE_LENGTH = 512
        MAX_TARGET_LENGTH = 512
        BATCH_SIZE = 32

    # --- 2. Load Model and Tokenizer ---
    print(f"Loading {model_id}...")
    
    # Create the correct bnb_config based on the model
    if model_id == "CohereLabs/aya-101":
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True, # Specific to this model
        )
    else:
        # Default config for other models
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
        )

    # Load the model based on its type
    if model_id in ["google/mt5-xl", "CohereLabs/aya-101"]:
        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_id,
            quantization_config=bnb_config,
            device_map="auto",
        )
    elif model_id == "CohereLabs/aya-expanse-8b":
        model = AutoModelForCausalLM.from_pretrained(
          model_id,
          quantization_config=bnb_config,
          device_map="auto",
          attn_implementation="sdpa",
          trust_remote_code=True,
      )
    elif model_id == "bramvanroy/geitje-7b-ultra": 
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True,
            attn_implementation="flash_attention_2",
        )
    
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Set the model to evaluation mode
    model.eval()
    
    print("Model and tokenizer loaded successfully.")

    # --- 3. Run Evaluation for each Shot Type ---
    for shot_type in ["zero_shot"]:
        if shot_type != "zero_shot" and not few_shot_dict:
            print(f"Skipping {shot_type} because few-shot examples were not found.")
            continue

        # --- 3a. Generation Loop ---
        results = []
        print(f"\nGenerating predictions for {len(dataset)} samples using {model_id} ({shot_type})...")
        
        for i in tqdm(range(0, len(dataset), BATCH_SIZE)):
            batch = dataset[i:i+BATCH_SIZE]
            prompts = [
                create_prompt(model_id, tokenizer, shot_type, lemma, gloss, one_shot_example, few_shot_dict)
                for lemma, gloss in zip(batch['Lemma'], batch['DefinitionShort'])
            ]
            
            # Define sources for COMET metric based on the model
            sources = []
            if model_id in ["CohereLabs/aya-101", "google/mt5-xl"]:
                sources = prompts
            elif model_id == "CohereLabs/aya-expanse-8b":
                sources = [
                    f"Breid de volgende korte definitie voor het woord '{lemma}' uit tot een volledige definitie: '{short_def}'"
                    for lemma, short_def in zip(batch['Lemma'], batch['DefinitionShort'])
                ]
            elif model_id == "bramvanroy/geitje-7b-ultra":
                #sources = [p.split("<|user|>")[1].split("<|assistant|>")[0].strip() for p in prompts]
                sources = [
                    f"Breid de volgende korte definitie voor het woord '{lemma}' uit tot een volledige definitie: '{short_def}'"
                    for lemma, short_def in zip(batch['Lemma'], batch['DefinitionShort'])
                ]

            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=MAX_SOURCE_LENGTH).to("cuda")

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=MAX_TARGET_LENGTH,
                    num_beams=4,
                    early_stopping=True,
                    pad_token_id=tokenizer.eos_token_id
                )

            # Decode predictions based on model type
            predictions = []
            if model_id in ["CohereLabs/aya-101", "google/mt5-xl"]:
                # For Seq2Seq models (like google/mt5-xl), the output is the prediction.
                predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            elif model_id == "bramvanroy/geitje-7b-ultra":
                # For Geitje, we decode the full output and split by the instruction token.
                #decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=False)
                #predictions = [decoded.split("<|assistant|>")[1].strip() for decoded in decoded_outputs]
                # Ask generate() to give us the full sequence tensor back
                with torch.no_grad():
                    outputs = model.generate(
                        **inputs,
                        max_new_tokens=MAX_TARGET_LENGTH,
                        num_beams=4,
                        early_stopping=True,
                        pad_token_id=tokenizer.eos_token_id,
                        return_dict_in_generate=True,   # NEW
                        output_scores=False             # keeps the object small
                    )

                # Everything up to len(prompt) tokens is the prompt itself,
                # so slice it away before decoding.
                prompt_len = inputs["input_ids"].shape[1]
                gen_only = outputs.sequences[:, prompt_len:]

                decoded_outputs = tokenizer.batch_decode(gen_only,
                                                         skip_special_tokens=True)
                predictions = [txt.strip() for txt in decoded_outputs]

            elif model_id == "CohereLabs/aya-expanse-8b":
                decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
                for decoded_output in decoded_outputs:
                    try:
                        # Split the output at the chatbot token to isolate the generated text
                        prediction_part = decoded_output.split("<|CHATBOT_TOKEN|>")[1]
                        # Clean up by removing the end-of-turn token
                        prediction = prediction_part.replace("<|END_OF_TURN_TOKEN|>", "").strip()
                        predictions.append(prediction)
                    except IndexError:
                        # Handle cases where the model fails to generate the expected token
                        predictions.append("")

            # *** PRINT FIRST BATCH RESULTS ***
            if i == 0:
                print("\n--- First Batch Generation Results ---")
                for k in range(len(predictions)):
                    print(f"\n--- Sample {k+1} ---")
                    print(f"SOURCE: {batch['DefinitionShort'][k]}")
                    print(f"PREDICTION: {predictions[k]}")
                    print(f"REFERENCE: {batch['DefinitionFull'][k]}")
                print("\n--- End of First Batch ---")

            for j in range(len(predictions)):
                results.append({
                    "source": sources[j],
                    "prediction": predictions[j],
                    "reference": batch['DefinitionFull'][j]
                })

        results_df = pd.DataFrame(results)
        print("Generation complete.")
        

        # --- 3b. Metric Calculation and Saving ---
        print(f"Calculating and saving metrics for {shot_type}...")
        # ==============================================================================
        # 4. CALCULATE AND DISPLAY METRICS
        # ==============================================================================

        model_name_map = {
            "CohereLabs/aya-101": "aya-101",
            "google/mt5-xl": "mt5-xl",
            "bramvanroy/geitje-7b-ultra": "Geitje",
            "CohereLabs/aya-expanse-8b": "aya-23"
        }
        short_model_name = model_name_map.get(model_id, "unknown_model")

        # Extract lists of predictions, references, and sources from the DataFrame
        predictions = results_df["prediction"].tolist()
        references = results_df["reference"].tolist()
        sources = results_df["source"].tolist()

        final_results_df = dataset.to_pandas().iloc[:len(results_df)]

        # Add the model's predictions from the generation step
        final_results_df['model_prediction'] = results_df['prediction'].values
        # --- Save to TSV File ---
        tsv_path = f"./evaluation_results_per_entry_{short_model_name}_{shot_type}.tsv"
        final_results_df.to_csv(tsv_path, sep='\t', index=False)

        # --- ROUGE ---
        rouge = evaluate.load('rouge')
        rouge_results = rouge.compute(predictions=predictions, references=references)
        print("\n--- ROUGE Scores ---")
        print(rouge_results)

        # --- BLEU ---
        bleu = evaluate.load('bleu')
        # Note: BLEU expects references to be a list of lists
        bleu_results = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
        print("\n--- BLEU Score ---")
        print(bleu_results)

        # --- BERTScore ---
        bertscore = evaluate.load("bertscore")
        bertscore_results = bertscore.compute(predictions=predictions, references=references, lang="nl")

        avg_precision = sum(bertscore_results['precision']) / len(bertscore_results['precision'])
        avg_recall = sum(bertscore_results['recall']) / len(bertscore_results['recall'])
        avg_f1 = sum(bertscore_results['f1']) / len(bertscore_results['f1'])

        print(f"\n--- BERTScore ---")
        print(f"{'Average Precision':>20}: {avg_precision:.4f}")
        print(f"{'Average Recall':>20}: {avg_recall:.4f}")
        print(f"{'Average F1':>20}: {avg_f1:.4f}")

        # --- COMET ---
        print("\n--- COMET Scores ---")
        print("Loading COMET models (this may take a while)...")
        comet_22 = evaluate.load('comet', 'Unbabel/wmt22-comet-da')
        comet_kiwi = evaluate.load('comet', 'Unbabel/wmt22-cometkiwi-da')
        xcomet = evaluate.load('comet', 'Unbabel/XCOMET-XL')

        comet_22_results = comet_22.compute(predictions=predictions, references=references, sources=sources)
        comet_kiwi_results = comet_kiwi.compute(predictions=predictions, references=references, sources=sources)
        xcomet_results = xcomet.compute(predictions=predictions, references=references, sources=sources)

        print(f"\n--- COMET-22 Score ---")
        print(f"{'score':>20}: {comet_22_results['mean_score']:.4f}")
        print(f"\n--- COMETkiwi Score ---")
        print(f"{'score':>20}: {comet_kiwi_results['mean_score']:.4f}")
        print(f"\n--- XCOMET Score ---")
        print(f"{'score':>20}: {xcomet_results['mean_score']:.4f}")


        # ==============================================================================
        # 5. SAVE RESULTS TO DISK
        # ==============================================================================
        print("\nSaving results to disk...")

        # --- Save the summary scores to a text file ---
        summary_path = f"./evaluation_summary_{short_model_name}_{shot_type}.txt"
        with open(summary_path, "w") as f:
            f.write("--- ROUGE Scores ---\n")
            f.write(str(rouge_results) + "\n")
            f.write("\n--- BLEU Score ---\n")
            f.write(str(bleu_results) + "\n")
            f.write("\n--- BERTScore ---\n")
            f.write(f"{'Average Precision':>20}: {avg_precision:.4f}\n")
            f.write(f"{'Average Recall':>20}: {avg_recall:.4f}\n")
            f.write(f"{'Average F1':>20}: {avg_f1:.4f}\n")
            f.write("\n--- COMET-22 Score ---\n")
            f.write(f"{'score':>20}: {comet_22_results['mean_score']:.4f}\n")
            f.write("\n--- COMETkiwi Score ---\n")
            f.write(f"{'score':>20}: {comet_kiwi_results['mean_score']:.4f}\n")
            f.write("\n--- XCOMET Score ---\n")
            f.write(f"{'score':>20}: {xcomet_results['mean_score']:.4f}\n")

        print(f"Summary of scores saved to: {summary_path}")

        # ==============================================================================
        # 6. CREATE AND SAVE DETAILED PER-ENTRY RESULTS TSV
        # ==============================================================================
        print("--- Creating a detailed results file with per-entry scores ---")

        try:
            # Convert the original test set to a pandas DataFrame
            # We need to slice the dataset to match the entries in results_df
            # This is a safe way to handle it, though results_df should cover the whole dataset
            final_results_df = dataset.to_pandas().iloc[:len(results_df)]

            # Add the model's predictions from the generation step
            final_results_df['model_prediction'] = results_df['prediction'].values

            # --- Add Per-Entry Metric Scores ---
            # Add BERTScore (Precision, Recall, and F1)
            final_results_df['bertscore_precision'] = bertscore_results['precision']
            final_results_df['bertscore_recall'] = bertscore_results['recall']
            final_results_df['bertscore_f1'] = bertscore_results['f1']
            
            # Add COMET-22, CometKiwi and XCOMET scores
            final_results_df['comet22_score'] = comet_22_results['scores']
            final_results_df['cometkiwi_score'] = comet_kiwi_results['scores']
            final_results_df['xcomet_score'] = xcomet_results['scores']

            # --- Save to TSV File ---
            tsv_path = f"./evaluation_results_per_entry_{short_model_name}_{shot_type}.tsv"
            final_results_df.to_csv(tsv_path, sep='\t', index=False)

            print(f"\nSuccessfully created and saved the detailed results file.")
            print(f"File saved to: {tsv_path}")
            
            print("\n--- Data Preview ---")
            print("Note: BLEU and ROUGE are corpus-level metrics and are not included in this per-entry file.")
            print(final_results_df.head())

        except Exception as e:
            print(f"\nAn error occurred while creating the TSV file: {e}")

    # --- 4. Clear Memory for Next Model ---
    print(f"\nClearing memory for model {model_id}...")
    del model
    del tokenizer
    gc.collect()
    torch.cuda.empty_cache()

print("\n\nAll models have been evaluated.")


EVALUATING MODEL: google/mt5-xl
Loading google/mt5-xl...
Model and tokenizer loaded successfully.

Generating predictions for 3450 samples using google/mt5-xl (zero_shot)...


  0%|          | 0/108 [00:00<?, ?it/s]


--- First Batch Generation Results ---

--- Sample 1 ---
SOURCE: maatschappij waarin fysieke afstand nodig is
PREDICTION: <extra_id_0>'. '1,5 metermaatschappij' '1,5 metermaatschappij' '1,5 metermaatschappij' '1,5 metermaatschappij' '1,5 metermaatschappij' '1,5 metermaatschappij' '1,5 metermaatschappij' ' <extra_id_8>' ' <extra_id_9>' ' <extra_id_10>' ' <extra_id_11>' ' <extra_id_12>' ' <extra_id_13>' ' <extra_id_14>' ' <extra_id_15>' ' <extra_id_16>' ' <extra_id_17>' ' <extra_id_18>'  <extra_id_19>' ' <extra_id_25>' ' <extra_id_26>' ' <extra_id_31>' ' <extra_id_32>' ' <extra_id_19>' ' <extra_id_17>' ' ' ' ' ' ' ' ' ' ' ' ' ' '
REFERENCE: maatschappij waarin mensen die niet tot hetzelfde huishouden behoren in de publieke ruimte een fysieke afstand van minimaal anderhalve meter tot elkaar moeten bewaren om verspreiding van een virus te voorkomen

--- Sample 2 ---
SOURCE: alarmcentrale
PREDICTION: <extra_id_0> 'alarmcentrale' 'alarmcentrale' 'alarmcentrale' 'alarmcentrale' '112-centrale

KeyboardInterrupt: 