In [4]:
import pandas as pd
import os

# ==============================================================================
# Cell 1: Pre-processing and Cleaning
# ==============================================================================

def process_evaluation_files(file_paths, output_dir="processed_results"):
    """
    Processes evaluation files to remove metric columns and extract long definitions.

    Args:
        file_paths (list): A list of paths to the input TSV files.
        output_dir (str): The directory to save the processed files.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created output directory: {output_dir}")

    # Columns to be removed if they exist
    metric_columns_to_drop = [
        'bertscore_precision', 'bertscore_recall', 'bertscore_f1',
        'comet22_score', 'cometkiwi_score', 'xcomet_score'
    ]

    for file_path in file_paths:
        try:
            print(f"\n--- Processing file: {os.path.basename(file_path)} ---")
            df = pd.read_csv(file_path, sep='\t')

            # 1. Eliminate metric columns
            existing_metrics = [col for col in metric_columns_to_drop if col in df.columns]
            if existing_metrics:
                df = df.drop(columns=existing_metrics)
                print(f"Removed metric columns: {', '.join(existing_metrics)}")
            else:
                print("No metric columns found to remove.")

            # 2. Extract long definitions
            if 'model_prediction' in df.columns:
                counters = {
                    "geitje_adapted": 0,
                    "aya_101_adapted": 0,
                    "aya_volledige_adapted": 0,
                    "aya_lange_adapted": 0,
                    "not_adapted": 0,
                }

                def extract_long_definition(prediction, filename):
                    """Extracts the long definition based on file-specific markers."""
                    prediction_str = str(prediction)
                    
                    if "aya-101" in filename or "Geitje" in filename:
                        marker = "Lange definitie: "
                        if marker in prediction_str:
                            if "aya-101" in filename:
                                counters["aya_101_adapted"] += 1
                            else: # Geitje
                                counters["geitje_adapted"] += 1
                            return prediction_str.split(marker, 1)[-1].strip()
                    elif "aya-23" in filename:
                        marker_volledige = "**Volledige definitie:** "
                        marker_lange = "**Lange definitie:** "
                        if marker_volledige in prediction_str:
                            counters["aya_volledige_adapted"] += 1
                            return prediction_str.split(marker_volledige, 1)[-1].strip()
                        elif marker_lange in prediction_str:
                            counters["aya_lange_adapted"] += 1
                            return prediction_str.split(marker_lange, 1)[-1].strip()

                    counters["not_adapted"] += 1
                    return prediction_str

                df['model_long_definition'] = df['model_prediction'].apply(
                    lambda x: extract_long_definition(x, os.path.basename(file_path))
                )
                print("Created 'model_long_definition' column.")

                # Print the counts for the current file
                if "Geitje" in os.path.basename(file_path):
                    print(f"  - Adapted for 'Lange definitie: ': {counters['geitje_adapted']} times.")
                if "aya-101" in os.path.basename(file_path):
                    print(f"  - Adapted for 'Lange definitie: ': {counters['aya_101_adapted']} times.")
                if "aya-23" in os.path.basename(file_path):
                    print(f"  - Adapted for '**Volledige definitie:** ': {counters['aya_volledige_adapted']} times.")
                    print(f"  - Adapted for '**Lange definitie:** ': {counters['aya_lange_adapted']} times.")
                print(f"  - Predictions left unchanged: {counters['not_adapted']} times.")


            else:
                print("Column 'model_prediction' not found.")

            # Save the processed dataframe
            output_filename = f"processed_{os.path.basename(file_path)}"
            output_path = os.path.join(output_dir, output_filename)
            df.to_csv(output_path, sep='\t', index=False)
            print(f"Successfully saved processed file to: {output_path}")

        except FileNotFoundError:
            print(f"Error: The file was not found at {file_path}")
        except Exception as e:
            print(f"An error occurred while processing {file_path}: {e}")

if __name__ == '__main__':
    # List of your input files
    # Add any other files you want to process to this list
    files_to_process = [
        "evaluation_results_per_entry_Geitje_few_shot.tsv",
        "evaluation_results_per_entry_aya-23_few_shot.tsv",
        "evaluation_results_per_entry_aya-101_few_shot.tsv",
        # "path/to/your/file_without_metrics.tsv" # Example for the file without metrics
    ]
    process_evaluation_files(files_to_process)



--- Processing file: evaluation_results_per_entry_Geitje_few_shot.tsv ---
Removed metric columns: bertscore_precision, bertscore_recall, bertscore_f1, comet22_score, cometkiwi_score, xcomet_score
Created 'model_long_definition' column.
  - Adapted for 'Lange definitie: ': 684 times.
  - Predictions left unchanged: 2766 times.
Successfully saved processed file to: processed_results/processed_evaluation_results_per_entry_Geitje_few_shot.tsv

--- Processing file: evaluation_results_per_entry_aya-23_few_shot.tsv ---
Removed metric columns: bertscore_precision, bertscore_recall, bertscore_f1, comet22_score, cometkiwi_score, xcomet_score
Created 'model_long_definition' column.
  - Adapted for '**Volledige definitie:** ': 1238 times.
  - Adapted for '**Lange definitie:** ': 1966 times.
  - Predictions left unchanged: 246 times.
Successfully saved processed file to: processed_results/processed_evaluation_results_per_entry_aya-23_few_shot.tsv

--- Processing file: evaluation_results_per_entry_

In [None]:
import evaluate

# ==============================================================================
# Cell 2: Metric Calculation on Processed Files
# ==============================================================================

def calculate_metrics_on_processed(processed_dir="processed_results", metrics_output_dir="metrics_results"):
    """
    Calculates metrics on the processed evaluation files.

    Args:
        processed_dir (str): Directory containing the processed TSV files.
        metrics_output_dir (str): Directory to save the metrics results.
    """
    if not os.path.exists(metrics_output_dir):
        os.makedirs(metrics_output_dir)
        print(f"\nCreated metrics output directory: {metrics_output_dir}")

    processed_files = [f for f in os.listdir(processed_dir) if f.startswith("processed_") and f.endswith(".tsv")]

    if not processed_files:
        print(f"No processed files found in '{processed_dir}'. Please run the first cell.")
        return

    # Load all metric models once
    print("\nLoading evaluation models...")
    rouge = evaluate.load('rouge')
    bleu = evaluate.load('bleu')
    bertscore = evaluate.load("bertscore")
    comet_22 = evaluate.load('comet', 'Unbabel/wmt22-comet-da')
    comet_kiwi = evaluate.load('comet', 'Unbabel/wmt22-cometkiwi-da')
    xcomet = evaluate.load('comet', 'Unbabel/XCOMET-XL')
    print("Evaluation models loaded.")

    for filename in processed_files:
        try:
            file_path = os.path.join(processed_dir, filename)
            print(f"\n--- Calculating metrics for: {filename} ---")
            df = pd.read_csv(file_path, sep='\t')

            # Define predictions, references, and sources
            predictions = df["model_long_definition"].tolist()
            references = df["DefinitionFull"].tolist()
            sources = df["DefinitionShort"].tolist() # Using short definition as source for COMET

            # --- ROUGE ---
            rouge_results = rouge.compute(predictions=predictions, references=references)
            print("\n--- ROUGE Scores ---")
            print(rouge_results)

            # --- BLEU ---
            bleu_results = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
            print("\n--- BLEU Score ---")
            print(bleu_results)

            # --- BERTScore ---
            bertscore_results = bertscore.compute(predictions=predictions, references=references, lang="nl")
            avg_precision = sum(bertscore_results['precision']) / len(bertscore_results['precision'])
            avg_recall = sum(bertscore_results['recall']) / len(bertscore_results['recall'])
            avg_f1 = sum(bertscore_results['f1']) / len(bertscore_results['f1'])
            print(f"\n--- BERTScore ---")
            print(f"{'Average Precision':>20}: {avg_precision:.4f}")
            print(f"{'Average Recall':>20}: {avg_recall:.4f}")
            print(f"{'Average F1':>20}: {avg_f1:.4f}")

            # --- COMET ---
            print("\n--- COMET Scores ---")
            comet_22_results = comet_22.compute(predictions=predictions, references=references, sources=sources)
            comet_kiwi_results = comet_kiwi.compute(predictions=predictions, references=references, sources=sources)
            xcomet_results = xcomet.compute(predictions=predictions, references=references, sources=sources)
            print(f"{'COMET-22':>20}: {comet_22_results['mean_score']:.4f}")
            print(f"{'COMETkiwi':>20}: {comet_kiwi_results['mean_score']:.4f}")
            print(f"{'XCOMET':>20}: {xcomet_results['mean_score']:.4f}")

            # --- Save Results ---
            # Extract model name and shot type from filename for saving
            parts = filename.replace("processed_evaluation_results_per_entry_", "").replace(".tsv", "").split("_")
            short_model_name = parts[0]
            shot_type = "_".join(parts[1:])

            # Save summary
            summary_path = os.path.join(metrics_output_dir, f"metrics_summary_{short_model_name}_{shot_type}.txt")
            with open(summary_path, "w") as f:
                f.write(f"Metrics Summary for {short_model_name} ({shot_type})\n\n")
                f.write("--- ROUGE Scores ---\n" + str(rouge_results) + "\n")
                f.write("\n--- BLEU Score ---\n" + str(bleu_results) + "\n")
                f.write("\n--- BERTScore ---\n")
                f.write(f"{'Average Precision':>20}: {avg_precision:.4f}\n")
                f.write(f"{'Average Recall':>20}: {avg_recall:.4f}\n")
                f.write(f"{'Average F1':>20}: {avg_f1:.4f}\n")
                f.write("\n--- COMET Scores ---\n")
                f.write(f"{'COMET-22':>20}: {comet_22_results['mean_score']:.4f}\n")
                f.write(f"{'COMETkiwi':>20}: {comet_kiwi_results['mean_score']:.4f}\n")
                f.write(f"{'XCOMET':>20}: {xcomet_results['mean_score']:.4f}\n")
            print(f"\nSummary of scores saved to: {summary_path}")

            # Save detailed per-entry TSV
            df['bertscore_precision'] = bertscore_results['precision']
            df['bertscore_recall'] = bertscore_results['recall']
            df['bertscore_f1'] = bertscore_results['f1']
            df['comet22_score'] = comet_22_results['scores']
            df['cometkiwi_score'] = comet_kiwi_results['scores']
            df['xcomet_score'] = xcomet_results['scores']
            detailed_path = os.path.join(metrics_output_dir, f"metrics_detailed_{short_model_name}_{shot_type}.tsv")
            df.to_csv(detailed_path, sep='\t', index=False)
            print(f"Detailed metrics saved to: {detailed_path}")

        except Exception as e:
            print(f"An error occurred while calculating metrics for {filename}: {e}")
            
if __name__ == '__main__':

    # --- Cell 2 Execution ---
    calculate_metrics_on_processed()