In [13]:
import pandas as pd
import json
from IPython.display import display, HTML

# --- Configuration ---
# Set the file paths for your detailed metric files
FINE_TUNED_FILE = 'evaluation_results_per_entry_AYA_23_SFT.tsv'
FEW_SHOT_FILE = 'metrics_detailed_aya-101_few_shot.tsv'
TEXT_OUTPUT_FILE = 'qualitative_analysis_output.txt'
JSONL_OUTPUT_FILE = 'qualitative_analysis_output.jsonl'

# Number of top definitions to select for analysis
N_SAMPLES = 10

def load_and_prepare_data(file_path, model_type):
    """
    Loads, cleans, and scores a model's result file.
    """
    try:
        df = pd.read_csv(file_path, sep='\t')
    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'. Please check the file path.")
        return None

    # Standardize column names
    if model_type == 'fine-tuned':
        df = df.rename(columns={'model_prediction': 'cleaned_prediction'})
        df['raw_output'] = df['cleaned_prediction']
    elif model_type == 'few-shot':
        df = df.rename(columns={
            'model_prediction': 'raw_output',
            'model_long_definition': 'cleaned_prediction'
        })
    
    # Ensure score columns are numeric and create combined score
    score_cols = ['comet22_score', 'xcomet_score']
    for col in score_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    df.dropna(subset=score_cols, inplace=True)
    df['combined_score'] = df['comet22_score'] + df['xcomet_score']
    
    return df

def get_final_comparison_list(sft_df, fs_df, n_samples):
    """
    Creates a final list of N*2 unique items for comparison, handling overlaps
    and tracking which model triggered each sample's inclusion.
    """
    # --- Stratum 1 Logic: Filter and Rank ---
    sft_decision_pool = sft_df[
        sft_df['cleaned_prediction'].str.strip().str.lower() != sft_df['DefinitionFull'].str.strip().str.lower()
    ].copy().sort_values(by='combined_score', ascending=False)
    
    fs_decision_pool = fs_df[
        fs_df['cleaned_prediction'].str.strip().str.lower() != fs_df['DefinitionFull'].str.strip().str.lower()
    ].copy().sort_values(by='combined_score', ascending=False)

    # --- Manual Exclusion List ---
    lemmas_to_exclude = [
        'er glijdt een schaduw over iemands gezicht',
        'homohaat'
    ]
    print(f"Excluding {len(lemmas_to_exclude)} specific lemmas from the 'Aya 101 (Few-Shot)' pool.")
    fs_decision_pool = fs_decision_pool[~fs_decision_pool['Lemma'].isin(lemmas_to_exclude)]

    sft_top = sft_decision_pool.head(n_samples + 1)
    fs_top = fs_decision_pool.head(n_samples + 1)
    
    # --- Overlap Check and Resolution ---
    sft_top_dict = {
        tuple(row[['Lemma', 'DefinitionShort']]): row['combined_score']
        for _, row in sft_top.head(n_samples).iterrows()
    }
    fs_top_dict = {
        tuple(row[['Lemma', 'DefinitionShort']]): row['combined_score']
        for _, row in fs_top.head(n_samples).iterrows()
    }

    sft_top_ids = set(sft_top_dict.keys())
    fs_top_ids = set(fs_top_dict.keys())
    
    overlap = sft_top_ids.intersection(fs_top_ids)
    print("--- OVERLAP ANALYSIS ---")
    print(f"Number of overlapping items in the top {n_samples} lists: {len(overlap)}")
    if len(overlap) > 0:
        print("Overlapping items:", overlap)

    final_item_ids = {}
    for item_id, score in sft_top_dict.items():
        final_item_ids[item_id] = ("Aya 23 (Fine-Tuned)", score)
    for item_id, score in fs_top_dict.items():
        if item_id not in final_item_ids:
            final_item_ids[item_id] = ("Aya 101 (Few-Shot)", score)

    if len(final_item_ids) < n_samples * 2 and len(sft_top) > n_samples and len(fs_top) > n_samples:
        sft_11th_item = sft_top.iloc[n_samples]
        fs_11th_item = fs_top.iloc[n_samples]
        
        sft_11th_id = tuple(sft_11th_item[['Lemma', 'DefinitionShort']])
        fs_11th_id = tuple(fs_11th_item[['Lemma', 'DefinitionShort']])
        
        print("\nResolving overlap to get more unique samples...")
        print(f"  Fine-Tuned 11th score: {sft_11th_item['combined_score']:.4f}")
        print(f"  Few-Shot 11th score:   {fs_11th_item['combined_score']:.4f}")
        
        if sft_11th_item['combined_score'] >= fs_11th_item['combined_score']:
            if sft_11th_id not in final_item_ids:
                final_item_ids[sft_11th_id] = ("Aya 23 (Fine-Tuned)", sft_11th_item['combined_score'])
                print("  -> Added 11th best from Fine-Tuned model.")
        else:
            if fs_11th_id not in final_item_ids:
                final_item_ids[fs_11th_id] = ("Aya 101 (Few-Shot)", fs_11th_item['combined_score'])
                print("  -> Added 11th best from Few-Shot model.")

    print(f"Total unique items for analysis: {len(final_item_ids)}")
    print("-" * 26 + "\n")
    
    # --- Build Final Comparison Data ---
    sft_indexed = sft_df.set_index(['Lemma', 'DefinitionShort']).sort_index()
    fs_indexed = fs_df.set_index(['Lemma', 'DefinitionShort']).sort_index()
    
    final_results = []
    for (lemma, short_def), (decision_model, trigger_score) in final_item_ids.items():
        try:
            sft_row = sft_indexed.loc[(lemma, short_def)]
            fs_row = fs_indexed.loc[(lemma, short_def)]
            if isinstance(sft_row, pd.DataFrame): sft_row = sft_row.iloc[0]
            if isinstance(fs_row, pd.DataFrame): fs_row = fs_row.iloc[0]
            
            final_results.append({
                'lemma': lemma,
                'short_def': short_def,
                'decision_model': decision_model,
                'trigger_score': trigger_score,
                'sft_row': sft_row,
                'fs_row': fs_row
            })
        except KeyError:
            print(f"Warning: Could not find match for Lemma='{lemma}' in both files. Skipping.")
            
    return final_results, len(final_item_ids), len(overlap)


def display_and_save_samples(results, sft_model_name, fs_model_name, text_file_handle, jsonl_file_handle):
    """Displays the results and saves them to the provided file handles."""
    
    def write_to_text_file(text):
        """Helper function to print to console and write to text file."""
        print(text)
        text_file_handle.write(text + '\n')

    if not results:
        write_to_text_file(f"No comparative samples to display.")
        return
        
    display(HTML(f"<h2>{len(results)} Unique Samples for Stratum 2 Analysis</h2>"))
    text_file_handle.write(f"--- {len(results)} UNIQUE SAMPLES FOR ANALYSIS (sorted by trigger score) ---\n\n")

    for i, item in enumerate(results):
        sft_row, fs_row = item['sft_row'], item['fs_row']
        lemma, short_def = item['lemma'], item['short_def']
        decision_model = item['decision_model']
        trigger_score = item['trigger_score']
        
        # --- Create JSON object for this sample ---
        json_record = {
            "rank": i + 1,
            "lemma": lemma,
            "source_definition": short_def,
            "reference_definition": sft_row['DefinitionFull'],
            "triggered_by": {
                "model": decision_model,
                "combined_score": round(trigger_score, 4)
            },
            "fine_tuned_output": {
                "model_name": sft_model_name,
                "prediction": sft_row['cleaned_prediction'],
                "comet22_score": round(sft_row['comet22_score'], 4),
                "xcomet_score": round(sft_row['xcomet_score'], 4)
            },
            "few_shot_output": {
                "model_name": fs_model_name,
                "prediction_cleaned": fs_row['cleaned_prediction'],
                "prediction_raw": fs_row['raw_output'],
                "comet22_score": round(fs_row['comet22_score'], 4),
                "xcomet_score": round(fs_row['xcomet_score'], 4)
            }
        }
        jsonl_file_handle.write(json.dumps(json_record) + '\n')

        # --- Write to human-readable text file ---
        write_to_text_file("=" * 80)
        write_to_text_file(f"COMPARISON SAMPLE #{i + 1} (Overall Rank)")
        write_to_text_file("-" * 80)
        write_to_text_file(f"Triggered by High Score from: {decision_model} (Score: {trigger_score:.4f})")
        write_to_text_file(f"Lemma: {lemma}")
        write_to_text_file(f"Source (Short Definition): {short_def}")
        write_to_text_file(f"Reference (Full Definition): {sft_row['DefinitionFull']}")
        write_to_text_file("-" * 80)
        
        write_to_text_file(f"Output from {sft_model_name}:")
        write_to_text_file(f"  Scores: COMET-22 = {sft_row['comet22_score']:.4f}, XCOMET = {sft_row['xcomet_score']:.4f}")
        write_to_text_file(f"  Prediction (Cleaned): {sft_row['cleaned_prediction']}\n")
        
        write_to_text_file(f"Output from {fs_model_name}:")
        write_to_text_file(f"  Scores: COMET-22 = {fs_row['comet22_score']:.4f}, XCOMET = {fs_row['xcomet_score']:.4f}")
        if 'raw_output' in fs_row and fs_row['raw_output'] != fs_row['cleaned_prediction']:
             write_to_text_file(f"  Raw Output: {fs_row['raw_output']}")
        write_to_text_file(f"  Prediction (Cleaned): {fs_row['cleaned_prediction']}\n")
    write_to_text_file("\n\n")

# --- Main Execution ---
sft_df_full = load_and_prepare_data(FINE_TUNED_FILE, 'fine-tuned')
fs_df_full = load_and_prepare_data(FEW_SHOT_FILE, 'few-shot')

if sft_df_full is not None and fs_df_full is not None:
    final_results, total_samples, overlap_count = get_final_comparison_list(sft_df_full, fs_df_full, N_SAMPLES)
    
    final_results.sort(key=lambda x: x['trigger_score'], reverse=True)
    
    with open(TEXT_OUTPUT_FILE, 'w', encoding='utf-8') as txt_f, \
         open(JSONL_OUTPUT_FILE, 'w', encoding='utf-8') as jsonl_f:
        
        summary_text = (
            "--- QUALITATIVE ANALYSIS SAMPLES ---\n\n"
            "--- OVERLAP ANALYSIS SUMMARY ---\n"
            f"Initial top {N_SAMPLES} lists had {overlap_count} overlapping item(s).\n"
            f"Final unique items for analysis: {total_samples}\n"
            "------------------------------\n\n"
        )
        txt_f.write(summary_text)
        
        display_and_save_samples(
            final_results, 
            "Aya 23 (Fine-Tuned)", 
            "Aya 101 (Few-Shot)", 
            txt_f, 
            jsonl_f
        )

    print(f"\n✅ Analysis complete. Output saved to '{TEXT_OUTPUT_FILE}' and '{JSONL_OUTPUT_FILE}'")


Excluding 2 specific lemmas from the 'Aya 101 (Few-Shot)' pool.
--- OVERLAP ANALYSIS ---
Number of overlapping items in the top 10 lists: 1
Overlapping items: {('eenoudergezin', 'gezin met één ouder')}

Resolving overlap to get more unique samples...
  Fine-Tuned 11th score: 1.9343
  Few-Shot 11th score:   1.9556
  -> Added 11th best from Few-Shot model.
Total unique items for analysis: 20
--------------------------



COMPARISON SAMPLE #1 (Overall Rank)
--------------------------------------------------------------------------------
Triggered by High Score from: Aya 101 (Few-Shot) (Score: 1.9766)
Lemma: liefhebber
Source (Short Definition): iemand die veel van iets of iemand houdt
Reference (Full Definition): iemand die iets of iemand heel leuk of mooi vindt
--------------------------------------------------------------------------------
Output from Aya 23 (Fine-Tuned):
  Scores: COMET-22 = 0.8541, XCOMET = 0.9623
  Prediction (Cleaned): iemand die veel van iets of iemand houdt

Output from Aya 101 (Few-Shot):
  Scores: COMET-22 = 0.9766, XCOMET = 1.0000
  Raw Output: Lange definitie: iemand die iets of iemand heel leuk of mooi vindt.
  Prediction (Cleaned): iemand die iets of iemand heel leuk of mooi vindt.

COMPARISON SAMPLE #2 (Overall Rank)
--------------------------------------------------------------------------------
Triggered by High Score from: Aya 101 (Few-Shot) (Score: 1.9740)
Lemma: lich

In [24]:
import pandas as pd
import json
from IPython.display import display, HTML

# --- Configuration ---
# Define paths for all model result and prompt files.
FINE_TUNED_FILE = 'evaluation_results_per_entry_AYA_23_SFT.tsv'
FEW_SHOT_FILE = 'metrics_detailed_aya-101_few_shot.tsv'
PROMPTS_FILE = 'prompts_alpha_0_8.jsonl'
TEXT_OUTPUT_FILE = 'stratum_3_analysis_output.txt'
JSONL_OUTPUT_FILE = 'stratum_3_analysis_output.jsonl'


# Define the Genus Proxima groups and the specific lemmas you want to analyze.
# This structure allows for precise selection.
STRATUM_3_SELECTION = {
    "liefhebber": [], # Will be auto-populated, then the word 'liefhebber' will be excluded.
    "ologie": [      # A filled list means: only use these specific lemmas for this group.
        'hematologie',
        'nefrologie',
        'pneumologie'
    ]
}


def load_few_shot_prompts(prompts_path):
    """Loads the few-shot examples from the JSONL file into a lookup dictionary."""
    prompts_lookup = {}
    try:
        with open(prompts_path, 'r', encoding='utf-8') as f:
            for line in f:
                record = json.loads(line)
                key = (record['lemma'], record['short_definition'])
                examples = [
                    record.get('few_shot_example_1'),
                    record.get('few_shot_example_2'),
                    record.get('few_shot_example_3')
                ]
                prompts_lookup[key] = [ex for ex in examples if ex is not None]
        print(f"Successfully loaded {len(prompts_lookup)} few-shot prompts.")
    except FileNotFoundError:
        print(f"Warning: Prompts file not found at '{prompts_path}'. Few-shot examples will not be displayed.")
    except Exception as e:
        print(f"An error occurred loading prompts file: {e}")
    return prompts_lookup


def load_and_prepare_data(sft_path, fs_path):
    """Loads and merges the full data from both model result files."""
    # Load Fine-Tuned Data
    sft_df = pd.read_csv(sft_path, sep='\t')
    sft_df = sft_df.rename(columns={
        'model_prediction': 'sft_prediction',
        'comet22_score': 'sft_comet22_score',
        'xcomet_score': 'sft_xcomet_score'
    })
    sft_df['sft_combined_score'] = pd.to_numeric(sft_df['sft_comet22_score'], errors='coerce') + \
                                   pd.to_numeric(sft_df['sft_xcomet_score'], errors='coerce')
    sft_df = sft_df[['Lemma', 'DefinitionShort', 'DefinitionFull', 'sft_prediction', 'sft_comet22_score', 'sft_xcomet_score', 'sft_combined_score']]

    # Load Few-Shot Data
    fs_df = pd.read_csv(fs_path, sep='\t')
    fs_df = fs_df.rename(columns={
        'model_long_definition': 'fs_prediction', 
        'model_prediction': 'fs_raw_prediction',
        'comet22_score': 'fs_comet22_score',
        'xcomet_score': 'fs_xcomet_score'
    })
    fs_df['fs_combined_score'] = pd.to_numeric(fs_df['fs_comet22_score'], errors='coerce') + \
                                 pd.to_numeric(fs_df['fs_xcomet_score'], errors='coerce')
    fs_df = fs_df[['Lemma', 'DefinitionShort', 'fs_prediction', 'fs_raw_prediction', 'fs_comet22_score', 'fs_xcomet_score', 'fs_combined_score']]

    # Merge the two dataframes
    merged_df = pd.merge(sft_df, fs_df, on=['Lemma', 'DefinitionShort'], how='inner')
    merged_df.drop_duplicates(subset=['Lemma', 'DefinitionShort'], inplace=True)
    
    return merged_df

def display_and_save_stratum_3(selection, data_df, prompts_data, txt_handle, jsonl_handle):
    """
    Finds, displays, and saves the detailed comparisons for the selected Stratum 3 groups.
    """
    def write_to_text_file(text):
        print(text)
        txt_handle.write(text + '\n')

    display(HTML("<h2>Detailed Comparisons for Selected Stratum 3 Groups</h2>"))
    txt_handle.write("--- DETAILED COMPARISONS FOR SELECTED STRATUM 3 GROUPS ---\n\n")

    for suffix, group_lemmas in selection.items():
        write_to_text_file("=" * 80)
        write_to_text_file(f"Genus Proxima: '{suffix}'")
        write_to_text_file("=" * 80)
        
        group_df = data_df[data_df['Lemma'].isin(group_lemmas)].sort_values(by='Lemma')
        
        if group_df.empty:
            write_to_text_file("No matching lemmas found in the test set for this group.\n")
            continue

        for _, row in group_df.iterrows():
            lookup_key = (row['Lemma'], row['DefinitionShort'])
            examples = prompts_data.get(lookup_key, [])

            # --- Create JSON object for this sample ---
            json_record = {
                "genus_proxima": suffix,
                "lemma": row['Lemma'],
                "source_definition": row['DefinitionShort'],
                "reference_definition": row['DefinitionFull'],
                "fine_tuned_output": {
                    "model_name": "Aya 23 (Fine-Tuned)",
                    "prediction": row['sft_prediction'],
                    "comet22_score": round(row['sft_comet22_score'], 4) if pd.notna(row['sft_comet22_score']) else None,
                    "xcomet_score": round(row['sft_xcomet_score'], 4) if pd.notna(row['sft_xcomet_score']) else None,
                    "combined_score": round(row['sft_combined_score'], 4) if pd.notna(row['sft_combined_score']) else None
                },
                "few_shot_output": {
                    "model_name": "Aya 101 (Few-Shot)",
                    "prediction_cleaned": row['fs_prediction'],
                    "prediction_raw": row['fs_raw_prediction'],
                    "comet22_score": round(row['fs_comet22_score'], 4) if pd.notna(row['fs_comet22_score']) else None,
                    "xcomet_score": round(row['fs_xcomet_score'], 4) if pd.notna(row['fs_xcomet_score']) else None,
                    "combined_score": round(row['fs_combined_score'], 4) if pd.notna(row['fs_combined_score']) else None,
                    "examples_used": [
                        {
                            "lemma": ex.get('lemma'),
                            "short_definition": ex.get('short_definition'),
                            "long_definition": ex.get('long_definition')
                        } for ex in examples
                    ]
                }
            }
            jsonl_handle.write(json.dumps(json_record) + '\n')

            # --- Write to human-readable text file ---
            write_to_text_file("-" * 60)
            write_to_text_file(f"Lemma: {row['Lemma']}")
            write_to_text_file(f"Source (Short Definition): {row['DefinitionShort']}")
            write_to_text_file(f"Reference (Full Definition): {row['DefinitionFull']}")
            write_to_text_file("-" * 60)

            sft_score_str = f"COMET-22: {row['sft_comet22_score']:.4f}, XCOMET: {row['sft_xcomet_score']:.4f}" if pd.notna(row['sft_combined_score']) else "N/A"
            write_to_text_file(f"Output from Aya 23 (Fine-Tuned) (Scores: {sft_score_str}):")
            write_to_text_file(f"  -> {row['sft_prediction']}\n")

            fs_score_str = f"COMET-22: {row['fs_comet22_score']:.4f}, XCOMET: {row['fs_xcomet_score']:.4f}" if pd.notna(row['fs_combined_score']) else "N/A"
            write_to_text_file(f"Output from Aya 101 (Few-Shot) (Scores: {fs_score_str}):")
            if row['fs_raw_prediction'] != row['fs_prediction']:
                write_to_text_file(f"  Raw: {row['fs_raw_prediction']}")
            write_to_text_file(f"  -> {row['fs_prediction']}\n")

            if examples:
                write_to_text_file("--- Few-Shot Examples Used for This Entry ---")
                for idx, ex in enumerate(examples):
                    write_to_text_file(f"  Example {idx+1}:")
                    write_to_text_file(f"    Lemma: {ex.get('lemma')}")
                    write_to_text_file(f"    Short: {ex.get('short_definition')}")
                    write_to_text_file(f"    Long:  {ex.get('long_definition')}")
                write_to_text_file("-" * 45)
        write_to_text_file("\n")


# --- Main Execution ---
print("--- Stratum 3: Detailed View of Selected Genus Proxima Groups ---")
try:
    # Load all data
    full_data_df = load_and_prepare_data(FINE_TUNED_FILE, FEW_SHOT_FILE)
    prompts_data = load_few_shot_prompts(PROMPTS_FILE)
    print(f"Successfully loaded {len(full_data_df)} unique entries for lookup.")

    # Find all lemmas that match the auto-detect groups (where the list is empty)
    for suffix, lemmas in STRATUM_3_SELECTION.items():
        if not lemmas:
            matching_lemmas = full_data_df[full_data_df['Lemma'].str.endswith(suffix)]['Lemma'].unique().tolist()
            STRATUM_3_SELECTION[suffix] = matching_lemmas
            print(f"Found {len(matching_lemmas)} lemmas for Genus Proxima '{suffix}'.")

    # Apply Manual Exclusions
    if 'liefhebber' in STRATUM_3_SELECTION:
        print("Excluding the lemma 'liefhebber' from its own group.")
        STRATUM_3_SELECTION['liefhebber'] = [l for l in STRATUM_3_SELECTION['liefhebber'] if l != 'liefhebber']

    # Open files and generate output
    with open(TEXT_OUTPUT_FILE, 'w', encoding='utf-8') as txt_f, \
         open(JSONL_OUTPUT_FILE, 'w', encoding='utf-8') as jsonl_f:
        
        display_and_save_stratum_3(STRATUM_3_SELECTION, full_data_df, prompts_data, txt_f, jsonl_f)

    print(f"\n✅ Analysis complete. Output saved to '{TEXT_OUTPUT_FILE}' and '{JSONL_OUTPUT_FILE}'")

except FileNotFoundError:
    print(f"Error: Could not find one of the required files.")
    print(f"Please ensure all .tsv and .jsonl files are in the same directory.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


--- Stratum 3: Detailed View of Selected Genus Proxima Groups ---
Successfully loaded 3446 few-shot prompts.
Successfully loaded 3446 unique entries for lookup.
Found 5 lemmas for Genus Proxima 'liefhebber'.
Excluding the lemma 'liefhebber' from its own group.


Genus Proxima: 'liefhebber'
------------------------------------------------------------
Lemma: operaliefhebber
Source (Short Definition): liefhebber van operamuziek
Reference (Full Definition): iemand die veel van het spelen van en/of het luisteren naar operamuziek houdt
------------------------------------------------------------
Output from Aya 23 (Fine-Tuned) (Scores: COMET-22: 0.8390, XCOMET: 0.7690):
  -> iemand die veel van het zingen van opera en het bijbehorende muziekgenre houdt

Output from Aya 101 (Few-Shot) (Scores: COMET-22: 0.8667, XCOMET: 0.9942):
  Raw: Lange definitie: iemand die van operamuziek houdt
  -> iemand die van operamuziek houdt

--- Few-Shot Examples Used for This Entry ---
  Example 1:
    Lemma: operacriticus
    Short: iemand die opera's recenseert
    Long:  iemand die zich, voor zijn beroep of uit liefhebberij, bezighoudt met het kritisch bespreken van opera's en opera in het algemeen, vooral in de media
  Example 2:
    Lemma: operadocent
    Short: d

In [19]:
import pandas as pd
from collections import defaultdict
from IPython.display import display, HTML

# --- Configuration ---
# Define paths for both model result files to access their scores.
FINE_TUNED_FILE = 'evaluation_results_per_entry_AYA_23_SFT.tsv'
FEW_SHOT_FILE = 'metrics_detailed_aya-101_few_shot.tsv'

# The minimum length of the common ending to be considered a valid genus proximum.
MIN_SUFFIX_LENGTH = 5

# The maximum number of words allowed in a lemma.
MAX_WORDS_IN_LEMMA = 2

# The minimum number of lemmas a group must have to be displayed.
MIN_GROUP_SIZE = 2


def load_and_prepare_data(sft_path, fs_path):
    """Loads and merges the full data from both model result files."""
    # Load Fine-Tuned Data
    sft_df = pd.read_csv(sft_path, sep='\t')
    sft_df = sft_df.rename(columns={'model_prediction': 'sft_prediction'})
    sft_df['sft_combined_score'] = pd.to_numeric(sft_df['comet22_score'], errors='coerce') + \
                                   pd.to_numeric(sft_df['xcomet_score'], errors='coerce')
    sft_df = sft_df[['Lemma', 'DefinitionShort', 'DefinitionFull', 'sft_prediction', 'sft_combined_score']]

    # Load Few-Shot Data
    fs_df = pd.read_csv(fs_path, sep='\t')
    fs_df = fs_df.rename(columns={'model_long_definition': 'fs_prediction'})
    fs_df['fs_combined_score'] = pd.to_numeric(fs_df['comet22_score'], errors='coerce') + \
                                 pd.to_numeric(fs_df['xcomet_score'], errors='coerce')
    fs_df = fs_df[['Lemma', 'DefinitionShort', 'fs_prediction', 'fs_combined_score']]

    # Merge the two dataframes
    merged_df = pd.merge(sft_df, fs_df, on=['Lemma', 'DefinitionShort'], how='inner')
    
    # Drop any unexpected duplicate rows for the same (Lemma, DefinitionShort) pair.
    merged_df.drop_duplicates(subset=['Lemma', 'DefinitionShort'], inplace=True)
    
    return merged_df


def find_common_suffix_groups(lemmas, min_length=5):
    """
    Finds groups of lemmas that share a common suffix of at least min_length.
    """
    print(f"Searching for common suffixes of at least {min_length} characters...")
    groups = defaultdict(list)
    lemma_set = set(lemmas)
    
    for lemma1 in lemmas:
        for lemma2 in lemma_set:
            if lemma1 >= lemma2:
                continue
            
            i = 0
            while i < len(lemma1) and i < len(lemma2) and lemma1[-(i+1)] == lemma2[-(i+1)]:
                i += 1
            
            if i >= min_length:
                common_suffix = lemma1[-i:]
                if lemma1 not in groups[common_suffix]:
                    groups[common_suffix].append(lemma1)
                if lemma2 not in groups[common_suffix]:
                    groups[common_suffix].append(lemma2)
                    
    for suffix in groups:
        groups[suffix].sort()
        
    return groups

# --- Main Execution ---
print("--- Stratum 3: Finding Lexical Pairs with Shared Genus Proxima ---")
try:
    # Load and prepare data from both models
    merged_df = load_and_prepare_data(FINE_TUNED_FILE, FEW_SHOT_FILE)
    print(f"Successfully loaded and merged scores for {len(merged_df)} unique common test entries.")

    # --- Apply Filters ---
    # 1. Filter out entries where BOTH models produced a perfect match
    merged_df['sft_is_perfect'] = merged_df['sft_prediction'].str.strip().str.lower() == merged_df['DefinitionFull'].str.strip().str.lower()
    merged_df['fs_is_perfect'] = merged_df['fs_prediction'].str.strip().str.lower() == merged_df['DefinitionFull'].str.strip().str.lower()
    
    initial_rows = len(merged_df)
    filtered_df = merged_df[~(merged_df['sft_is_perfect'] & merged_df['fs_is_perfect'])].copy()
    print(f"Filtered out {initial_rows - len(filtered_df)} entries where both models had a perfect match.")

    # 2. Filter by word count in the lemma
    initial_rows = len(filtered_df)
    filtered_df['word_count'] = filtered_df['Lemma'].str.split().str.len()
    filtered_df = filtered_df[filtered_df['word_count'] <= MAX_WORDS_IN_LEMMA].copy()
    print(f"Filtered out {initial_rows - len(filtered_df)} entries with lemmas longer than {MAX_WORDS_IN_LEMMA} words.")
    
    # Find groups based on the filtered list of unique lemmas
    unique_lemmas_filtered = filtered_df['Lemma'].unique().tolist()
    genus_groups = find_common_suffix_groups(unique_lemmas_filtered, min_length=MIN_SUFFIX_LENGTH)
    
    # --- Prepare groups for sorting ---
    groups_with_scores = []
    for suffix, group_lemmas in genus_groups.items():
        if len(group_lemmas) >= MIN_GROUP_SIZE:
            max_score_in_group = 0
            # Find the highest score among all entries for lemmas in this group
            for lemma in group_lemmas:
                lemma_entries = filtered_df[filtered_df['Lemma'] == lemma]
                if not lemma_entries.empty:
                    # Get the max score for this lemma across its senses and models
                    max_lemma_score = max(lemma_entries['sft_combined_score'].max(), lemma_entries['fs_combined_score'].max())
                    if max_lemma_score > max_score_in_group:
                        max_score_in_group = max_lemma_score
            
            groups_with_scores.append((suffix, group_lemmas, max_score_in_group))

    # Sort the groups based on the highest score found within each group, in descending order
    groups_with_scores.sort(key=lambda x: x[2], reverse=True)
    
    # --- Display Results ---
    print(f"\nFound {len(groups_with_scores)} potential genus proxima groups matching all criteria.")
    display(HTML("<h3>Potential Pairs/Groups for Stratum 3 Analysis (sorted by score):</h3>"))

    if not groups_with_scores:
        print(f"No groups found with at least {MIN_GROUP_SIZE} members after filtering. Try lowering MIN_SUFFIX_LENGTH or adjusting MAX_WORDS_IN_LEMMA.")
    else:
        for suffix, group_lemmas, max_score in groups_with_scores:
            print(f"Genus Proxima: '{suffix}' (Highest score in group: {max_score:.2f})")
            
            # Display each lemma in the group with its scores for all its senses
            for lemma in group_lemmas:
                lemma_entries = filtered_df[filtered_df['Lemma'] == lemma]
                print(f"  - {lemma}:")
                for _, row in lemma_entries.iterrows():
                    sft_score_str = f"{row['sft_combined_score']:.2f}"
                    fs_score_str = f"{row['fs_combined_score']:.2f}"
                    print(f"    - Short Def: '{row['DefinitionShort'][:40]}...' (SFT: {sft_score_str}, FS: {fs_score_str})")

            print("-" * 60)

except FileNotFoundError:
    print(f"Error: Could not find one of the required files.")
    print(f"Please ensure '{FINE_TUNED_FILE}' and '{FEW_SHOT_FILE}' are in the same directory.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")



--- Stratum 3: Finding Lexical Pairs with Shared Genus Proxima ---
Successfully loaded and merged scores for 3446 unique common test entries.
Filtered out 118 entries where both models had a perfect match.
Filtered out 160 entries with lemmas longer than 2 words.
Searching for common suffixes of at least 5 characters...

Found 406 potential genus proxima groups matching all criteria.


Genus Proxima: 'maker' (Highest score in group: 1.99)
  - doelpuntenmaker:
    - Short Def: 'iemand die scoort in een wedstrijd...' (SFT: 1.59, FS: 1.69)
    - Short Def: 'iemand die vaak doelpunten maakt...' (SFT: 1.84, FS: 1.60)
  - fietsenmaker:
    - Short Def: 'iemand die fietsen maakt fietsenfabriek ...' (SFT: 1.70, FS: 1.70)
    - Short Def: 'fietsenfabriek...' (SFT: 1.92, FS: 1.84)
    - Short Def: 'werkplaats of winkel...' (SFT: 1.53, FS: 1.70)
  - gelijkmaker:
    - Short Def: 'gelijkmakend doelpunt gelijkmakend punt ...' (SFT: 1.27, FS: 1.28)
    - Short Def: 'iets wat mensen gelijk maakt...' (SFT: 1.72, FS: 1.46)
    - Short Def: 'gelijkmakend punt in basketbal, handbal ...' (SFT: 1.73, FS: 1.60)
    - Short Def: 'gelegenheid tot gelijkmaken...' (SFT: 1.43, FS: 1.15)
  - grafmaker:
    - Short Def: 'iemand die graven maakt...' (SFT: 1.73, FS: 1.73)
  - instrumentmaker:
    - Short Def: 'iemand die werktuigen maakt...' (SFT: 1.85, FS: 1.73)
  - softwaremaker:
    - Short Def