In [1]:
import pandas as pd
import os

# Preparation

In [2]:
def get_all_paths_exclude_checkpoints(folder_path):
    """
    Get all file paths from the given folder, excluding paths with `.ipynb_checkpoints`.
    
    Parameters:
        folder_path (str): Path to the folder.
    
    Returns:
        List of file paths excluding `.ipynb_checkpoints`.
    """
    all_paths = []
    for root, dirs, files in os.walk(folder_path):
        if ".ipynb_checkpoints" in root:
            continue  # Skip directories containing `.ipynb_checkpoints`
        for file in files:
            file_path = os.path.join(root, file)
            if ".ipynb_checkpoints" not in file_path:
                all_paths.append(file_path)
                
    all_paths = {path: path.split('/')[-1].split('.')[0] for path in all_paths}

    return all_paths

# Example usage
folder_path = "../ready_for_evaluation"
paths = get_all_paths_exclude_checkpoints(folder_path)
paths = dict(sorted(paths.items()))
# print("Paths:", paths)
paths

{'../ready_for_evaluation/claude-3-opus/deu.csv': 'deu',
 '../ready_for_evaluation/claude-3-opus/fra.csv': 'fra',
 '../ready_for_evaluation/claude-3-opus/it.csv': 'it',
 '../ready_for_evaluation/claude-3-opus/spa.csv': 'spa',
 '../ready_for_evaluation/claude-3.5-haiku/deu.csv': 'deu',
 '../ready_for_evaluation/claude-3.5-haiku/fra.csv': 'fra',
 '../ready_for_evaluation/claude-3.5-haiku/it.csv': 'it',
 '../ready_for_evaluation/claude-3.5-haiku/spa.csv': 'spa',
 '../ready_for_evaluation/claude-3.5-sonnet/deu.csv': 'deu',
 '../ready_for_evaluation/claude-3.5-sonnet/fra.csv': 'fra',
 '../ready_for_evaluation/claude-3.5-sonnet/it.csv': 'it',
 '../ready_for_evaluation/claude-3.5-sonnet/spa.csv': 'spa',
 '../ready_for_evaluation/gemini-1.5-flash/deu.csv': 'deu',
 '../ready_for_evaluation/gemini-1.5-flash/fra.csv': 'fra',
 '../ready_for_evaluation/gemini-1.5-flash/it.csv': 'it',
 '../ready_for_evaluation/gemini-1.5-flash/spa.csv': 'spa',
 '../ready_for_evaluation/gemini-1.5-pro/deu.csv': 'deu'

In [3]:
def recheck_and_reassign(path, lang_prefix):
    """
    Rechecks if lan_m == lan_f in the DataFrame.
    If equal, moves the value to lan_n and clears lan_m and lan_f.
    
    Parameters:
        path (path: Path to the input DataFrame with columns <lang_prefix>_m, <lang_prefix>_f, <lang_prefix>_n.
        lang_prefix (str): Language prefix (e.g., 'deu', 'spa', 'fra', 'it').
    
    Returns:
        pd.DataFrame: Updated DataFrame with reassignments applied.
        
    """
    df = pd.read_csv(path)
    
    # Define column names based on the language prefix
    col_m = f"{lang_prefix}_m"
    col_f = f"{lang_prefix}_f"
    col_n = f"{lang_prefix}_n"

    # Ensure the required columns exist
    if col_m not in df.columns or col_f not in df.columns or col_n not in df.columns:
        raise ValueError(f"Required columns {col_m}, {col_f}, and {col_n} not found in DataFrame.")
    
    # Find rows where masculine and feminine forms are the same
    condition = df[col_m] == df[col_f]
    
    # Update neuter column with the matching values
    df.loc[condition, col_n] = df.loc[condition, col_m]
    
    # Clear masculine and feminine columns where they are the same
    df.loc[condition, col_m] = ""
    df.loc[condition, col_f] = ""
    
    # nan_rows = df[df[[col_m, col_f, col_n]].isna().all(axis=1)]
    df = df.applymap(lambda x: ' '.join(str(x).split()) if isinstance(x, str) else x)
    df.to_csv(path, index=False)
    
    

    # return df, nan_rows


In [4]:
for k, v in paths.items():
    recheck_and_reassign(k, v)

# Accuracy

We will calculate accuracy for `masculine`, `feminine` and `neuter` forms seperately. Then we will check how many outputs provide both gender forms if applicable.

In [5]:
def accuracy(row, lang_prefix: str, gender_prefix: str):
    # Collect true values 
    true_values = {row[f"{lang_prefix}_{gender_prefix}"]}
    true_values = {val.lower() for val in true_values if pd.notna(val)} # Remove NaN
    
    
    # Collect predicted values from extracted
    predicted_values = eval(row['extracted'].lower()) if pd.notna(row['extracted']) else []
    
    # Check if any predicted value matches the true values
    return any(pred in true_values for pred in predicted_values)

In [6]:
gender_prefixes = ['m', 'f', 'n']
for path, lang in paths.items():
    print(path)
    df = pd.read_csv(path)
    
    for gender_prefix in gender_prefixes:
        df[f'is_{gender_prefix}'] = df.apply(lambda row: accuracy(row, lang, gender_prefix), axis=1)
        
    save = 'accuracy/' + '/'.join(path.split('/')[2:])
    directory = os.path.dirname(save)  # Extract the directory path
    os.makedirs(directory, exist_ok=True)  # Create the directories if they don't exist
    
    df.to_csv(save, index=False)

../ready_for_evaluation/claude-3-opus/deu.csv
../ready_for_evaluation/claude-3-opus/fra.csv
../ready_for_evaluation/claude-3-opus/it.csv
../ready_for_evaluation/claude-3-opus/spa.csv
../ready_for_evaluation/claude-3.5-haiku/deu.csv
../ready_for_evaluation/claude-3.5-haiku/fra.csv
../ready_for_evaluation/claude-3.5-haiku/it.csv
../ready_for_evaluation/claude-3.5-haiku/spa.csv
../ready_for_evaluation/claude-3.5-sonnet/deu.csv
../ready_for_evaluation/claude-3.5-sonnet/fra.csv
../ready_for_evaluation/claude-3.5-sonnet/it.csv
../ready_for_evaluation/claude-3.5-sonnet/spa.csv
../ready_for_evaluation/gemini-1.5-flash/deu.csv
../ready_for_evaluation/gemini-1.5-flash/fra.csv
../ready_for_evaluation/gemini-1.5-flash/it.csv
../ready_for_evaluation/gemini-1.5-flash/spa.csv
../ready_for_evaluation/gemini-1.5-pro/deu.csv
../ready_for_evaluation/gemini-1.5-pro/fra.csv
../ready_for_evaluation/gemini-1.5-pro/it.csv
../ready_for_evaluation/gemini-1.5-pro/spa.csv
../ready_for_evaluation/gpt-3.5-turbo/deu

In [10]:
folder_path = "accuracy"
paths = get_all_paths_exclude_checkpoints(folder_path)
paths = dict(sorted(paths.items()))
# print("Paths:", paths)
keys_to_exclude = ['accuracy/accuracy_Transpose.csv', 'accuracy/accuracy_table.csv']

# Delete specified keys from the dictionary
for key in keys_to_exclude:
    paths.pop(key, None)
paths

{'accuracy/claude-3-opus/deu.csv': 'deu',
 'accuracy/claude-3-opus/fra.csv': 'fra',
 'accuracy/claude-3-opus/it.csv': 'it',
 'accuracy/claude-3-opus/spa.csv': 'spa',
 'accuracy/claude-3.5-haiku/deu.csv': 'deu',
 'accuracy/claude-3.5-haiku/fra.csv': 'fra',
 'accuracy/claude-3.5-haiku/it.csv': 'it',
 'accuracy/claude-3.5-haiku/spa.csv': 'spa',
 'accuracy/claude-3.5-sonnet/deu.csv': 'deu',
 'accuracy/claude-3.5-sonnet/fra.csv': 'fra',
 'accuracy/claude-3.5-sonnet/it.csv': 'it',
 'accuracy/claude-3.5-sonnet/spa.csv': 'spa',
 'accuracy/gemini-1.5-flash/deu.csv': 'deu',
 'accuracy/gemini-1.5-flash/fra.csv': 'fra',
 'accuracy/gemini-1.5-flash/it.csv': 'it',
 'accuracy/gemini-1.5-flash/spa.csv': 'spa',
 'accuracy/gemini-1.5-pro/deu.csv': 'deu',
 'accuracy/gemini-1.5-pro/fra.csv': 'fra',
 'accuracy/gemini-1.5-pro/it.csv': 'it',
 'accuracy/gemini-1.5-pro/spa.csv': 'spa',
 'accuracy/gpt-3.5-turbo/deu.csv': 'deu',
 'accuracy/gpt-3.5-turbo/fra.csv': 'fra',
 'accuracy/gpt-3.5-turbo/it.csv': 'it',
 '

In [11]:
models = list(sorted(set([k.split('/')[1] for k, _ in paths.items()])))

In [12]:
accuracy = {'language': ['deu', 'fra', 'it', 'spa', 'total'], 
            'models': ['claude-3-opus',
                       'claude-3.5-haiku',
                       'claude-3.5-sonnet',
                       'gemini-1.5-flash',
                       'gemini-1.5-pro',
                       'gpt-3.5-turbo',
                       'gpt-4',
                       'gpt-4-turbo',
                       'gpt-4o',
                       'gpt-4o-mini', 
                       'total']
           }
            
accuracy = pd.DataFrame(index=accuracy['language'], columns=accuracy['models'])
accuracy
# Values will be in format acc_m/acc_f/acc_n in %
# version 2 total accuracy in % ----> Cons cannot see gender distribution

Unnamed: 0,claude-3-opus,claude-3.5-haiku,claude-3.5-sonnet,gemini-1.5-flash,gemini-1.5-pro,gpt-3.5-turbo,gpt-4,gpt-4-turbo,gpt-4o,gpt-4o-mini,total
deu,,,,,,,,,,,
fra,,,,,,,,,,,
it,,,,,,,,,,,
spa,,,,,,,,,,,
total,,,,,,,,,,,


In [13]:
# gender_prefixes = ['m', 'f', 'n']

save = {model: {lang: {gender: {'total': 0, 'correct': 0} for gender in gender_prefixes} for lang in ['deu', 'fra', 'it', 'spa']} for model in models}

for path, lang in paths.items():
    temp = []
    df = pd.read_csv(path)
    
    for gender in gender_prefixes:
        total = df[f'{lang}_{gender}'].notna().sum()
        correct = df[f'is_{gender}'].sum()
        # print(gender, total, correct)
        
        save[path.split('/')[1]][lang][gender]['total'] = total
        save[path.split('/')[1]][lang][gender]['correct'] = correct
        
        if total == 0:
            temp.append('-')
        else:
            # correct = df[f'is_{gender}'].sum()
            temp.append(f"{correct / total:.2}")
    
    
    accuracy.loc[lang, path.split('/')[1]] = '/'.join(temp)

In [14]:
# total by model
for model, lang_counts in save.items():
    temp = []
    for gender in gender_prefixes:
        total, correct = 0, 0
        for _, gender_counts in lang_counts.items():
            total += gender_counts[gender]['total']
            correct += gender_counts[gender]['correct']
            
        if total == 0:
            temp.append('-')
        else:
            temp.append(f"{correct / total:.2}")

    accuracy.loc['total', model] = '/'.join(temp)
    
# total by language

for lang in ['deu', 'fra', 'it', 'spa']:
    temp = []
    
    for gender in gender_prefixes:
        total, correct = 0, 0
        for model in models:
            total += save[model][lang][gender]['total']
            correct += save[model][lang][gender]['correct']
            
        if total == 0:
            temp.append('-')
        else:
            temp.append(f"{correct / total:.2}")
            
    accuracy.loc[lang, 'total'] = '/'.join(temp)

In [15]:
accuracy

Unnamed: 0,claude-3-opus,claude-3.5-haiku,claude-3.5-sonnet,gemini-1.5-flash,gemini-1.5-pro,gpt-3.5-turbo,gpt-4,gpt-4-turbo,gpt-4o,gpt-4o-mini,total
deu,0.83/0.76/0.5,0.78/0.66/0.5,0.88/0.87/1.0,0.72/0.56/0.0,0.81/0.75/1.0,0.76/0.07/0.5,0.73/0.016/0.0,0.8/0.26/1.0,0.84/0.71/1.0,0.7/0.63/0.5,0.79/0.53/0.6
fra,0.89/0.86/0.91,0.86/0.85/0.74,0.88/0.87/1.0,0.81/0.79/0.83,0.89/0.86/0.91,0.82/0.19/0.83,0.86/0.0059/0.83,0.89/0.26/0.87,0.86/0.81/0.83,0.85/0.73/0.87,0.86/0.62/0.86
it,0.88/0.83/0.75,0.86/0.78/0.75,0.9/0.86/0.78,0.82/0.74/0.82,0.9/0.8/0.85,0.62/0.46/0.54,0.83/0.0/0.74,0.86/0.66/0.78,0.88/0.83/0.82,0.85/0.75/0.82,0.84/0.67/0.76
spa,0.94/0.68/0.8,0.89/0.63/0.61,0.92/0.66/0.78,0.88/0.72/0.67,0.92/0.61/0.73,0.89/0.6/0.67,0.86/0.0/0.59,0.86/0.36/0.57,0.92/0.62/0.73,0.9/0.6/0.65,0.9/0.55/0.68
total,0.88/0.79/0.79,0.84/0.73/0.7,0.89/0.82/0.82,0.8/0.69/0.76,0.87/0.76/0.82,0.78/0.3/0.63,0.81/0.0064/0.69,0.85/0.36/0.73,0.87/0.74/0.79,0.82/0.67/0.76,


In [16]:
accuracy.to_csv('accuracy/accuracy_table.csv', index=True)

In [17]:
transpose = accuracy.T

transpose.to_csv('accuracy/accuracy_Transpose.csv', index=True)

# F score

In [27]:
from sklearn.metrics import precision_score, recall_score, f1_score

def f_score(predicted_labels, true_labels):
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    
    return {'precision': precision, 'recall': recall, 'f1': f1}


In [30]:
save = {model: {lang: {gender: None for gender in gender_prefixes} for lang in ['deu', 'fra', 'it', 'spa']} for model in models}

for path, lang in paths.items():
    df = pd.read_csv(path)
    for gender in gender_prefixes:
        predicted_labels = df[f"is_{gender}"].tolist()
        true_labels = df[f"{lang}_{gender}"].notna().tolist()
        
        # print(len(predicted_labels), len(true_labels))
        
        model = path.split('/')[1]
        save[model][lang][gender] = f_score(predicted_labels, true_labels)
        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
