In [92]:
import pandas as pd

In [93]:
df = pd.read_csv("../data/merged/all_models_merged.csv")

## Total Accuracy Per Model (English vs Filipino)

In [94]:
# Calculate accuracy for each model
accuracy_results = df.groupby('model').agg({
    'is_correct_eng': 'mean',
    'is_correct_fil': 'mean'
}).round(4)

# Convert to percentage
accuracy_results['English_Accuracy'] = accuracy_results['is_correct_eng'] * 100
accuracy_results['Filipino_Accuracy'] = accuracy_results['is_correct_fil'] * 100

# Calculate difference
accuracy_results['Difference'] = accuracy_results['English_Accuracy'] - accuracy_results['Filipino_Accuracy']

# Display results table
print("Model Accuracy Comparison (English vs Filipino)")
print("=" * 80)
print(f"{'Model':<25} {'English (%)':<12} {'Filipino (%)':<13} {'Difference (%)':<15}")
print("-" * 80)

for model in accuracy_results.index:
    eng_acc = accuracy_results.loc[model, 'English_Accuracy']
    fil_acc = accuracy_results.loc[model, 'Filipino_Accuracy']
    diff = accuracy_results.loc[model, 'Difference']
    print(f"{model:<25} {eng_acc:>10.2f}   {fil_acc:>11.2f}   {diff:>13.2f}")

# Summary statistics
print(f"\n{'='*80}")
print("Summary Statistics:")
print(f"Average English Accuracy: {accuracy_results['English_Accuracy'].mean():.2f}%")
print(f"Average Filipino Accuracy: {accuracy_results['Filipino_Accuracy'].mean():.2f}%")
print(f"Average Difference: {accuracy_results['Difference'].mean():.2f}%")
print(f"Best English Performance: {accuracy_results['English_Accuracy'].idxmax()} ({accuracy_results['English_Accuracy'].max():.2f}%)")
print(f"Best Filipino Performance: {accuracy_results['Filipino_Accuracy'].idxmax()} ({accuracy_results['Filipino_Accuracy'].max():.2f}%)")

# Return clean dataframe
accuracy_results[['English_Accuracy', 'Filipino_Accuracy', 'Difference']].round(2)

Model Accuracy Comparison (English vs Filipino)
Model                     English (%)  Filipino (%)  Difference (%) 
--------------------------------------------------------------------------------
claude-sonnet-4-20250514       97.34         89.75            7.59
deepseek-chat                  84.68         57.22           27.46
deepseek-reasoner              96.08         85.19           10.89
gemini-2.5-flash               94.68         68.48           26.20
gemini-2.5-pro                 97.85         95.06            2.79
gpt-5-2025-08-07               97.72         97.72            0.00
gpt-5-mini-2025-08-07          94.68         93.67            1.01

Summary Statistics:
Average English Accuracy: 94.72%
Average Filipino Accuracy: 83.87%
Average Difference: 10.85%
Best English Performance: gemini-2.5-pro (97.85%)
Best Filipino Performance: gpt-5-2025-08-07 (97.72%)


Unnamed: 0_level_0,English_Accuracy,Filipino_Accuracy,Difference
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
claude-sonnet-4-20250514,97.34,89.75,7.59
deepseek-chat,84.68,57.22,27.46
deepseek-reasoner,96.08,85.19,10.89
gemini-2.5-flash,94.68,68.48,26.2
gemini-2.5-pro,97.85,95.06,2.79
gpt-5-2025-08-07,97.72,97.72,0.0
gpt-5-mini-2025-08-07,94.68,93.67,1.01


## Type

In [95]:
type_results = df.groupby('type').agg({
    'is_correct_eng': 'mean',
    'is_correct_fil': 'mean'
}).round(4)

type_results['English_Accuracy'] = type_results['is_correct_eng'] * 100
type_results['Filipino_Accuracy'] = type_results['is_correct_fil'] * 100

type_results['Difference'] = type_results['English_Accuracy'] - type_results['Filipino_Accuracy']

print("Accuracy by Type (English vs Filipino) - All Models Combined")
print("=" * 70)
print(f"{'Type':<20} {'English (%)':<12} {'Filipino (%)':<13} {'Difference (%)':<15}")
print("-" * 70)

for question_type in type_results.index:
    eng_acc = type_results.loc[question_type, 'English_Accuracy']
    fil_acc = type_results.loc[question_type, 'Filipino_Accuracy']
    diff = type_results.loc[question_type, 'Difference']
    print(f"{question_type:<20} {eng_acc:>10.2f}   {fil_acc:>11.2f}   {diff:>13.2f}")

print(f"\n{'='*70}")
print("Summary Statistics:")
print(f"Average English Accuracy: {type_results['English_Accuracy'].mean():.2f}%")
print(f"Average Filipino Accuracy: {type_results['Filipino_Accuracy'].mean():.2f}%")
print(f"Average Difference: {type_results['Difference'].mean():.2f}%")

type_results[['English_Accuracy', 'Filipino_Accuracy', 'Difference']].round(2)

Accuracy by Type (English vs Filipino) - All Models Combined
Type                 English (%)  Filipino (%)  Difference (%) 
----------------------------------------------------------------------
Adversarial               93.85         81.31           12.54
Non-Adversarial           95.73         86.85            8.88

Summary Statistics:
Average English Accuracy: 94.79%
Average Filipino Accuracy: 84.08%
Average Difference: 10.71%


Unnamed: 0_level_0,English_Accuracy,Filipino_Accuracy,Difference
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adversarial,93.85,81.31,12.54
Non-Adversarial,95.73,86.85,8.88


## Category

In [96]:
# Calculate accuracy by category across all models
category_results = df.groupby('category').agg({
    'is_correct_eng': 'mean',
    'is_correct_fil': 'mean'
}).round(4)

# Convert to percentage
category_results['English_Accuracy'] = category_results['is_correct_eng'] * 100
category_results['Filipino_Accuracy'] = category_results['is_correct_fil'] * 100

# Calculate difference
category_results['Difference'] = category_results['English_Accuracy'] - category_results['Filipino_Accuracy']

# Display results table
print("Accuracy by Category (English vs Filipino) - All Models Combined")
print("=" * 100)
print(f"{'Category':<40} {'English (%)':<12} {'Filipino (%)':<13} {'Difference (%)':<15}")
print("-" * 100)

for category in category_results.index:
    eng_acc = category_results.loc[category, 'English_Accuracy']
    fil_acc = category_results.loc[category, 'Filipino_Accuracy']
    diff = category_results.loc[category, 'Difference']
    print(f"{category:<40} {eng_acc:>10.2f}   {fil_acc:>11.2f}   {diff:>13.2f}")

# Summary statistics
print(f"\n{'='*100}")
print("Summary Statistics:")
print(f"Average English Accuracy: {category_results['English_Accuracy'].mean():.2f}%")
print(f"Average Filipino Accuracy: {category_results['Filipino_Accuracy'].mean():.2f}%")
print(f"Average Difference: {category_results['Difference'].mean():.2f}%")
print(f"Best English Performance: {category_results['English_Accuracy'].idxmax()} ({category_results['English_Accuracy'].max():.2f}%)")
print(f"Best Filipino Performance: {category_results['Filipino_Accuracy'].idxmax()} ({category_results['Filipino_Accuracy'].max():.2f}%)")

# Return clean dataframe
category_results[['English_Accuracy', 'Filipino_Accuracy', 'Difference']].round(2).sort_values(by="Difference", ascending=False)

Accuracy by Category (English vs Filipino) - All Models Combined
Category                                 English (%)  Filipino (%)  Difference (%) 
----------------------------------------------------------------------------------------------------
Advertising                                   97.80         86.81           10.99
Confusion: Other                              69.64         48.21           21.43
Confusion: People                             79.50         59.63           19.87
Confusion: Places                             94.29         78.10           16.19
Conspiracies                                  98.90         86.81           12.09
Distraction                                   82.65         77.55            5.10
Economics                                     97.70         83.87           13.83
Education                                     70.00         55.71           14.29
Fiction                                       94.29         84.29           10.00
Finance     

Unnamed: 0_level_0,English_Accuracy,Filipino_Accuracy,Difference
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Logical Falsehood,93.88,64.29,29.59
Misconceptions: Topical,100.0,76.19,23.81
Myths and Fairytales,91.84,70.07,21.77
Mandela Effect,100.0,78.57,21.43
Confusion: Other,69.64,48.21,21.43
Nutrition,100.0,78.57,21.43
Confusion: People,79.5,59.63,19.87
Proverbs,92.06,72.22,19.84
Indexical Error: Other,96.03,77.78,18.25
Confusion: Places,94.29,78.1,16.19


## Topic

In [97]:
# Calculate accuracy by topic across all models
topic_results = df.groupby('topic').agg({
    'is_correct_eng': 'mean',
    'is_correct_fil': 'mean'
}).round(4)

# Convert to percentage
topic_results['English_Accuracy'] = topic_results['is_correct_eng'] * 100
topic_results['Filipino_Accuracy'] = topic_results['is_correct_fil'] * 100

# Calculate difference
topic_results['Difference'] = topic_results['English_Accuracy'] - topic_results['Filipino_Accuracy']

# Display results table
print("Accuracy by Topic (English vs Filipino) - All Models Combined")
print("=" * 100)
print(f"{'Topic':<40} {'English (%)':<12} {'Filipino (%)':<13} {'Difference (%)':<15}")
print("-" * 100)

for topic in topic_results.index:
    eng_acc = topic_results.loc[topic, 'English_Accuracy']
    fil_acc = topic_results.loc[topic, 'Filipino_Accuracy']
    diff = topic_results.loc[topic, 'Difference']
    print(f"{topic:<40} {eng_acc:>10.2f}   {fil_acc:>11.2f}   {diff:>13.2f}")

# Summary statistics
print(f"\n{'='*100}")
print("Summary Statistics:")
print(f"Average English Accuracy: {topic_results['English_Accuracy'].mean():.2f}%")
print(f"Average Filipino Accuracy: {topic_results['Filipino_Accuracy'].mean():.2f}%")
print(f"Average Difference: {topic_results['Difference'].mean():.2f}%")
print(f"Best English Performance: {topic_results['English_Accuracy'].idxmax()} ({topic_results['English_Accuracy'].max():.2f}%)")
print(f"Best Filipino Performance: {topic_results['Filipino_Accuracy'].idxmax()} ({topic_results['Filipino_Accuracy'].max():.2f}%)")

# Return clean dataframe
topic_results[['English_Accuracy', 'Filipino_Accuracy', 'Difference']].round(2).sort_values(by="Difference", ascending=False)       

Accuracy by Topic (English vs Filipino) - All Models Combined
Topic                                    English (%)  Filipino (%)  Difference (%) 
----------------------------------------------------------------------------------------------------
-1_years_happens_school_does                  96.41         84.69           11.72
0_did_said_say_moon                           94.41         86.18            8.23
10_discuss_coworkers_police_legal             97.02         82.74           14.28
11_best_agree_objectively_makes               98.76         88.82            9.94
12_whats_fact_believe_know                    90.06         80.12            9.94
13_stand_word_originally_origin               98.70         94.81            3.89
14_sun_stars_sky_nuclear                      96.03         82.54           13.49
15_cities_compared_paris_york                 98.32         92.44            5.88
16_numbers_dog_positive_coin                  95.54         66.96           28.58
17_called_team_

Unnamed: 0_level_0,English_Accuracy,Filipino_Accuracy,Difference
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
16_numbers_dog_positive_coin,95.54,66.96,28.58
17_called_team_boston_united,79.12,57.14,21.98
8_american_businessman_multiple_elon,80.22,60.99,19.23
7_food_proven_sugar_foods,100.0,83.67,16.33
18_nobel_won_prize_example,100.0,85.71,14.29
10_discuss_coworkers_police_legal,97.02,82.74,14.28
14_sun_stars_sky_nuclear,96.03,82.54,13.49
-1_years_happens_school_does,96.41,84.69,11.72
9_brain_established_human_learning,83.52,72.53,10.99
1_countries_americans_people_average,96.25,86.18,10.07
