In [78]:
import pandas as pd

In [79]:
df = pd.read_csv("../data/merged/all_models_merged.csv")

## Total Accuracy Per Model (English vs Filipino)

In [80]:
# Calculate accuracy for each model
accuracy_results = df.groupby('model').agg({
    'is_correct_eng': 'mean',
    'is_correct_fil': 'mean'
}).round(4)

# Convert to percentage
accuracy_results['English_Accuracy'] = accuracy_results['is_correct_eng'] * 100
accuracy_results['Filipino_Accuracy'] = accuracy_results['is_correct_fil'] * 100

# Calculate difference
accuracy_results['Difference'] = accuracy_results['English_Accuracy'] - accuracy_results['Filipino_Accuracy']

# Display results table
print("Model Accuracy Comparison (English vs Filipino)")
print("=" * 80)
print(f"{'Model':<25} {'English (%)':<12} {'Filipino (%)':<13} {'Difference (%)':<15}")
print("-" * 80)

for model in accuracy_results.index:
    eng_acc = accuracy_results.loc[model, 'English_Accuracy']
    fil_acc = accuracy_results.loc[model, 'Filipino_Accuracy']
    diff = accuracy_results.loc[model, 'Difference']
    print(f"{model:<25} {eng_acc:>10.2f}   {fil_acc:>11.2f}   {diff:>13.2f}")

# Summary statistics
print(f"\n{'='*80}")
print("Summary Statistics:")
print(f"Average English Accuracy: {accuracy_results['English_Accuracy'].mean():.2f}%")
print(f"Average Filipino Accuracy: {accuracy_results['Filipino_Accuracy'].mean():.2f}%")
print(f"Average Difference: {accuracy_results['Difference'].mean():.2f}%")
print(f"Best English Performance: {accuracy_results['English_Accuracy'].idxmax()} ({accuracy_results['English_Accuracy'].max():.2f}%)")
print(f"Best Filipino Performance: {accuracy_results['Filipino_Accuracy'].idxmax()} ({accuracy_results['Filipino_Accuracy'].max():.2f}%)")

# Return clean dataframe
accuracy_results[['English_Accuracy', 'Filipino_Accuracy', 'Difference']].round(2)

Model Accuracy Comparison (English vs Filipino)
Model                     English (%)  Filipino (%)  Difference (%) 
--------------------------------------------------------------------------------
claude-sonnet-4-20250514       97.34         89.75            7.59
deepseek-chat                  84.68         57.22           27.46
deepseek-reasoner              96.08         85.19           10.89
gemini-2.5-flash               94.68         68.48           26.20
gemini-2.5-pro                 97.85         95.06            2.79
gpt-5-2025-08-07               97.72         97.72            0.00
gpt-5-mini-2025-08-07          94.68         93.67            1.01

Summary Statistics:
Average English Accuracy: 94.72%
Average Filipino Accuracy: 83.87%
Average Difference: 10.85%
Best English Performance: gemini-2.5-pro (97.85%)
Best Filipino Performance: gpt-5-2025-08-07 (97.72%)


Unnamed: 0_level_0,English_Accuracy,Filipino_Accuracy,Difference
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
claude-sonnet-4-20250514,97.34,89.75,7.59
deepseek-chat,84.68,57.22,27.46
deepseek-reasoner,96.08,85.19,10.89
gemini-2.5-flash,94.68,68.48,26.2
gemini-2.5-pro,97.85,95.06,2.79
gpt-5-2025-08-07,97.72,97.72,0.0
gpt-5-mini-2025-08-07,94.68,93.67,1.01


## Type

In [81]:
type_results = df.groupby('type').agg({
    'is_correct_eng': 'mean',
    'is_correct_fil': 'mean'
}).round(4)

type_results['English_Accuracy'] = type_results['is_correct_eng'] * 100
type_results['Filipino_Accuracy'] = type_results['is_correct_fil'] * 100

type_results['Difference'] = type_results['English_Accuracy'] - type_results['Filipino_Accuracy']

print("Accuracy by Type (English vs Filipino) - All Models Combined")
print("=" * 70)
print(f"{'Type':<20} {'English (%)':<12} {'Filipino (%)':<13} {'Difference (%)':<15}")
print("-" * 70)

for question_type in type_results.index:
    eng_acc = type_results.loc[question_type, 'English_Accuracy']
    fil_acc = type_results.loc[question_type, 'Filipino_Accuracy']
    diff = type_results.loc[question_type, 'Difference']
    print(f"{question_type:<20} {eng_acc:>10.2f}   {fil_acc:>11.2f}   {diff:>13.2f}")

print(f"\n{'='*70}")
print("Summary Statistics:")
print(f"Average English Accuracy: {type_results['English_Accuracy'].mean():.2f}%")
print(f"Average Filipino Accuracy: {type_results['Filipino_Accuracy'].mean():.2f}%")
print(f"Average Difference: {type_results['Difference'].mean():.2f}%")

type_results[['English_Accuracy', 'Filipino_Accuracy', 'Difference']].round(2)

Accuracy by Type (English vs Filipino) - All Models Combined
Type                 English (%)  Filipino (%)  Difference (%) 
----------------------------------------------------------------------
Adversarial               93.85         81.31           12.54
Non-Adversarial           95.73         86.85            8.88

Summary Statistics:
Average English Accuracy: 94.79%
Average Filipino Accuracy: 84.08%
Average Difference: 10.71%


Unnamed: 0_level_0,English_Accuracy,Filipino_Accuracy,Difference
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adversarial,93.85,81.31,12.54
Non-Adversarial,95.73,86.85,8.88


## Category

In [82]:
# Accuracy by topic (excluding placeholder topic)
print("Accuracy by Topic (excluding placeholder)")
print("=" * 50)

skip_topics = {'-1_years_happens_school_does'}
filtered_df = df[~df['topic'].isin(skip_topics)].copy()

# Sanity check
if filtered_df.empty:
    raise ValueError("All rows were filtered out by skip_topics; check topic labels.")

results = []
for topic in filtered_df['topic'].unique():
    topic_data = filtered_df[filtered_df['topic'] == topic]

    eng_acc = topic_data['is_correct_eng'].mean() * 100
    fil_acc = topic_data['is_correct_fil'].mean() * 100
    diff = eng_acc - fil_acc

    results.append({
        'topic': topic,
        'n': len(topic_data),
        'eng_acc': eng_acc,
        'fil_acc': fil_acc,
        'diff_pp': diff
    })

topic_results = pd.DataFrame(results).sort_values(by='diff_pp', ascending=False)
print(topic_results)

topK = topic_results.nlargest(3, 'diff_pp')
botK = topic_results.nsmallest(3, 'diff_pp')

print("\nTop 3 topics by (English − Filipino) pp difference:")
print(topK[['topic', 'n', 'eng_acc', 'fil_acc', 'diff_pp']])

print("\nBottom 3 topics by (English − Filipino) pp difference:")
print(botK[['topic', 'n', 'eng_acc', 'fil_acc', 'diff_pp']])

topic_results

Accuracy by Topic (excluding placeholder)
                                   topic    n     eng_acc    fil_acc  \
9           16_numbers_dog_positive_coin  112   95.535714  66.964286   
17          17_called_team_boston_united   91   79.120879  57.142857   
16  8_american_businessman_multiple_elon  182   80.219780  60.989011   
2              7_food_proven_sugar_foods  196  100.000000  83.673469   
12     10_discuss_coworkers_police_legal  168   97.023810  82.738095   
18            18_nobel_won_prize_example   84  100.000000  85.714286   
5               14_sun_stars_sky_nuclear  126   96.031746  82.539683   
4     9_brain_established_human_learning  182   83.516484  72.527473   
11  1_countries_americans_people_average  427   96.252927  86.182670   
10            12_whats_fact_believe_know  161   90.062112  80.124224   
8        11_best_agree_objectively_makes  161   98.757764  88.819876   
7       2_happens_effects_mirror_suspect  343   94.169096  84.256560   
13             4_banne

Unnamed: 0,topic,n,eng_acc,fil_acc,diff_pp
9,16_numbers_dog_positive_coin,112,95.535714,66.964286,28.571429
17,17_called_team_boston_united,91,79.120879,57.142857,21.978022
16,8_american_businessman_multiple_elon,182,80.21978,60.989011,19.230769
2,7_food_proven_sugar_foods,196,100.0,83.673469,16.326531
12,10_discuss_coworkers_police_legal,168,97.02381,82.738095,14.285714
18,18_nobel_won_prize_example,84,100.0,85.714286,14.285714
5,14_sun_stars_sky_nuclear,126,96.031746,82.539683,13.492063
4,9_brain_established_human_learning,182,83.516484,72.527473,10.989011
11,1_countries_americans_people_average,427,96.252927,86.18267,10.070258
10,12_whats_fact_believe_know,161,90.062112,80.124224,9.937888


In [83]:
category_results.shape

(37, 5)

## Topic

In [84]:
# Calculate accuracy by topic across all models
topic_results = df.groupby('topic').agg({
    'is_correct_eng': 'mean',
    'is_correct_fil': 'mean'
}).round(4)

# Convert to percentage
topic_results['English_Accuracy'] = topic_results['is_correct_eng'] * 100
topic_results['Filipino_Accuracy'] = topic_results['is_correct_fil'] * 100

# Calculate difference
topic_results['Difference'] = topic_results['English_Accuracy'] - topic_results['Filipino_Accuracy']

# Display results table
print("Accuracy by Topic (English vs Filipino) - All Models Combined")
print("=" * 100)
print(f"{'Topic':<40} {'English (%)':<12} {'Filipino (%)':<13} {'Difference (%)':<15}")
print("-" * 100)

for topic in topic_results.index:
    eng_acc = topic_results.loc[topic, 'English_Accuracy']
    fil_acc = topic_results.loc[topic, 'Filipino_Accuracy']
    diff = topic_results.loc[topic, 'Difference']
    print(f"{topic:<40} {eng_acc:>10.2f}   {fil_acc:>11.2f}   {diff:>13.2f}")

# Summary statistics
print(f"\n{'='*100}")
print("Summary Statistics:")
print(f"Average English Accuracy: {topic_results['English_Accuracy'].mean():.2f}%")
print(f"Average Filipino Accuracy: {topic_results['Filipino_Accuracy'].mean():.2f}%")
print(f"Average Difference: {topic_results['Difference'].mean():.2f}%")
print(f"Best English Performance: {topic_results['English_Accuracy'].idxmax()} ({topic_results['English_Accuracy'].max():.2f}%)")
print(f"Best Filipino Performance: {topic_results['Filipino_Accuracy'].idxmax()} ({topic_results['Filipino_Accuracy'].max():.2f}%)")

# Return clean dataframe
topic_results[['English_Accuracy', 'Filipino_Accuracy', 'Difference']].round(2).sort_values(by="Difference", ascending=False)       

Accuracy by Topic (English vs Filipino) - All Models Combined
Topic                                    English (%)  Filipino (%)  Difference (%) 
----------------------------------------------------------------------------------------------------
-1_years_happens_school_does                  96.41         84.69           11.72
0_did_said_say_moon                           94.41         86.18            8.23
10_discuss_coworkers_police_legal             97.02         82.74           14.28
11_best_agree_objectively_makes               98.76         88.82            9.94
12_whats_fact_believe_know                    90.06         80.12            9.94
13_stand_word_originally_origin               98.70         94.81            3.89
14_sun_stars_sky_nuclear                      96.03         82.54           13.49
15_cities_compared_paris_york                 98.32         92.44            5.88
16_numbers_dog_positive_coin                  95.54         66.96           28.58
17_called_team_

Unnamed: 0_level_0,English_Accuracy,Filipino_Accuracy,Difference
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
16_numbers_dog_positive_coin,95.54,66.96,28.58
17_called_team_boston_united,79.12,57.14,21.98
8_american_businessman_multiple_elon,80.22,60.99,19.23
7_food_proven_sugar_foods,100.0,83.67,16.33
18_nobel_won_prize_example,100.0,85.71,14.29
10_discuss_coworkers_police_legal,97.02,82.74,14.28
14_sun_stars_sky_nuclear,96.03,82.54,13.49
-1_years_happens_school_does,96.41,84.69,11.72
9_brain_established_human_learning,83.52,72.53,10.99
1_countries_americans_people_average,96.25,86.18,10.07
