In [1]:
import pandas as pd

def compute_gpt_metrics(filepath, prefix):
    df = pd.read_csv(filepath)
    en, fa, it = f"{prefix}_EN", f"{prefix}_FA", f"{prefix}_IT"

    df["Full_Agreement"] = (df[en] == df[fa]) & (df[en] == df[it])

    def classify_disagreement(row):
        choices = {row[en], row[fa], row[it]}
        if len(choices) == 1:
            return "All same"
        elif len(choices) == 3:
            return "All different"
        elif row[en] == row[it] and row[fa] != row[en]:
            return "FA diverged"
        elif row[en] == row[fa] and row[it] != row[en]:
            return "IT diverged"
        elif row[fa] == row[it] and row[en] != row[fa]:
            return "EN diverged"
        return "Other"

    df["Disagreement_Type"] = df.apply(classify_disagreement, axis=1)
    fa_freq = df[fa].value_counts(normalize=True) * 100
    disagreement_counts = df["Disagreement_Type"].value_counts()
    agreement_pct = df["Full_Agreement"].mean() * 100

    return {
        "Agreement (%)": round(agreement_pct, 2),
        "All Different": disagreement_counts.get("All different", 0),
        "FA Diverged": disagreement_counts.get("FA diverged", 0),
        "IT Diverged": disagreement_counts.get("IT diverged", 0),
        "EN Diverged": disagreement_counts.get("EN diverged", 0),
        "A_FA (%)": round(fa_freq.get("A", 0), 2),
        "B_FA (%)": round(fa_freq.get("B", 0), 2),
        "C_FA (%)": round(fa_freq.get("C", 0), 2),
        "D_FA (%)": round(fa_freq.get("D", 0), 2),
    }

# Apply to both files
original_results = compute_gpt_metrics("GPT4o_Merged_Multilingual.csv", "GPT4o")
shuffled_results = compute_gpt_metrics("GPT4o_Shuffled_Merged_Multilingual.csv", "GPT4o")

# Combine into one table
df_comparison = pd.DataFrame(
    [original_results, shuffled_results],
    index=["GPT-4o Original", "GPT-4o Shuffled"]
)

# Display
print(df_comparison)


                 Agreement (%)  All Different  FA Diverged  IT Diverged  \
GPT-4o Original           50.0              7           11            5   
GPT-4o Shuffled           50.0              4           12            9   

                 EN Diverged  A_FA (%)  B_FA (%)  C_FA (%)  D_FA (%)  
GPT-4o Original            7     16.67     36.67     28.33     18.33  
GPT-4o Shuffled            5     20.00     35.00     35.00     10.00  


In [2]:

import pandas as pd

def compute_metrics(path, prefix):
    df = pd.read_csv(path)
    en, fa, it = f"{prefix}_EN", f"{prefix}_FA", f"{prefix}_IT"

    df["Full_Agreement"] = (df[en] == df[fa]) & (df[en] == df[it])

    def classify_disagreement(row):
        choices = {row[en], row[fa], row[it]}
        if len(choices) == 1:
            return "All same"
        elif len(choices) == 3:
            return "All different"
        elif row[en] == row[it] and row[fa] != row[en]:
            return "FA diverged"
        elif row[en] == row[fa] and row[it] != row[en]:
            return "IT diverged"
        elif row[fa] == row[it] and row[en] != row[fa]:
            return "EN diverged"
        return "Other"

    df["Disagreement_Type"] = df.apply(classify_disagreement, axis=1)
    fa_freq = df[fa].value_counts(normalize=True) * 100
    disagreement_counts = df["Disagreement_Type"].value_counts()
    agreement_pct = df["Full_Agreement"].mean() * 100

    return {
        "Agreement (%)": round(agreement_pct, 2),
        "All Different": disagreement_counts.get("All different", 0),
        "FA Diverged": disagreement_counts.get("FA diverged", 0),
        "IT Diverged": disagreement_counts.get("IT diverged", 0),
        "EN Diverged": disagreement_counts.get("EN diverged", 0),
        "A_FA (%)": round(fa_freq.get("A", 0), 2),
        "B_FA (%)": round(fa_freq.get("B", 0), 2),
        "C_FA (%)": round(fa_freq.get("C", 0), 2),
        "D_FA (%)": round(fa_freq.get("D", 0), 2),
    }

# Run on original and shuffled
original = compute_metrics("GPT4o_Merged_Multilingual.csv", "GPT4o")
shuffled = compute_metrics("GPT4o_Shuffled_Merged_Multilingual.csv", "GPT4o")

# Combine into DataFrame
df_latex = pd.DataFrame(
    [original, shuffled],
    index=["GPT-4o (Original)", "GPT-4o (Shuffled)"]
).reset_index().rename(columns={"index": "Model"})

# Generate LaTeX table
latex_code = df_latex.to_latex(
    index=False,
    caption="Comparison of agreement and option distribution between original and shuffled GPT-4o prompts.",
    label="tab:gpt4o_shuffled_vs_original",
    column_format="lccccccccc"
)

# Print LaTeX code
print(latex_code)


\begin{table}
\caption{Comparison of agreement and option distribution between original and shuffled GPT-4o prompts.}
\label{tab:gpt4o_shuffled_vs_original}
\begin{tabular}{lccccccccc}
\toprule
Model & Agreement (%) & All Different & FA Diverged & IT Diverged & EN Diverged & A_FA (%) & B_FA (%) & C_FA (%) & D_FA (%) \\
\midrule
GPT-4o (Original) & 50.000000 & 7 & 11 & 5 & 7 & 16.670000 & 36.670000 & 28.330000 & 18.330000 \\
GPT-4o (Shuffled) & 50.000000 & 4 & 12 & 9 & 5 & 20.000000 & 35.000000 & 35.000000 & 10.000000 \\
\bottomrule
\end{tabular}
\end{table}



In [3]:
import pandas as pd
import numpy as np

def bootstrap_agreement_CI(filepath, prefix, num_bootstrap=1000, confidence=0.95):
    df = pd.read_csv(filepath)
    en, fa, it = f"{prefix}_EN", f"{prefix}_FA", f"{prefix}_IT"
    df["Full_Agreement"] = (df[en] == df[fa]) & (df[en] == df[it])

    # Bootstrapping
    bootstrapped_agreements = []
    for _ in range(num_bootstrap):
        sample = df.sample(frac=1.0, replace=True)
        agreement_pct = sample["Full_Agreement"].mean() * 100
        bootstrapped_agreements.append(agreement_pct)

    lower_bound = np.percentile(bootstrapped_agreements, (1 - confidence) / 2 * 100)
    upper_bound = np.percentile(bootstrapped_agreements, (1 + confidence) / 2 * 100)
    mean_agreement = np.mean(bootstrapped_agreements)

    print(f"Bootstrapped {int(confidence*100)}% CI for agreement:")
    print(f"Mean agreement: {mean_agreement:.2f}%")
    print(f"95% Confidence Interval: [{lower_bound:.2f}%, {upper_bound:.2f}%]")

# Example usage:
print("\nOriginal GPT-4o:")
bootstrap_agreement_CI("GPT4o_Merged_Multilingual.csv", "GPT4o")

print("\nShuffled GPT-4o:")
bootstrap_agreement_CI("GPT4o_Shuffled_Merged_Multilingual.csv", "GPT4o")



Original GPT-4o:
Bootstrapped 95% CI for agreement:
Mean agreement: 49.90%
95% Confidence Interval: [36.67%, 63.33%]

Shuffled GPT-4o:
Bootstrapped 95% CI for agreement:
Mean agreement: 49.88%
95% Confidence Interval: [38.33%, 61.67%]


In [1]:
import pandas as pd
import numpy as np

def bootstrap_disagreement_CI(filepath, prefix, num_bootstrap=1000, confidence=0.95):
    df = pd.read_csv(filepath)
    en, fa, it = f"{prefix}_EN", f"{prefix}_FA", f"{prefix}_IT"
    df["Full_Disagreement"] = (df[en] != df[fa]) & (df[en] != df[it]) & (df[fa] != df[it])

    # Bootstrapping
    bootstrapped_disagreements = []
    for _ in range(num_bootstrap):
        sample = df.sample(frac=1.0, replace=True)
        disagreement_pct = sample["Full_Disagreement"].mean() * 100
        bootstrapped_disagreements.append(disagreement_pct)

    lower_bound = np.percentile(bootstrapped_disagreements, (1 - confidence) / 2 * 100)
    upper_bound = np.percentile(bootstrapped_disagreements, (1 + confidence) / 2 * 100)
    mean_disagreement = np.mean(bootstrapped_disagreements)

    print(f"Bootstrapped {int(confidence*100)}% CI for full disagreement:")
    print(f"Mean disagreement: {mean_disagreement:.2f}%")
    print(f"95% Confidence Interval: [{lower_bound:.2f}%, {upper_bound:.2f}%]")

# Example usage:
print("\nOriginal GPT-4o:")
bootstrap_disagreement_CI("GPT4o_Merged_Multilingual.csv", "GPT4o")

print("\nShuffled GPT-4o:")
bootstrap_disagreement_CI("GPT4o_Shuffled_Merged_Multilingual.csv", "GPT4o")




Original GPT-4o:
Bootstrapped 95% CI for full disagreement:
Mean disagreement: 11.78%
95% Confidence Interval: [5.00%, 20.00%]

Shuffled GPT-4o:
Bootstrapped 95% CI for full disagreement:
Mean disagreement: 6.82%
95% Confidence Interval: [1.67%, 13.33%]
