In [1]:
import pandas as pd

def analyze_agreement(df, prefix):
    en_col, fa_col, it_col = f"{prefix}_EN", f"{prefix}_FA", f"{prefix}_IT"
    df["Full_Agreement"] = (df[en_col] == df[fa_col]) & (df[en_col] == df[it_col])

    def classify(row):
        choices = {row[en_col], row[fa_col], row[it_col]}
        if len(choices) == 1:
            return "All same"
        elif len(choices) == 3:
            return "All different"
        elif row[en_col] == row[it_col] and row[fa_col] != row[en_col]:
            return "FA diverged"
        elif row[en_col] == row[fa_col] and row[it_col] != row[en_col]:
            return "IT diverged"
        elif row[fa_col] == row[it_col] and row[en_col] != row[fa_col]:
            return "EN diverged"
        return "Other"

    df["Disagreement_Type"] = df.apply(classify, axis=1)
    agreement_pct = round(df["Full_Agreement"].mean() * 100, 2)
    disagreement = df["Disagreement_Type"].value_counts().to_dict()
    fa_dist = df[fa_col].value_counts(normalize=True).mul(100).round(2).to_dict()

    return {
        "Agreement (%)": agreement_pct,
        "All Different": disagreement.get("All different", 0),
        "FA Diverged": disagreement.get("FA diverged", 0),
        "IT Diverged": disagreement.get("IT diverged", 0),
        "EN Diverged": disagreement.get("EN diverged", 0),
        "A_FA (%)": fa_dist.get("A", 0),
        "B_FA (%)": fa_dist.get("B", 0),
        "C_FA (%)": fa_dist.get("C", 0),
        "D_FA (%)": fa_dist.get("D", 0),
    }

# Load both datasets
df_orig = pd.read_csv("LLaMA3.1_70_Merged_Multilingual.csv")
df_shfl = pd.read_csv("LLaMA3.1_Shuffled_Merged_Multilingual.csv")

# Analyze
original_metrics = analyze_agreement(df_orig, prefix="LLaMA3.1")
shuffled_metrics = analyze_agreement(df_shfl, prefix="LLaMA3.1")

# Combine into one table
summary_df = pd.DataFrame([original_metrics, shuffled_metrics], index=["LLaMA3.1 (Original)", "LLaMA3.1 (Shuffled)"])
print(summary_df)


                     Agreement (%)  All Different  FA Diverged  IT Diverged  \
LLaMA3.1 (Original)          53.33              4            5           13   
LLaMA3.1 (Shuffled)          43.33              6            7           15   

                     EN Diverged  A_FA (%)  B_FA (%)  C_FA (%)  D_FA (%)  
LLaMA3.1 (Original)            6     20.00     33.33     23.33     23.33  
LLaMA3.1 (Shuffled)            6     18.33     38.33     33.33     10.00  


In [2]:
import pandas as pd

# Manually enter the summarized results from your analysis
data = {
    "Model": ["LLaMA3.1 (Original)", "LLaMA3.1 (Shuffled)"],
    "Agreement (%)": [53.33, 43.33],
    "All Different": [36, 32],
    "FA Diverged": [5, 7],
    "IT Diverged": [13, 15],
    "EN Diverged": [6, 6],
    "A_FA (%)": [20.0, 18.0],
    "B_FA (%)": [33.0, 38.0],
    "C_FA (%)": [23.0, 33.0],
    "D_FA (%)": [23.0, 10.0],
}

df = pd.DataFrame(data)

# Generate compact LaTeX table for ECAI one-column format
latex = df.to_latex(
    index=False,
    caption="Comparison of agreement and Persian (FA) option distributions for LLaMA3.1 before and after option shuffling.",
    label="tab:llama3.1_shuffle_comparison",
    column_format="lccccccccc",
    escape=False,
    longtable=False
)

# Insert formatting commands manually
latex = latex.replace(r"\begin{tabular}", r"\begin{tabular}\small\setlength{\tabcolsep}{4pt}")
latex = latex.replace(r"\toprule", r"\toprule\normalsize")

print(latex)


\begin{table}
\caption{Comparison of agreement and Persian (FA) option distributions for LLaMA3.1 before and after option shuffling.}
\label{tab:llama3.1_shuffle_comparison}
\begin{tabular}\small\setlength{\tabcolsep}{4pt}{lccccccccc}
\toprule\normalsize
Model & Agreement (%) & All Different & FA Diverged & IT Diverged & EN Diverged & A_FA (%) & B_FA (%) & C_FA (%) & D_FA (%) \\
\midrule
LLaMA3.1 (Original) & 53.330000 & 36 & 5 & 13 & 6 & 20.000000 & 33.000000 & 23.000000 & 23.000000 \\
LLaMA3.1 (Shuffled) & 43.330000 & 32 & 7 & 15 & 6 & 18.000000 & 38.000000 & 33.000000 & 10.000000 \\
\bottomrule
\end{tabular}
\end{table}



In [3]:
import pandas as pd
import numpy as np

def bootstrap_agreement_CI(filepath, prefix, num_bootstrap=1000, confidence=0.95):
    df = pd.read_csv(filepath)
    en, fa, it = f"{prefix}_EN", f"{prefix}_FA", f"{prefix}_IT"
    df["Full_Agreement"] = (df[en] == df[fa]) & (df[en] == df[it])

    # Bootstrapping
    bootstrapped_agreements = []
    for _ in range(num_bootstrap):
        sample = df.sample(frac=1.0, replace=True)
        agreement_pct = sample["Full_Agreement"].mean() * 100
        bootstrapped_agreements.append(agreement_pct)

    lower_bound = np.percentile(bootstrapped_agreements, (1 - confidence) / 2 * 100)
    upper_bound = np.percentile(bootstrapped_agreements, (1 + confidence) / 2 * 100)
    mean_agreement = np.mean(bootstrapped_agreements)

    print(f"Bootstrapped {int(confidence*100)}% CI for agreement:")
    print(f"Mean agreement: {mean_agreement:.2f}%")
    print(f"{int(confidence*100)}% Confidence Interval: [{lower_bound:.2f}%, {upper_bound:.2f}%]")

# Run for LLaMA3.1
print("\nOriginal LLaMA3.1:")
bootstrap_agreement_CI("LLaMA3.1_70_Merged_Multilingual.csv", "LLaMA3.1")

print("\nShuffled LLaMA3.1:")
bootstrap_agreement_CI("LLaMA3.1_Shuffled_Merged_Multilingual.csv", "LLaMA3.1")




Original LLaMA3.1:
Bootstrapped 95% CI for agreement:
Mean agreement: 53.28%
95% Confidence Interval: [41.67%, 66.67%]

Shuffled LLaMA3.1:
Bootstrapped 95% CI for agreement:
Mean agreement: 43.10%
95% Confidence Interval: [31.67%, 55.00%]
