In [5]:
import pandas as pd

# Define model file names and column mappings
models_info = {
    "GPT-4o": ("GPT4o_Merged_Multilingual.csv", ["GPT4o_EN", "GPT4o_FA", "GPT4o_IT"]),
    "LLaMA 3.2": ("LLaMA3.2_Merged_Multilingual.csv", ["LLaMA_EN", "LLaMA_FA", "LLaMA_IT"]),
    "LLaMA 3.1": ("LLaMA3.1_70_Merged_Multilingual.csv", ["LLaMA3.1_EN", "LLaMA3.1_FA", "LLaMA3.1_IT"]),
    "mBERT": ("mBERT_Merged_Multilingual.csv", ["mBERT_EN", "mBERT_FA", "mBERT_IT"]),
    "XLM-R": ("XLM-R_Merged_Multilingual.csv", ["XLMR_EN", "XLMR_FA", "XLMR_IT"]),
    "Qwen2.5": ("Qwen2.5_Merged_Multilingual.csv", ["Qwen2.5_EN", "Qwen2.5_FA", "Qwen2.5_IT"]),
}

summary = []

for model, (path, cols) in models_info.items():
    try:
        df = pd.read_csv(path)
    except FileNotFoundError:
        print(f"⚠️ File not found: {path}")
        continue

    en, fa, it = cols

    # Full agreement
    df["Agree"] = (df[en] == df[fa]) & (df[en] == df[it])
    agree_pct = df["Agree"].mean() * 100

    # Disagreement classification
    def classify(row):
        choices = {row[en], row[fa], row[it]}
        if len(choices) == 1:
            return "All same"
        elif len(choices) == 3:
            return "All different"
        elif row[en] == row[it] and row[fa] != row[en]:
            return "FA diverged"
        elif row[en] == row[fa] and row[it] != row[en]:
            return "IT diverged"
        elif row[fa] == row[it] and row[en] != row[fa]:
            return "EN diverged"
        return "Other"

    df["Disagreement_Type"] = df.apply(classify, axis=1)
    counts = df["Disagreement_Type"].value_counts()

    # FA response distribution
    fa_counts = df[fa].value_counts(normalize=True) * 100

    summary.append({
        "Model": model,
        "Agreement %": round(agree_pct, 2),
        "All different": counts.get("All different", 0),
        "FA diverged": counts.get("FA diverged", 0),
        "IT diverged": counts.get("IT diverged", 0),
        "EN diverged": counts.get("EN diverged", 0),
        "A_FA %": round(fa_counts.get("A", 0), 2),
        "B_FA %": round(fa_counts.get("B", 0), 2),
        "C_FA %": round(fa_counts.get("C", 0), 2),
        "D_FA %": round(fa_counts.get("D", 0), 2),
    })

# Create summary DataFrame
summary_df = pd.DataFrame(summary)

# Show the full summary
print("\n📊 Summary Table Across Models:")
print(summary_df)

# Optionally, save to CSV or LaTeX if needed
summary_df.to_csv("LLM_Multilingual_Agreement_Summary.csv", index=False)



📊 Summary Table Across Models:
       Model  Agreement %  All different  FA diverged  IT diverged  \
0     GPT-4o        50.00              7           11            5   
1  LLaMA 3.2        23.33             16           14           10   
2  LLaMA 3.1        53.33              4            5           13   
3      mBERT         3.33             21           14           10   
4      XLM-R         8.33             21           11           12   
5    Qwen2.5        41.67              7           11           12   

   EN diverged  A_FA %  B_FA %  C_FA %  D_FA %  
0            7   16.67   36.67   28.33   18.33  
1            6    6.67   33.33   36.67   23.33  
2            6   20.00   33.33   23.33   23.33  
3           13   18.33   28.33   25.00   28.33  
4           11   13.33   51.67   11.67   23.33  
5            5   15.00   50.00   25.00   10.00  


In [6]:
import pandas as pd

# Table 1: Agreement and Disagreement with Qwen2.5 added
agreement_data = {
    "Model": ["GPT-4o", "LLaMA 3.2", "LLaMA 3.1", "mBERT", "XLM-R", "Qwen2.5"],
    "Agreement (%)": [50.0, 23.33, 53.33, 3.33, 8.33, 41.67],
    "All Different": [37, 30, 36, 23, 26, 32],
    "FA Diverged": [11, 14, 5, 14, 11, 11],
    "IT Diverged": [5, 10, 13, 10, 12, 12],
    "EN Diverged": [7, 6, 6, 13, 11, 5]
}
df_agreement = pd.DataFrame(agreement_data)

# Table 2: Distribution of FA Choices (A–D) with Qwen2.5 added
distribution_data = {
    "Model": ["GPT-4o", "LLaMA 3.2", "LLaMA 3.1", "mBERT", "XLM-R", "Qwen2.5"],
    "A (%)": [17.0, 7.0, 20.0, 18.0, 13.0, 15.0],
    "B (%)": [37.0, 33.0, 33.0, 28.0, 52.0, 50.0],
    "C (%)": [28.0, 37.0, 23.0, 25.0, 12.0, 25.0],
    "D (%)": [18.0, 23.0, 23.0, 28.0, 23.0, 10.0]
}
df_distribution = pd.DataFrame(distribution_data)

# Output LaTeX Table 1: Agreement metrics
latex_agreement = df_agreement.to_latex(index=False,
    caption="Cross-lingual agreement and disagreement breakdown for six LLMs.",
    label="tab:agreement_metrics",
    column_format="lccccc"
)

# Output LaTeX Table 2: FA Choice Distributions
latex_distribution = df_distribution.to_latex(index=False,
    caption="Distribution of response choices (A–D) for Persian (FA) prompts across six LLMs.",
    label="tab:fa_distribution",
    column_format="lcccc"
)

# Print both LaTeX tables
print(latex_agreement)
print("\n" + "-"*80 + "\n")
print(latex_distribution)



\begin{table}
\caption{Cross-lingual agreement and disagreement breakdown for six LLMs.}
\label{tab:agreement_metrics}
\begin{tabular}{lccccc}
\toprule
Model & Agreement (%) & All Different & FA Diverged & IT Diverged & EN Diverged \\
\midrule
GPT-4o & 50.000000 & 37 & 11 & 5 & 7 \\
LLaMA 3.2 & 23.330000 & 30 & 14 & 10 & 6 \\
LLaMA 3.1 & 53.330000 & 36 & 5 & 13 & 6 \\
mBERT & 3.330000 & 23 & 14 & 10 & 13 \\
XLM-R & 8.330000 & 26 & 11 & 12 & 11 \\
Qwen2.5 & 41.670000 & 32 & 11 & 12 & 5 \\
\bottomrule
\end{tabular}
\end{table}


--------------------------------------------------------------------------------

\begin{table}
\caption{Distribution of response choices (A–D) for Persian (FA) prompts across six LLMs.}
\label{tab:fa_distribution}
\begin{tabular}{lcccc}
\toprule
Model & A (%) & B (%) & C (%) & D (%) \\
\midrule
GPT-4o & 17.000000 & 37.000000 & 28.000000 & 18.000000 \\
LLaMA 3.2 & 7.000000 & 33.000000 & 37.000000 & 23.000000 \\
LLaMA 3.1 & 20.000000 & 33.000000 & 23.000000 & 23.0