In [None]:
import pandas as pd
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import os

# categories
categories = ["STEM", "FAQ", "RIDDLES", "CREATIVE PROMPTS", "MULTIPLE", "NON"]

# results
results = []

# processing for Gemini 2.0
for cat in categories:
    path = f"Project-statistical-eval/proportions2.0/{cat}_2.0_proportions.csv"
    if os.path.exists(path):
        df = pd.read_csv(path)
        df["total_words"] = pd.to_numeric(df["total_words"], errors="coerce")
        df["english_only"] = pd.to_numeric(df["english_only"], errors="coerce")
        df_clean = df[["total_words", "english_only"]].dropna()

        if not df_clean.empty:
            pearson_r, pearson_p = pearsonr(df_clean["total_words"], df_clean["english_only"])
            results.append({"Category": cat, "Pearson r": round(pearson_r, 4), "p-value": round(pearson_p, 4)})

            # save the figures
            plt.figure()
            plt.scatter(df_clean["total_words"], df_clean["english_only"], alpha=0.6)
            plt.xlabel("Total Words")
            plt.ylabel("English-Only Words")
            plt.title(f"{cat} (Gemini 2.0) — Prompt Length vs English Usage")
            plt.grid(True)
            plt.savefig(f"{cat}_2.0_scatter.png")
            plt.close()

In [2]:
result_df = pd.DataFrame(results)
result_df

Unnamed: 0,Category,Pearson r,p-value
0,STEM,0.3194,0.0
1,FAQ,0.097,0.0202
2,RIDDLES,0.0675,0.1119
3,CREATIVE PROMPTS,-0.0425,0.2961
4,MULTIPLE,0.0294,0.7971
5,NON,0.2449,0.0
