In [None]:
# ==========================================
# SETUP BLOCK
# ==========================================

import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency, spearmanr, fisher_exact

sys.path.append(os.path.abspath(".."))

from Helper_functions import (
    clean_up_subjects,
    calculate_true_false_score,
    calculate_internet_terms_understanding_score,
    group_internet_understanding,
    prepare_pair,
    order_crosstab,
)

from lists import (
    multiple_choice_questions,
    LIKERT_VALUE_MAPS,
    comparison_pairs_by_AI_questions,
    cross_tab_titles_and_colors,
)

from answer_categories import question_orders, COLUMN_ALIASES

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)

DATA_FILE = os.path.join("..", "Data", "Fertige Tabelle.xlsx")
df = pd.read_excel(DATA_FILE)
df.columns = df.columns.astype(str).str.strip()

df = df.rename(columns=COLUMN_ALIASES)

for col in ["Most used subjects", "Preferred Subjects", "Least preferred Subjects"]:
    if col in df.columns:
        df = clean_up_subjects(df, col)

true_false_cols = [f"True/False_{i}" for i in range(1, 7)]
if all(c in df.columns for c in true_false_cols):
    df = calculate_true_false_score(df)

if any(c.startswith("Internet terms_") for c in df.columns):
    df = calculate_internet_terms_understanding_score(df)
    df = group_internet_understanding(df)

print("Setup complete – DataFrame loaded and preprocessed")
print(f"Rows: {len(df)}, Columns: {len(df.columns)}")


In [None]:
# ==========================================
# BUILD COUNT COLUMNS FROM MULTI-CHOICE TEXT
# ==========================================

# count column based on number of answers given in MC question
def build_count_from_multichoice(df_in: pd.DataFrame, source_col: str, new_col: str) -> pd.DataFrame:
   
    if source_col not in df_in.columns:
        print(f"Source column '{source_col}' not in DataFrame; cannot build '{new_col}'.")
        return df_in

    def _count_items(x):
        if pd.isna(x):
            return np.nan
        s = str(x).strip()
        if s == "" or s.lower() in {"nan", "none"}:
            return np.nan

        s = s.replace(";", ",").replace("[", "").replace("]", "").replace("'", "").replace('"', "")
        parts = [p.strip() for p in s.split(",")]
        parts = [p for p in parts if p]

        seen = set()
        unique_parts = []
        for p in parts:
            if p not in seen:
                seen.add(p)
                unique_parts.append(p)
        return len(unique_parts) if unique_parts else np.nan

    df_in[new_col] = df_in[source_col].apply(_count_items).astype("Int64")
    print(f"✅ Built count column '{new_col}' from '{source_col}'.")
    return df_in


df = build_count_from_multichoice(df, "Reasons to use AI", "Reasons to use AI (Count)")

df = build_count_from_multichoice(df, "Purposes to use AI", "Purposes to use AI (Count)")


try:
    min_c = int(df["Reasons to use AI (Count)"].min())
    max_c = int(df["Reasons to use AI (Count)"].max())
    question_orders["Reasons to use AI (Count)"] = list(range(min_c, max_c + 1))
except Exception:
    pass

try:
    min_c = int(df["Purposes to use AI (Count)"].min())
    max_c = int(df["Purposes to use AI (Count)"].max())
    question_orders["Purposes to use AI (Count)"] = list(range(min_c, max_c + 1))
except Exception:
    pass

In [None]:
# ==========================================
# BAR CHARTS (COUNTS, stacked)
# ==========================================

for base_question, compare_list in comparison_pairs_by_AI_questions.items():
    for compare_question in compare_list:

        left_is_multi = base_question in multiple_choice_questions
        right_is_multi = compare_question in multiple_choice_questions

        data = prepare_pair(df, base_question, compare_question, left_is_multi, right_is_multi)
        if data.empty:
            print(f"No overlapping data for '{base_question}' × '{compare_question}'. Skipping.")
            continue

        ct = pd.crosstab(data[base_question], data[compare_question])

        if not ct.columns.is_unique:
            ct = ct.T.groupby(level=0).sum().T
        if not ct.index.is_unique:
            ct = ct.groupby(level=0).sum()


        ct = order_crosstab(ct, base_question, compare_question)
        if ct.empty:
            continue

        print(f"\n{compare_question} within each {base_question} (counts)")
        print(ct)

        title_and_colors = cross_tab_titles_and_colors.get(
            (base_question, compare_question),
            [f"{compare_question} within each {base_question} (stacked counts)"]
        )
        plot_title = title_and_colors[0]

        blue_palette = sns.color_palette("Blues", n_colors=max(3, min(7, ct.shape[1])))

        if len(title_and_colors) > 1:
            ax = ct.plot(kind="bar", stacked=True, figsize=(10, 6), color=title_and_colors[1:], width=0.9)
        else:
            ax = ct.plot(kind="bar", stacked=True, figsize=(10, 6), color=blue_palette, width=0.9)

        ax.set_title(plot_title)
        ax.set_ylabel("Count")
        ax.set_xlabel(base_question)

        plt.xticks(rotation=45, ha="right")
        plt.legend(title=compare_question, bbox_to_anchor=(1.01, 1), loc="upper left")
        plt.tight_layout()
        plt.show()


In [None]:
# ==========================================
# SIGNIFICANCE TESTS for AI × AI crosstabs
# ==========================================

def cramers_v_corrected(chi2, ct):
    n = ct.values.sum()
    if n == 0:
        return np.nan
    r, k = ct.shape
    phi2 = chi2 / n
    if n > 1:
        phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
        rcorr = r - ((r - 1) ** 2) / (n - 1)
        kcorr = k - ((k - 1) ** 2) / (n - 1)
    else:
        phi2corr, rcorr, kcorr = np.nan, r, k
    denom = min(rcorr - 1, kcorr - 1)
    return np.sqrt(phi2corr / denom) if denom > 0 else np.nan

ai_ai_tests = []

for base_question, compare_list in comparison_pairs_by_AI_questions.items():
    for compare_question in compare_list:

        left_is_multi = base_question in multiple_choice_questions
        right_is_multi = compare_question in multiple_choice_questions

        data = prepare_pair(df, base_question, compare_question, left_is_multi, right_is_multi)
        if data.empty:
            continue

        ct = pd.crosstab(data[base_question], data[compare_question])

        if not ct.columns.is_unique:
            ct = ct.T.groupby(level=0).sum().T
        if not ct.index.is_unique:
            ct = ct.groupby(level=0).sum()

        ct = order_crosstab(ct, base_question, compare_question)

        ct = ct.loc[ct.sum(axis=1) > 0, ct.sum(axis=0) > 0]
        if ct.shape[0] < 2 or ct.shape[1] < 2:
            continue

        chi2, p, dof, expected = chi2_contingency(ct.values, correction=False)
        expected_df = pd.DataFrame(expected, index=ct.index, columns=ct.columns)

        v = cramers_v_corrected(chi2, ct)
        min_exp = float(expected_df.values.min())
        prop_lt5 = float((expected_df.values < 5).mean())
        n_total = int(ct.values.sum())

        ai_ai_tests.append({
            "base_question": base_question,
            "compare_question": compare_question,
            "n": n_total,
            "rows": ct.shape[0],
            "cols": ct.shape[1],
            "chi2": float(chi2),
            "dof": int(dof),
            "p": float(p),
            "cramers_v": float(v),
            "min_expected": min_exp,
            "prop_expected_lt5": prop_lt5,
        })

ai_ai_tests_df = (
    pd.DataFrame(ai_ai_tests)
    .sort_values(["base_question", "compare_question"])
    .reset_index(drop=True)
)

print(f"AI×AI significance finished. Total tests: {len(ai_ai_tests_df)}")

EXPORT = True
EXPORT_PATH = os.path.join("..", "Data", "test_results", "ai_ai_chi_square_results.xlsx")
if EXPORT:
    with pd.ExcelWriter(EXPORT_PATH, engine="xlsxwriter") as writer:
        ai_ai_tests_df.to_excel(writer, index=False, sheet_name="chi_square_results")
    print(f"Exported to: {EXPORT_PATH}")




In [None]:
# ==========================================
# SPEARMAN TREND TESTS (ordinal × ordinal)
# ==========================================

def map_to_numeric_flexible(series, varname):
    s = series.astype("string").str.strip()

    if varname in LIKERT_VALUE_MAPS:
        mp = {str(k): v for k, v in LIKERT_VALUE_MAPS[varname].items()}
        out = s.map(mp)
        if out.notna().any():
            return out.astype(float)

    if varname in question_orders:
        order = [str(x) for x in question_orders[varname]]
        mapper = {cat: i for i, cat in enumerate(order)}
        out = s.map(mapper)
        if out.notna().any():
            return out.astype(float)

    return pd.to_numeric(series, errors="coerce")

def run_spearman_pair(df_in, x, y):
    if x not in df_in.columns or y not in df_in.columns:
        return None

    if (x in multiple_choice_questions and "(Count)" not in x) or (y in multiple_choice_questions and "(Count)" not in y):
        return None

    X = map_to_numeric_flexible(df_in[x], x)
    Y = map_to_numeric_flexible(df_in[y], y)
    data = pd.DataFrame({"x": X, "y": Y}).dropna()
    if data.empty:
        return None

    rho, p = spearmanr(data["x"].values, data["y"].values)
    return {"x": x, "y": y, "n": int(len(data)), "rho": float(rho), "p": float(p),
            "direction": "positive" if rho > 0 else ("negative" if rho < 0 else "zero")}

spearman_pairs = [
    ("Use AI school and freetime", "Usefulness AI"),
    ("Use AI school and freetime", "Reliability AI"),
    ("Use AI school and freetime", "Mates using AI"),
    ("Use AI school and freetime", "Deal with AI"),
    ("Reliability AI", "Teachers preparing lessons"),
    ("Reliability AI", "Teachers giving grades"),
    ("Frequency use of AI_school", "Help of AI"),
    ("Frequency use of AI_school", "Reasons to use AI (Count)"),
    ("Frequency use of AI_school", "Purposes to use AI (Count)"),
]

rows = []
for x, y in spearman_pairs:
    out = run_spearman_pair(df, x, y)
    if out:
        rows.append(out)

spearman_summary = pd.DataFrame(rows).sort_values(["x","y"]).reset_index(drop=True)
print(f"Spearman tests completed: {len(spearman_summary)}")

EXPORT = True
EXPORT_PATH = os.path.join("..", "Data", "test_results", "ai_ai_spearman_trends.xlsx")
if EXPORT:
    with pd.ExcelWriter(EXPORT_PATH, engine="xlsxwriter") as writer:
        spearman_summary.to_excel(writer, index=False, sheet_name="spearman_trends")
    print(f"Exported to: {EXPORT_PATH}")



In [None]:
# ==========================================
# PAIRWISE 2×2 TESTS (auto Fisher or Chi-square)
# ==========================================

from itertools import combinations

def test_2x2(a, b, c, d, alternative="two-sided", prefer="auto", yates=False):
    table = np.array([[int(a), int(b)], [int(c), int(d)]], dtype=int)

    row_sums = table.sum(axis=1)
    col_sums = table.sum(axis=0)
    if (row_sums[0] == 0) or (row_sums[1] == 0) or (col_sums[0] == 0) or (col_sums[1] == 0):
        return {"method": "skip", "p": np.nan, "chi2": np.nan, "dof": 1, "odds_ratio": np.nan}

    N = table.sum()
    expected = np.outer(row_sums, col_sums) / N
    min_exp = expected.min()

    method = "chi2"
    if prefer == "fisher" or (prefer == "auto" and (min_exp < 5 or (table == 0).any())):
        method = "fisher"

    if method == "fisher":
        orat, p = fisher_exact(table, alternative=alternative)
        return {"method": "fisher", "p": float(p), "chi2": np.nan, "dof": 1, "odds_ratio": float(orat) if np.isfinite(orat) else np.nan}

    chi2_val, p, dof, _ = chi2_contingency(table, correction=yates)

    a_, b_, c_, d_ = table.astype(float).ravel()
    if 0 in (a_, b_, c_, d_):
        a_, b_, c_, d_ = a_ + 0.5, b_ + 0.5, c_ + 0.5, d_ + 0.5
    odds_ratio = (a_ * d_) / (b_ * c_)

    return {"method": "chi2", "p": float(p), "chi2": float(chi2_val), "dof": int(dof), "odds_ratio": float(odds_ratio)}

def pairwise_2x2_for_pair(df_in, left, right, prefer="auto", yates=False):
    left_is_multi = left in multiple_choice_questions
    right_is_multi = right in multiple_choice_questions

    data = prepare_pair(df_in, left, right, left_is_multi, right_is_multi)
    if data.empty:
        return pd.DataFrame()

    ct = pd.crosstab(data[left], data[right])
    if not ct.columns.is_unique:
        ct = ct.T.groupby(level=0).sum().T
    if not ct.index.is_unique:
        ct = ct.groupby(level=0).sum()

    ct = order_crosstab(ct, left, right)
    ct = ct.loc[ct.sum(axis=1) > 0, ct.sum(axis=0) > 0]
    if ct.shape[0] < 1 or ct.shape[1] < 2:
        return pd.DataFrame()

    totals = ct.sum(axis=0)
    rows = []

    for row_cat in ct.index:
        for g1, g2 in combinations(ct.columns, 2):
            a = int(ct.loc[row_cat, g1]); b = int(totals[g1] - a)
            c = int(ct.loc[row_cat, g2]); d = int(totals[g2] - c)
            res = test_2x2(a, b, c, d, prefer=prefer, yates=yates)

            rows.append({
                "left_var": left,
                "right_var": right,
                "row_cat": row_cat,
                "g1": g1,
                "g2": g2,
                "method": res["method"],
                "p": res["p"],
                "chi2": res["chi2"],
                "dof": res["dof"],
                "odds_ratio": res["odds_ratio"],
                "n_g1": int(totals[g1]),
                "n_g2": int(totals[g2]),
                "x_g1": a,
                "x_g2": c,
            })

    return pd.DataFrame(rows)

pairwise_2x2_pairs = [
    ("Use AI school and freetime", "Concerns AI"),
    ("Used AI", "Reliability AI"),
]

all_2x2 = []
for left, right in pairwise_2x2_pairs:
    out = pairwise_2x2_for_pair(df, left, right, prefer="auto", yates=False)
    if not out.empty:
        all_2x2.append(out)

pairwise_results = pd.concat(all_2x2, ignore_index=True) if all_2x2 else pd.DataFrame()
print(f"Pairwise 2×2 tests completed: {len(pairwise_results)}")

EXPORT = True
EXPORT_PATH = os.path.join("..", "Data", "test_results", "ai_ai_posthoc_nominal_2x2.xlsx")
if EXPORT:
    with pd.ExcelWriter(EXPORT_PATH, engine="xlsxwriter") as writer:
        pairwise_results.to_excel(writer, index=False, sheet_name="pairwise_2x2_tests")
    print(f"Exported to: {EXPORT_PATH}")
