In [None]:
# ==========================================
# SETUP BLOCK 
# ==========================================

import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency

# ---- Imports from project files ----
sys.path.append(os.path.abspath(".."))
from Helper_funtions import (
    clean_up_subjects,
    calculate_true_false_score,
    calculate_Internet_terms_understanding_score,
    group_internet_understanding
)
from lists import (
    demographic_columns,
    multiple_choice_questions,
    single_choice_questions,
    likert_questions,
    likert_mapping,
    comparison_pairs_by_AI_questions,
    cross_tab_titles_and_colors

)
from answer_categories import question_orders

# ---- General plot style ----
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)

# ---- Data loading ----
DATA_FILE = os.path.join("..", "Data", "Fertige Tabelle.xlsx")
df = pd.read_excel(DATA_FILE)
df.columns = df.columns.str.strip()

# Clean up multi-subject columns
for col in ["Most used subjects", "Preferred Subjects", "Least preferred Subjects"]:
    if col in df.columns:
        df = clean_up_subjects(df, col)

# Calculate additional scores
if all(q in df.columns for q in ["True/False_1", "True/False_2"]):
    df = calculate_true_false_score(df)

if any(col.startswith("Internet terms_") for col in df.columns):
    df = calculate_Internet_terms_understanding_score(df)
    df = group_internet_understanding(df)

print("✅ Setup complete – DataFrame loaded and preprocessed")
print(f"Rows: {len(df)}, Columns: {len(df.columns)}")

In [None]:
# ------- cross-tables with graphs, graphs with seperated bars -------

for demo, question_list in comparison_pairs_by_AI_questions.items(): 
    for question in question_list:
        try:
            relevant_cols = [demo, question]
            data = df[relevant_cols].dropna()

            # Handle exploding if any of the two is multiple choice
            for col in relevant_cols:
                if col in multiple_choice_questions:
                    data[col] = data[col].astype(str).str.split(",")
                    data = data.explode(col)
                    data[col] = data[col].str.strip()

            # Drop any remaining empty entries
            data = data.dropna()
            data = data[(data[demo].astype(str).str.strip() != "") & (data[question].astype(str).str.strip() != "")]

            # Crosstab normalized by row (percentages)
            cross = pd.crosstab(data[demo], data[question], normalize='index') * 100

            # Apply predefined order
            if question in question_orders:
                order = question_orders[question]
                for col in order:
                    if col not in cross.columns:
                        cross[col] = 0
                cross = cross[order]

            if pd.api.types.is_numeric_dtype(data[demo]):
                cross = cross.sort_index()

            print(f"\n📊 {question} grouped by {demo}")
            print(cross.round(1).to_string())

            # Plot
            plot_df = cross.reset_index().melt(id_vars=demo, var_name="Answer", value_name="Percentage")

            plt.figure(figsize=(10, 6))
            sns.barplot(data=plot_df, x="Answer", y="Percentage", hue=demo)
            plt.title(f"{question} grouped by {demo}")
            plt.xticks(rotation=45, ha="right")
            plt.tight_layout()
            plt.show()

        except Exception as e:
            print(f"❌ Test failed for {question} x {demo}: {e}")


In [None]:
# ------ significance tests, tables, and stacked graphs with 100% bars -------

# Helper: align two columns on the same rows and explode when needed
def build_pair_dataframe(df_in: pd.DataFrame, left: str, right: str) -> pd.DataFrame:
    """
    Returns a DataFrame with two cleaned columns [left, right], where:
      - multiple-choice columns are split on ',' and exploded
      - empty/whitespace-only values are removed
      - both columns come from the same row slice to keep alignment
    """
    left_is_multi  = left  in multiple_choice_questions
    right_is_multi = right in multiple_choice_questions

    tmp = df_in[[left, right]].dropna().copy()

    if left_is_multi:
        tmp[left] = tmp[left].astype(str).str.split(",")
    else:
        tmp[left] = tmp[left].astype(str).str.strip()

    if right_is_multi:
        tmp[right] = tmp[right].astype(str).str.split(",")
    else:
        tmp[right] = tmp[right].astype(str).str.strip()

    # explode whichever needs exploding (order matters: explode one, then the other)
    if left_is_multi:
        tmp = tmp.explode(left)
    if right_is_multi:
        tmp = tmp.explode(right)

    # strip again (post-explode) and drop truly empty
    tmp[left]  = tmp[left].astype(str).str.strip()
    tmp[right] = tmp[right].astype(str).str.strip()
    tmp = tmp[(tmp[left] != "") & (tmp[right] != "")]

    return tmp

# Helper: apply ordering with question_orders or numeric
def order_rows_cols(ct: pd.DataFrame, rows_key: str, cols_key: str) -> pd.DataFrame:
    # columns (answers of "right" question)
    if cols_key in question_orders:
        col_order = [v for v in question_orders[cols_key] if v in ct.columns]
        remaining_cols = [v for v in ct.columns if v not in col_order]
        ct = ct[col_order + remaining_cols]
    else:
        # numeric fallback for columns
        try:
            _ = pd.to_numeric(ct.columns)
            ct = ct[sorted(ct.columns, key=lambda x: float(x))]
        except Exception:
            pass

    # rows (answers of "left" question)
    if rows_key in question_orders:
        row_order = [v for v in question_orders[rows_key] if v in ct.index]
        remaining_rows = [v for v in ct.index if v not in row_order]
        ct = ct.reindex(row_order + remaining_rows)
    else:
        # numeric fallback for rows
        try:
            ct.index = pd.to_numeric(ct.index)
            ct = ct.sort_index()
        except Exception:
            pass

    return ct

# Main loop: for each AI base question, compare to each AI question
for base_question, compare_list in comparison_pairs_by_AI_questions.items():
    for compare_question in compare_list:
        try:
            # Build pair-wise dataset (handles single/multiple and alignment)
            data = build_pair_dataframe(df, base_question, compare_question)
            if data.empty:
                print(f"⚠️ No overlapping data for '{base_question}' x '{compare_question}'. Skipping.")
                continue

            # Crosstab: rows = base question categories, cols = compare question categories
            ct = pd.crosstab(data[base_question], data[compare_question])

            # Fold duplicates (rare: if hidden whitespace produced duplicate labels)
            if not ct.columns.is_unique:
                ct = ct.T.groupby(level=0).sum().T
            if not ct.index.is_unique:
                ct = ct.groupby(level=0).sum()

            # Apply predefined ordering (or numeric as fallback)
            ct = order_rows_cols(ct, rows_key=base_question, cols_key=compare_question)

            # Normalize each row (each bar = 100%)
            ct_percent = (ct.div(ct.sum(axis=1), axis=0) * 100).fillna(0)

            # ---- Chi-square test on counts (NOT on percentages) ----
            # Filter out rows with zero total (rare)
            ct_for_test = ct.loc[ct.sum(axis=1) > 0]
            significant = None
            if ct_for_test.shape[0] >= 2 and ct_for_test.shape[1] >= 2:
                chi2, p, dof, expected = chi2_contingency(ct_for_test)
                significant = (p < 0.05)
                print(f"\n🔍 Chi²-test: {base_question} × {compare_question} | chi²={chi2:.3f}, df={dof}, p={p:.4f} → "
                      f"{'✅ significant' if significant else '❌ not significant'}")
            else:
                print(f"\nℹ️ Chi²-test skipped for '{base_question} × {compare_question}' (table too small).")

            # ---- Tabular printout (Counts & %) ----
            print(f"\n📊 {compare_question} within each {base_question} (rows sum to 100%)")
            print("Counts:\n", ct)
            print("\nPercent:\n", ct_percent.round(1))

            # ---- Plot: stacked 100% bar (X = base_question categories, stacks = compare_question answers) ----
            title_and_colors = cross_tab_titles_and_colors_ai.get(
                (base_question, compare_question),
                [f"{compare_question} within each {base_question} (100% stacked)"]
            )
            plot_title = title_and_colors[0]

            blues_palette = sns.color_palette("Blues", n_colors=5)

            ax = None
            if len(title_and_colors) > 1:
                # explicit colors supplied
                colors = title_and_colors[1:]
                ax = ct_percent.plot(kind="bar", stacked=True, figsize=(10, 6), color=colors, width=0.9)
            else:
                # default palette
                ax = ct_percent.plot(kind="bar", stacked=True, figsize=(10, 6), color=sns.color_palette("Blues", n_colors=len(ct_percent.columns)), width=0.9)

            ax.set_title(plot_title)
            ax.set_ylabel("Percentage (%)")
            ax.set_xlabel(base_question)
            ax.set_ylim(0, 100)
            plt.xticks(rotation=45, ha="right")
            plt.legend(title=compare_question, bbox_to_anchor=(1.01, 1), loc="upper left")
            plt.tight_layout()
            plt.show()

        except Exception as e:
            print(f"❌ Failed for {base_question} x {compare_question}: {e}")
