In [None]:
# ==========================================
# SETUP BLOCK 
# ==========================================

import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency

# ---- Imports from project files ----
sys.path.append(os.path.abspath(".."))
from Helper_funtions import (
    clean_up_subjects,
    calculate_true_false_score,
    calculate_Internet_terms_understanding_score,
    group_internet_understanding
)
from lists import (
    demographic_columns,
    multiple_choice_questions,
    single_choice_questions,
    likert_questions,
    likert_mapping,
    comparison_pairs_by_demo,
    cross_tab_titles_and_colors

)
from answer_categories import question_orders

# ---- General plot style ----
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)

# ---- Data loading ----
DATA_FILE = os.path.join("..", "Data", "Fertige Tabelle.xlsx")
df = pd.read_excel(DATA_FILE)
df.columns = df.columns.str.strip()

# Clean up multi-subject columns
for col in ["Most used subjects", "Preferred Subjects", "Least preferred Subjects"]:
    if col in df.columns:
        df = clean_up_subjects(df, col)

# Calculate additional scores
if all(q in df.columns for q in ["True/False_1", "True/False_2"]):
    df = calculate_true_false_score(df)

if any(col.startswith("Internet terms_") for col in df.columns):
    df = calculate_Internet_terms_understanding_score(df)
    df = group_internet_understanding(df)

print("✅ Setup complete – DataFrame loaded and preprocessed")
print(f"Rows: {len(df)}, Columns: {len(df.columns)}")


In [None]:
# ----- original file with all possible crosstabulations -----

# Imports 
sys.path.append(os.path.abspath(".."))
from Helper_funtions import test_significance_single_choice, test_significance_multiple_choice, calculate_true_false_score


for question in multiple_choice_questions:
    for demo in demographic_columns:
        try:
            print(f"\n Distribution of multiple answers for: {question} grouped by {demo}")

            exploded = (
                df[[demo, question]]
                .dropna()
                .assign(**{question: df[question].str.split(",")})
                .explode(question)
            )

            exploded[question] = exploded[question].str.strip()
            cross = pd.crosstab(exploded[demo], exploded[question], normalize='index') * 100

            # Sort alphabetically or define specific sort order if desired
            cross = cross[sorted(cross.columns)]
            print(cross.round(1).to_string())

            # significance test fo every option
            results = test_significance_multiple_choice(exploded, question, demo)
            for res in results:
                if "Error" in res:
                    print(f"   ❌ Option: {res['Option']} → ERROR: {res['Error']}")
                else:
                    print(f"   → Option: {res['Option']} → p-value: {res['p_value']:.4f} → {res['Significance']}")

        except Exception as e:
            print(f"❌ Failed for {question} x {demo}: {e}")

for question in single_choice_questions:
    print(f"\n Distribution of responses for: {question}")
    
    for demo in demographic_columns:
        print(f"– Grouped by: {demo}")

        try:
            df[question] = df[question].astype(str)
            data = df[[demo, question]].dropna()

            # table in percent
            cross = pd.crosstab(data[demo], data[question], normalize='index') * 100
            
            # use defined order of answers
            if question in question_orders:
                order = question_orders[question]
                for col in order:
                    if col not in cross.columns:
                        cross[col] = 0
                cross = cross[order]
            
            # print table
            print(cross.round(1).to_string())

            # significance test
            p = test_significance_single_choice(df, question, demo)
            significance = "✅ significant" if p < 0.05 else "❌ not significant"
            print(f"   → p-value: {p:.4f} → {significance}")

        except Exception as e:
            print(f"❌ Test failed for {question} x {demo}: {e}")



In [None]:
# ------- selected cross-tabulations with seperated bar plots ------

for demo, question_list in comparison_pairs_by_demo.items(): 
    for question in question_list:

        try:
            if question in multiple_choice_questions:
                exploded_df = (
                    df[[demo, question]]
                    .dropna()
                    .assign(**{question: df[question].str.split(",")})
                    .explode(question)
                )
                exploded_df[question] = exploded_df[question].str.strip()
                data = exploded_df
            elif question in single_choice_questions:
                data = df[[demo, question]].dropna()
                data = data[data[question].astype(str).str.strip() != ""]
            else:
                continue  # Weder Single noch Multiple

            # Crosstab normalized by row (percentages)
            cross = pd.crosstab(data[demo], data[question], normalize='index') * 100

            # Use predefined order if available
            if question in question_orders:
                order = question_orders[question]
                for col in order:
                    if col not in cross.columns:
                        cross[col] = 0
                cross = cross[order]
            
            # Optional: sort numeric demographic column
            if pd.api.types.is_numeric_dtype(df[demo]):
                cross = cross.sort_index()

            print(f"\n📊 {question} grouped by {demo}")
            print(cross.round(1).to_string())

            # Plot as graph
            plot_df = cross.reset_index().melt(id_vars=demo, var_name="Answer", value_name="Percentage")

            plt.figure(figsize=(10, 6))
            sns.barplot(data=plot_df, x="Answer", y="Percentage", hue=demo)

            plt.title(f"{question} grouped by {demo}")
            plt.xticks(rotation=45, ha="right")
            plt.tight_layout()
            plt.show()
        except Exception as e:
            print(f"❌ Test failed for {question} x {demo}: {e}")


In [None]:
# ------- selected cross-tabulations with stacked bar plots ------



multiple_choice_demographics = {
    "Preferred Subjects",
    "Least preferred Subjects",
    "Most used subjects",
}

# ---------- Helper: Spalte ggf. split/explode + leere Werte droppen ----------
def prepare_column(df_in, col, is_multi):
    """Return DataFrame with just column 'col' (exploded if multi) and cleaned strings."""
    if is_multi:
        tmp = (
            df_in[[col]]
            .dropna()
            .assign(**{col: df_in[col].str.split(",")})
            .explode(col)
        )
        tmp[col] = tmp[col].astype(str).str.strip()
        tmp = tmp[tmp[col] != ""]
        return tmp
    else:
        tmp = df_in[[col]].dropna()
        tmp[col] = tmp[col].astype(str).str.strip()
        tmp = tmp[tmp[col] != ""]
        return tmp

# ---------- Main loop: stacked 100% bars ----------

for demo, question_list in comparison_pairs_by_demo.items():
    for question in question_list:
        try:
            # --- vorbereiten: Frage-Seite (X-Achse sind die Antworten der Frage) ---
            q_is_multi = question in multiple_choice_questions
            q_df = prepare_column(df, question, q_is_multi)

            # --- vorbereiten: Demografie-Seite (Farben im Stack) ---
            d_is_multi = demo in multiple_choice_demographics
            d_df = prepare_column(df, demo, d_is_multi)

            # join auf Index (Zeilen-personen), damit beide Spalten zusammen vorliegen
            data = pd.concat([q_df, d_df], axis=1, join="inner").dropna()

            if data.empty:
                print(f"⚠️ No overlapping data for '{question}' x '{demo}'. Skipping.")
                continue

            # Crosstab: Zeilen = Antworten der Frage, Spalten = Demografie-Kategorien
            ct = pd.crosstab(data[question], data[demo])

            # order after defined list
            if question in question_orders:
                x_order = [v for v in question_orders[question] if v in ct.index]
                remaining = [v for v in ct.index if v not in x_order]
                ct = ct.reindex(x_order + remaining)  
            else:
                # numeric sorting
                try:
                    ct.index = pd.to_numeric(ct.index)
                    ct = ct.sort_index()
                except Exception:
                    pass  

            # order after defined list
            if demo in question_orders:
                d_order = [v for v in question_orders[demo] if v in ct.columns]
                d_remaining = [v for v in ct.columns if v not in d_order]
                ct = ct[d_order + d_remaining]
            else:
                # numeric sorting
                try:
                    new_cols = pd.Series(ct.columns).astype(float)
                    ct = ct[sorted(ct.columns, key=lambda c: float(c))]
                except Exception:
                    pass

            
            ct_percent = ct.div(ct.sum(axis=1), axis=0) * 100

            # Tabellarische Ausgabe (Counts & %), falls du es brauchst
            print(f"\n📊 {question} – stacked by {demo} (row-normalized to 100%)")
            print("Counts:")
            print(ct)
            print("\nPercent:")
            print(ct_percent.round(1))

            # Plot: stacked 100% bar
            title_and_colors = cross_tab_titles_and_colors.get(
                (demo, question),
                [f"{question} – distribution of {demo} within each answer"]
            )

            plot_title = title_and_colors[0]

            if len(title_and_colors) > 1:
                # custom colors
                colors = title_and_colors[1:]
                ax = ct.plot(
                    kind="bar",
                    stacked=True,
                    figsize=(10, 6),
                    color=colors,
                    width=0.9
                )
            else:
                # default palette
                ax = ct.plot(
                    kind="bar",
                    stacked=True,
                    figsize=(10, 6),
                    colormap="Set3",
                    width=0.9
                )

            ax.set_title(plot_title)
            ax.set_ylabel("Count")
            ax.set_xlabel(question)
            plt.xticks(rotation=45, ha="right")
            plt.legend(title=demo, bbox_to_anchor=(1.01, 1), loc="upper left")
            plt.tight_layout()
            plt.show()

        except Exception as e:
            print(f"❌ Failed for {question} x {demo}: {e}")


In [None]:
# ------- selected cross-tabulations with separated bar plots, all columns on 100% and demographics are on y-axis------

# --- Config: demographic multiple-choice columns ---
multiple_choice_demographics = {
    "Preferred Subjects",
    "Least preferred Subjects",
    "Most used subjects",
}

# Helper: explode & clean a column
def prepare_column(df_in, col, is_multi):
    """Return DataFrame with just column 'col' (exploded if multi) and cleaned strings."""
    if is_multi:
        tmp = (
            df_in[[col]]
            .dropna()
            .assign(**{col: df_in[col].astype(str).str.split(",")})
            .explode(col)
        )
        tmp[col] = tmp[col].astype(str).str.strip()
        tmp = tmp[tmp[col] != ""]
        return tmp
    else:
        tmp = df_in[[col]].dropna()
        tmp[col] = tmp[col].astype(str).str.strip()
        tmp = tmp[tmp[col] != ""]
        return tmp

# Main loop: stacked 100% bars (demo on X-axis)

for demo, question_list in comparison_pairs_by_demo.items():
    for question in question_list:
        try:
            # Prepare: question side
            q_is_multi = question in multiple_choice_questions
            q_df = prepare_column(df, question, q_is_multi)

            # Prepare: demographic side
            d_is_multi = demo in multiple_choice_demographics
            d_df = prepare_column(df, demo, d_is_multi)

            # Join
            data = pd.concat([q_df, d_df], axis=1, join="inner").dropna()
            if data.empty:
                print(f"⚠️ No overlapping data for '{question}' x '{demo}'. Skipping.")
                continue

            # Crosstab: rows = demo categories (X-axis), cols = question answers (stack colors)
            ct = pd.crosstab(data[demo], data[question])

            # Order question answers (columns)
            if question in question_orders:
                col_order = [v for v in question_orders[question] if v in ct.columns]
                remaining_cols = [v for v in ct.columns if v not in col_order]
                ct = ct[col_order + remaining_cols]
            else:
                try:
                    _ = pd.to_numeric(ct.columns)
                    ct = ct[sorted(ct.columns, key=lambda x: float(x))]
                except Exception:
                    pass

            # Order demo categories (rows)
            if demo in question_orders:
                row_order = [v for v in question_orders[demo] if v in ct.index]
                remaining_rows = [v for v in ct.index if v not in row_order]
                ct = ct.reindex(row_order + remaining_rows)
            else:
                try:
                    ct.index = pd.to_numeric(ct.index)
                    ct = ct.sort_index()
                except Exception:
                    pass

            # Normalize so each demo category sums to 100%
            ct_percent = ct.div(ct.sum(axis=1), axis=0) * 100

            chi2, p, dof, expected = chi2_contingency(ct)

            print(f"\n🔍 Chi²-Test for {demo} x {question}")
            print(f"Chi² = {chi2:.3f}, df = {dof}, p-value = {p:.4f}")
            if p < 0.05:
                print("➡️ Significant at α = 0.05")
            else:
                print("➡️ Not significant")

            # Tabular output
            print(f"\n📊 {question} – distribution of {demo} (each {demo} sums to 100%)")
            print("Counts:")
            print(ct)
            print("\nPercent:")
            print(ct_percent.round(1))

            # Plot
            title_and_colors = cross_tab_titles_and_colors.get(
                (demo, question),
                [f"{question} – distribution of {demo} (each {demo} sums to 100%)"]
            )
            plot_title = title_and_colors[0]

            if len(title_and_colors) > 1:
                colors = title_and_colors[1:]
                ax = ct_percent.plot(
                    kind="bar",
                    stacked=True,
                    figsize=(10, 6),
                    color=colors,
                    width=0.9
                )
            else:
                ax = ct_percent.plot(
                    kind="bar",
                    stacked=True,
                    figsize=(10, 6),
                    colormap="Set2",
                    width=0.9
                )

            ax.set_title(plot_title)
            ax.set_ylabel("Percentage (%)")
            ax.set_xlabel(demo)
            ax.set_ylim(0, 100)
            plt.xticks(rotation=45, ha="right")
            plt.legend(title=question, bbox_to_anchor=(1.01, 1), loc="upper left")
            plt.tight_layout()
            plt.show()

        except Exception as e:
            print(f"❌ Failed for {question} x {demo}: {e}")


In [None]:
# ------ description -------

# --- Helper: Prepare column (explode if multiple choice) ---
def prepare_column(df_in, col):
    """Return cleaned Series for a column, exploded if multiple choice."""
    if col in multiple_choice_questions:
        tmp = (
            df_in[[col]]
            .dropna()
            .assign(**{col: df_in[col].astype(str).str.split(",")})
            .explode(col)
        )
        tmp[col] = tmp[col].astype(str).str.strip()
        tmp = tmp[tmp[col] != ""]
        return tmp
    else:
        tmp = df_in[[col]].dropna()
        tmp[col] = tmp[col].astype(str).str.strip()
        tmp = tmp[tmp[col] != ""]
        return tmp

# --- Main loop: stacked 100% bars with significance tests ---
for demo, question_list in comparison_pairs_by_demo.items():
    for question in question_list:
        try:
            q_is_multi = question in multiple_choice_questions
            d_is_multi = demo in multiple_choice_questions

            if q_is_multi and d_is_multi:
                # both multiple-choice → split both, explode both from the SAME slice
                tmp = df[[demo, question]].dropna().copy()
                tmp[demo] = tmp[demo].astype(str).str.split(",")
                tmp[question] = tmp[question].astype(str).str.split(",")
                data = tmp.explode(demo).explode(question)
                data[demo] = data[demo].astype(str).str.strip()
                data[question] = data[question].astype(str).str.strip()
                data = data[(data[demo] != "") & (data[question] != "")]
            elif q_is_multi and not d_is_multi:
                # question multiple, demo single
                tmp = df[[demo, question]].dropna().copy()
                tmp[question] = tmp[question].astype(str).str.split(",")
                data = tmp.explode(question)
                data[demo] = data[demo].astype(str).str.strip()
                data[question] = data[question].astype(str).str.strip()
                data = data[(data[demo] != "") & (data[question] != "")]
            elif not q_is_multi and d_is_multi:
                # demo multiple, question single
                tmp = df[[demo, question]].dropna().copy()
                tmp[demo] = tmp[demo].astype(str).str.split(",")
                data = tmp.explode(demo)
                data[demo] = data[demo].astype(str).str.strip()
                data[question] = data[question].astype(str).str.strip()
                data = data[(data[demo] != "") & (data[question] != "")]
            else:
                # both single
                data = df[[demo, question]].dropna().copy()
                data[demo] = data[demo].astype(str).str.strip()
                data[question] = data[question].astype(str).str.strip()
                data = data[(data[demo] != "") & (data[question] != "")]
            if data.empty:
                print(f"⚠️ No overlapping data for '{question}' x '{demo}'. Skipping.")
                continue
            
            # Crosstab: rows = demo categories, cols = question answers
            ct = pd.crosstab(data[demo], data[question])

            # Falls (selten) doppelte Spalten-/Indexnamen existieren → zusammenfalten
            if not ct.columns.is_unique:
                ct = ct.T.groupby(level=0).sum().T
            if not ct.index.is_unique:
                ct = ct.groupby(level=0).sum()        

            # Crosstab: rows = demo categories (X-axis), cols = question answers (stack colors)
            ct = pd.crosstab(data[demo], data[question])

            # Order columns (answers)
            if question in question_orders:
                col_order = [v for v in question_orders[question] if v in ct.columns]
                remaining_cols = [v for v in ct.columns if v not in col_order]
                ct = ct[col_order + remaining_cols]
            else:
                try:
                    _ = pd.to_numeric(ct.columns)
                    ct = ct[sorted(ct.columns, key=lambda x: float(x))]
                except Exception:
                    pass

            # Order rows (demo categories)
            if demo in question_orders:
                row_order = [v for v in question_orders[demo] if v in ct.index]
                remaining_rows = [v for v in ct.index if v not in row_order]
                ct = ct.reindex(row_order + remaining_rows)
            else:
                try:
                    ct.index = pd.to_numeric(ct.index)
                    ct = ct.sort_index()
                except Exception:
                    pass

            # Normalize so each demo category sums to 100%
            ct_percent = ct.div(ct.sum(axis=1), axis=0) * 100

            # Chi² significance test
            chi2, p, dof, expected = chi2_contingency(ct)
            print(f"\n🔍 Chi²-Test for {demo} x {question}")
            print(f"Chi² = {chi2:.3f}, df = {dof}, p-value = {p:.4f}")
            if p < 0.05:
                print("➡️ Significant at α = 0.05")
            else:
                print("➡️ Not significant")

            # Tabular output
            print(f"\n📊 {question} – distribution of {demo} (each {demo} sums to 100%)")
            print("Counts:")
            print(ct)
            print("\nPercent:")
            print(ct_percent.round(1))

            # Plot
            title_and_colors = cross_tab_titles_and_colors.get(
                (demo, question),
                [f"{question} – distribution of {demo} (each {demo} sums to 100%)"]
            )
            plot_title = title_and_colors[0]

            if len(title_and_colors) > 1:
                colors = title_and_colors[1:]
                ax = ct_percent.plot(
                    kind="bar",
                    stacked=True,
                    figsize=(10, 6),
                    color=colors,
                    width=0.9
                )
            else:
                ax = ct_percent.plot(
                    kind="bar",
                    stacked=True,
                    figsize=(10, 6),
                    colormap="Set2",
                    width=0.9
                )

            ax.set_title(plot_title)
            ax.set_ylabel("Percentage (%)")
            ax.set_xlabel(demo)
            ax.set_ylim(0, 100)
            plt.xticks(rotation=45, ha="right")
            plt.legend(title=question, bbox_to_anchor=(1.01, 1), loc="upper left")
            plt.tight_layout()
            plt.show()

        except Exception as e:
            print(f"❌ Failed for {question} x {demo}: {e}")
