In [None]:
# ==========================================
# SETUP BLOCK 
# ==========================================

import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency

# ---- Imports from project files ----
sys.path.append(os.path.abspath(".."))
from Helper_funtions import (
    clean_up_subjects,
    calculate_true_false_score,
    calculate_Internet_terms_understanding_score,
    group_internet_understanding
)
from lists import (
    demographic_columns,
    multiple_choice_questions,
    single_choice_questions,
    likert_questions,
    likert_mapping,
    comparison_pairs_by_demo,
    cross_tab_titles_and_colors,
    nominal_posthoc_pairs_demo,
    ordinal_posthoc_pairs_demo
)
from answer_categories import question_orders

# ---- General plot style ----
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)

# ---- Data loading ----
DATA_FILE = os.path.join("..", "Data", "Fertige Tabelle.xlsx")
df = pd.read_excel(DATA_FILE)
df.columns = df.columns.str.strip()

# Clean up multi-subject columns
for col in ["Most used subjects", "Preferred Subjects", "Least preferred Subjects"]:
    if col in df.columns:
        df = clean_up_subjects(df, col)

# Calculate additional scores
if all(q in df.columns for q in ["True/False_1", "True/False_2"]):
    df = calculate_true_false_score(df)

if any(col.startswith("Internet terms_") for col in df.columns):
    df = calculate_Internet_terms_understanding_score(df)
    df = group_internet_understanding(df)

print("✅ Setup complete – DataFrame loaded and preprocessed")
print(f"Rows: {len(df)}, Columns: {len(df.columns)}")


In [None]:
# ------- selected cross-tabulations with stacked bar plots ------

multiple_choice_demographics = {
    "Preferred Subjects",
    "Least preferred Subjects",
    "Most used subjects",
}

# ---------- Helper: Spalte ggf. split/explode + leere Werte droppen ----------
def prepare_column(df_in, col, is_multi):
    if is_multi:
        tmp = (
            df_in[[col]]
            .dropna()
            .assign(**{col: df_in[col].str.split(",")})
            .explode(col)
        )
        tmp[col] = tmp[col].astype(str).str.strip()
        tmp = tmp[tmp[col] != ""]
        return tmp
    else:
        tmp = df_in[[col]].dropna()
        tmp[col] = tmp[col].astype(str).str.strip()
        tmp = tmp[tmp[col] != ""]
        return tmp

# ---------- stacked 100% bars ----------

for demo, question_list in comparison_pairs_by_demo.items():
    for question in question_list:
        try:
            
            q_is_multi = question in multiple_choice_questions
            q_df = prepare_column(df, question, q_is_multi)

            d_is_multi = demo in multiple_choice_demographics
            d_df = prepare_column(df, demo, d_is_multi)

            data = pd.concat([q_df, d_df], axis=1, join="inner").dropna()

            if data.empty:
                print(f"⚠️ No overlapping data for '{question}' x '{demo}'. Skipping.")
                continue

            ct = pd.crosstab(data[question], data[demo])

            # order after defined list
            if question in question_orders:
                x_order = [v for v in question_orders[question] if v in ct.index]
                remaining = [v for v in ct.index if v not in x_order]
                ct = ct.reindex(x_order + remaining)  
            else:
                # numeric sorting
                try:
                    ct.index = pd.to_numeric(ct.index)
                    ct = ct.sort_index()
                except Exception:
                    pass  

            # order after defined list
            if demo in question_orders:
                d_order = [v for v in question_orders[demo] if v in ct.columns]
                d_remaining = [v for v in ct.columns if v not in d_order]
                ct = ct[d_order + d_remaining]
            else:
                # numeric sorting
                try:
                    new_cols = pd.Series(ct.columns).astype(float)
                    ct = ct[sorted(ct.columns, key=lambda c: float(c))]
                except Exception:
                    pass

            
            ct_percent = ct.div(ct.sum(axis=1), axis=0) * 100

            # table output
            print(f"\n📊 {question} – stacked by {demo} (row-normalized to 100%)")
            print("Counts:")
            print(ct)
            print("\nPercent:")
            print(ct_percent.round(1))

            # plot
            title_and_colors = cross_tab_titles_and_colors.get(
                (demo, question),
                [f"{question} – distribution of {demo} within each answer"]
            )

            plot_title = title_and_colors[0]

            if len(title_and_colors) > 1:
                # custom colors
                colors = title_and_colors[1:]
                ax = ct.plot(
                    kind="bar",
                    stacked=True,
                    figsize=(10, 6),
                    color=colors,
                    width=0.9
                )
            else:
                # default palette
                ax = ct.plot(
                    kind="bar",
                    stacked=True,
                    figsize=(10, 6),
                    colormap="Set3",
                    width=0.9
                )

            ax.set_title(plot_title)
            ax.set_ylabel("Count")
            ax.set_xlabel(question)
            plt.xticks(rotation=45, ha="right")
            plt.legend(title=demo, bbox_to_anchor=(1.01, 1), loc="upper left")
            plt.tight_layout()
            plt.show()

        except Exception as e:
            print(f" Failed for {question} x {demo}: {e}")


In [None]:
# ==========================================
# CHI-SQUARE BLOCK (no plots)
# ==========================================


multiple_choice_demographics = {
    "Preferred Subjects",
    "Least preferred Subjects",
    "Most used subjects",
}

def prepare_column(df_in, col, is_multi):
    if col not in df_in.columns:
        return pd.DataFrame(columns=[col])

    if is_multi:
        tmp = (
            df_in[[col]]
            .dropna()
            .assign(**{col: df_in[col].astype(str).str.split(",")})
            .explode(col)
        )
    else:
        tmp = df_in[[col]].dropna()

    tmp[col] = tmp[col].astype(str).str.strip()
    tmp = tmp[tmp[col] != ""]
    return tmp

def order_crosstab(ct, question, demo):
    # index
    if question in question_orders:
        x_order = [v for v in question_orders[question] if v in ct.index]
        remaining = [v for v in ct.index if v not in x_order]
        ct = ct.reindex(x_order + remaining)
    else:
        # numeric sorting if possible
        try:
            idx_num = pd.to_numeric(ct.index)
            ct = ct.iloc[np.argsort(idx_num)]
        except Exception:
            pass

    # demographic columns
    if demo in question_orders:
        d_order = [v for v in question_orders[demo] if v in ct.columns]
        d_remaining = [v for v in ct.columns if v not in d_order]
        ct = ct[d_order + d_remaining]
    else:
        try:
            _ = [float(c) for c in ct.columns]
            ct = ct[sorted(ct.columns, key=lambda c: float(c))]
        except Exception:
            pass

    return ct

def cramers_v(chi2, ct):
    #Cramers' V for nominal association
    n = ct.values.sum()
    if n == 0:
        return np.nan
    r, k = ct.shape
    phi2 = chi2 / n
    # bias correction 
    phi2corr = max(0, phi2 - ((k - 1)*(r - 1)) / (n - 1)) if n > 1 else np.nan
    rcorr = r - ((r - 1)**2) / (n - 1) if n > 1 else r
    kcorr = k - ((k - 1)**2) / (n - 1) if n > 1 else k
    denom = min(rcorr - 1, kcorr - 1)
    return np.sqrt(phi2corr / denom) if denom > 0 else np.nan

def chi_square_with_checks(ct):
    # delete rows/cols with sum 0 
    ct = ct.loc[ct.sum(axis=1) > 0, ct.sum(axis=0) > 0]
    if ct.empty or (ct.shape[0] < 2 or ct.shape[1] < 2):
        return {
            "ok": False,
            "reason": "Contingency table too small or empty after filtering.",
            "chi2": np.nan, "p": np.nan, "dof": np.nan,
            "cramers_v": np.nan, "min_expected": np.nan,
            "prop_expected_lt5": np.nan, "expected": pd.DataFrame()
        }

    chi2, p, dof, expected = chi2_contingency(ct.values, correction=False)
    expected_df = pd.DataFrame(expected, index=ct.index, columns=ct.columns)
    min_exp = expected_df.values.min() if expected_df.size else np.nan
    prop_lt5 = (expected_df.values < 5).mean() if expected_df.size else np.nan
    v = cramers_v(chi2, ct)

    return {
        "ok": True,
        "reason": "",
        "chi2": chi2,
        "p": p,
        "dof": dof,
        "cramers_v": v,
        "min_expected": float(min_exp),
        "prop_expected_lt5": float(prop_lt5),
        "expected": expected_df
    }

#  build tables + chi² results
all_counts_long = []      
all_perc_long   = []      
all_tests       = []      # per test stats row

for demo, question_list in comparison_pairs_by_demo.items():
    for question in question_list:
        try:
            q_is_multi = question in multiple_choice_questions
            d_is_multi = demo in multiple_choice_demographics

            q_df = prepare_column(df, question, q_is_multi)
            d_df = prepare_column(df, demo, d_is_multi)

            # Auf Personenindex joinen (wie bei dir)
            data = pd.concat([q_df, d_df], axis=1, join="inner").dropna()
            if data.empty:
                print(f"⚠️ No overlapping data for '{question}' x '{demo}'. Skipping.")
                continue

            # Kontingenztafel
            ct = pd.crosstab(data[question], data[demo])

            # Ordnung wie in deinem Plot-Code
            ct = order_crosstab(ct, question, demo)

            if ct.empty:
                print(f"⚠️ Empty crosstab for '{question}' x '{demo}'. Skipping.")
                continue

            # Prozent (zeilennormiert)
            ct_percent = ct.div(ct.sum(axis=1), axis=0) * 100

            # ---- speichern (tidy) ----
            counts_long = (
                ct
                .reset_index()
                .melt(id_vars=ct.index.name or "index", var_name="col_cat", value_name="count")
                .rename(columns={ct.index.name or "index": "row_cat"})
            )
            counts_long["demo"] = demo
            counts_long["question"] = question
            all_counts_long.append(counts_long)

            perc_long = (
                ct_percent
                .reset_index()
                .melt(id_vars=ct_percent.index.name or "index", var_name="col_cat", value_name="percent_row")
                .rename(columns={ct_percent.index.name or "index": "row_cat"})
            )
            perc_long["demo"] = demo
            perc_long["question"] = question
            all_perc_long.append(perc_long)

            # ---- Chi²-Test + Diagnostik ----
            test = chi_square_with_checks(ct)
            all_tests.append({
                "demo": demo,
                "question": question,
                "n": int(ct.values.sum()),
                "rows": ct.shape[0],
                "cols": ct.shape[1],
                "ok": test["ok"],
                "reason": test["reason"],
                "chi2": test["chi2"],
                "dof": test["dof"],
                "p": test["p"],
                "cramers_v": test["cramers_v"],
                "min_expected": test["min_expected"],
                "prop_expected_lt5": test["prop_expected_lt5"]
            })

            # ---- Konsolen-Ausgabe (kompakt) ----
            print(f"\n📊 {question}  ×  {demo}")
            print("Counts:")
            print(ct)
            print("\nRow %:")
            print(ct_percent.round(1))

            if test["ok"]:
                print(f"\nχ²({test['dof']}) = {test['chi2']:.3f}, p = {test['p']:.4f},  Cramér’s V = {test['cramers_v']:.3f}")
                print(f"Assumptions: min(expected) = {test['min_expected']:.2f}, %cells<5 = {100*test['prop_expected_lt5']:.1f}%")
            else:
                print(f"\nChi-square not run: {test['reason']}")

        except Exception as e:
            print(f"❌ Failed for {question} x {demo}: {e}")

# ---------- dataframe outputs ----------
crosstabs_counts_long = pd.concat(all_counts_long, ignore_index=True) if all_counts_long else pd.DataFrame(
    columns=["demo","question","row_cat","col_cat","count"]
)
crosstabs_perc_long = pd.concat(all_perc_long, ignore_index=True) if all_perc_long else pd.DataFrame(
    columns=["demo","question","row_cat","col_cat","percent_row"]
)
chi2_results = pd.DataFrame(all_tests, columns=[
    "demo","question","n","rows","cols","ok","reason","chi2","dof","p","cramers_v","min_expected","prop_expected_lt5"
]).sort_values(["demo","question"]).reset_index(drop=True)

print("\n Finished chi-square run.")
print(f"Results: {len(chi2_results)} tests.")

# ---------- excel-Export ----------
EXPORT = True
EXPORT_PATH = os.path.join("..", "Data/test_results", "crosstabs_and_chi2_results.xlsx")

if EXPORT:
    with pd.ExcelWriter(EXPORT_PATH, engine="xlsxwriter") as writer:
        crosstabs_counts_long.to_excel(writer, index=False, sheet_name="crosstabs_counts")
        crosstabs_perc_long.to_excel(writer, index=False, sheet_name="crosstabs_row_percent")
        chi2_results.to_excel(writer, index=False, sheet_name="chi2_results")
    print(f"💾 Exported to: {EXPORT_PATH}")


In [None]:
# ==========================================
# POST-HOC ANALYSES (2×2 & Spearman)
# ==========================================

import itertools
from scipy.stats import fisher_exact, spearmanr

multiple_choice_demographics = {
    "Preferred Subjects",
    "Least preferred Subjects",
    "Most used subjects",
}

# Helpers  
def prepare_column(df_in, col, is_multi):
    if col not in df_in.columns:
        return pd.DataFrame(columns=[col])
    if is_multi:
        tmp = (
            df_in[[col]].dropna()
            .assign(**{col: df_in[col].astype(str).str.split(",")})
            .explode(col)
        )
    else:
        tmp = df_in[[col]].dropna()
    tmp[col] = tmp[col].astype(str).str.strip()
    tmp = tmp[tmp[col] != ""]
    return tmp

def order_crosstab(ct, question, group):
    # order of categories
    if question in question_orders:
        want = [v for v in question_orders[question] if v in ct.index]
        rest = [v for v in ct.index if v not in want]
        ct = ct.reindex(want + rest)
    else:
        try:
            idx_num = pd.to_numeric(ct.index)
            ct = ct.iloc[np.argsort(idx_num)]
        except Exception:
            pass

    if group in question_orders:
        want = [v for v in question_orders[group] if v in ct.columns]
        rest = [v for v in ct.columns if v not in want]
        ct = ct[want + rest]
    else:
        try:
            _ = [float(c) for c in ct.columns]
            ct = ct[sorted(ct.columns, key=lambda c: float(c))]
        except Exception:
            pass
    return ct

# ---------- 2×2-Test (Fisher exact for small counts, else Chi²) ----------
def test_2x2(a, b, c, d, alternative="two-sided", prefer="auto", yates=False):
    
    table = np.array([[int(a), int(b)], [int(c), int(d)]], dtype=int)
    row_sums = table.sum(axis=1)
    col_sums = table.sum(axis=0)
    if (row_sums[0] == 0) or (row_sums[1] == 0) or (col_sums[0] == 0) or (col_sums[1] == 0):
        return {
            "method": "skip", "reason": "degenerate margins (zero row/column total)",
            "p": np.nan, "chi2": np.nan, "dof": 1, "odds_ratio": np.nan, "min_expected": 0.0,
            "table": table,
        }

    N = table.sum()
    expected = np.outer(row_sums, col_sums) / N
    min_exp = expected.min()

    method = "chi2"
    if prefer == "fisher" or (prefer == "auto" and (min_exp < 5 or (table == 0).any())):
        method = "fisher"

    if method == "fisher":
        orat, p = fisher_exact(table, alternative=alternative)
        chi2_val, dof = np.nan, 1
        odds_ratio = float(orat) if np.isfinite(orat) else np.nan
    else:
        try:
            chi2_val, p, dof, _ = chi2_contingency(table, correction=yates)
        except Exception:
            # fallback on fisher
            orat, p = fisher_exact(table, alternative=alternative)
            chi2_val, dof = np.nan, 1
            odds_ratio = float(orat) if np.isfinite(orat) else np.nan
            return {
                "method": "fisher", "p": float(p), "chi2": float(chi2_val), "dof": int(dof),
                "odds_ratio": float(odds_ratio), "min_expected": float(min_exp), "table": table,
            }
        # Odds Ratio robust 
        a_, b_, c_, d_ = table.astype(float).ravel()
        if 0 in (a_, b_, c_, d_):
            a_, b_, c_, d_ = a_+0.5, b_+0.5, c_+0.5, d_+0.5
        odds_ratio = (a_ * d_) / (b_ * c_)

    return {
        "method": method, "p": float(p), "chi2": float(chi2_val), "dof": int(dof),
        "odds_ratio": float(odds_ratio), "min_expected": float(min_exp), "table": table,
    }

# ---------- Nominal × Nominal: pairwise 2×2-Tests  ----------
def posthoc_nominal(question, group, prefer="auto", yates=False, alternative="two-sided"):
    # handling of MC-questions
    q_is_multi = question in multiple_choice_questions
    g_is_multi = (group in multiple_choice_demographics) or (group in multiple_choice_questions)

    q_df = prepare_column(df, question, q_is_multi)
    g_df = prepare_column(df, group, g_is_multi)

    data = pd.concat([q_df, g_df], axis=1, join="inner").dropna()
    if data.empty:
        return pd.DataFrame(columns=["question","group","row_cat","g1","g2","method","p","chi2","dof","odds_ratio","min_expected","n_g1","n_g2","x_g1","x_g2"])

    ct = pd.crosstab(data[question], data[group])
    if not ct.columns.is_unique: ct = ct.T.groupby(level=0).sum().T
    if not ct.index.is_unique:   ct = ct.groupby(level=0).sum()
    ct = order_crosstab(ct, question, group)

    totals = ct.sum(axis=0)
    results = []

    for row_cat in ct.index:
        for g1, g2 in itertools.combinations(ct.columns, 2):
            a = int(ct.loc[row_cat, g1]); n1 = int(totals[g1]); b = n1 - a
            c = int(ct.loc[row_cat, g2]); n2 = int(totals[g2]); d = n2 - c
            res = test_2x2(a, b, c, d, alternative=alternative, prefer=prefer, yates=yates)

            results.append({
                "question": question, "group": group, "row_cat": row_cat,
                "g1": g1, "g2": g2,
                "method": res["method"], "p": res["p"], "chi2": res["chi2"], "dof": res["dof"],
                "odds_ratio": res["odds_ratio"], "min_expected": res["min_expected"],
                "n_g1": n1, "n_g2": n2, "x_g1": a, "x_g2": c,
                "note": res.get("reason","")
            })

    return pd.DataFrame(results)

# ---------- Ordinal × Ordinal: Spearman-Trend  ----------
def map_to_numeric_flexible(series, question):
    s = series.astype(str)
    # mapping
    if question in likert_mapping:
        mp = {str(k): v for k, v in likert_mapping[question].items()}
        out = s.map(mp)
        if out.notna().any():
            return out.astype(float)
    # order
    if question in question_orders:
        order = [str(x) for x in question_orders[question]]
        mapper = {cat: i for i, cat in enumerate(order)}
        out = s.map(mapper)
        if out.notna().any():
            return out.astype(float)
    # Scales
    common_orders = [
        ["Nie","Selten","Manchmal","Oft","Sehr oft"],
        ["Nie","Seltener","Etwa 1 Mal pro Woche","Mehrmals pro Woche","Täglich"],
        ["Gar nicht","Eher wenig","Teils/teils","Eher gut","Sehr gut"],
        ["Gar nicht verlässlich","Wenig verlässlich","Unsicher / Ich habe keine Meinung",
         "Teils/teils","Eher verlässlich","Sehr verlässlich"],
        ["Kein Verständnis","Schlechtes Verständnis","Mittelmässiges Verständnis","Gutes Verständnis","Völliges Verständnis"],
        ["Stört mich sehr","Stört mich ein wenig","Neutral / Mir egal","Finde ich gut", "Finde ich sehr gut"]
    ]
    su = set(s.unique())
    for order in common_orders:
        if su.issubset(set(order)):
            mapper = {cat: i for i, cat in enumerate(order)}
            return s.map(mapper).astype(float)
    # fallback numeric
    return pd.to_numeric(series, errors="coerce")

def make_ordered_group(series, name):
    s = series.astype(str)
    uniq = sorted(s.unique())
    if name in question_orders:
        desired = [str(x) for x in question_orders[name]]
        cats = [c for c in desired if c in s.unique()] + [c for c in uniq if c not in desired]
    else:
        try:
            nums = sorted({float(x) for x in s.unique()})
            cats = [str(int(x)) if float(x).is_integer() else str(x) for x in nums]
        except Exception:
            cats = uniq
    return pd.Categorical(s, categories=cats, ordered=True)

def posthoc_trend_spearman(question, group):
    #Spearman for ordinal pairs

    # Skip, if question is MC
    if (question in multiple_choice_questions and "(Count)" not in question) or \
       (group in multiple_choice_questions and "(Count)" not in group):
        print(f" Spearman skipped for {question} × {group}")
        return pd.DataFrame(columns=["question","group","n","rho","p","direction"])

    if question not in df.columns or group not in df.columns:
        return pd.DataFrame(columns=["question","group","n","rho","p","direction"])

    data = df[[question, group]].dropna().copy()
    if data.empty:
        return pd.DataFrame(columns=["question","group","n","rho","p","direction"])

    x = map_to_numeric_flexible(data[question], question)
    g_cat = make_ordered_group(data[group], group)
    codes = pd.Series(g_cat).cat.codes.astype(float)

    valid = (~x.isna()) & (~codes.isna())
    if not valid.any():
        return pd.DataFrame(columns=["question","group","n","rho","p","direction"])

    rho, p = spearmanr(x[valid].values, codes[valid].values)
    direction = "positive" if rho > 0 else ("negative" if rho < 0 else "zero")

    return pd.DataFrame([{
        "question": question, "group": group, "n": int(valid.sum()),
        "rho": float(rho), "p": float(p), "direction": direction
    }])

# run all tests
all_nominal = []
for (q, g) in nominal_posthoc_pairs_demo:
    try:
        df_res = posthoc_nominal(q, g, prefer="auto", yates=False, alternative="two-sided")
        if not df_res.empty:
            all_nominal.append(df_res)
            print(f"Nominal 2×2 computed for {q} × {g} ({len(df_res)} Tests).")
        else:
            print(f" No data for {q} × {g}.")
    except Exception as e:
        print(f" Failed nominal 2×2 for {q} × {g}: {e}")

nominal_posthoc_results = (
    pd.concat(all_nominal, ignore_index=True) if all_nominal
    else pd.DataFrame(columns=["question","group","row_cat","g1","g2","method","p","chi2","dof","odds_ratio","min_expected","n_g1","n_g2","x_g1","x_g2","note"])
)

all_trend = []
for (q, g) in ordinal_posthoc_pairs_demo:
    try:
        trend = posthoc_trend_spearman(q, g)
        if not trend.empty:
            all_trend.append(trend)
            print(f"Spearman trend computed for {q} × {g}.")
        else:
            print(f" No data for Spearman {q} × {g}.")
    except Exception as e:
        print(f" Failed Spearman for {q} × {g}: {e}")

trend_results = (
    pd.concat(all_trend, ignore_index=True) if all_trend
    else pd.DataFrame(columns=["question","group","n","rho","p","direction"])
)

print(f"Nominal 2×2 tests: {len(nominal_posthoc_results)} | Spearman trends: {len(trend_results)}")

# export 
EXPORT = True
EXPORT_PATH = os.path.join("..", "Data/test_results", "posthoc_results.xlsx")
if EXPORT:
    with pd.ExcelWriter(EXPORT_PATH, engine="xlsxwriter") as writer:
        nominal_posthoc_results.to_excel(writer, index=False, sheet_name="nominal_pairwise_2x2")
        trend_results.to_excel(writer, index=False, sheet_name="spearman_trend")
    print(f" Exported to: {EXPORT_PATH}")


In [None]:
# Graph: AI USAGE BY AGE --------------------------

# counts
counts = pd.DataFrame({
    13: [2, 7, 14, 9, 3],
    14: [1, 12, 14, 14, 4],
    15: [9, 11, 3, 4, 0],
    16: [10, 14, 4, 4, 3],
    17: [21, 15, 0, 0, 0],
    18: [9, 5, 3, 0, 0],
    19: [4, 4, 0, 0, 0],
}, index=[
    "Daily",
    "Several times per week",
    "About once per week",
    "Rarely",
    "Never"
])

# age order
ages = [13, 14, 15, 16, 17, 18, 19]
counts = counts[ages]

# order and naming of categories
order = ["Daily", "Several times per week", "About once per week", "Rarely", "Never"]
counts = counts.reindex(order)

# colours
colors = plt.cm.Blues([0.90, 0.75, 0.60, 0.45, 0.30])

# plot 
ax = counts.T.plot(kind="bar", stacked=True, figsize=(10, 6),
                   color=colors, width=0.9)

ax.set_title("AI usage (school + free time) by age", fontsize=16, fontweight="bold")
ax.set_ylabel("Number of respondents", fontsize=14, fontweight="bold")
ax.set_xlabel("Age", fontsize=14, fontweight="bold")
plt.xticks(rotation=0, fontsize=12)
plt.yticks(fontsize=12)

# legend
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[::-1], labels[::-1],
          title="Usage frequency",
          loc="upper right",    # innerhalb der Grafik
          fontsize=12, title_fontsize=13,
          framealpha=0.9)       # halbtransparentes Kästchen

plt.tight_layout()
plt.show()

In [None]:
# Graph: CONCERNS AI BY AGE -----------------

# data
data = {
    13: [6, 11, 18],
    14: [22, 10, 13],
    15: [9, 10, 8],
    16: [14, 15, 6],
    17: [17, 17, 2],
    18: [6, 8, 3],
    19: [5, 3, 0],
}
index = ["Yes", "No", "Never thought about it"]
counts = pd.DataFrame(data, index=index)

# age order
ages = [13, 14, 15, 16, 17, 18, 19]
counts = counts[ages]

# order for stacking
order = ["Yes", "No", "Never thought about it"]
counts = counts.reindex(order)

# set colours
colors = {
    "Yes": "green",
    "No": "red",
    "Never thought about it": "#4A90E2" 
}

# Plot
ax = counts.T.plot(
    kind="bar",
    stacked=True,
    figsize=(10, 6),
    color=[colors[o] for o in order],
    width=0.9
)

# title and axis
ax.set_title("Concerns about AI by age", fontsize=16, fontweight="bold")
ax.set_ylabel("Number of respondents", fontsize=14, fontweight="bold")
ax.set_xlabel("Age", fontsize=14, fontweight="bold")

plt.xticks(rotation=0, fontsize=12)
plt.yticks(fontsize=12)

# legend
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels,
          title="Concerns",
          loc="upper right",
          fontsize=12, title_fontsize=13,
          framealpha=0.9)


plt.tight_layout()
plt.show()


In [None]:
# ----- graph: AI usage by hours per week spent for school

# data
data = {
    "0-1 hours/week":     [3, 8, 7, 11, 4],
    "2-5 hours/week":     [5, 25, 41, 19, 17],
    "More than 5 hours/week": [3, 5, 16, 15, 14]
}
index = ["Never", "Rarely", "Sometimes", "Often", "Very often"]
counts = pd.DataFrame(data, index=index)

# set x-axis
order_x = ["Very often", "Often", "Sometimes", "Rarely", "Never"]
counts = counts.reindex(order_x)

# correct order
order_stack = ["More than 5 hours/week", "2-5 hours/week", "0-1 hours/week"]
counts = counts[order_stack]

# set colours
colors = {
    "More than 5 hours/week": "#0c3d86",
    "2-5 hours/week": "#3181b6",
    "0-1 hours/week": "#76b6f1"
}

# plot
ax = counts.plot(
    kind="bar", stacked=True, figsize=(10, 6),
    color=[colors[c] for c in order_stack], width=0.9
)


ax.set_title("AI usage frequency for education by hours per week spent for school in free time",
             fontsize=16, fontweight="bold")
ax.set_ylabel("Number of respondents", fontsize=14, fontweight="bold")
ax.set_xlabel("Usage frequency", fontsize=14, fontweight="bold")
plt.xticks(rotation=0, fontsize=12)
plt.yticks(fontsize=12)

# adjust legend
handles, labels = ax.get_legend_handles_labels()
legend_display = ["0-1 hours/week", "2-5 hours/week", "More than 5 hours/week"]  # hell → dunkel
lookup = {lab: h for h, lab in zip(handles, labels)}
ax.legend([lookup[l] for l in legend_display], legend_display,
          title="Hours per week", loc="upper right",
          fontsize=12, title_fontsize=13, framealpha=0.9)

plt.tight_layout()
plt.show()

In [None]:
# Graph: Most used subjects by gender 

import pandas as pd
import matplotlib.pyplot as plt

# correct order
subjects = [
    "History", "German", "Mathematics", "French", "Geography",
    "Biology", "Chemistry", "English", "Physics"
]

# percentages for each 
data = {
    "Male":   [17.3, 14.7, 14.1, 11.5,  8.9,  6.3,  7.3,  5.8,  5.2],
    "Female": [19.8, 11.8, 11.4, 12.2, 11.0, 10.1,  7.6,  7.6,  4.6],
    "No answer": [12.5, 12.5, 18.8, 12.5, 18.8, 0.0, 6.2, 0.0, 18.8],
}

df = pd.DataFrame(data, index=subjects)

# 3 bars per subjects
ax = df.plot(
    kind="bar",
    figsize=(11, 4.5),
    width=0.85,
    color=["#2b8cbe", "#de2d26", "#2ca25f"]  # male = blue, female = red, no answer = green
)

ax.set_title("Most used subjects – by gender", fontsize=14)
ax.set_ylabel("Percentage (%)", fontsize=12)
ax.set_xlabel("Subject", fontsize=12)
plt.xticks(rotation=45, ha="right", fontsize=11)
plt.yticks(fontsize=11)
plt.grid(False)
ax.legend(title="Gender", loc="upper right")

plt.tight_layout()
plt.show()