In [None]:
# ==========================================
# SETUP BLOCK - loads data, applies project helpers
# ==========================================

import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append(os.path.abspath(".."))

from Helper_functions import (
    clean_up_subjects,
    calculate_true_false_score,
    calculate_internet_terms_understanding_score,
    group_internet_understanding,
    analyze_distribution,
    analyze_subject_distribution
)

from answer_categories import COLUMN_ALIASES

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)

DATA_FILE = os.path.join("..", "Data", "Fertige Tabelle.xlsx")
df = pd.read_excel(DATA_FILE)
df.columns = df.columns.astype(str).str.strip()

df = df.rename(columns=COLUMN_ALIASES)

for col in ["Most used subjects", "Preferred Subjects", "Least preferred Subjects"]:
    if col in df.columns:
        df = clean_up_subjects(df, col)

true_false_cols = [f"True/False_{i}" for i in range(1, 7)]
if all(c in df.columns for c in true_false_cols):
    df = calculate_true_false_score(df)

if any(c.startswith("Internet terms_") for c in df.columns):
    df = calculate_internet_terms_understanding_score(df)
    df = group_internet_understanding(df)

print("Setup complete – DataFrame loaded and preprocessed")
print(f"Rows: {len(df)}, Columns: {len(df.columns)}")


In [None]:
# ------- Distribution for demographics with graphs and tables -------

def show_distribution_with_plot(df, column, title=None, color="#1f77b4", sort_values=False):
    if column not in df.columns:
        return

    result = analyze_distribution(df, column, return_df=True)
    if result is None or result.empty:
        return

    if sort_values:
        result = result.sort_values("Percentage", ascending=False)

    pretty_col = column.replace("_", " ")
    if column == "Educational Level parent_1":
        pretty_col = "Parental education – Father / Guardian"
    elif column == "Educational Level parent_2":
        pretty_col = "Parental education – Mother / Guardian"
    elif column == "CRT_points":
        pretty_col = "CRT Score"

    plot_title = title or pretty_col

    ax = result.plot(
        kind="bar",
        y="Percentage",
        legend=False,
        title=plot_title,
        color=color
    )
    ax.set_ylabel("Percentage", fontsize=14, fontweight="bold")
    ax.set_xlabel(pretty_col, fontsize=14, fontweight="bold")
    ax.set_title(plot_title, fontsize=16, fontweight="bold")
    ax.tick_params(axis="x", rotation=45, labelsize=12, width=2)
    ax.tick_params(axis="y", labelsize=12, width=2)

    plt.tight_layout()
    plt.show()

    print(f"\nTable for: {plot_title}")
    print(result.to_string())


# copy to avoid touching the original df
df_en = df.copy()

# mappings 
gender_map = {
    "Weiblich": "Female",
    "Männlich": "Male",
    "Keine Angabe": "No answer"
}

parent_level_map = {
    "Abschluss an einer Universität / ETH": "University / ETH",
    "Abschluss an einer Universität oder ETH": "University / ETH",
    "Abschluss an einer Hochschule / Fachhochschule": "University of applied sciences",
    "Abschluss an einer Fachhochschule / Hochschule": "University of applied sciences",
    "Berufsabschluss": "Vocational qualification",
    "Abschluss eines Doktorat / Professur": "Doctorate / Prof.",
    "Obligatorische Schule": "Mandatory school",
    "Matura": "Matura",
    "Ich weiss es nicht": "Don't know"
}

if "Gender" in df_en.columns:
    df_en["Gender"] = df_en["Gender"].astype("string").str.strip().replace(gender_map)

for col in ["Educational Level parent_1", "Educational Level parent_2"]:
    if col in df_en.columns:
        df_en[col] = df_en[col].astype("string").str.strip().replace(parent_level_map)

# apply mappings
if "Gender" in df_en.columns:
    df_en["Gender"] = df_en["Gender"].astype(str).str.strip().replace(gender_map)

for col in ["Educational Level parent_1", "Educational Level parent_2"]:
    if col in df_en.columns:
        df_en[col] = df_en[col].astype(str).str.strip().replace(parent_level_map)

# demographic analysis

show_distribution_with_plot(df_en, "Gender", "Gender Distribution")
show_distribution_with_plot(df, "Age", "Age Distribution")
show_distribution_with_plot(df, "Education Level", "Students’ Education Level")
show_distribution_with_plot(df, "Hours per week for school", "Hours per Week Distribution")

show_distribution_with_plot(df_en, "Educational Level parent_1", sort_values=True)
show_distribution_with_plot(df_en, "Educational Level parent_2", sort_values=True)

show_distribution_with_plot(df, "CRT_points", "CRT Score Distribution")

# ---- Subject Preferences ----

def show_subject_distribution_with_plot(df, column, title=None):
    if column not in df.columns:
        return

    result = analyze_subject_distribution(df, column, return_df=True)
    if result is None or result.empty:
        return

    plot_title = title or column
    result.plot(kind="bar", y="Percentage of respondents", legend=False, title=plot_title)
    plt.ylabel("Percentage of respondents")
    plt.xlabel(column)
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()

    print(f"\nTable for: {plot_title}")
    print(result.to_string())

show_subject_distribution_with_plot(df, "Preferred Subjects", "Favorite Subjects")
show_subject_distribution_with_plot(df, "Least preferred Subjects", "Least Favorite Subjects")
