In [None]:
# Setup
import os
import sys
import pandas as pd

# -------- Distribution in table form ---------

# Add helper function path
sys.path.append(os.path.abspath(".."))
from Helper_funtions import clean_up_subjects, analyze_subject_distribution, analyze_distribution

# Load data
file_path = os.path.join("..", "Data", "Fertige Tabelle.xlsx")
df = pd.read_excel(file_path)
df.columns = df.columns.str.strip()

# ---- Demographic Analysis ----

# Gender
analyze_distribution(df, "Gender", "Gender Distribution")

# Age
analyze_distribution(df, "Age", "Age Distribution")

# Education Level
analyze_distribution(df, "Education Level", "Students’ Education Level")

# Hours per week
analyze_distribution(df, "Hours per week for school", "Hours per week distribution")

# Parents' Education Levels
for parent_col in ["Educational Level parent_1", "Educational Level parent_2"]:
    analyze_distribution(df, parent_col, f"Parental Education – {parent_col}")

# CRT Score
analyze_distribution(df, "CRT_points", "Distribution of CRT Score")


# ---- Subject Preferences ----

# Clean up subject columns before analyzing
subject_columns = ["Preferred Subjects", "Least preferred Subjects"]
for col in subject_columns:
    df = clean_up_subjects(df, col)

# Favorite subjects
analyze_subject_distribution(df, "Preferred Subjects")

# Least favorite subjects
analyze_subject_distribution(df, "Least preferred Subjects")


In [None]:
# Setup
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt

# ------- Distribution for demographics with graphs and tables -------

# Add helper function path
sys.path.append(os.path.abspath(".."))
from Helper_funtions import clean_up_subjects, analyze_subject_distribution_changed, analyze_distribution_changed
from answer_categories import question_orders

# Load data
file_path = os.path.join("..", "Data", "Fertige Tabelle.xlsx")
df = pd.read_excel(file_path)
df.columns = df.columns.str.strip()

# Function to show both plot and table
def show_distribution_with_plot(df, column, title=None):
    result = analyze_distribution_changed(df, column, return_df=True)

    if result is not None:
        result.plot(kind="bar", y="Percentage", legend=False, title=title or column)
        plt.ylabel("Percentage")
        plt.xlabel(column)
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        plt.show()

        # Also print table for easy Excel copy
        print(f"\n🔹 Tabelle für: {title or column}")
        print(result.to_string())
        print(result.to_csv(sep='\t', index=True))

# ---- Demographic Analysis ----

show_distribution_with_plot(df, "Gender", "Gender Distribution")
show_distribution_with_plot(df, "Age", "Age Distribution")
show_distribution_with_plot(df, "Education Level", "Students’ Education Level")
show_distribution_with_plot(df, "Hours per week for school", "Hours per Week Distribution")

for parent_col in ["Educational Level parent_1", "Educational Level parent_2"]:
    show_distribution_with_plot(df, parent_col, f"Parental Education – {parent_col}")

show_distribution_with_plot(df, "CRT_points", "CRT Score Distribution")

# ---- Subject Preferences ----

# Clean up subject columns before analyzing
subject_columns = ["Preferred Subjects", "Least preferred Subjects"]
for col in subject_columns:
    df = clean_up_subjects(df, col)

def show_subject_distribution_with_plot(df, column, title=None):
    result = analyze_subject_distribution_changed(df, column, return_df=True)

    if result is not None:
        result.plot(kind="bar", y="Percentage of respondents", legend=False, title=title or column)
        plt.ylabel("Percentage of respondents")
        plt.xlabel(column)
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        plt.show()

        print(f"\n🔹 Tabelle für: {title or column}")
        print(result.to_string())
        print(result.to_csv(sep='\t', index=True))

show_subject_distribution_with_plot(df, "Preferred Subjects", "Favorite Subjects")
show_subject_distribution_with_plot(df, "Least preferred Subjects", "Least Favorite Subjects")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

# ------------ distribution of demographics of clusters, percentage is based on the category
# e.g., 20% of students in cluster 1 are 17 years old------------

# Setup 

sys.path.append(os.path.abspath(".."))
from answer_categories import question_orders
from lists import demographic_columns, multiple_choice_questions

# File path to clustered students
file_path = os.path.join("..", "Data", "Clustered_Students.xlsx")
xls = pd.read_excel(file_path, sheet_name=None)

# Define which of them are multiple-choice
multiple_choice_demographics = ["Used AI"]

# Plot style
sns.set(style="whitegrid")

for sheet_name, cluster_df in xls.items():
    print(f"\n📊 Demographic distribution in {sheet_name}:")

    for column in demographic_columns:
        if column not in cluster_df.columns:
            print(f"⚠️ Column '{column}' not found in {sheet_name}. Skipping.")
            continue

        # Handle Multiple Choice Columns (explode)
        if column in multiple_choice_questions:
            exploded_df = (
                cluster_df[[column]]
                .dropna()
                .assign(**{column: cluster_df[column].str.split(",")})
                .explode(column)
            )
            exploded_df[column] = exploded_df[column].str.strip()
            counts_series = exploded_df[column]
        else:
            counts_series = cluster_df[column].dropna().astype(str)

        # Calculate counts & percentages
        absolute_counts = counts_series.value_counts(dropna=False)
        relative_percent = counts_series.value_counts(normalize=True, dropna=False) * 100

        # Create summary DataFrame for display
        summary_df = pd.DataFrame({
            "Count": absolute_counts,
            "Percentage": relative_percent.round(1)
        })

        # Apply question_orders if available
        if column in question_orders:
            ordered_cats = [str(cat) for cat in question_orders[column] if str(cat) in summary_df.index]
            summary_df = summary_df.reindex(ordered_cats)

        print(f"\n{column} distribution in {sheet_name}:")
        print(summary_df)

        # Plot
        plt.figure(figsize=(8, 4))
        sns.barplot(
            x=summary_df.index, 
            y=summary_df["Percentage"], 
            palette="Set2"
        )
        plt.title(f"{column} in {sheet_name} (percent)")
        plt.xlabel(column)
        plt.ylabel("Percentage of students")
        plt.xticks(rotation=45)
        plt.ylim(0, 100)
        plt.tight_layout()
        plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from IPython.display import display, HTML

# ------------ distribution of demographics of clusters, percentage is based on the value
# e.g., 20% of students of 17 year old's in cluster 1------------

#  Setup 
sys.path.append(os.path.abspath(".."))
from answer_categories import question_orders
from lists import multiple_choice_questions, demographic_columns

# File paths
cluster_file = os.path.join("..", "Data", "Clustered_Students.xlsx")
full_data_file = os.path.join("..", "Data", "Fertige Tabelle.xlsx")

# Load full dataset (needed for category-level reference)
df_total = pd.read_excel(full_data_file)
df_total.columns = df_total.columns.str.strip()

# Load all clusters (each sheet = one cluster)
xls = pd.read_excel(cluster_file, sheet_name=None)

# Set plot style
sns.set(style="whitegrid")

# --------- Loop through clusters ----------
for sheet_name, cluster_df in xls.items():
    print(f"\n📊 Inverted Demographic Analysis in {sheet_name}:")

    for column in demographic_columns:
        if column not in cluster_df.columns or column not in df_total.columns:
            print(f"⚠️ Column '{column}' not found. Skipping.")
            continue

        # --- Handle Multiple Choice ---
        if column in multiple_choice_questions:
            # Explode in both cluster and full data
            cluster_exp = (
                cluster_df[[column]].dropna()
                .assign(**{column: cluster_df[column].str.split(",")})
                .explode(column)
            )
            cluster_exp[column] = cluster_exp[column].str.strip()

            total_exp = (
                df_total[[column]].dropna()
                .assign(**{column: df_total[column].str.split(",")})
                .explode(column)
            )
            total_exp[column] = total_exp[column].str.strip()

            cluster_series = cluster_exp[column]
            total_series = total_exp[column]
        else:
            # For single-choice: treat normally
            cluster_series = cluster_df[column].dropna().astype(str)
            total_series = df_total[column].dropna().astype(str)

        # --- Counts ---
        total_counts = total_series.value_counts()
        cluster_counts = cluster_series.value_counts()

        # --- Percentage of each category in the cluster (based on total dataset) ---
        percent_in_cluster = (cluster_counts / total_counts * 100).round(1)
        percent_in_cluster = percent_in_cluster.fillna(0)

        # --- Combine into display table ---
        summary_df = pd.DataFrame({
            "Total": total_counts,
            f"{sheet_name} Count": cluster_counts,
            f"{sheet_name} % of Total": percent_in_cluster
        }).fillna(0)

        # --- Apply order if defined ---
        if column in question_orders:
            ordered = [str(val) for val in question_orders[column] if str(val) in summary_df.index]
            summary_df = summary_df.reindex(ordered)
        elif summary_df.index.to_series().str.isnumeric().all():
            summary_df = summary_df.sort_index(key=lambda x: x.astype(int))

        # --- Print table ---
        styled_table = summary_df.style.set_caption(f"{column} – {sheet_name}").format(precision=1)
        display(HTML(styled_table.to_html()))

        # Show table horizontally with styling
        styled_table = summary_df.style.set_caption(f"{column} – {sheet_name}").format(precision=1)
        display(HTML(styled_table.to_html()))

        # --- Plotting ---
        plt.figure(figsize=(8, 4))
        sns.barplot(
            x=summary_df.index, 
            y=summary_df[f"{sheet_name} % of Total"], 
            palette="Set2"
        )
        plt.title(f"{column} – % of total in {sheet_name}")
        plt.xlabel(column)
        plt.ylabel("Percent of total students")
        plt.xticks(rotation=45)
        plt.ylim(0, 100)
        plt.tight_layout()
        plt.show()


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
import os
import sys

# ------ Significance tests for the analyzation of the clusters ------

# Add helper path and import question_orders
sys.path.append(os.path.abspath(".."))
from answer_categories import question_orders
from lists import demographic_columns, multiple_choice_questions

# Load full dataset (for global totals)
full_df = pd.read_excel(os.path.join("..", "Data", "Fertige Tabelle.xlsx"))
full_df.columns = full_df.columns.str.strip()

# Load clustered file (with sheet per cluster)
clustered_file = os.path.join("..", "Data", "Clustered_Students.xlsx")
xls = pd.read_excel(clustered_file, sheet_name=None)

# Combine all clusters into one DataFrame (with "Cluster" column)
cluster_dfs = []
for sheet_name, cluster_df in xls.items():
    cluster_df = cluster_df.copy()
    cluster_df["Cluster"] = sheet_name
    cluster_dfs.append(cluster_df)
df_clusters = pd.concat(cluster_dfs, ignore_index=True)

# ---------- Handle Exploding of Multiple-Choice Demographics ----------
def preprocess_for_demo_analysis(df, column):
    #Explode multiple-choice demographic if necessary
    if column in multiple_choice_questions:
        df = df.copy()
        df = df.dropna(subset=[column])
        df[column] = df[column].astype(str).str.split(",")
        df = df.explode(column)
        df[column] = df[column].str.strip()
    return df

# ---------- Loop through demographics ----------
for col in demographic_columns:
    print(f"\n📊 {col} – Significance tests between clusters")

    # Prepare data
    demo_df = preprocess_for_demo_analysis(df_clusters, col)
    full_demo_df = preprocess_for_demo_analysis(full_df, col)

    # Drop missing
    demo_df = demo_df.dropna(subset=[col, "Cluster"])
    full_demo_df = full_demo_df.dropna(subset=[col])

    # --- Variant 1: Percentage within cluster ---
    cross1 = pd.crosstab(demo_df["Cluster"], demo_df[col])
    perc1 = cross1.div(cross1.sum(axis=1), axis=0) * 100

    print("\n🔸 Variant 1: % per cluster (how each cluster is composed)")
    display_table = cross1.copy()
    for c in cross1.columns:
        display_table[c] = display_table[c].astype(str) + " (" + perc1[c].round(1).astype(str) + "%)"
    display(display_table)

    # --- Variant 2: % of total value per cluster ---
    cross2 = pd.crosstab(demo_df[col], demo_df["Cluster"])
    total_counts = full_demo_df[col].value_counts().rename("Total")
    perc2 = cross2.div(total_counts, axis=0) * 100

    print("\n🔸 Variant 2: % of all students with this value that fall into a cluster")
    display_table2 = cross2.copy()
    for c in cross2.columns:
        display_table2[c] = display_table2[c].astype(str) + " (" + perc2[c].round(1).astype(str) + "%)"
    display(display_table2)

    # --- Sort if order defined or numeric ---
    def reorder(df):
        if col in question_orders:
            order = [v for v in question_orders[col] if v in df.index]
            return df.loc[order]
        elif pd.api.types.is_numeric_dtype(df.index):
            return df.sort_index()
        else:
            try:
                df.index = df.index.astype(float)
                return df.sort_index()
            except:
                return df
    cross1 = reorder(cross1)
    cross2 = reorder(cross2)

    # --- Chi² test for both ---
    try:
        chi2_1, p1, dof1, _ = chi2_contingency(cross1)
        print(f"\n📌 Chi² test Variant 1 (cluster composition): p = {p1:.4f}")
    except Exception as e:
        print(f"❌ Chi² failed for Variant 1: {e}")

    try:
        chi2_2, p2, dof2, _ = chi2_contingency(cross2)
        print(f"📌 Chi² test Variant 2 (distribution of values): p = {p2:.4f}")
    except Exception as e:
        print(f"❌ Chi² failed for Variant 2: {e}")
