In [None]:
# ==========================================
# SETUP BLOCK
# ==========================================

import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

sys.path.append(os.path.abspath(".."))

from Helper_functions import (
    clean_up_subjects,
    calculate_true_false_score,
    calculate_internet_terms_understanding_score,
    group_internet_understanding,
)

from lists import (
    demographic_columns,
    multiple_choice_questions,
    likert_questions,
    LIKERT_VALUE_MAPS,
)

from answer_categories import question_orders, COLUMN_ALIASES

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)

DATA_FILE = os.path.join("..", "Data", "Fertige Tabelle.xlsx")
df = pd.read_excel(DATA_FILE)
df.columns = df.columns.astype(str).str.strip()
df = df.rename(columns=COLUMN_ALIASES)

for col in ["Most used subjects", "Preferred Subjects", "Least preferred Subjects"]:
    if col in df.columns:
        df = clean_up_subjects(df, col)

true_false_cols = [f"True/False_{i}" for i in range(1, 7)]
if all(c in df.columns for c in true_false_cols):
    df = calculate_true_false_score(df)

if any(c.startswith("Internet terms_") for c in df.columns):
    df = calculate_internet_terms_understanding_score(df)
    df = group_internet_understanding(df)

print("Setup complete â€“ DataFrame loaded and preprocessed")
print(f"Rows: {len(df)}, Columns: {len(df.columns)}")


In [None]:
# ==========================================
# CREATION OF CLUSTERS with PCA and KMEANS
# ==========================================

from pandas.api.types import CategoricalDtype

df_clean = df.dropna(subset=likert_questions).copy()

df_numeric = pd.DataFrame(index=df_clean.index)
for col in likert_questions:
    s = df_clean[col]

    if col in LIKERT_VALUE_MAPS:
        mapped = s.astype("string").str.strip().map(LIKERT_VALUE_MAPS[col])
        df_numeric[col] = pd.to_numeric(mapped, errors="coerce")
    else:
        df_numeric[col] = pd.to_numeric(s, errors="coerce")

df_numeric = df_numeric.dropna()
df_clean = df_clean.loc[df_numeric.index].copy()

scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_numeric)

pca = PCA(n_components=2, random_state=42)
pca_result = pca.fit_transform(scaled_data)

kmeans = KMeans(n_clusters=3, n_init=10, random_state=42)
clusters = kmeans.fit_predict(pca_result)

df_clean["Cluster"] = clusters
df_plot = pd.DataFrame(pca_result, columns=["PC1", "PC2"], index=df_clean.index)
df_plot["Cluster"] = clusters

cluster_map = {0: "B", 1: "A", 2: "C"}
df_clean["Cluster"] = df_clean["Cluster"].map(cluster_map)
df_plot["Cluster"] = df_plot["Cluster"].map(cluster_map)

cat_type = CategoricalDtype(categories=["A", "B", "C"], ordered=True)
df_clean["Cluster"] = df_clean["Cluster"].astype(cat_type)
df_plot["Cluster"] = df_plot["Cluster"].astype(cat_type)

plt.figure(figsize=(10, 7))
sns.scatterplot(
    data=df_plot,
    x="PC1",
    y="PC2",
    hue="Cluster",
    hue_order=["A", "B", "C"],
    palette="tab10",
    s=100,
)
plt.title("PCA of AI Attitudes (KMeans Clustering)", fontsize=20, fontweight="bold")
plt.xlabel("Principal Component 1", fontsize=16, fontweight="bold")
plt.ylabel("Principal Component 2", fontsize=16, fontweight="bold")
leg = plt.legend(title="Cluster", loc="best")
plt.setp(leg.get_texts(), fontsize=12, fontweight="bold")
plt.setp(leg.get_title(), fontsize=13, fontweight="bold")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

cluster_means = (
    df_numeric.assign(Cluster=df_clean["Cluster"])
    .groupby("Cluster", observed=True)
    .mean()
    .reindex(["A", "B", "C"])
    .round(2)
)

row_rename_map = {
    "Use AI school and freetime": "Use AI School and Free Time (Q12)",
    "Frequency of use education": "Frequency of Use - Education (Q14.1)",
    "Frequency of use everyday life": "Frequency of Use - Everyday Life (Q14.2)",
    "Usefulness AI": "Usefulness of AI (Q16)",
    "Deal with AI": "Ability to work with AI (Q18)",
    "Understanding AI": "Understanding AI (Q19)",
    "Reliability AI": "Reliability of AI (Q22)",
    "Help of AI": "Helpfulness of AI (Q32)",
    "Mates using AI": "Classmates using AI (Q26)",
    "Teachers preparing lessons": "Teachers Preparing Lessons (Q24)",
    "Teachers giving grades": "Teachers Giving Grades (Q25)",
    "Internet Understanding (Grouped)": "Internet Understanding (Grouped) (Q20)",
    "True_False_Score": "Bias evaluation of AI tools - Score (Q21)",
}
heat_df = cluster_means.T.rename(index=row_rename_map)

name_understanding = row_rename_map.get("Understanding AI", "Understanding AI")
name_mates = row_rename_map.get("Mates using AI", "Mates using AI")
order = list(heat_df.index)
if name_mates in order and name_understanding in order:
    order.remove(name_mates)
    order.insert(order.index(name_understanding) + 1, name_mates)
    heat_df = heat_df.loc[order]

plt.figure(figsize=(14, 9))
ax = sns.heatmap(
    heat_df[["A", "B", "C"]],
    annot=True,
    fmt=".1f",
    cmap="YlGnBu",
    linewidths=0.5,
    annot_kws={"size": 12, "weight": "bold"},
    cbar=True,
    cbar_kws={"shrink": 0.9},
)
plt.title("Cluster Profiles based on Likert-scale Answers", fontsize=20, fontweight="bold", pad=14)
plt.xlabel("Cluster", fontsize=16, fontweight="bold", labelpad=10)
plt.ylabel("Question", fontsize=16, fontweight="bold", labelpad=10)
ax.set_xticklabels(ax.get_xticklabels(), fontsize=14, fontweight="bold", rotation=0)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=13, fontweight="bold", rotation=0)
cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=12)
cbar.set_label("Mean Likert Score", fontsize=14, fontweight="bold")
plt.tight_layout()
plt.show()


In [None]:
# ==========================================
# ANALYSIS OF THE SEPARATE CLUSTERS
# ==========================================

from IPython.display import display, HTML

cluster_file = os.path.join("..", "Data", "Clustered_Students.xlsx")
full_data_file = os.path.join("..", "Data", "Fertige Tabelle.xlsx")

df_total = pd.read_excel(full_data_file)
df_total.columns = df_total.columns.astype(str).str.strip()
df_total = df_total.rename(columns=COLUMN_ALIASES)

xls = pd.read_excel(cluster_file, sheet_name=None)

def _to_str_clean(series):
    return series.dropna().astype("string").str.strip().replace({"": pd.NA}).dropna()

def _explode_mc(series):
    s = series.dropna().astype("string").str.strip()
    s = s.replace({"": pd.NA}).dropna()
    s = s.str.split(",")
    s = s.explode().astype("string").str.strip()
    s = s.replace({"": pd.NA}).dropna()
    return s

def _apply_order(df_in, col):
    if col in question_orders:
        desired = [str(v) for v in question_orders[col]]
        idx = df_in.index.astype(str)
        want = [v for v in desired if v in idx.tolist()]
        rest = [v for v in idx.tolist() if v not in want]
        out = df_in.copy()
        out.index = out.index.astype(str)
        return out.reindex(want + rest)

    idx = df_in.index.astype(str)
    if idx.str.fullmatch(r"-?\d+(\.\d+)?").all():
        return df_in.sort_index(key=lambda x: x.astype(float))

    return df_in

for sheet_name, cluster_df in xls.items():
    cluster_df = cluster_df.copy()
    cluster_df.columns = cluster_df.columns.astype(str).str.strip()
    cluster_df = cluster_df.rename(columns=COLUMN_ALIASES)

    print(f"\n================  {sheet_name}  ================")

    for column in demographic_columns:
        if column not in df_total.columns:
            print(f"Spalte '{column}' nicht im Gesamtdatensatz")
            continue
        if column not in cluster_df.columns:
            print(f"Spalte '{column}' nicht im Cluster-Sheet")
            continue

        if column in multiple_choice_questions:
            cluster_series = _explode_mc(cluster_df[column])
            total_series = _explode_mc(df_total[column])
        else:
            cluster_series = _to_str_clean(cluster_df[column])
            total_series = _to_str_clean(df_total[column])

        total_counts = total_series.value_counts(dropna=False)
        cluster_counts = cluster_series.value_counts(dropna=False)

        cluster_comp_pct = (cluster_counts / cluster_counts.sum() * 100).reindex(total_counts.index).fillna(0).round(1)

        table_cluster_comp = pd.DataFrame({
            f"{sheet_name} Count": cluster_counts.reindex(total_counts.index).fillna(0).astype(int),
            f"{sheet_name} % (within cluster)": cluster_comp_pct,
        })
        table_cluster_comp = _apply_order(table_cluster_comp, column)

        print(f"\n{column} - Distribution in {sheet_name}")
        display(HTML(table_cluster_comp.style.format(precision=1).set_caption(f"{column} - {sheet_name}: Cluster composition").to_html()))

        plt.figure(figsize=(9, 4))
        sns.barplot(
            x=table_cluster_comp.index,
            y=table_cluster_comp[f"{sheet_name} % (within cluster)"],
            palette="Set2",
        )
        plt.title(f"{column} - {sheet_name}: Percent of students in cluster")
        plt.xlabel(column)
        plt.ylabel("Percent (%)")
        plt.xticks(rotation=45, ha="right")
        plt.ylim(0, 100)
        plt.tight_layout()
        plt.show()

        percent_of_category_in_cluster = (cluster_counts / total_counts * 100).reindex(total_counts.index).fillna(0).round(1)

        table_cat_capture = pd.DataFrame({
            "Total Count": total_counts.astype(int),
            f"{sheet_name} Count": cluster_counts.reindex(total_counts.index).fillna(0).astype(int),
            f"% of {column} in {sheet_name}": percent_of_category_in_cluster,
        })
        table_cat_capture = _apply_order(table_cat_capture, column)

        print(f"\n{column} - Percent of category in {sheet_name}")
        display(HTML(table_cat_capture.style.format(precision=1).set_caption(f"{column} - Category share in {sheet_name}").to_html()))

        plt.figure(figsize=(9, 4))
        sns.barplot(
            x=table_cat_capture.index,
            y=table_cat_capture[f"% of {column} in {sheet_name}"],
            palette="Set2",
        )
        plt.title(f"{column} - Percent of the category in {sheet_name}")
        plt.xlabel(column)
        plt.ylabel("Percent (%)")
        plt.xticks(rotation=45, ha="right")
        plt.ylim(0, 100)
        plt.tight_layout()
        plt.show()
