In [None]:
# ==========================================
# SETUP BLOCK 
# ==========================================

import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ---- Imports from project files ----
sys.path.append(os.path.abspath(".."))
from Helper_funtions import (
    clean_up_subjects,
    calculate_true_false_score,
    calculate_Internet_terms_understanding_score,
    group_internet_understanding,
    analyze_distribution,
    analyze_subject_distribution
)
from lists import (
    demographic_columns,
    multiple_choice_questions,
    single_choice_questions,
    likert_questions,
    likert_mapping
)
from answer_categories import question_orders

# ---- General plot style ----
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)

# ---- Data loading ----
DATA_FILE = os.path.join("..", "Data", "Fertige Tabelle.xlsx")
df = pd.read_excel(DATA_FILE)
df.columns = df.columns.str.strip()

# Clean up multi-subject columns
for col in ["Most used subjects", "Preferred Subjects", "Least preferred Subjects"]:
    if col in df.columns:
        df = clean_up_subjects(df, col)

# Calculate additional scores
if all(q in df.columns for q in ["True/False_1", "True/False_2"]):
    df = calculate_true_false_score(df)

if any(col.startswith("Internet terms_") for col in df.columns):
    df = calculate_Internet_terms_understanding_score(df)
    df = group_internet_understanding(df)

print("✅ Setup complete – DataFrame loaded and preprocessed")
print(f"Rows: {len(df)}, Columns: {len(df.columns)}")


In [None]:
# --------- Shows distribution in table form -----------

# Filter: only non-demographic questions that actually exist in df
non_demo_single = [c for c in single_choice_questions if c not in demographic_columns and c in df.columns]
non_demo_multi  = [c for c in multiple_choice_questions if c not in demographic_columns and c in df.columns]

# ---- Single-Choice Questions ----
for column in non_demo_single:
    analyze_distribution(df, column, f"🔹 {column}")

# ---- Multiple-Choice Questions ----
for column in non_demo_multi:
    analyze_subject_distribution(df, column)



In [None]:
# --------- Shows distribution in table and graph form -----------


# --- plot functions ---
def plot_single_choice_distribution(df, column, title=None):
    data = df[column].dropna().astype(str).str.strip()
    data = data[data != ""]
    
    counts = data.value_counts()
    percents = data.value_counts(normalize=True) * 100

    if column in question_orders:
        ordered = [opt for opt in question_orders[column] if opt in counts.index]
        counts = counts.reindex(ordered)
        percents = percents.reindex(ordered)

    plot = percents.plot.bar(color="skyblue", edgecolor="black")
    plt.title(title if title else column)
    plt.ylabel("Percentage of respondents (%)")
    plt.xlabel("")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()


def plot_multiple_choice_distribution(df, column, title=None):
    data = df[column].dropna().astype(str).str.strip()
    data = data[data != ""]
    num_respondents = data.shape[0]

    exploded = data.str.split(",").explode().str.strip()
    counts = exploded.value_counts()
    percents = (counts / num_respondents * 100).round(1)

    plot = percents.plot.bar(color="lightgreen", edgecolor="black")
    plt.title(title if title else column)
    plt.ylabel("Percentage of respondents (%)")
    plt.xlabel("")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()


# Filter: only non-demographic questions that actually exist in df
non_demo_single = [c for c in single_choice_questions if c not in demographic_columns and c in df.columns]
non_demo_multi  = [c for c in multiple_choice_questions if c not in demographic_columns and c in df.columns]

# ---- Single-Choice Questions ----
for column in non_demo_single:
    analyze_distribution(df, column, f"🔹 {column}")

for column in non_demo_single:
    plot_single_choice_distribution(df, column, f"🔹 {column}")

# ---- Multiple-Choice Questions ----
for column in non_demo_multi:
    analyze_subject_distribution(df, column)

for column in non_demo_multi:
    plot_multiple_choice_distribution(df, column, title=f"{column}")



#Code for single analysis of True/False and internet terms
"""
internet_columns = [col for col in df.columns if col.startswith("Internet terms_")]
for column in internet_columns:
    plot_single_choice_distribution(df, column, title=column)

true_false_columns = [col for col in df.columns if col.startswith("True/False_")]
for column in true_false_columns:
    plot_single_choice_distribution(df, column, title=column)
"""

In [None]:
# ----- looks for relationship between frequency of use everyday life and frequency of use education,
#  same scales and part of the same question, so good to compare -----

# Mapping Answeroptions
usage_scale = {
    "Sehr oft": 5,
    "Oft": 4,
    "Manchmal": 3,
    "Selten": 2,
    "Nie": 1
}

col_school = "Frequency of use education"
col_everyday = "Frequency of use everyday life"

valid = df[[col_school, col_everyday]].dropna()

# Delete empty strings
valid = valid[(valid[col_school].str.strip() != "") & (valid[col_everyday].str.strip() != "")]

# Change answers to numbers
valid["score_school"] = valid[col_school].map(usage_scale)
valid["score_everyday"] = valid[col_everyday].map(usage_scale)

# Calculate difference
valid["difference"] = valid["score_school"] - valid["score_everyday"]

# Count results
result = valid["difference"].value_counts().sort_index()

# Change to dataframe
result_df = pd.DataFrame({
    "Differenz (school - everyday life)": result.index,
    "Anzahl Schüler:innen": result.values,
    "Prozent": (result.values / result.values.sum() * 100).round(1)
})

print("\n📊 Comparision usage everyday life vs. education:")
print(result_df)


In [None]:
# ----- looks for relationship between frequency of use for education and frequency of use for school, so essentially the same question, 
# but due to different scale conotations, the results differ from one another

# Mapping Answeroptions
usage_scale_1 = {
    "Sehr oft": 5,
    "Oft": 4,
    "Manchmal": 3,
    "Selten": 2,
    "Nie": 1
}

# Mapping Answeroptions
usage_scale_2 = {
    "Immer": 5,
    "Häufig": 4,
    "Manchmal": 3,
    "Selten": 2,
    "Nie": 1
}

col_school = "Frequency of use education"
col_school_1 = "Frequency use of AI_school"
valid = df[[col_school, col_school_1]].dropna()

# Delete empty strings
valid = valid[(valid[col_school].str.strip() != "") & (valid[col_school_1].str.strip() != "")]

# Change answers to numbers
valid["score_school"] = valid[col_school].map(usage_scale_1)
valid["score_school_1"] = valid[col_school_1].map(usage_scale_2)

# Calculate difference
valid["difference"] = valid["score_school"] - valid["score_school_1"]
# Count results
result = valid["difference"].value_counts().sort_index()

# Change to dataframe
result_df = pd.DataFrame({
    "Differenz (school - school_1)": result.index,
    "Anzahl Schüler:innen": result.values,
    "Prozent": (result.values / result.values.sum() * 100).round(1)
})

print("\n📊 Comparision usage school_1 vs. education:")
print(result_df)
