In [None]:
# ==========================================
# SETUP BLOCK 
# ==========================================

import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ---- Imports from project files ----
sys.path.append(os.path.abspath(".."))
from Helper_funtions import (
    clean_up_subjects,
    calculate_true_false_score,
    calculate_Internet_terms_understanding_score,
    group_internet_understanding,
    analyze_distribution, 
    analyze_subject_distribution
)
from lists import (
    demographic_columns,
    multiple_choice_questions,
    single_choice_questions,
    likert_questions,
    likert_mapping
)
from answer_categories import question_orders

# ---- General plot style ----
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)

# ---- Data loading ----
DATA_FILE = os.path.join("..", "Data", "Fertige Tabelle.xlsx")
df = pd.read_excel(DATA_FILE)
df.columns = df.columns.str.strip()

# Clean up multi-subject columns
for col in ["Most used subjects", "Preferred Subjects", "Least preferred Subjects"]:
    if col in df.columns:
        df = clean_up_subjects(df, col)

# Calculate additional scores
if all(q in df.columns for q in ["True/False_1", "True/False_2"]):
    df = calculate_true_false_score(df)

if any(col.startswith("Internet terms_") for col in df.columns):
    df = calculate_Internet_terms_understanding_score(df)
    df = group_internet_understanding(df)

print("‚úÖ Setup complete ‚Äì DataFrame loaded and preprocessed")
print(f"Rows: {len(df)}, Columns: {len(df.columns)}")


In [None]:
# --------- Demographic Analysis, table form ---------

# Gender
analyze_distribution(df, "Gender", "Gender Distribution")

# Age
analyze_distribution(df, "Age", "Age Distribution")

# Education Level
analyze_distribution(df, "Education Level", "Students‚Äô Education Level")

# Hours per week
analyze_distribution(df, "Hours per week for school", "Hours per week distribution")

# Parents' Education Levels
for parent_col in ["Educational Level parent_1", "Educational Level parent_2"]:
    analyze_distribution(df, parent_col, f"Parental Education ‚Äì {parent_col}")

# CRT Score
analyze_distribution(df, "CRT_points", "Distribution of CRT Score")


# Favorite subjects
analyze_subject_distribution(df, "Preferred Subjects")

# Least favorite subjects
analyze_subject_distribution(df, "Least preferred Subjects")


In [None]:
# ------- Distribution for demographics with graphs and tables -------

# Add helper function path
sys.path.append(os.path.abspath(".."))
from Helper_funtions import analyze_subject_distribution_changed, analyze_distribution_changed

# Function to show both plot and table
def show_distribution_with_plot(df, column, title=None):
    result = analyze_distribution_changed(df, column, return_df=True)

    if result is not None:
        result.plot(kind="bar", y="Percentage", legend=False, title=title or column)
        plt.ylabel("Percentage")
        plt.xlabel(column)
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        plt.show()
        
        # print table
        print(f"\nüîπ Tabelle f√ºr: {title or column}")
        print(result.to_string())

# ---- Demographic Analysis ----

show_distribution_with_plot(df, "Gender", "Gender Distribution")
show_distribution_with_plot(df, "Age", "Age Distribution")
show_distribution_with_plot(df, "Education Level", "Students‚Äô Education Level")
show_distribution_with_plot(df, "Hours per week for school", "Hours per Week Distribution")

for parent_col in ["Educational Level parent_1", "Educational Level parent_2"]:
    show_distribution_with_plot(df, parent_col, f"Parental Education ‚Äì {parent_col}")

show_distribution_with_plot(df, "CRT_points", "CRT Score Distribution")

# ---- Subject Preferences ----

def show_subject_distribution_with_plot(df, column, title=None):
    result = analyze_subject_distribution_changed(df, column, return_df=True)

    if result is not None:
        result.plot(kind="bar", y="Percentage of respondents", legend=False, title=title or column)
        plt.ylabel("Percentage of respondents")
        plt.xlabel(column)
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        plt.show()
        
        print(f"\nüîπ Tabelle f√ºr: {title or column}")
        print(result.to_string())

show_subject_distribution_with_plot(df, "Preferred Subjects", "Favorite Subjects")
show_subject_distribution_with_plot(df, "Least preferred Subjects", "Least Favorite Subjects")


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# --- Counts (reale Verteilung) ---
counts = pd.DataFrame({
    "0-1 hours/week":        [3,  8,  7, 11,  4],
    "2-5 hours/week":        [5, 25, 41, 19, 17],
    "More than 5 hours/week":[3,  5, 16, 15, 14]
}, index=["Never", "Rarely", "Sometimes", "Often", "Very often"])

# Reihenfolgen festlegen
hours_order = ["0-1 hours/week", "2-5 hours/week", "More than 5 hours/week"]
freq_order  = ["Never", "Rarely", "Sometimes", "Often", "Very often"]
counts = counts[hours_order].loc[freq_order]

# Blaut√∂ne
colors = {
    "Never": "#c6dbef",
    "Rarely": "#9ecae1",
    "Sometimes": "#6baed6",
    "Often": "#3182bd",
    "Very often": "#08519c"
}

# --- Plot ---
fig, ax = plt.subplots(figsize=(10, 6))
bottom = pd.Series([0]*len(hours_order), index=hours_order, dtype=float)

for freq in freq_order:
    ax.bar(
        hours_order,
        counts.loc[freq, hours_order],
        bottom=bottom[hours_order],
        color=colors[freq],
        width=0.7,
        label=freq
    )
    bottom += counts.loc[freq, hours_order]

# Titel & Achsen
ax.set_title("AI usage frequency in education\nstacked by hours per week for school", fontsize=18)
ax.set_xlabel("Hours per week for school", fontsize=14)
ax.set_ylabel("Count", fontsize=14)
ax.tick_params(axis='both', labelsize=12)

# Legende umdrehen
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[::-1], labels[::-1], title="Frequency of use in education",
          loc="upper right", fontsize=10, title_fontsize=11)

plt.tight_layout()
plt.show()







In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Reihenfolge wie im Screenshot
subjects = [
    "History", "German", "Mathematics", "French", "Geography",
    "Biology", "Chemistry", "English", "Physics"
]

# Prozentwerte pro Fach (aus deiner Tabelle "Percent")
data = {
    "Male":   [17.3, 14.7, 14.1, 11.5,  8.9,  6.3,  7.3,  5.8,  5.2],
    "Female": [19.8, 11.8, 11.4, 12.2, 11.0, 10.1,  7.6,  7.6,  4.6],
    "No answer": [12.5, 12.5, 18.8, 12.5, 18.8, 0.0, 6.2, 0.0, 18.8],
}

df = pd.DataFrame(data, index=subjects)

# Plot: drei Balken pro Fach (vertikal)
ax = df.plot(
    kind="bar",
    figsize=(11, 4.5),
    width=0.85,
    color=["#2b8cbe", "#de2d26", "#2ca25f"]  # Male=blau, Female=rot, No answer=gr√ºn
)

ax.set_title("Most used subjects ‚Äì by gender", fontsize=14)
ax.set_ylabel("Percentage (%)", fontsize=12)
ax.set_xlabel("Subject", fontsize=12)
plt.xticks(rotation=45, ha="right", fontsize=11)
plt.yticks(fontsize=11)
plt.grid(False)
ax.legend(title="Gender", loc="upper right")

plt.tight_layout()
plt.show()











In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ==== Konfiguration (Spaltennamen & Reihenfolgen ggf. anpassen) ====
COL_USE = "Use AI school and freetime"   # Nutzungskategorie
COL_REL = "Reliability AI"               # Verl√§sslichkeit

# gew√ºnschte Reihenfolge der Nutzungskategorien
use_order = ["Daily", "Several times a week", "About once a week", "Rarely", "Never"]

# Mapping der Verl√§sslichkeit -> Punkte (inkl. m√∂gliche deutsche/englische Varianten)
score_map = {
    "Sehr verl√§sslich": 5, "Very reliable": 5,
    "Eher verl√§sslich": 4, "Rather reliable": 4, "Fairly reliable": 4,
    "Teils/teils": 3, "Mixed": 3, "Neither": 3, "Neutral": 3,
    "Wenig verl√§sslich": 2, "Not very reliable": 2,
    "Gar nicht verl√§sslich": 1, "Not reliable at all": 1,
    "Keine Meinung": 0, "Unsicher": 0,
    "Keine Meinung/Unsicher": 0, "Keine Meinung / Unsicher": 0,
    "No opinion": 0, "Unsure": 0, "No opinion/Unsure": 0, "No opinion / Unsure": 0
}

# ==== Crosstab (Counts) ====
ct = pd.crosstab(df[COL_USE], df[COL_REL]).reindex(use_order, fill_value=0)

# Warnung ausgeben, falls es Antwortkategorien gibt, die nicht im Mapping sind
unknown_cols = [c for c in ct.columns if c not in score_map]
if unknown_cols:
    print("‚ö†Ô∏è Nicht gemappte Antwortkategorien (werden mit 0 bewertet):", unknown_cols)

# Vektorisierte Punktematrix (Spaltenweise mit Scores multiplizieren)
scores = pd.Series({c: score_map.get(c, 0) for c in ct.columns})
weighted = ct.mul(scores, axis=1)

# Mean-Score je Nutzungskategorie (0..5, 0 nimmt 'Keine Meinung/Unsicher' mit)
mean_scores = (weighted.sum(axis=1) / ct.sum(axis=1)).rename(
    "Mean reliability score (0‚Äì5)"
)

# Ausgabe als Tabelle
print("\nüìä Mean-Score je Nutzungskategorie (0‚Äì5, 0=Keine Meinung/Unsicher):")
print(mean_scores.round(2).to_frame())

# (Optional) Balkendiagramm der Mean-Scores
ax = mean_scores.plot(
    kind="bar",
    figsize=(7, 4),
    width=0.8,
    color="cornflowerblue",
    title="Mean reliability score by usage category (0‚Äì5)"
)
ax.set_xlabel(COL_USE)
ax.set_ylabel("Mean score (0‚Äì5)")
plt.xticks(rotation=45, ha="right")
plt.ylim(0, 5)
plt.grid(False)
plt.tight_layout()
plt.show()

# ---- Falls du den Score OHNE 'Keine Meinung/Unsicher' berechnen willst (nur 1..5):
ct_pos = ct.drop(columns=[c for c in ct.columns if score_map.get(c, 0) == 0], errors="ignore")
scores_pos = pd.Series({c: score_map[c] for c in ct_pos.columns})
mean_scores_pos = (ct_pos.mul(scores_pos, axis=1).sum(axis=1) / ct_pos.sum(axis=1))
print("\n(ohne 0er) Mean-Score je Nutzungskategorie (1‚Äì5):")
print(mean_scores_pos.round(2).to_frame())


KeyError: 'Use AI school and freetime'