In [None]:
# ==========================================
# SETUP BLOCK - loads data, applies project helpers
# ==========================================

import os
import sys
import pandas as pd
import statsmodels.api as sm

sys.path.append(os.path.abspath(".."))

from Helper_functions import (
    clean_up_subjects,
    calculate_true_false_score,
    calculate_internet_terms_understanding_score,
    group_internet_understanding,
    apply_dummy_spec,
)

from answer_categories import COLUMN_ALIASES
from lists import LIKERT_VALUE_MAPS

# regression-specific imports
from regression_config import (
    BASE_PREDICTORS,
    GENDER_DUMMIES_FEMALE_REF,
    GENDER_VARS_FEMALE_REF,
    PRIVACY_DUMMIES_NO_REF,
    PRIVACY_VARS_NO_REF,
)

LIKERT_COLS = [
    "Frequency of use education",
    "Frequency of use everyday life",
    "Understanding AI",
    "Deal with AI",
    "Use AI school and freetime",
    "Help of AI",
    "Reliability AI",
]

DATA_FILE = os.path.join("..", "Data", "Fertige Tabelle.xlsx")
df = pd.read_excel(DATA_FILE)
df.columns = df.columns.astype(str).str.strip()
df = df.rename(columns=COLUMN_ALIASES)

for col in ["Most used subjects", "Preferred Subjects", "Least preferred Subjects"]:
    if col in df.columns:
        df = clean_up_subjects(df, col)

true_false_cols = [f"True/False_{i}" for i in range(1, 7)]
if all(c in df.columns for c in true_false_cols):
    df = calculate_true_false_score(df)

if any(c.startswith("Internet terms_") for c in df.columns):
    df = calculate_internet_terms_understanding_score(df)
    df = group_internet_understanding(df)

# Map Likert answers to numeric using per-column maps
for col in LIKERT_COLS:
    if col in df.columns and col in LIKERT_VALUE_MAPS:
        s = df[col].astype("string").str.strip()
        df[col] = pd.to_numeric(s.map(LIKERT_VALUE_MAPS[col]), errors="coerce")

df = apply_dummy_spec(df, GENDER_DUMMIES_FEMALE_REF)
df = apply_dummy_spec(df, PRIVACY_DUMMIES_NO_REF)


REGRESSION_PREDICTORS = (
    BASE_PREDICTORS
    + GENDER_VARS_FEMALE_REF
    + PRIVACY_VARS_NO_REF
)

print("Setup complete – DataFrame loaded and preprocessed")
print(f"Rows: {len(df)}, Columns: {len(df.columns)}")
print("Regression predictors:")
print(REGRESSION_PREDICTORS)


In [None]:
# ==========================================
# REGRESSION HELPER
# ==========================================

DEPENDENT_VAR = "Use AI school and freetime"

def run_regression(df_in: pd.DataFrame, dependent_var: str, predictors: list[str], model_name: str):
    missing = [c for c in [dependent_var] + predictors if c not in df_in.columns]
    if missing:
        raise ValueError(f"Missing columns for {model_name}: {missing}")

    data = df_in[[dependent_var] + predictors].dropna().copy()

    # numeric coercion
    for c in [dependent_var] + predictors:
        data[c] = pd.to_numeric(data[c], errors="coerce")
    data = data.dropna()

    dummy_cols = [c for c in data.columns if c.startswith("gender_") or c.startswith("privacy_")]
    if dummy_cols:
        data[dummy_cols] = data[dummy_cols].astype(int)

    X = sm.add_constant(data[predictors], has_constant="add")
    y = data[dependent_var]

    model = sm.OLS(y, X).fit()

    print(f"\n=== {model_name} ===")
    print(f"N = {len(data)}")
    print(model.summary())

    return model



In [7]:
MODEL_PREDICTORS = [
    "Age",
    "CRT_points",
    "Internet_Understanding_Score",
    "Help of AI",
    "Deal with AI",
    "Understanding AI",
    "True_False_Score",
    "Reliability AI",
    "gender_Männlich",
    "gender_Keine Angabe",
    "privacy_Ja",
    "privacy_Ich habe darüber noch nie nachgedacht."
]

model = run_regression(
    df,
    DEPENDENT_VAR,
    MODEL_PREDICTORS,
    model_name="Model: Demographics + Skills + Attitudes"
)



=== Model: Demographics + Skills + Attitudes ===
N = 181
                                OLS Regression Results                                
Dep. Variable:     Use AI school and freetime   R-squared:                       0.477
Model:                                    OLS   Adj. R-squared:                  0.440
Method:                         Least Squares   F-statistic:                     12.78
Date:                        Wed, 11 Feb 2026   Prob (F-statistic):           2.38e-18
Time:                                09:22:56   Log-Likelihood:                -210.40
No. Observations:                         181   AIC:                             446.8
Df Residuals:                             168   BIC:                             488.4
Df Model:                                  12                                         
Covariance Type:                    nonrobust                                         
                                                     coef    std err    