In [None]:
# Setup
import os
import sys
import pandas as pd

# Imports 
sys.path.append(os.path.abspath(".."))
from answer_categories import question_orders
from lists import single_choice_questions, demographic_columns, multiple_choice_questions, true_false_solutions
from Helper_funtions import test_significance_single_choice, test_significance_multiple_choice, calculate_true_false_score

# Load data
file_path = os.path.join("..", "Data", "Fertige Tabelle.xlsx")
df = pd.read_excel(file_path)

#calculate True/False score
df = calculate_true_false_score(df)


for question in multiple_choice_questions:
    for demo in demographic_columns:
        try:
            print(f"\n Distribution of multiple answers for: {question} grouped by {demo}")

            exploded = (
                df[[demo, question]]
                .dropna()
                .assign(**{question: df[question].str.split(",")})
                .explode(question)
            )

            exploded[question] = exploded[question].str.strip()
            cross = pd.crosstab(exploded[demo], exploded[question], normalize='index') * 100

            # Sort alphabetically or define specific sort order if desired
            cross = cross[sorted(cross.columns)]
            print(cross.round(1).to_string())

            # significance test fo every option
            results = test_significance_multiple_choice(exploded, question, demo)
            for res in results:
                if "Error" in res:
                    print(f"   ❌ Option: {res['Option']} → ERROR: {res['Error']}")
                else:
                    print(f"   → Option: {res['Option']} → p-value: {res['p_value']:.4f} → {res['Significance']}")

        except Exception as e:
            print(f"❌ Failed for {question} x {demo}: {e}")

for question in single_choice_questions:
    print(f"\n Distribution of responses for: {question}")
    
    for demo in demographic_columns:
        print(f"– Grouped by: {demo}")

        try:
            df[question] = df[question].astype(str)
            data = df[[demo, question]].dropna()

            # table in percent
            cross = pd.crosstab(data[demo], data[question], normalize='index') * 100
            
            # use defined order of answers
            if question in question_orders:
                order = question_orders[question]
                for col in order:
                    if col not in cross.columns:
                        cross[col] = 0
                cross = cross[order]
            
            # print table
            print(cross.round(1).to_string())

            # significance test
            p = test_significance_single_choice(df, question, demo)
            significance = "✅ significant" if p < 0.05 else "❌ not significant"
            print(f"   → p-value: {p:.4f} → {significance}")

        except Exception as e:
            print(f"❌ Test failed for {question} x {demo}: {e}")



In [None]:
# Setup
import os
import sys
import pandas as pd

# Imports 
sys.path.append(os.path.abspath(".."))
from answer_categories import question_orders
from lists import single_choice_questions, demographic_columns, multiple_choice_questions
from Helper_funtions import (
    test_significance_single_choice, 
    test_significance_multiple_choice, 
    calculate_true_false_score,
    clean_up_subjects
)

# Load data
file_path = os.path.join("..", "Data", "Fertige Tabelle.xlsx")
df = pd.read_excel(file_path)
df.columns = df.columns.str.strip()

df = clean_up_subjects(df, "Most used subjects")

# Calculate True/False score and add to DataFrame
df = calculate_true_false_score(df)

# Analyze multiple choice questions
for question in multiple_choice_questions:
    for demo in demographic_columns:
        if demo in ["Preferred Subjects", "Least preferred Subjects"]:
            exploded_df = (
                df[[demo, question]]
                .dropna()
                .assign(**{demo: df[demo].str.split(",")})
                .explode(demo)
            )
            exploded_df[demo] = exploded_df[demo].str.strip()
            data = exploded_df.dropna()
        else:
            data = df[[demo, question]].dropna()
        
        try:
            print(f"\n Distribution of multiple answers for: {question} grouped by {demo}")

            # Drop empty strings
            valid_data = df[[demo, question]].dropna()
            valid_data = valid_data[valid_data[question].str.strip() != ""]

            # Split and explode responses
            exploded = (
                valid_data.assign(**{question: valid_data[question].str.split(",")})
                .explode(question)
            )
            exploded[question] = exploded[question].str.strip()

            # Crosstab normalized by row (percentages)
            cross = pd.crosstab(exploded[demo], exploded[question], normalize='index') * 100
            cross = cross[sorted(cross.columns)]  # Optional sorting

            print(cross.round(1).to_string())

            # Run significance test for each response option
            results = test_significance_multiple_choice(exploded, question, demo)
            for res in results:
                if "Error" in res:
                    print(f"   ❌ Option: {res['Option']} → ERROR: {res['Error']}")
                else:
                    print(f"   → Option: {res['Option']} → p-value: {res['p_value']:.4f} → {res['Significance']}")

        except Exception as e:
            print(f"❌ Failed for {question} x {demo}: {e}")


# Analyze single choice questions
for question in single_choice_questions:
    print(f"\n Distribution of responses for: {question}")
    
    for demo in demographic_columns:
        print(f"– Grouped by: {demo}")

        for demo in demographic_columns:
            if demo in ["Preferred Subjects", "Least preferred Subjects"]:
                exploded_df = (
                    df[[demo, question]]
                    .dropna()
                    .assign(**{demo: df[demo].str.split(",")})
                    .explode(demo)
                )
                exploded_df[demo] = exploded_df[demo].str.strip()
                data = exploded_df.dropna()
            else:
                data = df[[demo, question]].dropna()

        try:
            # Drop missing or empty responses
            data = df[[demo, question]].dropna()
            data = data[data[question].astype(str).str.strip() != ""]

            # Crosstab normalized by row (percentages)
            cross = pd.crosstab(data[demo], data[question], normalize='index') * 100

            # Use predefined order if available
            if question in question_orders:
                order = question_orders[question]
                for col in order:
                    if col not in cross.columns:
                        cross[col] = 0
                cross = cross[order]

            # Optional: sort numeric demographic column
            if pd.api.types.is_numeric_dtype(df[demo]):
                cross = cross.sort_index()

            print(cross.round(1).to_string())

            # Significance test
            p = test_significance_single_choice(df, question, demo)
            significance = "✅ significant" if p < 0.05 else "❌ not significant"
            print(f"   → p-value: {p:.4f} → {significance}")

        except Exception as e:
            print(f"❌ Test failed for {question} x {demo}: {e}")


In [None]:
# Setup
import os
import sys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# imports
sys.path.append(os.path.abspath(".."))
from answer_categories import question_orders
from lists import comparison_pairs_by_demo, single_choice_questions, multiple_choice_questions
from Helper_funtions import (
    clean_up_subjects,
    calculate_true_false_score,
    calculate_Internet_terms_understanding_score,
    group_internet_understanding
)

# load excel
file_path = os.path.join("..", "Data", "Fertige Tabelle.xlsx")
df =  pd.read_excel(file_path)
df.columns = df.columns.str.strip()

# reduce nr of subjects to a max of 3
df = clean_up_subjects(df, "Most used subjects")

# Calculate True/False score and add to DataFrame
df = calculate_true_false_score(df)

# Calculate the score
df = calculate_Internet_terms_understanding_score(df)

df = group_internet_understanding(df)

for demo, question_list in comparison_pairs_by_demo.items(): 
    for question in question_list:

        try:
            if question in multiple_choice_questions:
                exploded_df = (
                    df[[demo, question]]
                    .dropna()
                    .assign(**{question: df[question].str.split(",")})
                    .explode(question)
                )
                exploded_df[question] = exploded_df[question].str.strip()
                data = exploded_df
            elif question in single_choice_questions:
                data = df[[demo, question]].dropna()
                data = data[data[question].astype(str).str.strip() != ""]
            else:
                continue  # Weder Single noch Multiple

            # Crosstab normalized by row (percentages)
            cross = pd.crosstab(data[demo], data[question], normalize='index') * 100

            # Use predefined order if available
            if question in question_orders:
                order = question_orders[question]
                for col in order:
                    if col not in cross.columns:
                        cross[col] = 0
                cross = cross[order]
            
            # Optional: sort numeric demographic column
            if pd.api.types.is_numeric_dtype(df[demo]):
                cross = cross.sort_index()

            print(f"\n📊 {question} grouped by {demo}")
            print(cross.round(1).to_string())

            # Plot as graph
            plot_df = cross.reset_index().melt(id_vars=demo, var_name="Answer", value_name="Percentage")

            plt.figure(figsize=(10, 6))
            sns.barplot(data=plot_df, x="Answer", y="Percentage", hue=demo)

            plt.title(f"{question} grouped by {demo}")
            plt.xticks(rotation=45, ha="right")
            plt.tight_layout()
            plt.show()
        except Exception as e:
            print(f"❌ Test failed for {question} x {demo}: {e}")
