# 04 - Subgroups

This notebook explores subgroup differences in daily‑living activities based on gender and education level. To assess these effects, an ANOVA‑based analysis is applied to compare activity scores across the selected demographic groups.


In [1]:
import sys
from pathlib import Path

project_root = Path(__file__).parent.parent if "__file__" in globals() else Path("..").resolve()
sys.path.append(str(project_root))

import pandas as pd
import matplotlib.pyplot as plt
import os
from src.config import activity_cols, covariates
from src.analysis import anova


In [2]:
# Path
project_root = Path(__file__).parent.parent if "__file__" in globals() else Path("..").resolve()

tables_dir = project_root / "reports" / "tables"
figures_dir = project_root / "reports" / "figures"


In [3]:
# Load  processed dataset
DATA_PATH = Path("../data/processed/nacc_alzheimers_dataset_processed.csv")
df = pd.read_csv(DATA_PATH)

print("Processed dataset loaded. Shape:", df.shape)


Processed dataset loaded. Shape: (2700, 57)


In [4]:
# ANOVA
anova_subgroup = anova(df=df, dependent_columns=activity_cols, covariates=covariates)

# Display p-values sorted
anova_results = pd.DataFrame([
    {"measure": dep, "F_statistic": res["F_statistic"], "p_value": res["p_value"]}
    for dep, res in anova_subgroup.items()
])
anova_results.sort_values('p_value', inplace=True)
anova_results

Unnamed: 0,measure,F_statistic,p_value
5,mealprep,6.488206,1.317368e-37
0,bills,5.685519,3.005297e-31
8,remdates,4.098522,5.047903e-19
1,taxes,3.415987,4.857547e-14
9,travel,3.39285,7.09134e-14
2,shopping,3.218973,1.185175e-12
6,events,2.484823,8.814611e-08
3,games,2.441471,1.636629e-07
7,payattn,2.007414,5.519321e-05
4,stove,1.742644,0.00125874


In [5]:
# Save
file_path = os.path.join(tables_dir, "anova_activities_gender_education.csv")
anova_results.to_csv(file_path, index=False)


In [6]:
# Gender
anova_gender = anova(df, activity_cols, ["female"])
pd.DataFrame([
    {"measure": dep, **res} for dep, res in anova_gender.items()
]).sort_values("p_value")


Unnamed: 0,measure,F_statistic,p_value
5,mealprep,269.018106,1.051208e-57
0,bills,133.355166,3.818393e-30
8,remdates,90.350484,4.2501389999999996e-21
2,shopping,85.172635,5.363802e-20
1,taxes,31.880331,1.810765e-08
4,stove,29.169446,7.209789e-08
7,payattn,18.794201,1.509444e-05
3,games,11.208551,0.0008253858
9,travel,10.586897,0.001152975
6,events,5.324303,0.02110536


In [7]:
# Save
anova_gender_df = pd.DataFrame([
    {"measure": dep, **res} 
    for dep, res in anova_gender.items()
])

anova_gender_df.to_csv(
    os.path.join(tables_dir, "anova_gender.csv"),
    index=False
)

anova_gender_df

anova_gender_df.to_csv(os.path.join(tables_dir, "anova_gender.csv"), index=False)


In [8]:
# Education
anova_educ = anova(df, activity_cols, ["educ"])
pd.DataFrame([
    {"measure": dep, **res} for dep, res in anova_educ.items()
]).sort_values("p_value")


Unnamed: 0,measure,F_statistic,p_value
9,travel,4.540887,6.379582e-13
1,taxes,3.413742,2.321408e-08
0,bills,3.320611,5.322662e-08
6,events,3.214112,1.362219e-07
8,remdates,3.067479,4.882751e-07
3,games,2.484396,6.144961e-05
2,shopping,1.936548,0.003542415
7,payattn,1.907341,0.0043184
4,stove,1.034955,0.4150966
5,mealprep,0.999787,0.4631915


In [9]:
# Save
anova_educ_df = pd.DataFrame([
    {"measure": dep, **res} 
    for dep, res in anova_educ.items()
])

anova_educ_df.to_csv(
    os.path.join(tables_dir, "anova_education.csv"),
    index=False
)

anova_educ_df

anova_educ_df.to_csv(os.path.join(tables_dir, "anova_education.csv"), index=False)

## Conclusion

- The activities most affected in the subgroup analysis are stove use, attention, social‑event attendance, and games.
- These effects are stronger in women and vary additionally with education level.
