In [40]:
import pandas as pd

# Adjust pandas options to see all columns and to read subgroup descriptions fully
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_rows', 200)

In [51]:
df_s = pd.read_csv("results/subgroup_model_results.csv")
#df_s.columns

In [52]:
# Assign rank: one per row (since you now have only subgroup_model rows)
df_s['subgroup_rank'] = (df_s.index + 1).astype("object")
df_s.loc[df_s['model_type'] == 'global', 'subgroup_rank'] = "N/A"

# Round columns for better readability
for col, ndigits in [('cookD', 2), ('r2', 3), ('mae', 3), ('mse', 3), ('mean_residual', 3),
                     ('global_baseline_r2', 3), ('global_baseline_mae', 3), ('global_baseline_mse', 3), ('global_baseline_mean_residual', 3),
                     ('mean_baseline_r2', 3), ('mean_baseline_mae', 3), ('mean_baseline_mse', 3), ('mean_baseline_mean_residual', 3)]:
    if col in df_s.columns:
        df_s[col] = df_s[col].round(ndigits)

# Compare r2 between subgroup_model and global baseline for each row
df_s['subgroup_r2_better'] = (df_s['r2'] > df_s['global_baseline_r2']).astype("object")
df_s.loc[df_s['model_type'] == 'global', 'subgroup_r2_better'] = None
df_s['subgroup_better_global'] = (df_s['ttest_p'] < 0.1)
df_s.loc[df_s['model_type'] == 'global', 'subgroup_better_global'] = None
df_s['subgroup_better_mean'] = (df_s['ttest_p_mean'] < 0.1)
df_s.loc[df_s['model_type'] == 'global', 'subgroup_better_mean'] = None
df_s['global_better_mean'] = (df_s['ttest_p_mean_global'] < 0.1)
df_s.loc[df_s['model_type'] == 'global', 'global_better_mean'] = None

# Add decision column based on the rules
def determine_decision(row):
    sg = row['subgroup_better_global']
    sm = row['subgroup_better_mean']
    gm = row['global_better_mean']
    
    if sg and sm:
        return "True discovery"
    elif sg and not sm:
        return "True discovery, but model not appropriate"
    elif not sg and sm:
        return "Weak discovery"
    elif not sg and not sm and gm:
        return "False discovery"
    else:
        return "Model not appropriate"

df_s['decision'] = df_s.apply(determine_decision, axis=1)

# Specify desired column order
main_cols = [
    'subgroup_rank', 'model_type', 'description', 'decision', 'subgroup_r2_better', 'cookD', 'n_train', 'n_test',
    'r2', 'global_baseline_r2', 'mean_baseline_r2', 
    'mae', 'global_baseline_mae', 'mean_baseline_mae',
    'mse', 'global_baseline_mse', 'mean_baseline_mse',
    'mean_residual', 'global_baseline_mean_residual', 'mean_baseline_mean_residual'
]
rest_cols = [c for c in df_s.columns if c not in main_cols]
new_order = main_cols + rest_cols

# Reorder columns
df_s = df_s[new_order]

  df_s.loc[df_s['model_type'] == 'global', 'subgroup_better_global'] = None
  df_s.loc[df_s['model_type'] == 'global', 'subgroup_better_mean'] = None
  df_s.loc[df_s['model_type'] == 'global', 'global_better_mean'] = None


In [53]:
# Check global model
df_s.tail(1)

Unnamed: 0,subgroup_rank,model_type,description,decision,subgroup_r2_better,cookD,n_train,n_test,r2,global_baseline_r2,mean_baseline_r2,mae,global_baseline_mae,mean_baseline_mae,mse,global_baseline_mse,mean_baseline_mse,mean_residual,global_baseline_mean_residual,mean_baseline_mean_residual,intercept,coef__nr_distinct_files_viewed,pval__nr_distinct_files_viewed,coef__total_course_activities,pval__total_course_activities,coef__nightly_activities,pval__nightly_activities,coef__distinct_days,pval__distinct_days,coef__logged_in_weekly,pval__logged_in_weekly,coef__nr_files_viewed,pval__nr_files_viewed,coef__nr_slides_viewed,pval__nr_slides_viewed,ttest_p,ttest_stat,wilcoxon_p,wilcoxon_stat,ttest_p_mean,ttest_stat_mean,wilcoxon_p_mean,wilcoxon_stat_mean,ttest_p_mean_global,ttest_stat_mean_global,wilcoxon_p_mean_global,wilcoxon_stat_mean_global,subgroup_better_global,subgroup_better_mean,global_better_mean
100,,global,,True discovery,,,2947,1263,0.084,,-0.002,1.75,,1.81,4.296,,4.701,0.083,,0.105,4.313976,0.037802,1.5556219999999999e-19,-0.001444,0.028773,-0.00049,0.539272,0.027918,4e-06,-0.683618,1.728714e-10,0.01181,0.000363,-0.021485,1.03209e-15,,,,,,,,,,,,,,,


In [54]:
# Check subgroup models
df_s[['description', 'decision',   
      'r2', 'global_baseline_r2', 'mean_baseline_r2',
      'mse', 'global_baseline_mse', 'mean_baseline_mse',
      'ttest_p', 'n_test', 'subgroup_better_global', 'subgroup_better_mean', 'global_better_mean']].head(100)

Unnamed: 0,description,decision,r2,global_baseline_r2,mean_baseline_r2,mse,global_baseline_mse,mean_baseline_mse,ttest_p,n_test,subgroup_better_global,subgroup_better_mean,global_better_mean
0,croho=='B Psychology & Technology' ∧ type_vooropleiding=='BUITENL_SL',"True discovery, but model not appropriate",-0.014,-0.429,-0.054,3.677,5.183,3.823,0.088133,19,True,False,False
1,croho=='B Psychology & Technology' ∧ double_major==False ∧ type_vooropleiding=='BUITENL_SL',"True discovery, but model not appropriate",-0.014,-0.429,-0.054,3.677,5.183,3.823,0.088133,19,True,False,False
2,croho=='B Electrical Engineering' ∧ origin=='ORIGIN_E' ∧ type_vooropleiding=='BUITENL_SL',"True discovery, but model not appropriate",0.09,-0.054,-0.004,5.059,5.858,5.586,0.037529,91,True,False,False
3,croho=='B Computer Science & Engineering' ∧ origin=='ORIGIN_R' ∧ sex=='SEX_V',"True discovery, but model not appropriate",-0.087,-1.484,-0.007,2.654,6.066,2.46,0.020506,19,True,False,False
4,course_repeater==np.True_ ∧ croho=='B Innovation Sciences' ∧ sex=='SEX_M',Model not appropriate,-0.132,-0.405,-0.057,5.617,6.974,5.25,0.313242,9,False,False,False
5,course_repeater==np.True_ ∧ croho=='B Psychology & Technology' ∧ origin=='ORIGIN_N',Model not appropriate,-0.646,-0.138,-0.158,3.284,2.27,2.31,0.874223,25,False,False,False
6,course_repeater==np.True_ ∧ croho=='B Innovation Sciences',Model not appropriate,-0.14,-0.537,-0.06,4.026,5.429,3.744,0.240725,14,False,False,False
7,course_repeater==np.True_ ∧ croho=='B Innovation Sciences' ∧ double_major==False,Model not appropriate,-0.14,-0.537,-0.06,4.026,5.429,3.744,0.240725,14,False,False,False
8,course_repeater==np.True_ ∧ croho=='B Data Science (joint degree)' ∧ type_vooropleiding=='VWO',Weak discovery,-0.08,0.2,-0.709,3.009,2.227,4.762,0.761025,17,False,True,True
9,course_repeater==np.True_ ∧ croho=='B Psychology & Technology' ∧ type_vooropleiding=='VWO',Model not appropriate,-0.56,-0.188,-0.09,2.964,2.258,2.071,0.802308,23,False,False,False


In [13]:
# Compare individual subgroup against global model

subgroup_rank = 1

df_individual = df_s[(df_s['model_type'] == 'global') | (df_s['subgroup_rank'] == subgroup_rank)]

cols = ['subgroup_rank', 'model_type', 'description', 'cookD', 'n_train', 'n_test'] + rest_cols

df_individual[cols]

Unnamed: 0,subgroup_rank,model_type,description,cookD,n_train,n_test,intercept,coef__total_attended_labsessions,pval__total_attended_labsessions,coef__active_minutes,pval__active_minutes,coef__nr_distinct_files_viewed,pval__nr_distinct_files_viewed,coef__total_course_activities,pval__total_course_activities,coef__distinct_days,pval__distinct_days,coef__nr_files_viewed,pval__nr_files_viewed,coef__nr_practice_exams_viewed,pval__nr_practice_exams_viewed,ttest_p,ttest_stat,ttest_p_mean,ttest_stat_mean,ttest_p_mean_global,ttest_stat_mean_global,wilcoxon_p,wilcoxon_stat,subgroup_better_global,subgroup_better_mean,global_better_mean
0,1.0,subgroup,ECTS=='15' ∧ course_repeater==np.False_ ∧ sex=='SEX_M',19.51,161,122,4.779713,0.059493,0.011402,2.1e-05,0.37408,0.074019,0.002725588,0.006153,0.1884854,-0.00205,0.900882,-0.036407,0.06297257,0.056328,0.008924,0.00069,-3.274801,0.646858,0.377742,0.009122,-2.393091,0.000634,2490.0,True,False,False
15,,global,,,559,374,2.774317,0.064525,7.6e-05,4e-05,0.040861,0.096937,4.528641e-12,0.016668,6.406497e-10,-0.02897,0.007907,-0.079991,4.967337e-12,0.072741,2e-06,,,,,,,,,,,


In [5]:
subgroup_linear_models = pd.read_csv("results/subgroup_linear_models1.csv")

In [7]:
subgroup_linear_models.head(20)

Unnamed: 0,subgroup,n,cookD,term,coef_group,se_group,t_group,p_group,sig_group,coef_global,se_global,t_global,p_global,sig_global
0,ECTS=='15' ∧ course_repeater==np.False_ ∧ sex=...,161,19.51085,Intercept,4.779713,0.643772,7.424547,7.365128e-12,***,2.774317,0.350289,7.92007,1.317561e-14,***
1,ECTS=='15' ∧ course_repeater==np.False_ ∧ sex=...,161,19.51085,total_attended_labsessions,0.059493,0.023229,2.561096,0.01140157,*,0.064525,0.016182,3.987409,7.580384e-05,***
2,ECTS=='15' ∧ course_repeater==np.False_ ∧ sex=...,161,19.51085,active_minutes,2.1e-05,2.4e-05,0.891465,0.3740796,,4e-05,1.9e-05,2.049756,0.04086108,*
3,ECTS=='15' ∧ course_repeater==np.False_ ∧ sex=...,161,19.51085,nr_distinct_files_viewed,0.074019,0.024295,3.046732,0.002725588,**,0.096937,0.013699,7.076028,4.528641e-12,***
4,ECTS=='15' ∧ course_repeater==np.False_ ∧ sex=...,161,19.51085,total_course_activities,0.006153,0.004658,1.320965,0.1884854,,0.016668,0.002649,6.291686,6.406497e-10,***
5,ECTS=='15' ∧ course_repeater==np.False_ ∧ sex=...,161,19.51085,distinct_days,-0.00205,0.016432,-0.124754,0.9008819,,-0.02897,0.010868,-2.665771,0.007906636,**
6,ECTS=='15' ∧ course_repeater==np.False_ ∧ sex=...,161,19.51085,nr_files_viewed,-0.036407,0.019437,-1.873026,0.06297257,.,-0.079991,0.011327,-7.062031,4.967337e-12,***
7,ECTS=='15' ∧ course_repeater==np.False_ ∧ sex=...,161,19.51085,nr_practice_exams_viewed,0.056328,0.021265,2.648857,0.008923789,**,0.072741,0.01523,4.776176,2.293433e-06,***
8,ECTS=='15' ∧ course_repeater==np.False_,222,19.391462,Intercept,4.952012,0.566309,8.744366,6.667954e-16,***,2.774317,0.350289,7.92007,1.317561e-14,***
9,ECTS=='15' ∧ course_repeater==np.False_,222,19.391462,total_attended_labsessions,0.055126,0.019189,2.872732,0.004479099,**,0.064525,0.016182,3.987409,7.580384e-05,***
