In [23]:
import pandas as pd

# Adjust pandas options to see all columns and to read subgroup descriptions fully
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

In [55]:
df_s = pd.read_csv("results/subgroup_model_results.csv")

# Assign rank: one per row (since you now have only subgroup_model rows)
df_s['subgroup_rank'] = (df_s.index + 1).astype("object")
df_s.loc[df_s['model_type'] == 'global', 'subgroup_rank'] = "N/A"

# Round columns for better readability
for col, ndigits in [('cookD', 2), ('r2', 3), ('mae', 3), ('mse', 3), ('mean_residual', 3),
                     ('baseline_r2', 3), ('baseline_mae', 3), ('baseline_mse', 3), ('baseline_mean_residual', 3)]:
    if col in df_s.columns:
        df_s[col] = df_s[col].round(ndigits)

# Compare r2 between subgroup_model and baseline for each row
df_s['subgroup_r2_better'] = (df_s['r2'] > df_s['baseline_r2']).astype("object")
df_s.loc[df_s['model_type'] == 'global', 'subgroup_r2_better'] = None

# Specify desired column order
main_cols = [
    'subgroup_rank', 'model_type', 'description', 'subgroup_r2_better', 'cookD', 'n_train', 'n_test',
    'r2', 'baseline_r2', 'mae', 'baseline_mae', 'mse', 'baseline_mse', 'mean_residual',
    'baseline_mean_residual'
]
rest_cols = [c for c in df_s.columns if c not in main_cols]
new_order = main_cols + rest_cols

# Reorder columns
df_s = df_s[new_order]

In [42]:
# Check global model
df_s.tail(1)

Unnamed: 0,subgroup_rank,model_type,description,subgroup_r2_better,cookD,n_train,n_test,r2,baseline_r2,mae,baseline_mae,mse,baseline_mse,mean_residual,baseline_mean_residual,intercept,coef__total_attended_labsessions,pval__total_attended_labsessions,coef__active_minutes,pval__active_minutes,coef__nr_distinct_files_viewed,pval__nr_distinct_files_viewed,coef__total_course_activities,pval__total_course_activities,coef__distinct_days,pval__distinct_days,coef__nr_files_viewed,pval__nr_files_viewed,coef__nr_practice_exams_viewed,pval__nr_practice_exams_viewed
15,,global,,,,559,374,0.226,,0.93,,1.359,,0.063,,2.774317,0.064525,7.6e-05,4e-05,0.040861,0.096937,4.528641e-12,0.016668,6.406497e-10,-0.02897,0.007907,-0.079991,4.967337e-12,0.072741,2e-06


In [56]:
# Check subgroup models
df_s.head(40)

Unnamed: 0,subgroup_rank,model_type,description,subgroup_r2_better,cookD,n_train,n_test,r2,baseline_r2,mae,baseline_mae,mse,baseline_mse,mean_residual,baseline_mean_residual,intercept,coef__total_attended_labsessions,pval__total_attended_labsessions,coef__active_minutes,pval__active_minutes,coef__nr_distinct_files_viewed,pval__nr_distinct_files_viewed,coef__total_course_activities,pval__total_course_activities,coef__distinct_days,pval__distinct_days,coef__nr_files_viewed,pval__nr_files_viewed,coef__nr_practice_exams_viewed,pval__nr_practice_exams_viewed,wilcoxon_p,wilcoxon_stat
0,1.0,subgroup,ECTS=='15' ∧ course_repeater==np.False_ ∧ sex=='SEX_M',True,19.51,161,122,-0.005,-0.353,0.846,0.988,1.023,1.377,-0.046,0.432,4.779713,0.059493,0.011402,2.123436e-05,0.37408,0.074019,0.002725588,0.006153,0.1884854,-0.00205,0.900882,-0.036407,0.06297257,0.056328,0.008924,0.000634,2490.0
1,2.0,subgroup,ECTS=='15' ∧ course_repeater==np.False_,True,19.39,222,157,-0.006,-0.346,0.808,0.936,0.963,1.288,-0.028,0.436,4.952012,0.055126,0.004479,2.180098e-05,0.291611,0.075963,0.0002015234,0.006769,0.07577193,-0.013839,0.323329,-0.038075,0.01825012,0.060359,0.000925,0.00036,4272.0
2,3.0,subgroup,ECTS=='15',True,17.82,224,158,-0.012,-0.352,0.81,0.938,0.966,1.292,-0.026,0.425,4.820753,0.053338,0.00638,2.329117e-05,0.264509,0.080467,8.242501e-05,0.00634,0.09896983,-0.010717,0.445345,-0.036967,0.02311647,0.059345,0.00123,0.000222,4257.0
3,4.0,subgroup,ECTS=='15' ∧ sex=='SEX_M',True,17.76,163,123,-0.01,-0.36,0.849,0.991,1.025,1.381,-0.043,0.417,4.644857,0.057681,0.015048,2.330406e-05,0.334634,0.080338,0.001099184,0.005375,0.2540198,0.001517,0.926772,-0.034106,0.0842112,0.055407,0.010796,0.000508,2511.0
4,5.0,subgroup,course_repeater==np.False_ ∧ croho=='B Computer Science & Engineering' ∧ sex=='SEX_M',True,12.47,167,106,-0.026,-0.264,0.951,1.062,1.315,1.619,0.068,0.436,3.428655,0.09656,0.001031,-5.186074e-06,0.838207,0.100617,3.511573e-05,0.009385,0.03917193,-0.002748,0.872128,-0.05322,0.005692475,0.095627,3.4e-05,0.005816,2035.0
5,6.0,subgroup,croho=='B Computer Science & Engineering' ∧ sex=='SEX_M',True,12.12,173,112,0.147,-0.006,0.957,1.052,1.376,1.623,0.009,0.372,3.499475,0.091877,0.001866,-3.992256e-06,0.876025,0.099222,3.689359e-05,0.008647,0.0559164,-3.9e-05,0.998143,-0.050062,0.008876456,0.093403,4e-05,0.012984,2397.0
6,7.0,subgroup,course_repeater==np.False_ ∧ croho=='B Computer Science & Engineering',True,10.16,210,136,-0.024,-0.315,0.913,1.038,1.228,1.577,0.115,0.468,3.597339,0.070225,0.004363,-4.74399e-07,0.9827,0.09785,5.656173e-06,0.00896,0.02222905,-0.002723,0.858205,-0.050426,0.002640554,0.087239,2.1e-05,0.000374,3106.0
7,8.0,subgroup,croho=='B Computer Science & Engineering',True,10.08,216,142,0.129,-0.075,0.92,1.031,1.282,1.582,0.067,0.416,3.648746,0.067353,0.006439,3.423507e-07,0.987589,0.096819,5.640508e-06,0.00839,0.03134482,-0.000609,0.96765,-0.047999,0.004002949,0.085637,2.2e-05,0.000889,3542.0
8,9.0,subgroup,origin=='ORIGIN_N' ∧ sex=='SEX_M',False,3.13,154,101,0.337,0.4,1.028,0.992,1.607,1.455,-0.068,-0.02,2.942578,0.07567,0.021248,3.785857e-05,0.369795,0.075553,0.007152453,0.024477,2.91997e-05,-0.052444,0.019903,-0.112408,1.599562e-05,0.073827,0.020308,0.885108,2930.0
9,10.0,subgroup,origin=='ORIGIN_E' ∧ sex=='SEX_M',False,2.81,156,108,-0.004,0.046,0.997,0.959,1.629,1.549,0.212,0.187,1.806042,0.064434,0.088407,3.292618e-05,0.323566,0.09243,0.002714872,0.017733,0.001913318,-0.002806,0.896906,-0.080922,0.0007991598,0.053655,0.098485,0.967049,3543.0


In [47]:
# Compare individual subgroup against global model

subgroup_rank = 1

df_individual = df_s[(df_s['model_type'] == 'global') | (df_s['subgroup_rank'] == subgroup_rank)]

cols = ['subgroup_rank', 'model_type', 'description', 'cookD', 'n_train', 'n_test'] + rest_cols

df_individual[cols]

Unnamed: 0,subgroup_rank,model_type,description,cookD,n_train,n_test,intercept,coef__total_attended_labsessions,pval__total_attended_labsessions,coef__active_minutes,pval__active_minutes,coef__nr_distinct_files_viewed,pval__nr_distinct_files_viewed,coef__total_course_activities,pval__total_course_activities,coef__distinct_days,pval__distinct_days,coef__nr_files_viewed,pval__nr_files_viewed,coef__nr_practice_exams_viewed,pval__nr_practice_exams_viewed
0,1.0,subgroup_model,ECTS=='15' ∧ course_repeater==np.False_ ∧ sex=='SEX_M',19.51,161,122,4.779713,0.059493,0.011402,2.1e-05,0.37408,0.074019,0.002725588,0.006153,0.1884854,-0.00205,0.900882,-0.036407,0.06297257,0.056328,0.008924
15,,global,,,559,374,2.774317,0.064525,7.6e-05,4e-05,0.040861,0.096937,4.528641e-12,0.016668,6.406497e-10,-0.02897,0.007907,-0.079991,4.967337e-12,0.072741,2e-06


In [5]:
subgroup_linear_models = pd.read_csv("results/subgroup_linear_models1.csv")

In [7]:
subgroup_linear_models.head(20)

Unnamed: 0,subgroup,n,cookD,term,coef_group,se_group,t_group,p_group,sig_group,coef_global,se_global,t_global,p_global,sig_global
0,ECTS=='15' ∧ course_repeater==np.False_ ∧ sex=...,161,19.51085,Intercept,4.779713,0.643772,7.424547,7.365128e-12,***,2.774317,0.350289,7.92007,1.317561e-14,***
1,ECTS=='15' ∧ course_repeater==np.False_ ∧ sex=...,161,19.51085,total_attended_labsessions,0.059493,0.023229,2.561096,0.01140157,*,0.064525,0.016182,3.987409,7.580384e-05,***
2,ECTS=='15' ∧ course_repeater==np.False_ ∧ sex=...,161,19.51085,active_minutes,2.1e-05,2.4e-05,0.891465,0.3740796,,4e-05,1.9e-05,2.049756,0.04086108,*
3,ECTS=='15' ∧ course_repeater==np.False_ ∧ sex=...,161,19.51085,nr_distinct_files_viewed,0.074019,0.024295,3.046732,0.002725588,**,0.096937,0.013699,7.076028,4.528641e-12,***
4,ECTS=='15' ∧ course_repeater==np.False_ ∧ sex=...,161,19.51085,total_course_activities,0.006153,0.004658,1.320965,0.1884854,,0.016668,0.002649,6.291686,6.406497e-10,***
5,ECTS=='15' ∧ course_repeater==np.False_ ∧ sex=...,161,19.51085,distinct_days,-0.00205,0.016432,-0.124754,0.9008819,,-0.02897,0.010868,-2.665771,0.007906636,**
6,ECTS=='15' ∧ course_repeater==np.False_ ∧ sex=...,161,19.51085,nr_files_viewed,-0.036407,0.019437,-1.873026,0.06297257,.,-0.079991,0.011327,-7.062031,4.967337e-12,***
7,ECTS=='15' ∧ course_repeater==np.False_ ∧ sex=...,161,19.51085,nr_practice_exams_viewed,0.056328,0.021265,2.648857,0.008923789,**,0.072741,0.01523,4.776176,2.293433e-06,***
8,ECTS=='15' ∧ course_repeater==np.False_,222,19.391462,Intercept,4.952012,0.566309,8.744366,6.667954e-16,***,2.774317,0.350289,7.92007,1.317561e-14,***
9,ECTS=='15' ∧ course_repeater==np.False_,222,19.391462,total_attended_labsessions,0.055126,0.019189,2.872732,0.004479099,**,0.064525,0.016182,3.987409,7.580384e-05,***
