In [1]:
import pandas as pd

# Adjust pandas options to see all columns and to read subgroup descriptions fully
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_rows', 200)

In [2]:
df_s = pd.read_csv("results/subgroup_model_results.csv")
#df_s.columns

In [None]:
# Move global model row(s) to the bottom
df_s = pd.concat([
    df_s[df_s['model_type'] != 'global'],
    df_s[df_s['model_type'] == 'global']
], ignore_index=True)

# Assign rank: one per row (since you now have only subgroup_model rows)
df_s['subgroup_rank'] = (df_s.index + 1).astype("object")
df_s.loc[df_s['model_type'] == 'global', 'subgroup_rank'] = "N/A"

# Round columns for better readability
for col, ndigits in [('cookD', 2), ('r2', 3), ('mae', 3), ('mse', 3), ('mean_residual', 3),
                     ('global_baseline_r2', 3), ('global_baseline_mae', 3), ('global_baseline_mse', 3), ('global_baseline_mean_residual', 3),
                     ('mean_baseline_r2', 3), ('mean_baseline_mae', 3), ('mean_baseline_mse', 3), ('mean_baseline_mean_residual', 3)]:
    if col in df_s.columns:
        df_s[col] = df_s[col].round(ndigits)

# Compare r2 between subgroup_model and global baseline for each row
df_s['subgroup_r2_better'] = (df_s['r2'] > df_s['global_baseline_r2']).astype("object")
df_s.loc[df_s['model_type'] == 'global', 'subgroup_r2_better'] = None
df_s['subgroup_better_global'] = (df_s['ttest_p'] < 0.1)
df_s.loc[df_s['model_type'] == 'global', 'subgroup_better_global'] = None
df_s['subgroup_better_mean'] = (df_s['ttest_p_mean'] < 0.1)
df_s.loc[df_s['model_type'] == 'global', 'subgroup_better_mean'] = None
df_s['global_better_mean'] = (df_s['ttest_p_mean_global'] < 0.1)
df_s.loc[df_s['model_type'] == 'global', 'global_better_mean'] = None

# Add decision column based on the rules
def determine_decision(row):
    sg = row['subgroup_better_global']
    sm = row['subgroup_better_mean']
    gm = row['global_better_mean']
    
    if sg and sm:
        return "True discovery"
    elif sg and not sm:
        return "True discovery, but model not appropriate"
    elif not sg and sm:
        return "Weak discovery"
    elif not sg and not sm and gm:
        return "False discovery"
    else:
        return "Model not appropriate"

df_s['decision'] = df_s.apply(determine_decision, axis=1)

# Specify desired column order
main_cols = [
    'subgroup_rank', 'model_type', 'description', 'decision', 'subgroup_r2_better', 'cookD', 'n_train', 'n_test',
    'r2', 'global_baseline_r2', 'mean_baseline_r2', 
    'mae', 'global_baseline_mae', 'mean_baseline_mae',
    'mse', 'global_baseline_mse', 'mean_baseline_mse',
    'mean_residual', 'global_baseline_mean_residual', 'mean_baseline_mean_residual'
 ]
rest_cols = [c for c in df_s.columns if c not in main_cols]
new_order = main_cols + rest_cols

# Reorder columns
df_s = df_s[new_order]

  df_s.loc[df_s['model_type'] == 'global', 'subgroup_better_global'] = None
  df_s.loc[df_s['model_type'] == 'global', 'subgroup_better_mean'] = None
  df_s.loc[df_s['model_type'] == 'global', 'global_better_mean'] = None


TypeError: can only concatenate tuple (not "list") to tuple

In [4]:
df_s.head(1)

Unnamed: 0,subgroup_rank,model_type,description,decision,subgroup_r2_better,cookD,n_train,n_test,r2,global_baseline_r2,mean_baseline_r2,mae,global_baseline_mae,mean_baseline_mae,mse,global_baseline_mse,mean_baseline_mse,mean_residual,global_baseline_mean_residual,mean_baseline_mean_residual,intercept,coef__total_attended_labsessions,pval__total_attended_labsessions,coef__active_minutes,pval__active_minutes,coef__nr_distinct_files_viewed,pval__nr_distinct_files_viewed,coef__total_course_activities,pval__total_course_activities,coef__distinct_days,pval__distinct_days,coef__nr_files_viewed,pval__nr_files_viewed,coef__nr_practice_exams_viewed,pval__nr_practice_exams_viewed,ttest_p,ttest_stat,wilcoxon_p,wilcoxon_stat,ttest_p_mean,ttest_stat_mean,wilcoxon_p_mean,wilcoxon_stat_mean,ttest_p_mean_global,ttest_stat_mean_global,wilcoxon_p_mean_global,wilcoxon_stat_mean_global,subgroup_better_global,subgroup_better_mean,global_better_mean
0,1,subgroup,"ECTS=='15' ∧ GPA in (7.0, 8.9] ∧ croho=='B Computer Science & Engineering'","True discovery, but model not appropriate",True,104.42,91,34,-0.102,-1.323,-0.103,0.651,0.962,0.649,0.603,1.271,0.603,-0.133,0.845,-0.237,6.071988,0.044559,0.082377,3e-06,0.904598,0.046316,0.139547,5.8e-05,0.989982,0.048305,0.003561,-0.014246,0.455406,0.004711,0.845443,0.010143,-2.43873,0.007655,157.0,0.49876,-0.003133,0.526904,301.0,0.986973,2.33026,0.988673,429.0,True,False,False


In [5]:
# Check subgroup models
df_s[['description', 'decision',   
      'r2', 'global_baseline_r2', 'mean_baseline_r2',
      'mse', 'global_baseline_mse', 'mean_baseline_mse',
      'ttest_p', 'n_test', 'subgroup_better_global', 'subgroup_better_mean', 'global_better_mean']].head(100)

Unnamed: 0,description,decision,r2,global_baseline_r2,mean_baseline_r2,mse,global_baseline_mse,mean_baseline_mse,ttest_p,n_test,subgroup_better_global,subgroup_better_mean,global_better_mean
0,"ECTS=='15' ∧ GPA in (7.0, 8.9] ∧ croho=='B Computer Science & Engineering'","True discovery, but model not appropriate",-0.102,-1.323,-0.103,0.603,1.271,0.603,0.010143,34,True,False,False
1,"GPA in (7.0, 8.9] ∧ croho=='B Computer Science & Engineering'","True discovery, but model not appropriate",-0.078,-1.344,-0.104,0.566,1.231,0.58,0.006799,36,True,False,False
2,"GPA in (7.0, 8.9] ∧ course_repeater==np.False_ ∧ croho=='B Computer Science & Engineering'","True discovery, but model not appropriate",-0.078,-1.344,-0.104,0.566,1.231,0.58,0.006799,36,True,False,False
3,"ECTS=='15' ∧ GPA in (7.0, 8.9]",True discovery,0.068,-0.865,-0.085,0.569,1.14,0.663,0.004023,60,True,True,False
4,"ECTS=='15' ∧ GPA in (7.0, 8.9] ∧ course_repeater==np.False_",True discovery,0.068,-0.865,-0.085,0.569,1.14,0.663,0.004023,60,True,True,False
5,"GPA in (7.0, 8.9]",True discovery,0.009,-0.78,-0.132,0.588,1.056,0.672,0.007135,72,True,True,False
6,"GPA in (7.0, 8.9] ∧ course_repeater==np.False_",True discovery,0.009,-0.78,-0.132,0.588,1.056,0.672,0.007135,72,True,True,False
7,"ECTS=='15' ∧ GPA in (7.0, 8.9] ∧ sex=='SEX_M'",True discovery,0.125,-1.461,-0.057,0.482,1.356,0.582,0.001743,38,True,True,False
8,"GPA in (7.0, 8.9] ∧ origin=='ORIGIN_E'",True discovery,0.174,-1.186,-0.008,0.425,1.123,0.518,0.003199,37,True,True,False
9,"GPA in (7.0, 8.9] ∧ course_repeater==np.False_ ∧ origin=='ORIGIN_E'",True discovery,0.174,-1.186,-0.008,0.425,1.123,0.518,0.003199,37,True,True,False


In [5]:
# Compare individual subgroup against global model

subgroup_rank = 1

df_individual = df_s[(df_s['model_type'] == 'global') | (df_s['subgroup_rank'] == subgroup_rank)]

cols = ['subgroup_rank', 'model_type', 'description', 'cookD', 'n_train', 'n_test'] + rest_cols

df_individual[cols]

Unnamed: 0,subgroup_rank,model_type,description,cookD,n_train,n_test,intercept,coef__G1,pval__G1,coef__G2,pval__G2,coef__absences,pval__absences,coef__activities,pval__activities,coef__paid,pval__paid,coef__famsup,pval__famsup,coef__schoolsup,pval__schoolsup,coef__studytime,pval__studytime,indices,ttest_p,ttest_stat,wilcoxon_p,wilcoxon_stat,ttest_p_mean,ttest_stat_mean,wilcoxon_p_mean,wilcoxon_stat_mean,ttest_p_mean_global,ttest_stat_mean_global,wilcoxon_p_mean_global,wilcoxon_stat_mean_global,subgroup_better_global,subgroup_better_mean,global_better_mean
0,1.0,subgroup,"Walc in (2.0, 3.0] ∧ nursery=='yes' ∧ school=='MS'",474.33,29,5,-2.790428,0.795457,0.000344,0.393798,0.03085668,-0.049514,0.69335,0.714396,0.292423,-5.530959,0.002976,1.96735,0.03099,-9.398559,5.9e-05,-0.003401,0.994783,"[4, 8, 18, 36, 46, 53, 58, 75, 83, 95, 97, 118, 155, 176, 215, 221, 227, 235, 249, 290, 313, 363, 365, 440, 447, 457, 461, 486, 504]",0.865752,1.283915,0.90625,12.0,0.106733,-1.478048,0.21875,4.0,0.089036,-1.631734,0.0625,1.0,False,False,True
50,,global,,,519,130,-0.330637,0.158945,7.7e-05,0.887581,8.869635000000001e-84,0.017393,0.153938,-0.079296,0.471657,-0.343987,0.124951,0.122758,0.278713,-0.114223,0.539993,0.04124,0.564622,,,,,,,,,,,,,,,,


In [6]:
subgroup_linear_models = pd.read_csv("results/subgroup_portuguese_linear.csv")

In [7]:
subgroup_linear_models[['subgroup', 'n', 'cookD']].drop_duplicates().iloc[[14, 18, 25, 28, 29, 38, 44, 45, 46]]#[8,9, 27, 28, 41, 47]]

Unnamed: 0,subgroup,n,cookD
126,"Mjob=='other' ∧ Pstatus=='T' ∧ Walc in (3.0, 5.0]",25,240.53405
162,"Walc in (2.0, 3.0] ∧ sex=='M' ∧ traveltime in (0.999, 2.0]",25,234.751851
225,"Dalc in (2.0, 5.0] ∧ famrel in (0.999, 4.0] ∧ school=='GP'",24,218.148582
252,"goout in (4.0, 5.0] ∧ higher=='yes' ∧ reason=='course'",29,215.42872
261,"Walc in (3.0, 5.0] ∧ guardian=='father'",23,211.476044
342,"Mjob=='other' ∧ Walc in (3.0, 5.0] ∧ higher=='yes'",23,195.506601
396,"Pstatus=='T' ∧ freetime in (4.0, 5.0] ∧ reason=='course'",20,190.188238
405,"Walc in (2.0, 3.0] ∧ higher=='yes' ∧ romantic=='no'",40,189.731408
414,"Medu in (3.0, 4.0] ∧ age in (17.0, 18.0] ∧ nursery=='yes'",23,189.182909


In [8]:
subgroup_linear_models

Unnamed: 0,subgroup,n,cookD,indices,term,coef_group,se_group,t_group,p_group,sig_group,coef_global,se_global,t_global,p_global,sig_global
0,"Dalc in (2.0, 5.0] ∧ reason=='course'",20,709.810622,"[25, 31, 38, 55, 74, 75, 123, 137, 154, 170, 182, 231, 291, 302, 342, 348, 366, 419, 429, 444]",Intercept,-3.448716,3.816816,-0.903558,0.385590,,0.027177,0.268016,0.101401,9.192777e-01,
1,"Dalc in (2.0, 5.0] ∧ reason=='course'",20,709.810622,"[25, 31, 38, 55, 74, 75, 123, 137, 154, 170, 182, 231, 291, 302, 342, 348, 366, 419, 429, 444]",G1,1.491312,0.892413,1.671100,0.122876,,0.132953,0.039030,3.406398,7.180220e-04,***
2,"Dalc in (2.0, 5.0] ∧ reason=='course'",20,709.810622,"[25, 31, 38, 55, 74, 75, 123, 137, 154, 170, 182, 231, 291, 302, 342, 348, 366, 419, 429, 444]",G2,0.002628,0.766467,0.003429,0.997325,,0.890912,0.036540,24.381771,5.217999e-84,***
3,"Dalc in (2.0, 5.0] ∧ reason=='course'",20,709.810622,"[25, 31, 38, 55, 74, 75, 123, 137, 154, 170, 182, 231, 291, 302, 342, 348, 366, 419, 429, 444]",absences,0.419381,0.174835,2.398725,0.035312,*,0.010979,0.012097,0.907587,3.645878e-01,
4,"Dalc in (2.0, 5.0] ∧ reason=='course'",20,709.810622,"[25, 31, 38, 55, 74, 75, 123, 137, 154, 170, 182, 231, 291, 302, 342, 348, 366, 419, 429, 444]",activities,-0.794305,1.403415,-0.565980,0.582769,,-0.051632,0.109035,-0.473532,6.360657e-01,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445,"Pstatus=='T' ∧ Walc in (3.0, 5.0] ∧ nursery=='no'",23,187.598493,"[1, 20, 33, 35, 66, 68, 74, 83, 147, 154, 216, 321, 331, 334, 342, 348, 383, 399, 402, 419, 427, 429, 431]",activities,0.200127,0.570490,0.350798,0.730962,,-0.051632,0.109035,-0.473532,6.360657e-01,
446,"Pstatus=='T' ∧ Walc in (3.0, 5.0] ∧ nursery=='no'",23,187.598493,"[1, 20, 33, 35, 66, 68, 74, 83, 147, 154, 216, 321, 331, 334, 342, 348, 383, 399, 402, 419, 427, 429, 431]",paid,0.326356,1.092965,0.298597,0.769634,,-0.449179,0.221763,-2.025495,4.341291e-02,*
447,"Pstatus=='T' ∧ Walc in (3.0, 5.0] ∧ nursery=='no'",23,187.598493,"[1, 20, 33, 35, 66, 68, 74, 83, 147, 154, 216, 321, 331, 334, 342, 348, 383, 399, 402, 419, 427, 429, 431]",famsup,1.108130,0.617034,1.795898,0.094117,.,0.098840,0.113121,0.873762,3.827193e-01,
448,"Pstatus=='T' ∧ Walc in (3.0, 5.0] ∧ nursery=='no'",23,187.598493,"[1, 20, 33, 35, 66, 68, 74, 83, 147, 154, 216, 321, 331, 334, 342, 348, 383, 399, 402, 419, 427, 429, 431]",schoolsup,5.083626,1.357058,3.746063,0.002170,**,0.027472,0.184330,0.149034,8.815941e-01,
