In [1]:
import pandas as pd

# Adjust pandas options to see all columns and to read subgroup descriptions fully
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

In [50]:
df_s = pd.read_csv("results/subgroup_ss_portuguese_results8020.csv")

# Assign rank: one per row (since you now have only subgroup_model rows)
df_s['subgroup_rank'] = (df_s.index + 1).astype("object")
df_s.loc[df_s['model_type'] == 'global', 'subgroup_rank'] = "N/A"

# Round columns for better readability
for col, ndigits in [('cookD', 2), ('r2', 3), ('mae', 3), ('mse', 3), ('mean_residual', 3),
                     ('baseline_r2', 3), ('baseline_mae', 3), ('baseline_mse', 3), ('baseline_mean_residual', 3)]:
    if col in df_s.columns:
        df_s[col] = df_s[col].round(ndigits)

# Compare r2 between subgroup_model and baseline for each row
df_s['subgroup_r2_better'] = (df_s['r2'] > df_s['baseline_r2']).astype("object")
df_s.loc[df_s['model_type'] == 'global', 'subgroup_r2_better'] = None
df_s['subgroup_better_global'] = (df_s['ttest_p'] < 0.1)
df_s.loc[df_s['model_type'] == 'global', 'subgroup_better_global'] = None
df_s['subgroup_better_mean'] = (df_s['ttest_p_mean'] < 0.1)
df_s.loc[df_s['model_type'] == 'global', 'subgroup_better_mean'] = None
df_s['global_better_mean'] = (df_s['ttest_p_mean_global'] < 0.1)
df_s.loc[df_s['model_type'] == 'global', 'global_better_mean'] = None

# Specify desired column order
main_cols = [
    'subgroup_rank', 'model_type', 'description', 'subgroup_r2_better', 'cookD', 'n_train', 'n_test',
    'r2', 'baseline_r2', 'mae', 'baseline_mae', 'mse', 'baseline_mse', 'mean_residual',
    'baseline_mean_residual'
]
rest_cols = [c for c in df_s.columns if c not in main_cols]
new_order = main_cols + rest_cols

# Reorder columns
df_s = df_s[new_order]

  df_s.loc[df_s['model_type'] == 'global', 'subgroup_better_global'] = None
  df_s.loc[df_s['model_type'] == 'global', 'subgroup_better_mean'] = None
  df_s.loc[df_s['model_type'] == 'global', 'global_better_mean'] = None


In [51]:
# Check global model
df_s.tail(1)

Unnamed: 0,subgroup_rank,model_type,description,subgroup_r2_better,cookD,n_train,n_test,r2,baseline_r2,mae,baseline_mae,mse,baseline_mse,mean_residual,baseline_mean_residual,intercept,coef__G1,pval__G1,coef__G2,pval__G2,coef__absences,pval__absences,coef__activities,pval__activities,coef__paid,pval__paid,coef__famsup,pval__famsup,coef__schoolsup,pval__schoolsup,coef__studytime,pval__studytime,indices,ttest_p,ttest_stat,wilcoxon_p,wilcoxon_stat,ttest_p_mean,ttest_stat_mean,wilcoxon_p_mean,wilcoxon_stat_mean,ttest_p_mean_global,ttest_stat_mean_global,wilcoxon_p_mean_global,wilcoxon_stat_mean_global,subgroup_better_global,subgroup_better_mean,global_better_mean
50,,global,,,,519,130,0.851,,0.84,,1.746,,0.04,,-0.330637,0.158945,7.7e-05,0.887581,8.869635000000001e-84,0.017393,0.153938,-0.079296,0.471657,-0.343987,0.124951,0.122758,0.278713,-0.114223,0.539993,0.04124,0.564622,,,,,,,,,,,,,,,,


In [52]:
df_s.head(1)

Unnamed: 0,subgroup_rank,model_type,description,subgroup_r2_better,cookD,n_train,n_test,r2,baseline_r2,mae,baseline_mae,mse,baseline_mse,mean_residual,baseline_mean_residual,intercept,coef__G1,pval__G1,coef__G2,pval__G2,coef__absences,pval__absences,coef__activities,pval__activities,coef__paid,pval__paid,coef__famsup,pval__famsup,coef__schoolsup,pval__schoolsup,coef__studytime,pval__studytime,indices,ttest_p,ttest_stat,wilcoxon_p,wilcoxon_stat,ttest_p_mean,ttest_stat_mean,wilcoxon_p_mean,wilcoxon_stat_mean,ttest_p_mean_global,ttest_stat_mean_global,wilcoxon_p_mean_global,wilcoxon_stat_mean_global,subgroup_better_global,subgroup_better_mean,global_better_mean
0,1,subgroup,"Walc in (2.0, 3.0] ∧ nursery=='yes' ∧ school=='MS'",False,474.33,29,5,0.792,0.931,1.796,1.156,5.205,1.723,0.305,-0.11,-2.790428,0.795457,0.000344,0.393798,0.030857,-0.049514,0.69335,0.714396,0.292423,-5.530959,0.002976,1.96735,0.03099,-9.398559,5.9e-05,-0.003401,0.994783,"[4, 8, 18, 36, 46, 53, 58, 75, 83, 95, 97, 118, 155, 176, 215, 221, 227, 235, 249, 290, 313, 363, 365, 440, 447, 457, 461, 486, 504]",0.865752,1.283915,0.90625,12.0,0.106733,-1.478048,0.21875,4.0,0.089036,-1.631734,0.0625,1.0,False,False,True


In [58]:
# Check subgroup models
df_interest = df_s[ (df_s['subgroup_better_global'] == True) & (df_s['subgroup_better_mean'] == True)]# | (df_s['global_better_mean'] == False)]


print( len(df_interest) )
# mat = [14, 18, 25, 28, 29, 38, 44, 45, 46]
# por = [15, 20, 31, 34, 39, 40, 44]
df_interest[['description', 'n_test', 'n_train','mae', 'r2', 'subgroup_better_global', 'subgroup_better_mean', 'global_better_mean', 'cookD']]#.head(10)

1


Unnamed: 0,description,n_test,n_train,mae,r2,subgroup_better_global,subgroup_better_mean,global_better_mean,cookD
30,"Fjob=='services' ∧ age in (16.0, 17.0] ∧ internet=='yes'",8,25,0.86,0.873,True,True,True,263.78


In [60]:

df_top5 = df_s[ (df_s['subgroup_better_global'] == True) | (df_s['subgroup_better_mean'] == True)]#| (df_s['global_better_mean'] == False)]

df_top5[['description', 'n_test', 'n_train', 'mae', 'r2', 'subgroup_better_global', 'subgroup_better_mean', 'global_better_mean', 'cookD']].head(20)

Unnamed: 0,description,n_test,n_train,mae,r2,subgroup_better_global,subgroup_better_mean,global_better_mean,cookD
7,"Dalc in (0.999, 2.0] ∧ Walc in (2.0, 3.0] ∧ school=='MS'",6,30,2.519,0.661,False,True,True,332.14
11,"Medu in (2.0, 4.0] ∧ famsize=='GT3' ∧ school=='MS'",10,43,1.998,0.548,False,True,True,318.78
13,"Medu in (2.0, 4.0] ∧ school=='MS' ∧ sex=='F'",13,32,1.828,0.6,False,True,True,301.66
14,"Medu in (2.0, 4.0] ∧ school=='MS' ∧ traveltime in (0.999, 2.0]",15,55,1.326,0.706,False,True,True,297.68
15,"Medu in (2.0, 4.0] ∧ nursery=='yes' ∧ school=='MS'",11,49,0.938,0.917,False,True,True,296.21
17,"Medu in (2.0, 4.0] ∧ address=='U' ∧ school=='MS'",9,34,0.564,0.955,False,True,True,288.33
18,"Medu in (2.0, 4.0] ∧ higher=='yes' ∧ school=='MS'",14,57,1.61,0.717,False,True,True,286.38
24,"Fjob=='services' ∧ age in (16.0, 17.0] ∧ famsize=='GT3'",6,25,0.849,0.881,False,True,True,276.59
25,"Medu in (2.0, 4.0] ∧ school=='MS'",16,61,1.547,0.728,False,True,True,276.28
26,"Medu in (2.0, 4.0] ∧ failures in (-0.001, 3.0] ∧ school=='MS'",16,61,1.547,0.728,False,True,True,276.28


In [5]:
# Compare individual subgroup against global model

subgroup_rank = 1

df_individual = df_s[(df_s['model_type'] == 'global') | (df_s['subgroup_rank'] == subgroup_rank)]

cols = ['subgroup_rank', 'model_type', 'description', 'cookD', 'n_train', 'n_test'] + rest_cols

df_individual[cols]

Unnamed: 0,subgroup_rank,model_type,description,cookD,n_train,n_test,intercept,coef__G1,pval__G1,coef__G2,pval__G2,coef__absences,pval__absences,coef__activities,pval__activities,coef__paid,pval__paid,coef__famsup,pval__famsup,coef__schoolsup,pval__schoolsup,coef__studytime,pval__studytime,indices,ttest_p,ttest_stat,wilcoxon_p,wilcoxon_stat,ttest_p_mean,ttest_stat_mean,wilcoxon_p_mean,wilcoxon_stat_mean,ttest_p_mean_global,ttest_stat_mean_global,wilcoxon_p_mean_global,wilcoxon_stat_mean_global,subgroup_better_global,subgroup_better_mean,global_better_mean
0,1.0,subgroup,"Walc in (2.0, 3.0] ∧ nursery=='yes' ∧ school=='MS'",474.33,29,5,-2.790428,0.795457,0.000344,0.393798,0.03085668,-0.049514,0.69335,0.714396,0.292423,-5.530959,0.002976,1.96735,0.03099,-9.398559,5.9e-05,-0.003401,0.994783,"[4, 8, 18, 36, 46, 53, 58, 75, 83, 95, 97, 118, 155, 176, 215, 221, 227, 235, 249, 290, 313, 363, 365, 440, 447, 457, 461, 486, 504]",0.865752,1.283915,0.90625,12.0,0.106733,-1.478048,0.21875,4.0,0.089036,-1.631734,0.0625,1.0,False,False,True
50,,global,,,519,130,-0.330637,0.158945,7.7e-05,0.887581,8.869635000000001e-84,0.017393,0.153938,-0.079296,0.471657,-0.343987,0.124951,0.122758,0.278713,-0.114223,0.539993,0.04124,0.564622,,,,,,,,,,,,,,,,


In [6]:
subgroup_linear_models = pd.read_csv("results/subgroup_portuguese_linear.csv")

In [7]:
subgroup_linear_models[['subgroup', 'n', 'cookD']].drop_duplicates().iloc[[14, 18, 25, 28, 29, 38, 44, 45, 46]]#[8,9, 27, 28, 41, 47]]

Unnamed: 0,subgroup,n,cookD
126,"Mjob=='other' ∧ Pstatus=='T' ∧ Walc in (3.0, 5.0]",25,240.53405
162,"Walc in (2.0, 3.0] ∧ sex=='M' ∧ traveltime in (0.999, 2.0]",25,234.751851
225,"Dalc in (2.0, 5.0] ∧ famrel in (0.999, 4.0] ∧ school=='GP'",24,218.148582
252,"goout in (4.0, 5.0] ∧ higher=='yes' ∧ reason=='course'",29,215.42872
261,"Walc in (3.0, 5.0] ∧ guardian=='father'",23,211.476044
342,"Mjob=='other' ∧ Walc in (3.0, 5.0] ∧ higher=='yes'",23,195.506601
396,"Pstatus=='T' ∧ freetime in (4.0, 5.0] ∧ reason=='course'",20,190.188238
405,"Walc in (2.0, 3.0] ∧ higher=='yes' ∧ romantic=='no'",40,189.731408
414,"Medu in (3.0, 4.0] ∧ age in (17.0, 18.0] ∧ nursery=='yes'",23,189.182909


In [8]:
subgroup_linear_models

Unnamed: 0,subgroup,n,cookD,indices,term,coef_group,se_group,t_group,p_group,sig_group,coef_global,se_global,t_global,p_global,sig_global
0,"Dalc in (2.0, 5.0] ∧ reason=='course'",20,709.810622,"[25, 31, 38, 55, 74, 75, 123, 137, 154, 170, 182, 231, 291, 302, 342, 348, 366, 419, 429, 444]",Intercept,-3.448716,3.816816,-0.903558,0.385590,,0.027177,0.268016,0.101401,9.192777e-01,
1,"Dalc in (2.0, 5.0] ∧ reason=='course'",20,709.810622,"[25, 31, 38, 55, 74, 75, 123, 137, 154, 170, 182, 231, 291, 302, 342, 348, 366, 419, 429, 444]",G1,1.491312,0.892413,1.671100,0.122876,,0.132953,0.039030,3.406398,7.180220e-04,***
2,"Dalc in (2.0, 5.0] ∧ reason=='course'",20,709.810622,"[25, 31, 38, 55, 74, 75, 123, 137, 154, 170, 182, 231, 291, 302, 342, 348, 366, 419, 429, 444]",G2,0.002628,0.766467,0.003429,0.997325,,0.890912,0.036540,24.381771,5.217999e-84,***
3,"Dalc in (2.0, 5.0] ∧ reason=='course'",20,709.810622,"[25, 31, 38, 55, 74, 75, 123, 137, 154, 170, 182, 231, 291, 302, 342, 348, 366, 419, 429, 444]",absences,0.419381,0.174835,2.398725,0.035312,*,0.010979,0.012097,0.907587,3.645878e-01,
4,"Dalc in (2.0, 5.0] ∧ reason=='course'",20,709.810622,"[25, 31, 38, 55, 74, 75, 123, 137, 154, 170, 182, 231, 291, 302, 342, 348, 366, 419, 429, 444]",activities,-0.794305,1.403415,-0.565980,0.582769,,-0.051632,0.109035,-0.473532,6.360657e-01,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445,"Pstatus=='T' ∧ Walc in (3.0, 5.0] ∧ nursery=='no'",23,187.598493,"[1, 20, 33, 35, 66, 68, 74, 83, 147, 154, 216, 321, 331, 334, 342, 348, 383, 399, 402, 419, 427, 429, 431]",activities,0.200127,0.570490,0.350798,0.730962,,-0.051632,0.109035,-0.473532,6.360657e-01,
446,"Pstatus=='T' ∧ Walc in (3.0, 5.0] ∧ nursery=='no'",23,187.598493,"[1, 20, 33, 35, 66, 68, 74, 83, 147, 154, 216, 321, 331, 334, 342, 348, 383, 399, 402, 419, 427, 429, 431]",paid,0.326356,1.092965,0.298597,0.769634,,-0.449179,0.221763,-2.025495,4.341291e-02,*
447,"Pstatus=='T' ∧ Walc in (3.0, 5.0] ∧ nursery=='no'",23,187.598493,"[1, 20, 33, 35, 66, 68, 74, 83, 147, 154, 216, 321, 331, 334, 342, 348, 383, 399, 402, 419, 427, 429, 431]",famsup,1.108130,0.617034,1.795898,0.094117,.,0.098840,0.113121,0.873762,3.827193e-01,
448,"Pstatus=='T' ∧ Walc in (3.0, 5.0] ∧ nursery=='no'",23,187.598493,"[1, 20, 33, 35, 66, 68, 74, 83, 147, 154, 216, 321, 331, 334, 342, 348, 383, 399, 402, 419, 427, 429, 431]",schoolsup,5.083626,1.357058,3.746063,0.002170,**,0.027472,0.184330,0.149034,8.815941e-01,
