In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import xgboost as xgb
from ML_xgboost import train_xgboost_model
from ML_lightgbm import train_lightgbm_model
from ML_catboost import train_catboost_model
from ML_randomforest import train_randomforest_model 

df_model = pd.read_csv("D:/Research-SJTUH/UniversityData/UniData/INS_prediction/Input/df_model_ins.csv")

selected_vars = [
    "somatic_y1", "BMI_T1_cat", "sleep_dura_T1_cat", "sleep_quali_T1", "insomnia_y1",
    "life_satis_y1", "ms_ses_y1", "per_stress_y1", "ms_stress_y1", "depression_y1", "anxiety_y1",
    "ace", "loneliness_y1", "support_y1", "gender_T1", "age_T1", "residence", "income", 
    "pocket_mon_T1", "income_ineqCity_y1", "sss_now", "marrige_par_bin", "edu_pa",
    "eat_unctl_y1", "eat_emot_y1", "food_sweetdrink_T1", "food_takeout_T1",
    "IPAQ_T1_1_bin", "IPAQ_T1_3_bin", "IPAQ_T1_5_bin", "screenT_weekday_T1", "screenT_weekend_T1",
    "psmu_y1", "media_BadMood_T1", "media_GoodMood_T1", "edu_self"
]

y_col = "insomnia_y2"
y = pd.to_numeric(df_model[y_col], errors="coerce")
X = df_model[selected_vars].copy()

RANDOM_STATE = 1234
TEST_SIZE = 0.30
VAL_SIZE = 0.15

# === 1. Training XGBoost ===
print("=== 1. Training XGBoost ===")
res_xgb = train_xgboost_model(X, y, selected_vars, RANDOM_STATE, TEST_SIZE, VAL_SIZE)
print(f"XGB Test R2: {res_xgb['test_r2']:.4f}")

# === 2. Training LightGBM ===
print("\n=== 2. Training LightGBM ===")
res_lgbm = train_lightgbm_model(X, y, selected_vars, RANDOM_STATE, TEST_SIZE, VAL_SIZE)
print(f"LGBM Test R2: {res_lgbm['test_r2']:.4f}")

# === 3. Training CatBoost ===
print("\n=== 3. Training CatBoost ===")
res_cat = train_catboost_model(X, y, selected_vars, RANDOM_STATE, TEST_SIZE, VAL_SIZE)
print(f"CatBoost Test R2: {res_cat['test_r2']:.4f}")

# === 4. Training Random Forest ===
print("\n=== 4. Training RandomForest ===")
res_rf = train_randomforest_model(X, y, selected_vars, RANDOM_STATE, TEST_SIZE, VAL_SIZE)
print(f"RF Test R2: {res_rf['test_r2']:.4f}")


=== 1. Training XGBoost ===
Start hyperparameter search...
Fitting 5 folds for each of 100 candidates, totalling 500 fits

=== RandomizedSearchCV ===
Best CV Score: -4.409187337264696
Best Params: {'xgb__subsample': 0.7, 'xgb__reg_lambda': 1, 'xgb__reg_alpha': 2, 'xgb__n_estimators': 600, 'xgb__min_child_weight': 15, 'xgb__max_depth': 2, 'xgb__learning_rate': 0.01, 'xgb__gamma': 0.1, 'xgb__colsample_bytree': 0.4, 'xgb__colsample_bylevel': 0.8}
[0]	train-rmse:5.22480	eval-rmse:5.50248
[50]	train-rmse:4.86647	eval-rmse:5.01051
[100]	train-rmse:4.63576	eval-rmse:4.69617
[150]	train-rmse:4.48373	eval-rmse:4.49937
[200]	train-rmse:4.37943	eval-rmse:4.38591
[250]	train-rmse:4.30588	eval-rmse:4.32055
[300]	train-rmse:4.24386	eval-rmse:4.27301
[350]	train-rmse:4.19632	eval-rmse:4.24728
[400]	train-rmse:4.15280	eval-rmse:4.22988
[450]	train-rmse:4.11562	eval-rmse:4.22128
[500]	train-rmse:4.08017	eval-rmse:4.20816
[550]	train-rmse:4.04910	eval-rmse:4.20182
[559]	train-rmse:4.04394	eval-rmse:4.19

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

models = ['RandomForest', 'XGBoost', 'LightGBM', 'CatBoost']

# R2 Score: higher is better
r2_scores = [
    res_rf['test_r2'], 
    res_xgb['test_r2'], 
    res_lgbm['test_r2'], 
    res_cat['test_r2']
]

# RMSE Score: lower is better
rmse_scores = [
    res_rf['test_rmse'], 
    res_xgb['test_rmse'], 
    res_lgbm['test_rmse'], 
    res_cat['test_rmse']
]

# create DataFrame
df_perf = pd.DataFrame({
    'Model': models,
    'Test R2': r2_scores,
    'Test RMSE': rmse_scores
})

# pltting settings
os.makedirs("output", exist_ok=True)
os.makedirs("plots", exist_ok=True)

plt.rcParams.update({
    'font.family': 'Times New Roman',
    'axes.titlesize': 16,
    'axes.labelsize': 14,
    'xtick.labelsize': 14,
    'ytick.labelsize': 14,
    'legend.fontsize': 14,
    'font.size': 12
})

fig, axes = plt.subplots(1, 2, figsize=(16, 8), dpi=400, tight_layout=True)

sns.barplot(x='Model', y='Test R2', data=df_perf, ax=axes[0], palette="viridis")
axes[0].set_title(f'Test Set R² Comparison')
axes[0].set_ylabel('R² Score')
axes[0].tick_params(axis='x') # rotation=15

for i in axes[0].containers:
    axes[0].bar_label(i, fmt='%.4f', label_type='edge', padding=3)
axes[0].margins(y=0.1)

sns.barplot(x='Model', y='Test RMSE', data=df_perf, ax=axes[1], palette="magma")
axes[1].set_title(f'Test Set RMSE Comparison')
axes[1].set_ylabel('RMSE')
axes[1].tick_params(axis='x')

for i in axes[1].containers:
    axes[1].bar_label(i, fmt='%.4f', label_type='edge', padding=3)
axes[1].margins(y=0.1)

# plt.suptitle(f'Ensemble vs. Individual Model Performance for Insomnia Prediction', fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])

save_path = f"plots/Performance_Comparison_5Models.png"
plt.savefig(save_path)
plt.close()

print(f"\n The comparison of models had been saved in: {save_path}")


 The comparison of models had been saved in: plots/Performance_Comparison_5Models.png


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

results = {
    'RandomForest': res_rf,
    'XGBoost': res_xgb,
    'LightGBM': res_lgbm,
    'CatBoost': res_cat
}

data_list = []

for model_name, res in results.items():
    # Train set metrics
    data_list.append({
        'Model': model_name,
        'Dataset': 'Train',
        'R2': res['train_r2'],
        'RMSE': res['train_rmse'],
        'MAE': res['train_mae']
    })
    
    # Evalidation set metrics
    data_list.append({
        'Model': model_name,
        'Dataset': 'Validation',
        'R2': res['val_r2'],
        'RMSE': res['val_rmse'],
        'MAE': res['val_mae']
    })
    
    # Test set metrics
    data_list.append({
        'Model': model_name,
        'Dataset': 'Test',
        'R2': res['test_r2'],
        'RMSE': res['test_rmse'],
        'MAE': res['test_mae']
    })

# DataFrame
df_final_perf = pd.DataFrame(data_list)

os.makedirs("output", exist_ok=True) 

excel_save_path = f"output/Single_Model_Full_Performance.xlsx"
df_final_perf = df_final_perf.sort_values(by=['Model', 'Dataset'])

df_final_perf.to_excel(excel_save_path, index=False)

print("\n" + "="*50)
print(f"All model performances had been saved in: {excel_save_path}")
print("="*50)
print("\n=== The comparison of model performance ===")
print(df_final_perf)

# === Plot ===
plt.rcParams.update({
    'font.family': 'Times New Roman',
    'axes.titlesize': 16,
    'axes.labelsize': 14,
    'xtick.labelsize': 14,
    'ytick.labelsize': 14,
    'legend.fontsize': 14,
    'font.size': 12
})

plt.rcParams['font.family'] = 'Times New Roman'

fig, axes = plt.subplots(1, 2, figsize=(18, 7), dpi=400, tight_layout=True)

# --- 1. R² Score (Higher is better) ---
sns.barplot(
    x='Model', 
    y='R2', 
    hue='Dataset', 
    data=df_final_perf, 
    ax=axes[0], 
    palette="viridis"
)
axes[0].set_title('R² Performance Comparison')
axes[0].set_ylabel('R² Score')
axes[0].tick_params(axis='x')
axes[0].legend(title='Dataset')

# --- 2. RMSE (Lower is better) ---
sns.barplot(
    x='Model', 
    y='RMSE', 
    hue='Dataset', 
    data=df_final_perf, 
    ax=axes[1], 
    palette="magma"
)
axes[1].set_ylim(0, 5)
axes[1].set_title('RMSE Performance Comparison')
axes[1].set_ylabel('RMSE')
axes[1].tick_params(axis='x')
axes[1].legend(title='Dataset')

# plt.suptitle(f'Single Model Performance Across Train, Validation, and Test Sets', fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])

png_save_path = f"plots/Performance_Comparison_5Models_Full_Comparison.png"
plt.savefig(png_save_path)
plt.close()

print(f"\n All performance comparison had been save in: {png_save_path}")


All model performances had been saved in: output/Single_Model_Full_Performance.xlsx

=== The comparison of model performance ===
           Model     Dataset        R2      RMSE       MAE
11      CatBoost        Test  0.276412  4.274304  3.318711
9       CatBoost       Train  0.524324  3.607470  2.875133
10      CatBoost  Validation  0.451936  4.075419  3.264157
8       LightGBM        Test  0.260387  4.321375  3.383927
6       LightGBM       Train  0.480425  3.770259  2.979080
7       LightGBM  Validation  0.408795  4.232782  3.297552
2   RandomForest        Test  0.267618  4.300198  3.350538
0   RandomForest       Train  0.560353  3.468158  2.681885
1   RandomForest  Validation  0.410938  4.225104  3.275081
5        XGBoost        Test  0.267758  4.299785  3.373761
3        XGBoost       Train  0.396606  4.063004  3.222202
4        XGBoost  Validation  0.418531  4.197783  3.267958

 All performance comparison had been save in: plots/Performance_Comparison_5Models_Full_Comparison.png

In [None]:
import os
import pandas as pd
import shap
import matplotlib.pyplot as plt
from ML_shap import run_full_shap_analysis

VARIABLE_MAPPING = {
    "somatic_y1": "Somatic Symptoms",
    "BMI_T1_cat": "BMI Category", 
    "sleep_dura_T1_cat": "Sleep Duration",
    "sleep_quali_T1": "Sleep Quality",
    "insomnia_y1": "Baseline Insomnia",
    
    "life_satis_y1": "Life Satisfaction",
    "ms_ses_y1": "Subjective SES", 
    "per_stress_y1": "Perceived Stress", 
    "ms_stress_y1": "Stress Mindset",
    "depression_y1": "Depressive Symptoms",
    "anxiety_y1": "Anxiety Symptoms",
    
    "ace": "Adverse Childhood Experiences",
    
    "loneliness_y1": "Loneliness", 
    "support_y1": "Social Support",
    
    "gender_T1": "Gender",
    "age_T1": "Age",
    "residence": "Residence", 
    "income": "Household Income",
    "pocket_mon_T1": "Pocket Money",
    "income_ineqCity_y1": "City Income Inequality",
    "sss_now": "Subjective Social Status",
    "marrige_par_bin": "Parental Marital Status",
    "edu_pa": "Parental Education",
    
    "eat_unctl_y1": "Uncontrolled Eating",
    "eat_emot_y1": "Emotional Eating", 
    "food_sweetdrink_T1": "Sweet Drink Consumption",
    "food_takeout_T1": "Takeout Frequency",
    
    "IPAQ_T1_1_bin": "Vigorous Physical Activity",
    "IPAQ_T1_3_bin": "Moderate Physical Activity", 
    "IPAQ_T1_5_bin": "Walking Activity",
    
    "screenT_weekday_T1": "Weekday Screen Time",
    "screenT_weekend_T1": "Weekend Screen Time",
    
    "psmu_y1": "Problematic Social Media Use",
    "media_BadMood_T1": "Media Use When Bad Mood", 
    "media_GoodMood_T1": "Media Use When Good Mood",
    
    "edu_self": "Self Educational Expectation"
}


all_results = {
    'RandomForest': res_rf,
    'XGBoost': res_xgb,
    'LightGBM': res_lgbm,
    'CatBoost': res_cat
}


FLAG_SHOW = False 
FLAG_TITLE = False 
IS_LOG = False 
TOPN = 15


for name, results in all_results.items():
    run_full_shap_analysis(
        model_name=name,
        results=results,
        df_model=df_model,         
        selected_vars=selected_vars,
        variable_mapping=VARIABLE_MAPPING,
        is_log_transformed=IS_LOG,
        top_n=TOPN,
        flag_show=FLAG_SHOW,
        flag_title=FLAG_TITLE
    )

Starting Full SHAP Analysis for RandomForest




-> SHAP values calculated successfully.
Plot saved to: plots/shap_summary_simple_randomforest.png
-> 1/6 Plotted Simple Summary.
Plot saved to: plots/feature_importance_bar_randomforest.png
-> 2/6 Plotted Feature Importance Bar.
Plot saved to: plots/shap_summary_with_bars_randomforest.png
-> 3/6 Plotted Standard SHAP Summary.
Plot saved to: plots/shap_dependence_plots_randomforest.png
-> 4/6 Plotted Dependence Plots for 4 features.
-> 5/6 Generated Log-Transformed Interpretation.
Results saved to: output/SHAP_Analysis_randomforest_20251127_175119.xlsx
-> 6/6 Saved results to Excel: output/SHAP_Analysis_randomforest_20251127_175119.xlsx
Full SHAP Analysis for RandomForest COMPLETED.
Starting Full SHAP Analysis for XGBoost




-> SHAP values calculated successfully.
Plot saved to: plots/shap_summary_simple_xgboost.png
-> 1/6 Plotted Simple Summary.
Plot saved to: plots/feature_importance_bar_xgboost.png
-> 2/6 Plotted Feature Importance Bar.
Plot saved to: plots/shap_summary_with_bars_xgboost.png
-> 3/6 Plotted Standard SHAP Summary.
Plot saved to: plots/shap_dependence_plots_xgboost.png
-> 4/6 Plotted Dependence Plots for 4 features.
-> 5/6 Generated Log-Transformed Interpretation.
Results saved to: output/SHAP_Analysis_xgboost_20251127_175140.xlsx
-> 6/6 Saved results to Excel: output/SHAP_Analysis_xgboost_20251127_175140.xlsx
Full SHAP Analysis for XGBoost COMPLETED.
Starting Full SHAP Analysis for LightGBM




-> SHAP values calculated successfully.
Plot saved to: plots/shap_summary_simple_lightgbm.png
-> 1/6 Plotted Simple Summary.
Plot saved to: plots/feature_importance_bar_lightgbm.png
-> 2/6 Plotted Feature Importance Bar.
Plot saved to: plots/shap_summary_with_bars_lightgbm.png
-> 3/6 Plotted Standard SHAP Summary.
Plot saved to: plots/shap_dependence_plots_lightgbm.png
-> 4/6 Plotted Dependence Plots for 4 features.
-> 5/6 Generated Log-Transformed Interpretation.
Results saved to: output/SHAP_Analysis_lightgbm_20251127_175159.xlsx
-> 6/6 Saved results to Excel: output/SHAP_Analysis_lightgbm_20251127_175159.xlsx
Full SHAP Analysis for LightGBM COMPLETED.
Starting Full SHAP Analysis for CatBoost




-> SHAP values calculated successfully.
Plot saved to: plots/shap_summary_simple_catboost.png
-> 1/6 Plotted Simple Summary.
Plot saved to: plots/feature_importance_bar_catboost.png
-> 2/6 Plotted Feature Importance Bar.
Plot saved to: plots/shap_summary_with_bars_catboost.png
-> 3/6 Plotted Standard SHAP Summary.
Plot saved to: plots/shap_dependence_plots_catboost.png
-> 4/6 Plotted Dependence Plots for 4 features.
-> 5/6 Generated Log-Transformed Interpretation.
Results saved to: output/SHAP_Analysis_catboost_20251127_175224.xlsx
-> 6/6 Saved results to Excel: output/SHAP_Analysis_catboost_20251127_175224.xlsx
Full SHAP Analysis for CatBoost COMPLETED.
