In [1]:
# %%
# =============================================================================
# CELL 1: CONFIGURATION & IMPORTS
# =============================================================================

import numpy as np
import pandas as pd
import optuna
from sklearn.preprocessing import PolynomialFeatures

# Configuration - À MODIFIER ICI
N_FOLDS = 5
N_TRIALS = 30
RANDOM_STATE = 42

# Modèles à utiliser
CLF_MODELS = ["XGB", "LGBM", "CatBoost", "RF", "LR", "LDA", "QDA", "LinearSVC", "ElasticNetLR"]
REG_MODELS = ["XGB", "LGBM", "CatBoost", "Ridge", "ElasticNet", "PLS", "KernelRidge"]
IPCW_MODELS = ["XGB", "LGBM", "CatBoost", "RF", "Ridge", "ElasticNet", "PLS", "KernelRidge"]

optuna.logging.set_verbosity(optuna.logging.WARNING)

# Imports du module training
from ens_data_challenge.training import (
    transform_y,
    scale_01,
    FoldMetrics,
    CVResults,
    compute_ipcw_cindex,
    get_classifier_factory,
    get_regressor_factory,
    train_classifier_cv,
    train_regressor_cv,
    train_ensemble,
    get_ipcw_weights,
)
from ens_data_challenge.preprocess import Preprocessor

print("Configuration loaded!")
print(f"  N_FOLDS={N_FOLDS}, N_TRIALS={N_TRIALS}")



  from .autonotebook import tqdm as notebook_tqdm


Configuration loaded!
  N_FOLDS=5, N_TRIALS=30


In [None]:


# %%
# =============================================================================
# CELL 2: DATA LOADING & PREPROCESSING
# =============================================================================

# Charger les données via globals
from ens_data_challenge.globals import (
    TRAIN_CLINICAL_DATA_PATH,
    TRAIN_MOLECULAR_DATA_PATH,
    TRAIN_TARGET_PATH,
    TEST_CLINICAL_DATA_PATH,
    TEST_MOLECULAR_DATA_PATH,
)
from ens_data_challenge.feature_engineering import FeatureEngineerHelper

clinical_train = pd.read_csv(TRAIN_CLINICAL_DATA_PATH)
clinical_test = pd.read_csv(TEST_CLINICAL_DATA_PATH)
molecular_train = pd.read_csv(TRAIN_MOLECULAR_DATA_PATH)
molecular_test = pd.read_csv(TEST_MOLECULAR_DATA_PATH)
targets_train = pd.read_csv(TRAIN_TARGET_PATH)

# Preprocessing
preprocessor = Preprocessor()

# Get cyto features
clinical_train, cyto_struct_train = preprocessor.get_cyto_features_and_df(
    clinical_train
)
clinical_test, cyto_struct_test = preprocessor.get_cyto_features_and_df(clinical_test)

# Fit transform
(
    clinical_preprocess_train,
    clinical_preprocess_test,
    molecular_preprocess_train,
    molecular_preprocess_test,
    cyto_struct_train,
    cyto_struct_test,
    targets_preprocess,
) = preprocessor.fit_transform(
    clinical_train,
    clinical_test,
    molecular_train,
    molecular_test,
    cyto_struct_train,
    cyto_struct_test,
    targets_train,
)

# =============================================================================
# FEATURE ENGINEERING avec FeatureEngineerHelper
# =============================================================================
feat_helper = FeatureEngineerHelper()

# 1. Ajouter Nmut (nombre de mutations par patient)
clinical_preprocess_train = feat_helper.Nmut(
    molecular_preprocess_train, clinical_preprocess_train
)
clinical_preprocess_test = feat_helper.Nmut(
    molecular_preprocess_test, clinical_preprocess_test
)

# 2. Ajouter ratios et interactions (WBC/ANC, PLT/HB, blast_cyto_complexity, tumor_burden_composite)
clinical_preprocess_train = feat_helper.ratios_and_interactions(
    clinical_preprocess_train
)
clinical_preprocess_test = feat_helper.ratios_and_interactions(clinical_preprocess_test)

# 3. Ajouter severity scores (cytopenias_count)
clinical_preprocess_train = feat_helper.severity_scores(clinical_preprocess_train)
clinical_preprocess_test = feat_helper.severity_scores(clinical_preprocess_test)

# 4. Ajouter encodage moléculaire par PATHWAY (confidence_weighted avec effect weighting)
clinical_preprocess_train = feat_helper.fit_transform_mol_encoding(
    clinical_data=clinical_preprocess_train,
    molecular_data=molecular_preprocess_train,
    col="PATHWAY",
    method="confidence_weighted",
    apply_effect_weighting=True,
)
clinical_preprocess_test = feat_helper.transform_mol_encoding(
    clinical_data=clinical_preprocess_test,
    molecular_data=molecular_preprocess_test,
    col="PATHWAY",
    method="confidence_weighted",
    apply_effect_weighting=True,
)

# 5. Ajouter encodage moléculaire par GENE (constant pour baseline)
clinical_preprocess_train = feat_helper.fit_transform_mol_encoding(
    clinical_data=clinical_preprocess_train,
    molecular_data=molecular_preprocess_train,
    col="GENE",
    method="constant",
    apply_effect_weighting=False,
    n_pca_components=20,
)
clinical_preprocess_test = feat_helper.transform_mol_encoding(
    clinical_data=clinical_preprocess_test,
    molecular_data=molecular_preprocess_test,
    col="GENE",
    method="constant",
    apply_effect_weighting=False,
)

# Prepare data
drop_columns = [ "ID", "CENTER", ]
y_times = targets_preprocess["OS_YEARS"].values
events = targets_preprocess["OS_STATUS"].values

X_clinical = (
    clinical_preprocess_train.drop(columns=drop_columns, errors="ignore")
    .copy()
    .fillna(0)
    .replace(
        [np.inf, -np.inf], 0
    )  # Remplacer les inf créés par les divisions (ex: WBC/ANC quand ANC=0)
)
X_cyto = cyto_struct_train.drop(columns=["ID"], errors="ignore").fillna("UNKNOWN")

# Supprimer les colonnes catégorielles (garder uniquement les colonnes numériques)
categorical_cols = X_clinical.select_dtypes(
    include=["object", "category"]
).columns.tolist()
if categorical_cols:
    print(f"Dropping categorical columns: {categorical_cols}")
    X_clinical = X_clinical.drop(columns=categorical_cols)

print(f"Data loaded!")
print(f"  X_clinical: {X_clinical.shape}")
print(f"  X_cyto: {X_cyto.shape}")
print(f"  Event rate: {events.mean():.3f}")
print(f"\nFeature Engineering ajouté:")
print(f"  - Nmut (nombre de mutations)")
print(
    f"  - Ratios: wbc_anc_ratio, plt_hb_ratio, blast_cyto_complexity, tumor_burden_composite"
)
print(f"  - Severity: cytopenias_count")
print(
    f"  - Molecular encoding: PATHWAY (confidence_weighted + effect), GENE (constant)"
)



PCA fitted for PATHWAY with 8 components.
PCA fitted for GENE with 20 components.
Data loaded!
  X_clinical: (3173, 56)
  X_cyto: (2758, 12)
  Event rate: 0.504

Feature Engineering ajouté:
  - Nmut (nombre de mutations)
  - Ratios: wbc_anc_ratio, plt_hb_ratio, blast_cyto_complexity, tumor_burden_composite
  - Severity: cytopenias_count
  - Molecular encoding: PATHWAY (confidence_weighted + effect), GENE (constant)


In [7]:
X_clinical

Unnamed: 0,BM_BLAST,WBC,ANC,HB,PLT,is_normal,ploidy,has_tp53_deletion,has_complex_chr3,n_abnormalities,...,GENE_PCA_10,GENE_PCA_11,GENE_PCA_12,GENE_PCA_13,GENE_PCA_14,GENE_PCA_15,GENE_PCA_16,GENE_PCA_17,GENE_PCA_18,GENE_PCA_19
0,14.0,2.80,0.20,7.6,119.0,False,46,False,False,1,...,-0.362127,-0.098401,-0.334633,-0.422827,0.406305,0.153874,-0.391967,0.182757,0.236527,0.202842
1,1.0,7.40,2.40,11.6,42.0,True,46,False,False,0,...,-0.080871,-0.177629,-0.204931,-0.232519,0.062835,0.146025,0.089539,-0.028232,-0.337980,0.019597
2,15.0,3.70,2.10,14.2,81.0,False,46,False,True,1,...,0.355818,-0.136760,0.207136,0.501284,-0.258472,-0.428289,-0.346950,-0.213499,0.101130,0.047094
3,1.0,3.90,1.90,8.9,77.0,False,46,False,False,1,...,1.298820,-0.491564,0.371567,1.334935,-1.161551,-0.877461,-0.589696,0.042594,0.547192,-0.025505
4,6.0,128.00,9.70,11.1,195.0,False,46,False,False,1,...,-0.097110,-0.185439,-0.043895,-0.159814,-0.011856,0.045029,0.104480,-0.078675,-0.295712,0.004032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3168,1.0,2.50,1.02,10.2,78.0,True,46,False,False,0,...,0.019895,-0.122375,-0.222443,-0.034158,0.101435,0.191833,-0.082066,-0.023210,-0.074848,-0.465893
3169,1.5,8.10,2.66,11.3,40.0,False,44,False,False,4,...,0.005912,-0.023434,-0.058662,-0.036904,-0.024066,-0.001572,0.051518,-0.037426,-0.080861,0.006692
3170,0.0,1.80,0.55,9.4,86.0,False,45,False,False,1,...,-0.374401,0.517249,-0.545983,0.003789,0.026998,0.030135,-0.207101,-0.200459,0.427083,0.150896
3171,5.0,1.37,0.37,11.4,102.0,False,46,False,False,1,...,0.613928,0.029560,0.106959,-0.327796,0.161075,0.119511,0.009930,-0.086212,-0.093364,0.063722


In [None]:

# %%
# =============================================================================
# CELL 3: FEATURE IMPORTANCE PRETRAINING
# =============================================================================
# Analyse feature importance avec random feature baseline
# Sélectionne uniquement les features significatives
# =============================================================================

from matplotlib import pyplot as plt

from ens_data_challenge.training import (
    analyze_feature_importance,
    plot_feature_importance,
    select_features,
)

print("=" * 60)
print("FEATURE IMPORTANCE PRETRAINING")
print("=" * 60)

# Classification task - P(event=1)
print("\n--- Classification Feature Importance ---")
fi_clf = analyze_feature_importance(
    X_clinical,
    events,
    is_classifier=True,
    threshold_method="random",
    random_state=RANDOM_STATE,
    verbose=True,
)

y_transformed = transform_y(y_times)

print("\n--- Regression Feature Importance ---")
fi_reg = analyze_feature_importance(
    X_clinical,
    y_transformed,
    is_classifier=False,
    threshold_method="random",
    random_state=RANDOM_STATE,
    verbose=True,
)

# Keep features significant in BOTH classification AND regression
selected_clf = set(fi_clf.selected_features)
selected_reg = set(fi_reg.selected_features)
final_selected = list(
    selected_clf | selected_reg
)  # Union pour garder les features importants

# Log dropped features
dropped_features = [f for f in X_clinical.columns if f not in final_selected]
print(f"\n{'=' * 60}")
print(f"FINAL FEATURE SELECTION")
print(f"{'=' * 60}")
print(f"Original features: {len(X_clinical.columns)}")
print(f"Selected features: {len(final_selected)}")
print(f"Dropped features: {len(dropped_features)}")
print(
    f"\nDropped: {dropped_features[:20]}..."
    if len(dropped_features) > 20
    else f"\nDropped: {dropped_features}"
)

# Plot
fig_clf = plot_feature_importance(fi_clf, top_n=25)
fig_clf.suptitle("Classification Feature Importance", fontsize=14)
plt.show()

fig_reg = plot_feature_importance(fi_reg, top_n=25)
fig_reg.suptitle("Regression Feature Importance", fontsize=14)
plt.show()

# Apply selection
X_clinical_selected = select_features(X_clinical, final_selected)
print(f"\nX_clinical after selection: {X_clinical_selected.shape}")



FEATURE IMPORTANCE PRETRAINING

--- Classification Feature Importance ---
Computing tree-based feature importance...
  Optimizing RF...
  Optimizing XGB...


In [5]:

# %%
# =============================================================================
# CELL 3: CLASSIFICATION - P(event=1)
# =============================================================================

print("=" * 60)
print("CLASSIFICATION: P(event=1)")
print("=" * 60)

clf_results = {}

for model_name in CLF_MODELS:
    print(f"\n--- {model_name} ---")
    oof, models, cv_results, params = train_classifier_cv(
        X_clinical_selected,
        events,
        y_times,
        events,
        model_name,
        n_folds=N_FOLDS,
        n_trials=N_TRIALS,
        random_state=RANDOM_STATE,
    )
    clf_results[model_name] = {
        "oof": oof,
        "models": models,
        "cv_results": cv_results,
        "params": params,
    }

# Ensemble
oof_proba = np.mean([r["oof"] for r in clf_results.values()], axis=0)

print(f"\n{'=' * 60}")
print(f"ENSEMBLE P(event=1)")
final_auc = compute_ipcw_cindex(y_times, events, y_times, events, oof_proba)
print(f"  Ensemble IPCW C-index: {final_auc:.4f}")



CLASSIFICATION: P(event=1)

--- XGB ---


Best trial: 2. Best value: -0.678517: 100%|██████████| 30/30 [01:07<00:00,  2.24s/it]


  Fold 1: AUC=0.7159, IPCW_C=0.7049
  Fold 2: AUC=0.7099, IPCW_C=0.6429
  Fold 3: AUC=0.7075, IPCW_C=0.6679
  Fold 4: AUC=0.7029, IPCW_C=0.6875
  Fold 5: AUC=0.6711, IPCW_C=0.6475
CV RESULTS SUMMARY
Fold 0: train_loss=-0.7305, val_loss=-0.7159, ipcw_c=0.7049
Fold 1: train_loss=-0.7310, val_loss=-0.7099, ipcw_c=0.6429
Fold 2: train_loss=-0.7314, val_loss=-0.7075, ipcw_c=0.6679
Fold 3: train_loss=-0.7312, val_loss=-0.7029, ipcw_c=0.6875
Fold 4: train_loss=-0.7307, val_loss=-0.6711, ipcw_c=0.6475
------------------------------------------------------------
Mean Train Loss: -0.7309
Mean Val Loss:   -0.7015
Mean IPCW C-idx: 0.6701 ± 0.0235
Mean AUC:        0.7015

--- LGBM ---


Best trial: 23. Best value: -0.642907: 100%|██████████| 30/30 [01:09<00:00,  2.33s/it]


  Fold 1: AUC=0.6755, IPCW_C=0.6779
  Fold 2: AUC=0.6936, IPCW_C=0.6024
  Fold 3: AUC=0.6651, IPCW_C=0.6410
  Fold 4: AUC=0.6895, IPCW_C=0.6598
  Fold 5: AUC=0.6810, IPCW_C=0.6548
CV RESULTS SUMMARY
Fold 0: train_loss=-0.7169, val_loss=-0.6755, ipcw_c=0.6779
Fold 1: train_loss=-0.7151, val_loss=-0.6936, ipcw_c=0.6024
Fold 2: train_loss=-0.7290, val_loss=-0.6651, ipcw_c=0.6410
Fold 3: train_loss=-0.7191, val_loss=-0.6895, ipcw_c=0.6598
Fold 4: train_loss=-0.7251, val_loss=-0.6810, ipcw_c=0.6548
------------------------------------------------------------
Mean Train Loss: -0.7210
Mean Val Loss:   -0.6809
Mean IPCW C-idx: 0.6472 ± 0.0253
Mean AUC:        0.6809

--- CatBoost ---


Best trial: 28. Best value: -0.660697: 100%|██████████| 30/30 [02:27<00:00,  4.92s/it]


  Fold 1: AUC=0.7318, IPCW_C=0.7209
  Fold 2: AUC=0.7083, IPCW_C=0.6619
  Fold 3: AUC=0.7015, IPCW_C=0.6626
  Fold 4: AUC=0.7194, IPCW_C=0.6938
  Fold 5: AUC=0.7001, IPCW_C=0.6743
CV RESULTS SUMMARY
Fold 0: train_loss=-0.7227, val_loss=-0.7318, ipcw_c=0.7209
Fold 1: train_loss=-0.7259, val_loss=-0.7083, ipcw_c=0.6619
Fold 2: train_loss=-0.7258, val_loss=-0.7015, ipcw_c=0.6626
Fold 3: train_loss=-0.7291, val_loss=-0.7194, ipcw_c=0.6938
Fold 4: train_loss=-0.7335, val_loss=-0.7001, ipcw_c=0.6743
------------------------------------------------------------
Mean Train Loss: -0.7274
Mean Val Loss:   -0.7122
Mean IPCW C-idx: 0.6827 ± 0.0223
Mean AUC:        0.7122

--- RF ---


Best trial: 8. Best value: -0.491762: 100%|██████████| 30/30 [04:23<00:00,  8.77s/it]


  Fold 1: AUC=0.6871, IPCW_C=0.6720
  Fold 2: AUC=0.6777, IPCW_C=0.6026
  Fold 3: AUC=0.6884, IPCW_C=0.6546
  Fold 4: AUC=0.7141, IPCW_C=0.6905
  Fold 5: AUC=0.6987, IPCW_C=0.6686
CV RESULTS SUMMARY
Fold 0: train_loss=-0.6891, val_loss=-0.6871, ipcw_c=0.6720
Fold 1: train_loss=-0.7073, val_loss=-0.6777, ipcw_c=0.6026
Fold 2: train_loss=-0.7079, val_loss=-0.6884, ipcw_c=0.6546
Fold 3: train_loss=-0.7142, val_loss=-0.7141, ipcw_c=0.6905
Fold 4: train_loss=-0.7191, val_loss=-0.6987, ipcw_c=0.6686
------------------------------------------------------------
Mean Train Loss: -0.7075
Mean Val Loss:   -0.6932
Mean IPCW C-idx: 0.6577 ± 0.0298
Mean AUC:        0.6932

--- LR ---


Best trial: 16. Best value: -0.724264: 100%|██████████| 30/30 [00:26<00:00,  1.13it/s]


  Fold 1: AUC=0.7141, IPCW_C=0.7034
  Fold 2: AUC=0.7328, IPCW_C=0.6528
  Fold 3: AUC=0.7332, IPCW_C=0.6865
  Fold 4: AUC=0.7168, IPCW_C=0.7027
  Fold 5: AUC=0.7325, IPCW_C=0.7063
CV RESULTS SUMMARY
Fold 0: train_loss=-0.7447, val_loss=-0.7141, ipcw_c=0.7034
Fold 1: train_loss=-0.7385, val_loss=-0.7328, ipcw_c=0.6528
Fold 2: train_loss=-0.7406, val_loss=-0.7332, ipcw_c=0.6865
Fold 3: train_loss=-0.7454, val_loss=-0.7168, ipcw_c=0.7027
Fold 4: train_loss=-0.7400, val_loss=-0.7325, ipcw_c=0.7063
------------------------------------------------------------
Mean Train Loss: -0.7418
Mean Val Loss:   -0.7259
Mean IPCW C-idx: 0.6904 ± 0.0200
Mean AUC:        0.7259

--- LDA ---


Best trial: 23. Best value: -0.714426: 100%|██████████| 30/30 [00:22<00:00,  1.33it/s]


  Fold 1: AUC=0.7146, IPCW_C=0.7038
  Fold 2: AUC=0.7326, IPCW_C=0.6527
  Fold 3: AUC=0.7326, IPCW_C=0.6858
  Fold 4: AUC=0.7179, IPCW_C=0.7037
  Fold 5: AUC=0.7320, IPCW_C=0.7055
CV RESULTS SUMMARY
Fold 0: train_loss=-0.7438, val_loss=-0.7146, ipcw_c=0.7038
Fold 1: train_loss=-0.7375, val_loss=-0.7326, ipcw_c=0.6527
Fold 2: train_loss=-0.7396, val_loss=-0.7326, ipcw_c=0.6858
Fold 3: train_loss=-0.7442, val_loss=-0.7179, ipcw_c=0.7037
Fold 4: train_loss=-0.7393, val_loss=-0.7320, ipcw_c=0.7055
------------------------------------------------------------
Mean Train Loss: -0.7409
Mean Val Loss:   -0.7259
Mean IPCW C-idx: 0.6903 ± 0.0201
Mean AUC:        0.7259

--- QDA ---


Best trial: 7. Best value: -0.703105: 100%|██████████| 30/30 [00:55<00:00,  1.85s/it]


  Fold 1: AUC=0.6928, IPCW_C=0.6884
  Fold 2: AUC=0.7192, IPCW_C=0.6285
  Fold 3: AUC=0.7194, IPCW_C=0.6742
  Fold 4: AUC=0.7180, IPCW_C=0.7010
  Fold 5: AUC=0.7239, IPCW_C=0.6978
CV RESULTS SUMMARY
Fold 0: train_loss=-0.7340, val_loss=-0.6928, ipcw_c=0.6884
Fold 1: train_loss=-0.7266, val_loss=-0.7192, ipcw_c=0.6285
Fold 2: train_loss=-0.7287, val_loss=-0.7194, ipcw_c=0.6742
Fold 3: train_loss=-0.7325, val_loss=-0.7180, ipcw_c=0.7010
Fold 4: train_loss=-0.7296, val_loss=-0.7239, ipcw_c=0.6978
------------------------------------------------------------
Mean Train Loss: -0.7303
Mean Val Loss:   -0.7147
Mean IPCW C-idx: 0.6780 ± 0.0264
Mean AUC:        0.7147

--- LinearSVC ---


Best trial: 25. Best value: -0.714128: 100%|██████████| 30/30 [01:15<00:00,  2.50s/it]


  Fold 1: AUC=0.7142, IPCW_C=0.7031
  Fold 2: AUC=0.7328, IPCW_C=0.6529
  Fold 3: AUC=0.7325, IPCW_C=0.6859
  Fold 4: AUC=0.7171, IPCW_C=0.7031
  Fold 5: AUC=0.7320, IPCW_C=0.7058
CV RESULTS SUMMARY
Fold 0: train_loss=-0.7445, val_loss=-0.7142, ipcw_c=0.7031
Fold 1: train_loss=-0.7383, val_loss=-0.7328, ipcw_c=0.6529
Fold 2: train_loss=-0.7403, val_loss=-0.7325, ipcw_c=0.6859
Fold 3: train_loss=-0.7451, val_loss=-0.7171, ipcw_c=0.7031
Fold 4: train_loss=-0.7400, val_loss=-0.7320, ipcw_c=0.7058
------------------------------------------------------------
Mean Train Loss: -0.7416
Mean Val Loss:   -0.7257
Mean IPCW C-idx: 0.6902 ± 0.0199
Mean AUC:        0.7257

--- ElasticNetLR ---


Best trial: 13. Best value: -0.719176: 100%|██████████| 30/30 [04:21<00:00,  8.73s/it]


  Fold 1: AUC=0.7081, IPCW_C=0.6962
  Fold 2: AUC=0.7325, IPCW_C=0.6560
  Fold 3: AUC=0.7328, IPCW_C=0.6887
  Fold 4: AUC=0.7088, IPCW_C=0.6979
  Fold 5: AUC=0.7273, IPCW_C=0.7024
CV RESULTS SUMMARY
Fold 0: train_loss=-0.7485, val_loss=-0.7081, ipcw_c=0.6962
Fold 1: train_loss=-0.7419, val_loss=-0.7325, ipcw_c=0.6560
Fold 2: train_loss=-0.7442, val_loss=-0.7328, ipcw_c=0.6887
Fold 3: train_loss=-0.7495, val_loss=-0.7088, ipcw_c=0.6979
Fold 4: train_loss=-0.7427, val_loss=-0.7273, ipcw_c=0.7024
------------------------------------------------------------
Mean Train Loss: -0.7454
Mean Val Loss:   -0.7219
Mean IPCW C-idx: 0.6882 ± 0.0167
Mean AUC:        0.7219

ENSEMBLE P(event=1)
  Ensemble IPCW C-index: 0.6953


In [7]:

# %%
# =============================================================================
# CELL 4: REGRESSION - E[rank|event=1]
# =============================================================================

print("\n" + "=" * 60)
print("REGRESSION: E[rank|event=1] (trained on events only)")
print("=" * 60)

e1_mask = events == 1
reg_e1_results = {}

for model_name in REG_MODELS:
    print(f"\n--- {model_name} ---")
    oof, models, cv_results, params = train_regressor_cv(
        X_clinical_selected,
        y_times,
        events,
        model_name,
        subset_mask=e1_mask,
        n_folds=N_FOLDS,
        n_trials=N_TRIALS,
        random_state=RANDOM_STATE,
    )
    reg_e1_results[model_name] = {
        "oof": oof,
        "models": models,
        "cv_results": cv_results,
        "params": params,
    }

oof_rank_e1 = np.mean([r["oof"] for r in reg_e1_results.values()], axis=0)




REGRESSION: E[rank|event=1] (trained on events only)

--- XGB ---


Best trial: 21. Best value: 0.0899913: 100%|██████████| 30/30 [00:14<00:00,  2.05it/s]


  Fold 1: MSE=0.0840, IPCW_C=0.5967
  Fold 2: MSE=0.0842, IPCW_C=0.5786
  Fold 3: MSE=0.0840, IPCW_C=0.6048
  Fold 4: MSE=0.0841, IPCW_C=0.5843
  Fold 5: MSE=0.0841, IPCW_C=0.5846
CV RESULTS SUMMARY
Fold 0: train_loss=0.0842, val_loss=0.0840, ipcw_c=0.5967
Fold 1: train_loss=0.0843, val_loss=0.0842, ipcw_c=0.5786
Fold 2: train_loss=0.0842, val_loss=0.0840, ipcw_c=0.6048
Fold 3: train_loss=0.0843, val_loss=0.0841, ipcw_c=0.5843
Fold 4: train_loss=0.0842, val_loss=0.0841, ipcw_c=0.5846
------------------------------------------------------------
Mean Train Loss: 0.0842
Mean Val Loss:   0.0841
Mean IPCW C-idx: 0.5898 ± 0.0096

--- LGBM ---


Best trial: 11. Best value: 0.114795: 100%|██████████| 30/30 [00:30<00:00,  1.02s/it]


  Fold 1: MSE=0.0838, IPCW_C=0.5795
  Fold 2: MSE=0.0840, IPCW_C=0.5497
  Fold 3: MSE=0.0837, IPCW_C=0.5796
  Fold 4: MSE=0.0839, IPCW_C=0.5673
  Fold 5: MSE=0.0841, IPCW_C=0.5637
CV RESULTS SUMMARY
Fold 0: train_loss=0.0839, val_loss=0.0838, ipcw_c=0.5795
Fold 1: train_loss=0.0840, val_loss=0.0840, ipcw_c=0.5497
Fold 2: train_loss=0.0840, val_loss=0.0837, ipcw_c=0.5796
Fold 3: train_loss=0.0840, val_loss=0.0839, ipcw_c=0.5673
Fold 4: train_loss=0.0839, val_loss=0.0841, ipcw_c=0.5637
------------------------------------------------------------
Mean Train Loss: 0.0840
Mean Val Loss:   0.0839
Mean IPCW C-idx: 0.5680 ± 0.0112

--- CatBoost ---


Best trial: 8. Best value: 0.089471: 100%|██████████| 30/30 [01:44<00:00,  3.47s/it]


  Fold 1: MSE=0.0836, IPCW_C=0.5972
  Fold 2: MSE=0.0836, IPCW_C=0.5814
  Fold 3: MSE=0.0833, IPCW_C=0.6100
  Fold 4: MSE=0.0837, IPCW_C=0.5834
  Fold 5: MSE=0.0836, IPCW_C=0.6076
CV RESULTS SUMMARY
Fold 0: train_loss=0.0837, val_loss=0.0836, ipcw_c=0.5972
Fold 1: train_loss=0.0836, val_loss=0.0836, ipcw_c=0.5814
Fold 2: train_loss=0.0837, val_loss=0.0833, ipcw_c=0.6100
Fold 3: train_loss=0.0837, val_loss=0.0837, ipcw_c=0.5834
Fold 4: train_loss=0.0837, val_loss=0.0836, ipcw_c=0.6076
------------------------------------------------------------
Mean Train Loss: 0.0837
Mean Val Loss:   0.0836
Mean IPCW C-idx: 0.5959 ± 0.0119

--- Ridge ---


Best trial: 11. Best value: 0.0769765: 100%|██████████| 30/30 [00:12<00:00,  2.33it/s]


  Fold 1: MSE=0.0750, IPCW_C=0.6849
  Fold 2: MSE=0.0762, IPCW_C=0.6683
  Fold 3: MSE=0.0710, IPCW_C=0.6985
  Fold 4: MSE=0.0708, IPCW_C=0.6859
  Fold 5: MSE=0.0777, IPCW_C=0.6860
CV RESULTS SUMMARY
Fold 0: train_loss=0.0721, val_loss=0.0750, ipcw_c=0.6849
Fold 1: train_loss=0.0719, val_loss=0.0762, ipcw_c=0.6683
Fold 2: train_loss=0.0726, val_loss=0.0710, ipcw_c=0.6985
Fold 3: train_loss=0.0736, val_loss=0.0708, ipcw_c=0.6859
Fold 4: train_loss=0.0756, val_loss=0.0777, ipcw_c=0.6860
------------------------------------------------------------
Mean Train Loss: 0.0732
Mean Val Loss:   0.0741
Mean IPCW C-idx: 0.6847 ± 0.0096

--- ElasticNet ---


Best trial: 15. Best value: 0.0872937: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


  Fold 1: MSE=0.0844, IPCW_C=0.5000
  Fold 2: MSE=0.0845, IPCW_C=0.5000
  Fold 3: MSE=0.0844, IPCW_C=0.5000
  Fold 4: MSE=0.0845, IPCW_C=0.5000
  Fold 5: MSE=0.0844, IPCW_C=0.5000
CV RESULTS SUMMARY
Fold 0: train_loss=0.0846, val_loss=0.0844, ipcw_c=0.5000
Fold 1: train_loss=0.0847, val_loss=0.0845, ipcw_c=0.5000
Fold 2: train_loss=0.0846, val_loss=0.0844, ipcw_c=0.5000
Fold 3: train_loss=0.0847, val_loss=0.0845, ipcw_c=0.5000
Fold 4: train_loss=0.0846, val_loss=0.0844, ipcw_c=0.5000
------------------------------------------------------------
Mean Train Loss: 0.0846
Mean Val Loss:   0.0844
Mean IPCW C-idx: 0.5000 ± 0.0000

--- PLS ---


Best trial: 9. Best value: 0.0874168: 100%|██████████| 30/30 [00:12<00:00,  2.47it/s]


  Fold 1: MSE=0.0759, IPCW_C=0.6646
  Fold 2: MSE=0.0759, IPCW_C=0.6463
  Fold 3: MSE=0.0729, IPCW_C=0.6770
  Fold 4: MSE=0.0737, IPCW_C=0.6677
  Fold 5: MSE=0.0775, IPCW_C=0.6616
CV RESULTS SUMMARY
Fold 0: train_loss=0.0746, val_loss=0.0759, ipcw_c=0.6646
Fold 1: train_loss=0.0745, val_loss=0.0759, ipcw_c=0.6463
Fold 2: train_loss=0.0751, val_loss=0.0729, ipcw_c=0.6770
Fold 3: train_loss=0.0751, val_loss=0.0737, ipcw_c=0.6677
Fold 4: train_loss=0.0757, val_loss=0.0775, ipcw_c=0.6616
------------------------------------------------------------
Mean Train Loss: 0.0750
Mean Val Loss:   0.0752
Mean IPCW C-idx: 0.6634 ± 0.0100

--- KernelRidge ---


Best trial: 23. Best value: 0.0798147: 100%|██████████| 30/30 [02:28<00:00,  4.95s/it]


  Fold 1: MSE=0.0726, IPCW_C=0.6809
  Fold 2: MSE=0.0750, IPCW_C=0.6649
  Fold 3: MSE=0.0734, IPCW_C=0.6945
  Fold 4: MSE=0.0723, IPCW_C=0.6864
  Fold 5: MSE=0.0759, IPCW_C=0.6850
CV RESULTS SUMMARY
Fold 0: train_loss=0.0739, val_loss=0.0726, ipcw_c=0.6809
Fold 1: train_loss=0.0731, val_loss=0.0750, ipcw_c=0.6649
Fold 2: train_loss=0.0734, val_loss=0.0734, ipcw_c=0.6945
Fold 3: train_loss=0.0735, val_loss=0.0723, ipcw_c=0.6864
Fold 4: train_loss=0.0725, val_loss=0.0759, ipcw_c=0.6850
------------------------------------------------------------
Mean Train Loss: 0.0733
Mean Val Loss:   0.0739
Mean IPCW C-idx: 0.6824 ± 0.0098


pour ca on ft un optimal ensemble avec optuna plutot

In [None]:

# %%
# =============================================================================
# CELL 5: REGRESSION - E[rank|event=0]
# =============================================================================

print("\n" + "=" * 60)
print("REGRESSION: E[rank|event=0] (trained on censored only)")
print("=" * 60)

e0_mask = events == 0
reg_e0_results = {}

for model_name in REG_MODELS:
    print(f"\n--- {model_name} ---")
    oof, models, cv_results, params = train_regressor_cv(
        X_clinical_selected,
        y_times,
        events,
        model_name,
        subset_mask=e0_mask,
        n_folds=N_FOLDS,
        n_trials=N_TRIALS,
        random_state=RANDOM_STATE,
    )
    reg_e0_results[model_name] = {
        "oof": oof,
        "models": models,
        "cv_results": cv_results,
        "params": params,
    }

oof_rank_e0 = np.mean([r["oof"] for r in reg_e0_results.values()], axis=0)



REGRESSION: E[rank|event=0] (trained on censored only)

--- XGB ---


Best trial: 8. Best value: 0.0875578: 100%|██████████| 30/30 [00:11<00:00,  2.70it/s]


  Fold 1: MSE=0.0844, IPCW_C=0.5167
  Fold 2: MSE=0.0839, IPCW_C=0.5384
  Fold 3: MSE=0.0840, IPCW_C=0.5467
  Fold 4: MSE=0.0842, IPCW_C=0.5477
  Fold 5: MSE=0.0843, IPCW_C=0.5585
CV RESULTS SUMMARY
Fold 0: train_loss=0.0841, val_loss=0.0844, ipcw_c=0.5167
Fold 1: train_loss=0.0838, val_loss=0.0839, ipcw_c=0.5384
Fold 2: train_loss=0.0842, val_loss=0.0840, ipcw_c=0.5467
Fold 3: train_loss=0.0844, val_loss=0.0842, ipcw_c=0.5477
Fold 4: train_loss=0.0842, val_loss=0.0843, ipcw_c=0.5585
------------------------------------------------------------
Mean Train Loss: 0.0841
Mean Val Loss:   0.0842
Mean IPCW C-idx: 0.5416 ± 0.0140

--- LGBM ---


Best trial: 12. Best value: 0.10569: 100%|██████████| 30/30 [00:07<00:00,  4.08it/s] 


  Fold 1: MSE=0.0842, IPCW_C=0.6102
  Fold 2: MSE=0.0843, IPCW_C=0.5891
  Fold 3: MSE=0.0842, IPCW_C=0.6300
  Fold 4: MSE=0.0844, IPCW_C=0.6310
  Fold 5: MSE=0.0844, IPCW_C=0.6084
CV RESULTS SUMMARY
Fold 0: train_loss=0.0844, val_loss=0.0842, ipcw_c=0.6102
Fold 1: train_loss=0.0844, val_loss=0.0843, ipcw_c=0.5891
Fold 2: train_loss=0.0844, val_loss=0.0842, ipcw_c=0.6300
Fold 3: train_loss=0.0846, val_loss=0.0844, ipcw_c=0.6310
Fold 4: train_loss=0.0845, val_loss=0.0844, ipcw_c=0.6084
------------------------------------------------------------
Mean Train Loss: 0.0845
Mean Val Loss:   0.0843
Mean IPCW C-idx: 0.6137 ± 0.0156

--- CatBoost ---


Best trial: 12. Best value: 0.0877715: 100%|██████████| 30/30 [00:21<00:00,  1.39it/s]


  Fold 1: MSE=0.0843, IPCW_C=0.5511
  Fold 2: MSE=0.0843, IPCW_C=0.5443
  Fold 3: MSE=0.0843, IPCW_C=0.5474
  Fold 4: MSE=0.0845, IPCW_C=0.5489
  Fold 5: MSE=0.0845, IPCW_C=0.5585
CV RESULTS SUMMARY
Fold 0: train_loss=0.0845, val_loss=0.0843, ipcw_c=0.5511
Fold 1: train_loss=0.0845, val_loss=0.0843, ipcw_c=0.5443
Fold 2: train_loss=0.0845, val_loss=0.0843, ipcw_c=0.5474
Fold 3: train_loss=0.0846, val_loss=0.0845, ipcw_c=0.5489
Fold 4: train_loss=0.0846, val_loss=0.0845, ipcw_c=0.5585
------------------------------------------------------------
Mean Train Loss: 0.0845
Mean Val Loss:   0.0844
Mean IPCW C-idx: 0.5500 ± 0.0048

--- Ridge ---


Best trial: 19. Best value: 0.0794815: 100%|██████████| 30/30 [00:02<00:00, 13.43it/s]


  Fold 1: MSE=0.0732, IPCW_C=0.6690
  Fold 2: MSE=0.0787, IPCW_C=0.6371
  Fold 3: MSE=0.0897, IPCW_C=0.6907
  Fold 4: MSE=0.0744, IPCW_C=0.6783
  Fold 5: MSE=0.0758, IPCW_C=0.7108
CV RESULTS SUMMARY
Fold 0: train_loss=0.0730, val_loss=0.0732, ipcw_c=0.6690
Fold 1: train_loss=0.0729, val_loss=0.0787, ipcw_c=0.6371
Fold 2: train_loss=0.0738, val_loss=0.0897, ipcw_c=0.6907
Fold 3: train_loss=0.0730, val_loss=0.0744, ipcw_c=0.6783
Fold 4: train_loss=0.0712, val_loss=0.0758, ipcw_c=0.7108
------------------------------------------------------------
Mean Train Loss: 0.0728
Mean Val Loss:   0.0784
Mean IPCW C-idx: 0.6772 ± 0.0244

--- ElasticNet ---


Best trial: 22. Best value: 0.0849799: 100%|██████████| 30/30 [00:03<00:00,  8.41it/s]


  Fold 1: MSE=0.0815, IPCW_C=0.6131
  Fold 2: MSE=0.0812, IPCW_C=0.5945
  Fold 3: MSE=0.0800, IPCW_C=0.6288
  Fold 4: MSE=0.0808, IPCW_C=0.6204
  Fold 5: MSE=0.0833, IPCW_C=0.5958
CV RESULTS SUMMARY
Fold 0: train_loss=0.0819, val_loss=0.0815, ipcw_c=0.6131
Fold 1: train_loss=0.0797, val_loss=0.0812, ipcw_c=0.5945
Fold 2: train_loss=0.0810, val_loss=0.0800, ipcw_c=0.6288
Fold 3: train_loss=0.0812, val_loss=0.0808, ipcw_c=0.6204
Fold 4: train_loss=0.0819, val_loss=0.0833, ipcw_c=0.5958
------------------------------------------------------------
Mean Train Loss: 0.0811
Mean Val Loss:   0.0814
Mean IPCW C-idx: 0.6105 ± 0.0135


In [None]:

# %%
# =============================================================================
# CELL 6: REGRESSION - E[rank|IPCW]
# =============================================================================

print("\n" + "=" * 60)
print("REGRESSION: E[rank|IPCW] (trained with KM weights)")
print("=" * 60)

reg_ipcw_results = {}

for model_name in IPCW_MODELS:
    print(f"\n--- {model_name} ---")
    oof, models, cv_results, params = train_regressor_cv(
        X_clinical_selected,
        y_times,
        events,
        model_name,
        use_ipcw=True,
        n_folds=N_FOLDS,
        n_trials=N_TRIALS,
        random_state=RANDOM_STATE,
    )
    reg_ipcw_results[model_name] = {
        "oof": oof,
        "models": models,
        "cv_results": cv_results,
        "params": params,
    }

oof_rank_ipcw = np.mean([r["oof"] for r in reg_ipcw_results.values()], axis=0)



REGRESSION: E[rank|IPCW] (trained with KM weights)

--- XGB ---


Best trial: 25. Best value: 0.130347: 100%|██████████| 30/30 [01:23<00:00,  2.78s/it]


  Fold 1: MSE=0.0909, IPCW_C=0.6409
  Fold 2: MSE=0.0917, IPCW_C=0.6026
  Fold 3: MSE=0.0864, IPCW_C=0.6519
  Fold 4: MSE=0.0915, IPCW_C=0.6470
  Fold 5: MSE=0.0909, IPCW_C=0.6402
CV RESULTS SUMMARY
Fold 0: train_loss=0.0915, val_loss=0.0909, ipcw_c=0.6409
Fold 1: train_loss=0.0894, val_loss=0.0917, ipcw_c=0.6026
Fold 2: train_loss=0.0887, val_loss=0.0864, ipcw_c=0.6519
Fold 3: train_loss=0.0926, val_loss=0.0915, ipcw_c=0.6470
Fold 4: train_loss=0.0871, val_loss=0.0909, ipcw_c=0.6402
------------------------------------------------------------
Mean Train Loss: 0.0899
Mean Val Loss:   0.0903
Mean IPCW C-idx: 0.6365 ± 0.0175

--- LGBM ---


Best trial: 21. Best value: 0.124406: 100%|██████████| 30/30 [01:44<00:00,  3.49s/it]


  Fold 1: MSE=0.0936, IPCW_C=0.6369
  Fold 2: MSE=0.0927, IPCW_C=0.6324
  Fold 3: MSE=0.0894, IPCW_C=0.6518
  Fold 4: MSE=0.0976, IPCW_C=0.6398
  Fold 5: MSE=0.0933, IPCW_C=0.6417
CV RESULTS SUMMARY
Fold 0: train_loss=0.0937, val_loss=0.0936, ipcw_c=0.6369
Fold 1: train_loss=0.0921, val_loss=0.0927, ipcw_c=0.6324
Fold 2: train_loss=0.0896, val_loss=0.0894, ipcw_c=0.6518
Fold 3: train_loss=0.0964, val_loss=0.0976, ipcw_c=0.6398
Fold 4: train_loss=0.0906, val_loss=0.0933, ipcw_c=0.6417
------------------------------------------------------------
Mean Train Loss: 0.0925
Mean Val Loss:   0.0933
Mean IPCW C-idx: 0.6405 ± 0.0065

--- CatBoost ---


Best trial: 21. Best value: 0.102382: 100%|██████████| 30/30 [01:07<00:00,  2.24s/it]


  Fold 1: MSE=0.1011, IPCW_C=0.5298
  Fold 2: MSE=0.1001, IPCW_C=0.5769
  Fold 3: MSE=0.0962, IPCW_C=0.5879
  Fold 4: MSE=0.1057, IPCW_C=0.5912
  Fold 5: MSE=0.0987, IPCW_C=0.5881
CV RESULTS SUMMARY
Fold 0: train_loss=0.1013, val_loss=0.1011, ipcw_c=0.5298
Fold 1: train_loss=0.1002, val_loss=0.1001, ipcw_c=0.5769
Fold 2: train_loss=0.0964, val_loss=0.0962, ipcw_c=0.5879
Fold 3: train_loss=0.1060, val_loss=0.1057, ipcw_c=0.5912
Fold 4: train_loss=0.0988, val_loss=0.0987, ipcw_c=0.5881
------------------------------------------------------------
Mean Train Loss: 0.1006
Mean Val Loss:   0.1004
Mean IPCW C-idx: 0.5748 ± 0.0230


In [None]:

# %%
# =============================================================================
# CELL 7: CYTO STRUCT CLASSIFICATION (CatBoost avec cat_features)
# =============================================================================

from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold

print("\n" + "=" * 60)
print("CYTO STRUCT: CatBoost Classification")
print("=" * 60)

cat_features = list(X_cyto.columns)
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

oof_proba_cyto = np.zeros(len(events))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_cyto, events)):
    train_pool = Pool(
        X_cyto.iloc[train_idx], events[train_idx], cat_features=cat_features
    )
    val_pool = Pool(X_cyto.iloc[val_idx], events[val_idx], cat_features=cat_features)

    model = CatBoostClassifier(
        iterations=200, depth=6, learning_rate=0.1, random_state=RANDOM_STATE, verbose=0
    )
    model.fit(train_pool, eval_set=val_pool, use_best_model=True)
    oof_proba_cyto[val_idx] = model.predict_proba(X_cyto.iloc[val_idx])[:, 1]

    print(
        f"  Fold {fold + 1}: AUC={compute_ipcw_cindex(y_times[train_idx], events[train_idx], y_times[val_idx], events[val_idx], oof_proba_cyto[val_idx]):.4f}"
    )




CYTO STRUCT: CatBoost Classification


ValueError: Found input variables with inconsistent numbers of samples: [2758, 3173]

In [None]:

# %%
# =============================================================================
# CELL 8: META-FEATURES DATAFRAME
# =============================================================================

print("\n" + "=" * 60)
print("META-FEATURES")
print("=" * 60)

# Combine features
X_meta = X_clinical_selected.copy()

# Predictions
X_meta["prob_event1"] = oof_proba
X_meta["prob_event0"] = 1 - oof_proba
# X_meta["prob_cyto"] = oof_proba_cyto
X_meta["rank_e1"] = scale_01(oof_rank_e1)
X_meta["rank_e0"] = scale_01(oof_rank_e0)
X_meta["rank_ipcw"] = scale_01(oof_rank_ipcw)

# Polynomial features
pred_cols = ["prob_event1", "rank_e1", "rank_e0", "rank_ipcw"]
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_feats = poly.fit_transform(X_meta[pred_cols].values)
for i in range(poly_feats.shape[1]):
    X_meta[f"poly_{i}"] = poly_feats[:, i]

print(f"Meta-features: {X_meta.shape}")



META-FEATURES
Meta-features: (3173, 93)


In [None]:

# %%
# =============================================================================
# CELL 9: FORMULA OPTIMIZATION
# =============================================================================

from sklearn.model_selection import KFold

print("\n" + "=" * 60)
print("FORMULA OPTIMIZATION")
print("=" * 60)
print(
    "Rang = P(e=0)*w0*E[r|e=0] + P(e=1)*(w1_base + w1_rank*E[r|e=1]) + w_ipcw*E[IPCW]"
)

kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

# Prepare scaled values
proba = oof_proba
rank_e0 = scale_01(oof_rank_e0)
rank_e1 = scale_01(oof_rank_e1)
rank_ipcw = scale_01(oof_rank_ipcw)


def formula_objective(trial):
    w0 = trial.suggest_float("w0", 0.0, 1.0)
    w1_base = trial.suggest_float("w1_base", 0.0, 1.0)
    w1_rank = trial.suggest_float("w1_rank", 0.0, 1.0)
    w_ipcw = trial.suggest_float("w_ipcw", 0.0, 1.0)

    c_indices = []
    for train_idx, val_idx in kf.split(proba):
        prob_e0 = 1 - proba[val_idx]
        prob_e1 = proba[val_idx]

        risk = (
            prob_e0 * w0 * rank_e0[val_idx]
            + prob_e1 * (w1_base + w1_rank * rank_e1[val_idx])
        )
        risk_scores = pd.Series(risk).rank().values / len(risk)

        c_idx = compute_ipcw_cindex(
            y_times[train_idx],
            events[train_idx],
            y_times[val_idx],
            events[val_idx],
            risk_scores,
        )
        c_indices.append(c_idx)

    return -np.mean(c_indices)


study = optuna.create_study(direction="minimize")
study.optimize(formula_objective, n_trials=100, show_progress_bar=True)

formula_params = study.best_params
formula_score = -study.best_value

print(f"\nBest params: {formula_params}")
print(f"Formula IPCW C-index: {formula_score:.4f}")

# Apply formula
prob_e0 = 1 - proba
prob_e1 = proba
formula_risk = (
    prob_e0 * formula_params["w0"] 
    + prob_e1 * (formula_params["w1_base"] + formula_params["w1_rank"] )
    
)
formula_risk = pd.Series(formula_risk).rank().values / len(formula_risk)




FORMULA OPTIMIZATION
Rang = P(e=0)*w0*E[r|e=0] + P(e=1)*(w1_base + w1_rank*E[r|e=1]) + w_ipcw*E[IPCW]


Best trial: 92. Best value: -0.704236: 100%|██████████| 100/100 [00:09<00:00, 10.02it/s]


Best params: {'w0': 0.9983081172559186, 'w1_base': 0.7989920462780471, 'w1_rank': 0.37038613369805823, 'w_ipcw': 0.948958288136958}
Formula IPCW C-index: 0.7042





In [None]:

# %%
# =============================================================================
# CELL 10: META-MODELS
# =============================================================================

print("\n" + "=" * 60)
print("META-MODELS")
print("=" * 60)

meta_model_names = ["XGB", "LGBM", "Ridge"]
meta_results = {}

for model_name in meta_model_names:
    print(f"\n--- {model_name} ---")
    oof, models, cv_results, params = train_regressor_cv(
        X_meta,
        y_times,
        events,
        model_name,
        use_ipcw=True,
        n_folds=N_FOLDS,
        n_trials=N_TRIALS,
        random_state=RANDOM_STATE,
    )
    meta_results[model_name] = {
        "oof": oof,
        "models": models,
        "cv_results": cv_results,
        "params": params,
    }

oof_meta = np.mean([r["oof"] for r in meta_results.values()], axis=0)
meta_risk = oof_meta



META-MODELS

--- XGB ---


Best trial: 1. Best value: 0.181687:   7%|▋         | 2/30 [00:10<02:26,  5.23s/it]


[33m[W 2026-01-26 03:42:13,672][0m Trial 2 failed with parameters: {'n_estimators': 11, 'max_depth': 7, 'learning_rate': 0.1091896572547298, 'subsample': 0.9328944524288025, 'colsample_bytree': 0.7765903263154738, 'reg_lambda': 0.2604125420478061, 'reg_alpha': 5.918205359029644} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File [35m"c:\Users\enzo.cAo\Documents\Projects\competitions\ens_data_challenge\.venv\Lib\site-packages\optuna\study\_optimize.py"[0m, line [35m206[0m, in [35m_run_trial[0m
    value_or_values = func(trial)
  File [35m"C:\Users\enzo.cAo\Documents\Projects\competitions\ens_data_challenge\src\ens_data_challenge\training\trainers.py"[0m, line [35m270[0m, in [35mobjective[0m
    [31mmodel.fit[0m[1;31m(X_tr, y_tr, sample_weight=weights)[0m
    [31m~~~~~~~~~[0m[1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[0m
  File [35m"c:\Users\enzo.cAo\Documents\Projects\competitions\ens_data_challenge\.venv\Lib\site-pack

KeyboardInterrupt: 

In [None]:

# %%
# =============================================================================
# CELL 11: FINAL ENSEMBLE
# =============================================================================

print("\n" + "=" * 60)
print("FINAL ENSEMBLE: Formula + Meta-Models")
print("=" * 60)


def ensemble_objective(trial):
    w_formula = trial.suggest_float("w_formula", 0.0, 1.0)
    w_meta = trial.suggest_float("w_meta", 0.0, 1.0)

    w_sum = w_formula + w_meta + 1e-10
    combined = (w_formula / w_sum) * formula_risk + (w_meta / w_sum) * meta_risk
    combined_ranked = pd.Series(combined).rank().values / len(combined)

    c_indices = []
    for train_idx, val_idx in kf.split(combined):
        c_idx = compute_ipcw_cindex(
            y_times[train_idx],
            events[train_idx],
            y_times[val_idx],
            events[val_idx],
            combined_ranked[val_idx],
        )
        c_indices.append(c_idx)

    return -np.mean(c_indices)


study = optuna.create_study(direction="minimize")
study.optimize(ensemble_objective, n_trials=50, show_progress_bar=True)

ensemble_params = study.best_params
ensemble_score = -study.best_value

print(f"\nEnsemble params: {ensemble_params}")
print(f"Final IPCW C-index: {ensemble_score:.4f}")

# Final predictions
w_sum = ensemble_params["w_formula"] + ensemble_params["w_meta"]
final_risk = (
    ensemble_params["w_formula"] / w_sum * formula_risk
    + ensemble_params["w_meta"] / w_sum * meta_risk
)
final_risk = pd.Series(final_risk).rank().values / len(final_risk)

print(f"\n" + "=" * 60)
print("PIPELINE COMPLETE!")
print(f"Final risk shape: {final_risk.shape}")
print("=" * 60)




FINAL ENSEMBLE: Formula + Meta-Models


  0%|          | 0/50 [00:00<?, ?it/s]

[33m[W 2026-01-26 03:35:10,910][0m Trial 0 failed with parameters: {'w_formula': 0.5748521520846346, 'w_meta': 0.7530763656043956} because of the following error: NameError("name 'meta_risk' is not defined").[0m
Traceback (most recent call last):
  File [35m"c:\Users\enzo.cAo\Documents\Projects\competitions\ens_data_challenge\.venv\Lib\site-packages\optuna\study\_optimize.py"[0m, line [35m206[0m, in [35m_run_trial[0m
    value_or_values = func(trial)
  File [35m"C:\Users\enzo.cAo\AppData\Local\Temp\ipykernel_26828\690425579.py"[0m, line [35m16[0m, in [35mensemble_objective[0m
    combined = (w_formula / w_sum) * formula_risk + (w_meta / w_sum) * [1;31mmeta_risk[0m
                                                                       [1;31m^^^^^^^^^[0m
[1;35mNameError[0m: [35mname 'meta_risk' is not defined[0m
[33m[W 2026-01-26 03:35:10,918][0m Trial 0 failed with value None.[0m





NameError: name 'meta_risk' is not defined

In [None]:

# %%
# =============================================================================
# CELL 10: META-MODELS
# =============================================================================

print("\n" + "=" * 60)
print("META-MODELS")
print("=" * 60)

meta_model_names = ["XGB", "LGBM", "Ridge"]
meta_results = {}

for model_name in meta_model_names:
    print(f"\n--- {model_name} ---")
    oof, models, cv_results, params = train_regressor_cv(
        X_meta,
        y_times,
        events,
        model_name,
        use_ipcw=True,
        n_folds=N_FOLDS,
        n_trials=N_TRIALS,
        random_state=RANDOM_STATE,
    )
    meta_results[model_name] = {
        "oof": oof,
        "models": models,
        "cv_results": cv_results,
        "params": params,
    }

oof_meta = np.mean([r["oof"] for r in meta_results.values()], axis=0)
meta_risk = oof_meta



META-MODELS

--- XGB ---


Best trial: 0. Best value: 0.124848:  10%|█         | 3/30 [00:07<01:08,  2.55s/it]


[33m[W 2026-01-26 03:35:26,084][0m Trial 3 failed with parameters: {'n_estimators': 1, 'max_depth': 9, 'learning_rate': 0.0193683028176939, 'subsample': 0.8268851348803736, 'colsample_bytree': 0.8145173390470355, 'reg_lambda': 0.01873431932145186, 'reg_alpha': 0.47025005163813194} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File [35m"c:\Users\enzo.cAo\Documents\Projects\competitions\ens_data_challenge\.venv\Lib\site-packages\optuna\study\_optimize.py"[0m, line [35m206[0m, in [35m_run_trial[0m
    value_or_values = func(trial)
  File [35m"C:\Users\enzo.cAo\Documents\Projects\competitions\ens_data_challenge\src\ens_data_challenge\training\trainers.py"[0m, line [35m282[0m, in [35mobjective[0m
    pred = model.predict(X_val)
  File [35m"c:\Users\enzo.cAo\Documents\Projects\competitions\ens_data_challenge\.venv\Lib\site-packages\xgboost\core.py"[0m, line [35m774[0m, in [35minner_f[0m
    return func(**kwargs)
  File [35m

KeyboardInterrupt: 

In [None]:

# %%
# =============================================================================
# CELL 11: FINAL ENSEMBLE
# =============================================================================

print("\n" + "=" * 60)
print("FINAL ENSEMBLE: Formula + Meta-Models")
print("=" * 60)


def ensemble_objective(trial):
    w_formula = trial.suggest_float("w_formula", 0.0, 1.0)
    w_meta = trial.suggest_float("w_meta", 0.0, 1.0)

    w_sum = w_formula + w_meta + 1e-10
    combined = (w_formula / w_sum) * formula_risk + (w_meta / w_sum) * meta_risk
    combined_ranked = pd.Series(combined).rank().values / len(combined)

    c_indices = []
    for train_idx, val_idx in kf.split(combined):
        c_idx = compute_ipcw_cindex(
            y_times[train_idx],
            events[train_idx],
            y_times[val_idx],
            events[val_idx],
            combined_ranked[val_idx],
        )
        c_indices.append(c_idx)

    return -np.mean(c_indices)


study = optuna.create_study(direction="minimize")
study.optimize(ensemble_objective, n_trials=50, show_progress_bar=True)

ensemble_params = study.best_params
ensemble_score = -study.best_value

print(f"\nEnsemble params: {ensemble_params}")
print(f"Final IPCW C-index: {ensemble_score:.4f}")

# Final predictions
w_sum = ensemble_params["w_formula"] + ensemble_params["w_meta"]
final_risk = (
    ensemble_params["w_formula"] / w_sum * formula_risk
    + ensemble_params["w_meta"] / w_sum * meta_risk
)
final_risk = pd.Series(final_risk).rank().values / len(final_risk)

print(f"\n" + "=" * 60)
print("PIPELINE COMPLETE!")
print(f"Final risk shape: {final_risk.shape}")
print("=" * 60)


In [None]:

# %%
# =============================================================================
# CELL 12: RESULTS SUMMARY
# =============================================================================

print("\n" + "=" * 60)
print("RESULTS SUMMARY")
print("=" * 60)

# Create summary DataFrame
summary_data = []

# Classification
for name, r in clf_results.items():
    summary_data.append(
        {
            "Type": "Classification",
            "Model": name,
            "IPCW_C": r["cv_results"].mean_ipcw_c_index,
            "Std": r["cv_results"].std_ipcw_c_index,
        }
    )

# Regression E1
for name, r in reg_e1_results.items():
    summary_data.append(
        {
            "Type": "Reg E[r|e=1]",
            "Model": name,
            "IPCW_C": r["cv_results"].mean_ipcw_c_index,
            "Std": r["cv_results"].std_ipcw_c_index,
        }
    )

# Regression E0
for name, r in reg_e0_results.items():
    summary_data.append(
        {
            "Type": "Reg E[r|e=0]",
            "Model": name,
            "IPCW_C": r["cv_results"].mean_ipcw_c_index,
            "Std": r["cv_results"].std_ipcw_c_index,
        }
    )

# IPCW
for name, r in reg_ipcw_results.items():
    summary_data.append(
        {
            "Type": "Reg IPCW",
            "Model": name,
            "IPCW_C": r["cv_results"].mean_ipcw_c_index,
            "Std": r["cv_results"].std_ipcw_c_index,
        }
    )

# Meta-models
for name, r in meta_results.items():
    summary_data.append(
        {
            "Type": "Meta-Model",
            "Model": name,
            "IPCW_C": r["cv_results"].mean_ipcw_c_index,
            "Std": r["cv_results"].std_ipcw_c_index,
        }
    )

# Final
summary_data.append(
    {"Type": "Formula", "Model": "Optimized", "IPCW_C": formula_score, "Std": 0.0}
)
summary_data.append(
    {"Type": "FINAL", "Model": "Ensemble", "IPCW_C": ensemble_score, "Std": 0.0}
)

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))
