# Entrainement sur les données moléculaires

Dans ce notebook on va tenter d'entrainer des modeles eseulement avec les données moléculaires pour créer des features additionnels pour un meta model ensuite, on va voir si on peut predire la proba d'une mort = 1 avec ces donnés et voir aussi la tache de regression sur le temps de survie.

In [1]:
# Import necessary libraries
import pandas as pd

In [2]:
from ens_data_challenge.globals import TRAIN_CLINICAL_DATA_PATH, TRAIN_MOLECULAR_DATA_PATH, TRAIN_TARGET_PATH, TEST_CLINICAL_DATA_PATH, TEST_MOLECULAR_DATA_PATH
clinical_data_train = pd.read_csv(TRAIN_CLINICAL_DATA_PATH)
clinical_data_eval = pd.read_csv(TEST_CLINICAL_DATA_PATH)

# Molecular Data
molecular_data_train = pd.read_csv(TRAIN_MOLECULAR_DATA_PATH)
molecular_data_eval = pd.read_csv(TEST_MOLECULAR_DATA_PATH)

target_df = pd.read_csv(TRAIN_TARGET_PATH)

# Preview the data
clinical_data_train.head()

Unnamed: 0,ID,CENTER,BM_BLAST,WBC,ANC,MONOCYTES,HB,PLT,CYTOGENETICS
0,P132697,MSK,14.0,2.8,0.2,0.7,7.6,119.0,"46,xy,del(20)(q12)[2]/46,xy[18]"
1,P132698,MSK,1.0,7.4,2.4,0.1,11.6,42.0,"46,xx"
2,P116889,MSK,15.0,3.7,2.1,0.1,14.2,81.0,"46,xy,t(3;3)(q25;q27)[8]/46,xy[12]"
3,P132699,MSK,1.0,3.9,1.9,0.1,8.9,77.0,"46,xy,del(3)(q26q27)[15]/46,xy[5]"
4,P132700,MSK,6.0,128.0,9.7,0.9,11.1,195.0,"46,xx,t(3;9)(p13;q22)[10]/46,xx[10]"


In [3]:
from ens_data_challenge.preprocess.preprocessor import Preprocessor

preprocessor = Preprocessor()

In [4]:
clinical_data_train, cyto_df_train = preprocessor.get_cyto_features_and_df(clinical_data_train)
clinical_data_eval, cyto_df_eval = preprocessor.get_cyto_features_and_df(clinical_data_eval)

In [5]:
(
    clinical_preprocess_train,
    clinical_preprocess_eval, 
    molecular_preprocess_train, 
    molecular_preprocess_eval, 
    cyto_struct_preprocess_train, 
    cyto_struct_preprocess_eval,
    targets_preprocess
  ) = preprocessor.fit_transform(
    clinical_data_train=clinical_data_train,
    molecular_data_train=molecular_data_train,
    clinical_data_test=clinical_data_eval,
    molecular_data_test=molecular_data_eval,
    cyto_struct_train=cyto_df_train,
    cyto_struct_test=cyto_df_eval,
    targets=target_df
)

In [6]:
molecular_preprocess_train

Unnamed: 0,ID,CHR,START,END,REF,ALT,GENE,PROTEIN_CHANGE,EFFECT,VAF,DEPTH
0,P100000,11,119149248.0,119149248.0,G,A,CBL,p.C419Y,non_synonymous_codon,0.08300,1308.0
1,P100000,5,131822301.0,131822301.0,G,T,IRF1,p.Y164*,stop_gained,0.02200,532.0
2,P100000,3,77694060.0,77694060.0,G,C,OTHER,p.?,OTHER,0.41000,876.0
3,P100000,4,106164917.0,106164917.0,G,T,TET2,p.R1262L,non_synonymous_codon,0.43000,826.0
4,P100000,2,25468147.0,25468163.0,OTHER,A,DNMT3A,p.E505fs*141,frameshift_variant,0.08980,942.0
...,...,...,...,...,...,...,...,...,...,...,...
10540,P131472,OTHER,0.0,0.0,OTHER,OTHER,MLL,MLL_PTD,PTD,0.32125,975.0
10541,P131505,OTHER,0.0,0.0,OTHER,OTHER,MLL,MLL_PTD,PTD,0.32125,975.0
10542,P131816,OTHER,0.0,0.0,OTHER,OTHER,MLL,MLL_PTD,PTD,0.32125,975.0
10543,P132717,OTHER,0.0,0.0,OTHER,OTHER,MLL,MLL_PTD,PTD,0.32125,975.0


In [7]:
molecular_preprocess_eval

Unnamed: 0,ID,CHR,START,END,REF,ALT,GENE,PROTEIN_CHANGE,EFFECT,VAF,DEPTH
0,KYW961,1,1747229.0,1747229.0,T,C,GNB1,p.K57E,non_synonymous_codon,0.2620,485.0
1,KYW142,1,1747229.0,1747229.0,T,C,GNB1,p.K57E,non_synonymous_codon,0.0280,527.0
2,KYW453,1,1747229.0,1747229.0,T,C,GNB1,p.K57E,non_synonymous_codon,0.2920,277.0
3,KYW982,1,1747229.0,1747229.0,T,C,GNB1,p.K57E,non_synonymous_codon,0.0970,821.0
4,KYW845,1,36932209.0,36932209.0,G,A,CSF3R,p.Q754X,stop_gained,0.4300,358.0
...,...,...,...,...,...,...,...,...,...,...,...
3084,KYW1077,OTHER,0.0,0.0,OTHER,OTHER,MLL,MLL_PTD,PTD,0.4231,975.0
3085,KYW1084,OTHER,0.0,0.0,OTHER,OTHER,MLL,MLL_PTD,PTD,0.0176,975.0
3086,KYW1082,OTHER,0.0,0.0,OTHER,OTHER,MLL,MLL_PTD,PTD,0.2273,975.0
3087,KYW1085,OTHER,0.0,0.0,OTHER,OTHER,MLL,MLL_PTD,PTD,0.2941,975.0


In [8]:
merged_df = pd.merge(molecular_preprocess_train, targets_preprocess, on='ID')
merged_df

Unnamed: 0,ID,CHR,START,END,REF,ALT,GENE,PROTEIN_CHANGE,EFFECT,VAF,DEPTH,OS_YEARS,OS_STATUS
0,P100000,11,119149248.0,119149248.0,G,A,CBL,p.C419Y,non_synonymous_codon,0.08300,1308.0,5.819178,0.0
1,P100000,5,131822301.0,131822301.0,G,T,IRF1,p.Y164*,stop_gained,0.02200,532.0,5.819178,0.0
2,P100000,3,77694060.0,77694060.0,G,C,OTHER,p.?,OTHER,0.41000,876.0,5.819178,0.0
3,P100000,4,106164917.0,106164917.0,G,T,TET2,p.R1262L,non_synonymous_codon,0.43000,826.0,5.819178,0.0
4,P100000,2,25468147.0,25468163.0,OTHER,A,DNMT3A,p.E505fs*141,frameshift_variant,0.08980,942.0,5.819178,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10540,P131472,OTHER,0.0,0.0,OTHER,OTHER,MLL,MLL_PTD,PTD,0.32125,975.0,0.671233,0.0
10541,P131505,OTHER,0.0,0.0,OTHER,OTHER,MLL,MLL_PTD,PTD,0.32125,975.0,3.695890,0.0
10542,P131816,OTHER,0.0,0.0,OTHER,OTHER,MLL,MLL_PTD,PTD,0.32125,975.0,0.468493,1.0
10543,P132717,OTHER,0.0,0.0,OTHER,OTHER,MLL,MLL_PTD,PTD,0.32125,975.0,3.589041,1.0


In [9]:
# =============================================================================
# CatBoost Training Loop avec StratifiedKFold + Cat Features
# =============================================================================

from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd

# --- Configuration ---
N_FOLDS = 5
RANDOM_STATE = 42

# --- Params CatBoost ---
catboost_params = {
    'iterations': 500,
    'depth': 6,
    'learning_rate': 0.05,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'random_seed': RANDOM_STATE,
    'verbose': 100,
    'early_stopping_rounds': 50,
}


def train_catboost_cv(X, y, params=catboost_params, n_folds=N_FOLDS, cat_features=None):
    """
    Entraîne CatBoost avec StratifiedKFold et gestion des features catégorielles.
    
    Args:
        X: DataFrame des features
        y: Array/Series des labels (binaire)
        params: Dict des hyperparamètres CatBoost
        n_folds: Nombre de folds
        cat_features: Liste des noms/indices des features catégorielles
                      Si None et toutes les colonnes sont catégorielles, 
                      utilise toutes les colonnes
    
    Returns:
        oof_preds, models, scores
    """
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)
    
    # Si cat_features non spécifié, prendre toutes les colonnes
    if cat_features is None:
        cat_features = list(X.columns)  # Toutes les colonnes sont catégorielles
    
    oof_preds = np.zeros(len(y))
    models = []
    scores = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"\n{'='*50}")
        print(f"Fold {fold + 1}/{n_folds}")
        print(f"{'='*50}")
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        # Créer les Pools avec cat_features
        train_pool = Pool(X_train, y_train, cat_features=cat_features)
        val_pool = Pool(X_val, y_val, cat_features=cat_features)
        
        model = CatBoostClassifier(**params)
        
        model.fit(
            train_pool,
            eval_set=val_pool,
            use_best_model=True
        )
        
        # Prédictions
        val_preds = model.predict_proba(X_val)[:, 1]
        oof_preds[val_idx] = val_preds
        
        # Score
        fold_score = roc_auc_score(y_val, val_preds)
        scores.append(fold_score)
        print(f"Fold {fold + 1} AUC: {fold_score:.4f}")
        
        models.append(model)
    
    # Score global OOF
    oof_score = roc_auc_score(y, oof_preds)
    
    print(f"\n{'='*50}")
    print(f"CV Results:")
    print(f"  Mean AUC: {np.mean(scores):.4f} ± {np.std(scores):.4f}")
    print(f"  OOF AUC:  {oof_score:.4f}")
    print(f"{'='*50}")
    
    return oof_preds, models, scores


# --- Usage ---
# Toutes les colonnes sont catégorielles:
# oof_preds, models, scores = train_catboost_cv(X, y)
#
# Ou spécifier explicitement:
# cat_cols = ['col1', 'col2', 'col3']
# oof_preds, models, scores = train_catboost_cv(X, y, cat_features=cat_cols)
#
# Prédictions sur test:
# test_preds = np.mean([m.predict_proba(X_test)[:, 1] for m in models], axis=0)

In [10]:
train_catboost_cv(merged_df.drop(columns=['OS_YEARS', 'OS_STATUS', 'START', 'END', 'PROTEIN_CHANGE', 'VAF', 'DEPTH', 'ID']), merged_df['OS_STATUS'])


Fold 1/5
0:	test: 0.5827472	best: 0.5827472 (0)	total: 215ms	remaining: 1m 47s
100:	test: 0.6262203	best: 0.6276618 (65)	total: 5.25s	remaining: 20.7s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.6276617835
bestIteration = 65

Shrink model to first 66 iterations.
Fold 1 AUC: 0.6277

Fold 2/5
0:	test: 0.5618331	best: 0.5618331 (0)	total: 48.1ms	remaining: 24s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.601495693
bestIteration = 36

Shrink model to first 37 iterations.
Fold 2 AUC: 0.6015

Fold 3/5
0:	test: 0.5788822	best: 0.5788822 (0)	total: 104ms	remaining: 51.9s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.6037774853
bestIteration = 25

Shrink model to first 26 iterations.
Fold 3 AUC: 0.6038

Fold 4/5
0:	test: 0.5478240	best: 0.5478240 (0)	total: 186ms	remaining: 1m 32s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.5775495627
bestIteration = 24

Shrink model to first 25 iterations.
Fold 4 AUC:

(array([0.58801928, 0.60755619, 0.55985999, ..., 0.63862774, 0.6403419 ,
        0.63862774], shape=(10545,)),
 [<catboost.core.CatBoostClassifier at 0x1f4d136e510>,
  <catboost.core.CatBoostClassifier at 0x1f4d2600f50>,
  <catboost.core.CatBoostClassifier at 0x1f4d26011d0>,
  <catboost.core.CatBoostClassifier at 0x1f4d13536f0>,
  <catboost.core.CatBoostClassifier at 0x1f4d13535c0>],
 [0.6276617834618274,
  0.6014956929727255,
  0.6037774853310052,
  0.5775495626526194,
  0.6046194749086075])

In [11]:
# =============================================================================
# CatBoost Regression avec KFold + Poids KM + IPCW C-index
# =============================================================================

from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from scipy.stats import norm
from sksurv.metrics import concordance_index_ipcw
import numpy as np
import pandas as pd

# --- Configuration ---
N_FOLDS = 5
RANDOM_STATE = 42

# --- Params CatBoost Regression ---
catboost_params = {
    'iterations': 500,
    'depth': 6,
    'learning_rate': 0.05,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': RANDOM_STATE,
    'verbose': 100,
    'early_stopping_rounds': 50,
}


def transform_y(y):
    """Transforme y via rank -> uniform -> inverse CDF normale."""
    n = len(y)
    ranks = (-y).argsort().argsort() + 1
    uniform_scores = ranks / (n + 1)
    return norm.ppf(uniform_scores)


def inverse_transform_pred(pred):
    """CDF normale pour obtenir des scores de risque."""
    return norm.cdf(pred)


def make_survival_array(times, events):
    """Crée un structured array pour sksurv."""
    return np.array(
        [(bool(e), t) for e, t in zip(events, times)],
        dtype=[('event', bool), ('time', float)]
    )


def compute_km_weights(times, events):
    """Calcule les poids IPCW Kaplan-Meier."""
    df = pd.DataFrame({'time': times, 'event': events})
    sorted_df = df.sort_values('time').reset_index(drop=True)
    sorted_df['_censoring'] = 1 - sorted_df['event']
    
    time_stats = sorted_df.groupby('time').agg(
        n_censored=('_censoring', 'sum'),
        n_total=('_censoring', 'count')
    ).reset_index()
    
    n = len(sorted_df)
    time_stats['at_risk'] = n - time_stats['n_total'].cumsum().shift(1, fill_value=0)
    time_stats['hazard'] = time_stats['n_censored'] / time_stats['at_risk'].clip(lower=1)
    time_stats['G_t'] = (1 - time_stats['hazard']).cumprod()
    time_stats['G_t_minus'] = time_stats['G_t'].shift(1, fill_value=1.0)
    
    km_lookup = time_stats[['time', 'G_t_minus']].copy()
    result = df.merge(km_lookup, on='time', how='left')
    
    weights = np.zeros(len(df))
    event_mask = result['event'] == 1
    weights[event_mask] = 1.0 / result.loc[event_mask, 'G_t_minus'].clip(lower=1e-10)
    
    return weights


def safe_ipcw_cindex(times_train, events_train, times_test, events_test, risk_scores, tau=None):
    """Calcule le C-index IPCW avec gestion d'erreurs."""
    try:
        surv_train = make_survival_array(times_train, events_train)
        surv_test = make_survival_array(times_test, events_test)
        
        if tau is None:
            tau = times_test.max()
        
        c_idx, _, _, _, _ = concordance_index_ipcw(surv_train, surv_test, risk_scores, tau=tau)
        return c_idx
    except Exception as e:
        print(f"IPCW error: {e}")
        return 0.5


def train_catboost_regression_cv(X, y_times, events, params=catboost_params, 
                                  n_folds=N_FOLDS, cat_features=None, use_ipcw=True):
    """
    Entraîne CatBoost Regressor avec:
    - Transformation de y pour l'entraînement (RMSE)
    - Poids Kaplan-Meier IPCW
    - Retransformation + C-index IPCW pour l'évaluation
    
    Args:
        X: DataFrame des features
        y_times: Array des temps de survie originaux
        events: Array des événements (1=événement, 0=censuré)
        params: Dict des hyperparamètres CatBoost
        n_folds: Nombre de folds
        cat_features: Liste des features catégorielles (None = toutes)
        use_ipcw: Si True, utilise les poids Kaplan-Meier
    
    Returns:
        oof_preds, models, c_indices
    """
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)
    
    if cat_features is None:
        cat_features = list(X.columns)
    
    oof_preds = np.zeros(len(y_times))
    models = []
    c_indices = []
    rmses = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        print(f"\n{'='*50}")
        print(f"Fold {fold + 1}/{n_folds}")
        print(f"{'='*50}")
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        times_train, times_val = y_times[train_idx], y_times[val_idx]
        events_train, events_val = events[train_idx], events[val_idx]
        
        # Transformer y pour l'entraînement
        y_train_transformed = transform_y(times_train)
        y_val_transformed = transform_y(times_val)
        
        # Calculer les poids IPCW sur le train
        if use_ipcw:
            train_weights = compute_km_weights(times_train, events_train)
        else:
            train_weights = None
        
        # Créer les Pools
        train_pool = Pool(X_train, y_train_transformed, cat_features=cat_features, weight=train_weights)
        val_pool = Pool(X_val, y_val_transformed, cat_features=cat_features)
        
        model = CatBoostRegressor(**params)
        
        model.fit(
            train_pool,
            eval_set=val_pool,
            use_best_model=True
        )
        
        # Prédictions sur val
        val_preds_transformed = model.predict(X_val)
        oof_preds[val_idx] = val_preds_transformed
        
        # RMSE sur y transformé
        rmse = np.sqrt(mean_squared_error(y_val_transformed, val_preds_transformed))
        rmses.append(rmse)
        
        # Retransformer -> scores de risque pour C-index
        # Plus la prédiction est élevée, plus le score normal est élevé
        # On veut: risque élevé = survie courte, donc on prend CDF directement
        risk_scores = inverse_transform_pred(val_preds_transformed)
        
        # C-index IPCW (risque élevé = survie courte)
        c_idx = safe_ipcw_cindex(
            times_train, events_train,
            times_val, events_val,
            risk_scores,
            tau=None
        )
        c_indices.append(c_idx)
        
        print(f"Fold {fold + 1} RMSE: {rmse:.4f} | IPCW C-index: {c_idx:.4f}")
        
        models.append(model)
    
    # Résultats globaux
    print(f"\n{'='*50}")
    print(f"CV Results:")
    print(f"  Mean RMSE:        {np.mean(rmses):.4f} ± {np.std(rmses):.4f}")
    print(f"  Mean IPCW C-index: {np.mean(c_indices):.4f} ± {np.std(c_indices):.4f}")
    print(f"{'='*50}")
    
    return oof_preds, models, c_indices


# --- Usage ---
# oof_preds, models, c_indices = train_catboost_regression_cv(
#     X=X, 
#     y_times=times,    # Temps de survie originaux
#     events=events,    # Indicateurs d'événement
#     use_ipcw=True
# )
#
# Prédictions sur test:
# test_preds = np.mean([m.predict(X_test) for m in models], axis=0)
# test_risk = inverse_transform_pred(test_preds)

In [12]:
oof_preds, models, scores = train_catboost_regression_cv(
    X=merged_df.drop(columns=['OS_YEARS', 'OS_STATUS', 'START', 'END', 'PROTEIN_CHANGE', 'VAF', 'DEPTH', 'ID']), 
    y_times=targets_preprocess["OS_YEARS"].values, 
    events=targets_preprocess["OS_STATUS"].values,
    use_ipcw=True
)


Fold 1/5


IndexError: index 3173 is out of bounds for axis 0 with size 3173