# Entrainement sur les données cyto

Dans ce notebook on va tenter d'entrainer des modeles eseulement avec les données cytogenetiques extraites pour créer des features additionnels pour un meta model ensuite, on va voir si on peut predire la proba d'une mort = 1 avec ces donnés et voir aussi la tache de regression sur le temps de survie.

In [1]:
# Import necessary libraries
import pandas as pd

In [2]:
from ens_data_challenge.globals import TRAIN_CLINICAL_DATA_PATH, TRAIN_MOLECULAR_DATA_PATH, TRAIN_TARGET_PATH, TEST_CLINICAL_DATA_PATH, TEST_MOLECULAR_DATA_PATH
clinical_data_train = pd.read_csv(TRAIN_CLINICAL_DATA_PATH)
clinical_data_eval = pd.read_csv(TEST_CLINICAL_DATA_PATH)

# Molecular Data
molecular_data_train = pd.read_csv(TRAIN_MOLECULAR_DATA_PATH)
molecular_data_eval = pd.read_csv(TEST_MOLECULAR_DATA_PATH)

target_df = pd.read_csv(TRAIN_TARGET_PATH)

# Preview the data
clinical_data_train.head()

Unnamed: 0,ID,CENTER,BM_BLAST,WBC,ANC,MONOCYTES,HB,PLT,CYTOGENETICS
0,P132697,MSK,14.0,2.8,0.2,0.7,7.6,119.0,"46,xy,del(20)(q12)[2]/46,xy[18]"
1,P132698,MSK,1.0,7.4,2.4,0.1,11.6,42.0,"46,xx"
2,P116889,MSK,15.0,3.7,2.1,0.1,14.2,81.0,"46,xy,t(3;3)(q25;q27)[8]/46,xy[12]"
3,P132699,MSK,1.0,3.9,1.9,0.1,8.9,77.0,"46,xy,del(3)(q26q27)[15]/46,xy[5]"
4,P132700,MSK,6.0,128.0,9.7,0.9,11.1,195.0,"46,xx,t(3;9)(p13;q22)[10]/46,xx[10]"


In [3]:
from ens_data_challenge.preprocess.preprocessor import Preprocessor

preprocessor = Preprocessor()

In [4]:
clinical_data_train, cyto_df_train = preprocessor.get_cyto_features_and_df(clinical_data_train)
clinical_data_eval, cyto_df_eval = preprocessor.get_cyto_features_and_df(clinical_data_eval)

In [5]:
(
    clinical_preprocess_train,
    clinical_preprocess_eval, 
    molecular_preprocess_train, 
    molecular_preprocess_eval, 
    cyto_struct_preprocess_train, 
    cyto_struct_preprocess_eval,
    targets_preprocess
  ) = preprocessor.fit_transform(
    clinical_data_train=clinical_data_train,
    molecular_data_train=molecular_data_train,
    clinical_data_test=clinical_data_eval,
    molecular_data_test=molecular_data_eval,
    cyto_struct_train=cyto_df_train,
    cyto_struct_test=cyto_df_eval,
    targets=target_df
)

In [6]:
cyto_struct_preprocess_train

Unnamed: 0,ID,ploidy,sex_chromosomes,clone_index,clone_cell_count,mutation_type,chromosome,arm,start,end,start_arm,end_arm,raw
0,P132697,46,XY,0,2,deletion,20,q,12,0,q,UNKNOWN,"{'type': 'deletion', 'chromosome': '20', 'arm'..."
1,P116889,46,XY,0,8,translocation,33,UNKNOWN,0,0,q,q,"{'type': 'translocation', 'chromosomes': ('3',..."
2,P132699,46,XY,0,15,deletion,3,q,26,27,q,q,"{'type': 'deletion', 'chromosome': '3', 'arm':..."
3,P132700,46,XX,0,10,translocation,39,UNKNOWN,0,0,p,q,"{'type': 'translocation', 'chromosomes': ('3',..."
4,P132704,45,XX,0,2,deletion,5,q,13,33,q,q,"{'type': 'deletion', 'chromosome': '5', 'arm':..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2753,P121830,46,XY,0,4,deletion,20,q,11.2,13.1,q,q,"{'type': 'deletion', 'chromosome': '20', 'arm'..."
2754,P121830,46,XY,1,16,monosomy,7,UNKNOWN,0,0,UNKNOWN,UNKNOWN,7
2755,P121853,46,XX,0,5,deletion,1,p,34,0,p,UNKNOWN,"{'type': 'deletion', 'chromosome': '1', 'arm':..."
2756,P121853,46,XX,1,12,monosomy,18,UNKNOWN,0,0,UNKNOWN,UNKNOWN,18


In [7]:
cyto_struct_preprocess_eval

Unnamed: 0,ID,ploidy,sex_chromosomes,clone_index,clone_cell_count,mutation_type,chromosome,arm,start,end,start_arm,end_arm,raw
0,KYW1,47,XY,0,15,deletion,9,UNKNOWN,0,0,UNKNOWN,UNKNOWN,"{'type': 'deletion', 'chromosome': '9'}"
1,KYW2,46,XY,0,0,deletion,11,UNKNOWN,0,0,UNKNOWN,UNKNOWN,"{'type': 'deletion', 'chromosome': '11'}"
2,KYW2,46,XY,0,0,addition,4,p,15,0,UNKNOWN,UNKNOWN,"{'type': 'addition', 'chromosome': '4', 'arm':..."
3,KYW2,46,XY,0,0,derivative,3,UNKNOWN,0,0,UNKNOWN,UNKNOWN,"{'type': 'derivative', 'chromosome': '3'}"
4,KYW2,46,XY,0,0,derivative,11,UNKNOWN,0,0,UNKNOWN,UNKNOWN,"{'type': 'derivative', 'chromosome': '11'}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2352,KYW1176,45,XY,1,4,monosomy,14,UNKNOWN,0,0,UNKNOWN,UNKNOWN,14
2353,KYW1176,45,XY,1,4,monosomy,15,UNKNOWN,0,0,UNKNOWN,UNKNOWN,15
2354,KYW1176,45,XY,1,4,monosomy,20,UNKNOWN,0,0,UNKNOWN,UNKNOWN,20
2355,KYW1176,45,XY,1,4,monosomy,22,UNKNOWN,0,0,UNKNOWN,UNKNOWN,22


In [8]:
targets_preprocess

Unnamed: 0,ID,OS_YEARS,OS_STATUS
0,P132697,1.115068,1.0
1,P132698,4.928767,0.0
2,P116889,2.043836,0.0
3,P132699,2.476712,1.0
4,P132700,3.145205,0.0
...,...,...,...
3168,P121826,0.547945,0.0
3169,P121827,2.339726,0.0
3170,P121830,1.997260,0.0
3171,P121853,0.095890,1.0


In [9]:
merged_df = pd.merge(cyto_struct_preprocess_train, targets_preprocess, on='ID')

In [10]:
merged_df

Unnamed: 0,ID,ploidy,sex_chromosomes,clone_index,clone_cell_count,mutation_type,chromosome,arm,start,end,start_arm,end_arm,raw,OS_YEARS,OS_STATUS
0,P132697,46,XY,0,2,deletion,20,q,12,0,q,UNKNOWN,"{'type': 'deletion', 'chromosome': '20', 'arm'...",1.115068,1.0
1,P116889,46,XY,0,8,translocation,33,UNKNOWN,0,0,q,q,"{'type': 'translocation', 'chromosomes': ('3',...",2.043836,0.0
2,P132699,46,XY,0,15,deletion,3,q,26,27,q,q,"{'type': 'deletion', 'chromosome': '3', 'arm':...",2.476712,1.0
3,P132700,46,XX,0,10,translocation,39,UNKNOWN,0,0,p,q,"{'type': 'translocation', 'chromosomes': ('3',...",3.145205,0.0
4,P132704,45,XX,0,2,deletion,5,q,13,33,q,q,"{'type': 'deletion', 'chromosome': '5', 'arm':...",1.005479,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2753,P121830,46,XY,0,4,deletion,20,q,11.2,13.1,q,q,"{'type': 'deletion', 'chromosome': '20', 'arm'...",1.997260,0.0
2754,P121830,46,XY,1,16,monosomy,7,UNKNOWN,0,0,UNKNOWN,UNKNOWN,7,1.997260,0.0
2755,P121853,46,XX,0,5,deletion,1,p,34,0,p,UNKNOWN,"{'type': 'deletion', 'chromosome': '1', 'arm':...",0.095890,1.0
2756,P121853,46,XX,1,12,monosomy,18,UNKNOWN,0,0,UNKNOWN,UNKNOWN,18,0.095890,1.0


In [11]:
merged_df[merged_df['ploidy'] == "OTHER"] = 1

In [12]:
# =============================================================================
# CatBoost Training Loop avec StratifiedKFold + Cat Features
# =============================================================================

from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd

# --- Configuration ---
N_FOLDS = 5
RANDOM_STATE = 42

# --- Params CatBoost ---
catboost_params = {
    'iterations': 500,
    'depth': 6,
    'learning_rate': 0.05,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'random_seed': RANDOM_STATE,
    'verbose': 100,
    'early_stopping_rounds': 50,
}


def train_catboost_cv(X, y, params=catboost_params, n_folds=N_FOLDS, cat_features=None):
    """
    Entraîne CatBoost avec StratifiedKFold et gestion des features catégorielles.
    
    Args:
        X: DataFrame des features
        y: Array/Series des labels (binaire)
        params: Dict des hyperparamètres CatBoost
        n_folds: Nombre de folds
        cat_features: Liste des noms/indices des features catégorielles
                      Si None et toutes les colonnes sont catégorielles, 
                      utilise toutes les colonnes
    
    Returns:
        oof_preds, models, scores
    """
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)
    
    # Si cat_features non spécifié, prendre toutes les colonnes
    if cat_features is None:
        cat_features = list(X.columns)  # Toutes les colonnes sont catégorielles
    
    oof_preds = np.zeros(len(y))
    models = []
    scores = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"\n{'='*50}")
        print(f"Fold {fold + 1}/{n_folds}")
        print(f"{'='*50}")
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        # Créer les Pools avec cat_features
        train_pool = Pool(X_train, y_train, cat_features=cat_features)
        val_pool = Pool(X_val, y_val, cat_features=cat_features)
        
        model = CatBoostClassifier(**params)
        
        model.fit(
            train_pool,
            eval_set=val_pool,
            use_best_model=True
        )
        
        # Prédictions
        val_preds = model.predict_proba(X_val)[:, 1]
        oof_preds[val_idx] = val_preds
        
        # Score
        fold_score = roc_auc_score(y_val, val_preds)
        scores.append(fold_score)
        print(f"Fold {fold + 1} AUC: {fold_score:.4f}")
        
        models.append(model)
    
    # Score global OOF
    oof_score = roc_auc_score(y, oof_preds)
    
    print(f"\n{'='*50}")
    print(f"CV Results:")
    print(f"  Mean AUC: {np.mean(scores):.4f} ± {np.std(scores):.4f}")
    print(f"  OOF AUC:  {oof_score:.4f}")
    print(f"{'='*50}")
    
    return oof_preds, models, scores


# --- Usage ---
# Toutes les colonnes sont catégorielles:
# oof_preds, models, scores = train_catboost_cv(X, y)
#
# Ou spécifier explicitement:
# cat_cols = ['col1', 'col2', 'col3']
# oof_preds, models, scores = train_catboost_cv(X, y, cat_features=cat_cols)
#
# Prédictions sur test:
# test_preds = np.mean([m.predict_proba(X_test)[:, 1] for m in models], axis=0)

In [13]:
train_catboost_cv(merged_df.drop(columns=['OS_YEARS', 'OS_STATUS', 'raw', 'ID']), merged_df['OS_STATUS'])


Fold 1/5
0:	test: 0.5110863	best: 0.5110863 (0)	total: 161ms	remaining: 1m 20s
100:	test: 0.7599442	best: 0.7599442 (100)	total: 4.79s	remaining: 18.9s
200:	test: 0.7693255	best: 0.7694724 (198)	total: 9.89s	remaining: 14.7s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7712996802
bestIteration = 219

Shrink model to first 220 iterations.
Fold 1 AUC: 0.7713

Fold 2/5
0:	test: 0.5562607	best: 0.5562607 (0)	total: 104ms	remaining: 52.1s
100:	test: 0.7967579	best: 0.7976030 (63)	total: 11.9s	remaining: 47s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7997481108
bestIteration = 106

Shrink model to first 107 iterations.
Fold 2 AUC: 0.7997

Fold 3/5
0:	test: 0.5942634	best: 0.5942634 (0)	total: 108ms	remaining: 54s
100:	test: 0.8044446	best: 0.8048753 (80)	total: 22.7s	remaining: 1m 29s
200:	test: 0.8094418	best: 0.8094418 (200)	total: 41.4s	remaining: 1m 1s
300:	test: 0.8153246	best: 0.8163159 (288)	total: 2m 1s	remaining: 1m 19s
Stopped by ove

(array([0.65989632, 0.67968197, 0.78140095, ..., 0.76862524, 0.77306842,
        0.51437789], shape=(2758,)),
 [<catboost.core.CatBoostClassifier at 0x20287d6e510>,
  <catboost.core.CatBoostClassifier at 0x20289034f50>,
  <catboost.core.CatBoostClassifier at 0x20289035090>,
  <catboost.core.CatBoostClassifier at 0x202c6d2e2c0>,
  <catboost.core.CatBoostClassifier at 0x20287d436f0>],
 [0.7712996802192782,
  0.7997481108312342,
  0.8163159177703746,
  0.8047695377670189,
  0.8215103536262227])

In [14]:
# =============================================================================
# CatBoost Regression avec KFold + Poids KM + IPCW C-index
# =============================================================================

from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from scipy.stats import norm
from sksurv.metrics import concordance_index_ipcw
import numpy as np
import pandas as pd

# --- Configuration ---
N_FOLDS = 5
RANDOM_STATE = 42

# --- Params CatBoost Regression ---
catboost_params = {
    'iterations': 500,
    'depth': 6,
    'learning_rate': 0.05,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': RANDOM_STATE,
    'verbose': 100,
    'early_stopping_rounds': 50,
}


def transform_y(y):
    """Transforme y via rank -> uniform -> inverse CDF normale."""
    n = len(y)
    ranks = (-y).argsort().argsort() + 1
    uniform_scores = ranks / (n + 1)
    return norm.ppf(uniform_scores)


def inverse_transform_pred(pred):
    """CDF normale pour obtenir des scores de risque."""
    return norm.cdf(pred)


def make_survival_array(times, events):
    """Crée un structured array pour sksurv."""
    return np.array(
        [(bool(e), t) for e, t in zip(events, times)],
        dtype=[('event', bool), ('time', float)]
    )


def compute_km_weights(times, events):
    """Calcule les poids IPCW Kaplan-Meier."""
    df = pd.DataFrame({'time': times, 'event': events})
    sorted_df = df.sort_values('time').reset_index(drop=True)
    sorted_df['_censoring'] = 1 - sorted_df['event']
    
    time_stats = sorted_df.groupby('time').agg(
        n_censored=('_censoring', 'sum'),
        n_total=('_censoring', 'count')
    ).reset_index()
    
    n = len(sorted_df)
    time_stats['at_risk'] = n - time_stats['n_total'].cumsum().shift(1, fill_value=0)
    time_stats['hazard'] = time_stats['n_censored'] / time_stats['at_risk'].clip(lower=1)
    time_stats['G_t'] = (1 - time_stats['hazard']).cumprod()
    time_stats['G_t_minus'] = time_stats['G_t'].shift(1, fill_value=1.0)
    
    km_lookup = time_stats[['time', 'G_t_minus']].copy()
    result = df.merge(km_lookup, on='time', how='left')
    
    weights = np.zeros(len(df))
    event_mask = result['event'] == 1
    weights[event_mask] = 1.0 / result.loc[event_mask, 'G_t_minus'].clip(lower=1e-10)
    
    return weights


def safe_ipcw_cindex(times_train, events_train, times_test, events_test, risk_scores, tau=None):
    """Calcule le C-index IPCW avec gestion d'erreurs."""
    try:
        surv_train = make_survival_array(times_train, events_train)
        surv_test = make_survival_array(times_test, events_test)
        
        if tau is None:
            tau = times_test.max()
        
        c_idx, _, _, _, _ = concordance_index_ipcw(surv_train, surv_test, risk_scores, tau=tau)
        return c_idx
    except Exception as e:
        print(f"IPCW error: {e}")
        return 0.5


def train_catboost_regression_cv(X, y_times, events, params=catboost_params, 
                                  n_folds=N_FOLDS, cat_features=None, use_ipcw=True):
    """
    Entraîne CatBoost Regressor avec:
    - Transformation de y pour l'entraînement (RMSE)
    - Poids Kaplan-Meier IPCW
    - Retransformation + C-index IPCW pour l'évaluation
    
    Args:
        X: DataFrame des features
        y_times: Array des temps de survie originaux
        events: Array des événements (1=événement, 0=censuré)
        params: Dict des hyperparamètres CatBoost
        n_folds: Nombre de folds
        cat_features: Liste des features catégorielles (None = toutes)
        use_ipcw: Si True, utilise les poids Kaplan-Meier
    
    Returns:
        oof_preds, models, c_indices
    """
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)
    
    if cat_features is None:
        cat_features = list(X.columns)
    
    oof_preds = np.zeros(len(y_times))
    models = []
    c_indices = []
    rmses = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        print(f"\n{'='*50}")
        print(f"Fold {fold + 1}/{n_folds}")
        print(f"{'='*50}")
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        times_train, times_val = y_times[train_idx], y_times[val_idx]
        events_train, events_val = events[train_idx], events[val_idx]
        
        # Transformer y pour l'entraînement
        y_train_transformed = transform_y(times_train)
        y_val_transformed = transform_y(times_val)
        
        # Calculer les poids IPCW sur le train
        if use_ipcw:
            train_weights = compute_km_weights(times_train, events_train)
        else:
            train_weights = None
        
        # Créer les Pools
        train_pool = Pool(X_train, y_train_transformed, cat_features=cat_features, weight=train_weights)
        val_pool = Pool(X_val, y_val_transformed, cat_features=cat_features)
        
        model = CatBoostRegressor(**params)
        
        model.fit(
            train_pool,
            eval_set=val_pool,
            use_best_model=True
        )
        
        # Prédictions sur val
        val_preds_transformed = model.predict(X_val)
        oof_preds[val_idx] = val_preds_transformed
        
        # RMSE sur y transformé
        rmse = np.sqrt(mean_squared_error(y_val_transformed, val_preds_transformed))
        rmses.append(rmse)
        
        # Retransformer -> scores de risque pour C-index
        # Plus la prédiction est élevée, plus le score normal est élevé
        # On veut: risque élevé = survie courte, donc on prend CDF directement
        risk_scores = inverse_transform_pred(val_preds_transformed)
        
        # C-index IPCW (risque élevé = survie courte)
        c_idx = safe_ipcw_cindex(
            times_train, events_train,
            times_val, events_val,
            risk_scores,
            tau=None
        )
        c_indices.append(c_idx)
        
        print(f"Fold {fold + 1} RMSE: {rmse:.4f} | IPCW C-index: {c_idx:.4f}")
        
        models.append(model)
    
    # Résultats globaux
    print(f"\n{'='*50}")
    print(f"CV Results:")
    print(f"  Mean RMSE:        {np.mean(rmses):.4f} ± {np.std(rmses):.4f}")
    print(f"  Mean IPCW C-index: {np.mean(c_indices):.4f} ± {np.std(c_indices):.4f}")
    print(f"{'='*50}")
    
    return oof_preds, models, c_indices


# --- Usage ---
# oof_preds, models, c_indices = train_catboost_regression_cv(
#     X=X, 
#     y_times=times,    # Temps de survie originaux
#     events=events,    # Indicateurs d'événement
#     use_ipcw=True
# )
#
# Prédictions sur test:
# test_preds = np.mean([m.predict(X_test) for m in models], axis=0)
# test_risk = inverse_transform_pred(test_preds)

pas significatif ca

In [15]:
oof_preds, models, scores = train_catboost_regression_cv(
    X=merged_df.drop(columns=['OS_YEARS', 'OS_STATUS', 'raw', 'ID']), 
    y_times=targets_preprocess["OS_YEARS"].values, 
    events=targets_preprocess["OS_STATUS"].values,
    use_ipcw=True
)


Fold 1/5


0:	learn: 1.2418401	test: 1.1495782	best: 1.1495782 (0)	total: 180ms	remaining: 1m 29s
100:	learn: 0.8585676	test: 1.0608057	best: 1.0603030 (86)	total: 13.8s	remaining: 54.5s


KeyboardInterrupt: 