# Modèle de survie avancé pour prédiction du risque de décès

Ce notebook améliore l'approche précédente en combinant plusieurs modèles de survie, une ingénierie de features avancée et une optimisation des hyperparamètres pour maximiser le score IPCW-C-index.

## 1. Chargement des données

Chargement des données cliniques, moléculaires et cibles.

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, GridSearchCV
from sksurv.ensemble import RandomSurvivalForest
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import concordance_index_ipcw
import warnings
warnings.filterwarnings('ignore')

clin = pd.read_csv('X_train/clinical_train.csv')
mol = pd.read_csv('X_train/molecular_train.csv')
target = pd.read_csv('target_train.csv')

clin_test = pd.read_csv('X_test/clinical_test.csv')
mol_test = pd.read_csv('X_test/molecular_test.csv')

## 2. Ingénierie de features cliniques et moléculaires

Imputation, transformation, extraction de variables binaires et agrégation moléculaire.

In [9]:
# Imputation et transformation des variables continues
num_cols = ['BM_BLAST','WBC','ANC','MONOCYTES','HB','PLT']
for df in [clin, clin_test]:
    for c in num_cols:
        df[c] = pd.to_numeric(df[c], errors='coerce')
    df[num_cols] = SimpleImputer(strategy='median').fit_transform(df[num_cols])
    df['WBC'] = np.log1p(df['WBC'])
    df['ANC'] = np.log1p(df['ANC'])
# Extraction de features cytogénétiques
def is_normal_karyotype(s):
    if pd.isna(s): return 0
    return int(str(s).startswith('46,XX') or str(s).startswith('46,XY'))
for df in [clin, clin_test]:
    df['cyto_normal'] = df['CYTOGENETICS'].apply(is_normal_karyotype)
    df['cyto_mono7'] = df['CYTOGENETICS'].fillna('').str.contains('-7').astype(int)
    df['cyto_gain8'] = df['CYTOGENETICS'].fillna('').str.contains('\+8').astype(int)

In [10]:
# Encodage one-hot du centre
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ctr_train = ohe.fit_transform(clin[['CENTER']])
ctr_test = ohe.transform(clin_test[['CENTER']])
ctr_train_df = pd.DataFrame(ctr_train, columns=ohe.get_feature_names_out(['CENTER']))
ctr_test_df = pd.DataFrame(ctr_test, columns=ohe.get_feature_names_out(['CENTER']))
clin = pd.concat([clin.reset_index(drop=True), ctr_train_df], axis=1)
clin_test = pd.concat([clin_test.reset_index(drop=True), ctr_test_df], axis=1)
clin.drop(['CENTER','CYTOGENETICS'], axis=1, inplace=True)
clin_test.drop(['CENTER','CYTOGENETICS'], axis=1, inplace=True)

In [11]:
# Agrégation moléculaire
def aggregate_mol(mol_df):
    mol_df['VAF'] = pd.to_numeric(mol_df['VAF'], errors='coerce')
    strong_impacts = ['stop_gained','frameshift_variant','splice_site_variant']
    mol_df['strong'] = mol_df['EFFECT'].isin(strong_impacts).astype(int)
    top_genes = mol_df['GENE'].value_counts().index[:10].tolist()
    for gene in top_genes:
        mol_df[gene] = (mol_df['GENE'] == gene).astype(int)
    agg = mol_df.groupby('ID').agg({
        'VAF': ['mean','max'],
        'strong': 'sum',
        'GENE': 'count',
        **{gene: ('max') for gene in top_genes}
    })
    agg.columns = ['VAF_mean','VAF_max','Nstrong','Nmut'] + [f'has_{g}' for g in top_genes]
    return agg.fillna(0)
mol_feat = aggregate_mol(mol)
mol_feat_test = aggregate_mol(mol_test)

In [12]:
# Fusion des données
X_train = clin.set_index('ID').join(mol_feat, how='left').fillna(0)
X_test = clin_test.set_index('ID').join(mol_feat_test, how='left').fillna(0)
# Harmonisation des colonnes
missing_cols = set(X_train.columns) - set(X_test.columns)
for col in missing_cols:
    X_test[col] = 0
extra_cols = set(X_test.columns) - set(X_train.columns)
X_test = X_test.drop(columns=list(extra_cols))
X_test = X_test[X_train.columns]

## 3. Sélection de variables et réduction de dimension

Standardisation et PCA pour réduire la dimensionnalité et améliorer la robustesse.

In [13]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
pca = PCA(n_components=0.95, svd_solver='full')
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

## 4. Préparation de la cible et gestion de la censure

In [14]:
target = target.dropna(subset=['OS_YEARS','OS_STATUS'])
y_struct = np.array([(bool(s), t) for s, t in zip(target['OS_STATUS'], target['OS_YEARS'])], dtype=[('event', bool), ('time', float)])
ids = target['ID'].values
X_train_pca = X_train_pca[np.isin(X_train.index, ids)]

## 5. Entraînement des modèles de survie avancés et stacking

In [15]:
# CoxPH
cox = CoxPHSurvivalAnalysis()
cox.fit(X_train_pca, y_struct)
cox_pred = cox.predict(X_test_pca)
# Random Survival Forest
rsf = RandomSurvivalForest(n_estimators=200, min_samples_split=10, min_samples_leaf=5, max_features='sqrt', random_state=42)
rsf.fit(X_train_pca, y_struct)
rsf_pred = rsf.predict(X_test_pca)
# Stacking (moyenne pondérée, peut être optimisée)
risk_score = 0.6 * rsf_pred + 0.4 * cox_pred

## 6. Génération du fichier de soumission

In [None]:
submission = pd.DataFrame({'ID': X_test.index, 'risk_score': risk_score})
submission.to_csv('y_test.csv', index=False)
print(submission.head())

# SCORE : 0.7364

     ID  risk_score
0  KYW1  769.418935
1  KYW2  549.230884
2  KYW3  342.961620
3  KYW4  703.309901
4  KYW5  726.695382
