# Data

In [124]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns# Figure size
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Regressors from scikit-learn
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

In [125]:
X_train= pd.read_csv('X_train_6ZIKlTY.csv', index_col=0)
Y_train= pd.read_csv('Y_train.csv',index_col=0)
X_test= pd.read_csv('X_test_oiZ2ukx.csv',index_col=0)
X_test = X_test.drop(columns=['time_since_diagnosis'])

X_test.index = X_test.index + X_train.shape[0]  # Décale les index de X_test
data = pd.concat([X_train, X_test], axis=0)

print(len(data), len(X_train), len(Y_train), len(X_test))

assert (data.loc[X_train.index].index == Y_train.index).all(), "⚠️ Mauvais alignement entre X_train_processed et Y_train !"

data.head(3)

79275 55603 55603 23672


Unnamed: 0_level_0,patient_id,cohort,sexM,gene,age_at_diagnosis,age,ledd,time_since_intake_on,time_since_intake_off,on,off
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,IPLP5212,A,0,LRRK2+,48.5,52.1,607.0,1.9,,7.0,
1,IPLP5212,A,0,LRRK2+,48.5,53.0,666.0,1.9,17.6,12.0,44.0
2,IPLP5212,A,0,LRRK2+,48.5,53.9,717.0,1.2,,6.0,


Données démographiques du patient : Âge, sexe et âge au moment du diagnostic (symptômes moteurs).

Cohore: cohorte désigne un groupe de patients atteints de la maladie de Parkinson partageant des caractéristiques ou conditions communes et observés sur une période définie afin d’étudier la progression de la maladie et les résultats des traitements.

Informations génétiques : Marqueurs génétiques pertinents.

ledd = Informations sur les médicaments : Posologie (en dose quotidienne équivalente en lévodopa).

Scores Moteur MDS-UPDRS : Scores ON et OFF, avec le délai indiqué depuis la dernière prise de médicament.

Dans le test, time_since_diagnosis est aussi dispo

## Statistiques

In [126]:
print('|FULL SET MISSING VALUES (%):|')
print(data.isna().sum()*100/len(data))
print("\n|FULL SET UNIQUE VALUES-")
print(data.nunique())
#print("\n|FULL SET STATS|")
#print(data.describe())

|FULL SET MISSING VALUES (%):|
patient_id                0.000000
cohort                    0.000000
sexM                      0.000000
gene                     32.445285
age_at_diagnosis          3.569852
age                       0.000000
ledd                     37.159256
time_since_intake_on     46.844529
time_since_intake_off    78.800378
on                       30.177231
off                      41.850520
dtype: float64

|FULL SET UNIQUE VALUES-
patient_id               9959
cohort                      2
sexM                        2
gene                        4
age_at_diagnosis          594
age                      1194
ledd                     1414
time_since_intake_on       65
time_since_intake_off     181
on                         83
off                       102
dtype: int64


In [127]:
gene_counts = data['gene'].value_counts()
print(gene_counts)

gene
No Mutation    25293
LRRK2+         13011
GBA+           11795
OTHER+          3455
Name: count, dtype: int64


## Catégorisation en entiers

In [128]:
categorical_columns = ['patient_id', 'cohort', 'gene']

# Encodage des colonnes catégorielles
for col in categorical_columns:
    data[col] = pd.Categorical(data[col]).codes

data['gene'] = data['gene'].replace(-1, pd.NA)

for col in ['patient_id', 'cohort', 'sexM', 'gene']:
    data[col] = data[col].astype('Int16')
for col in ['age_at_diagnosis', 'age', 'ledd', 'time_since_intake_on', 'time_since_intake_off', 'on', 'off']:
    data[col] = data[col].astype('Float32')

assert (data.loc[X_train.index].index == Y_train.index).all(), "⚠️ Mauvais alignement entre X_train_processed et Y_train !"
print(data.isna().sum())
data.head(3)

patient_id                   0
cohort                       0
sexM                         0
gene                     25721
age_at_diagnosis          2830
age                          0
ledd                     29458
time_since_intake_on     37136
time_since_intake_off    62469
on                       23923
off                      33177
dtype: int64


Unnamed: 0_level_0,patient_id,cohort,sexM,gene,age_at_diagnosis,age,ledd,time_since_intake_on,time_since_intake_off,on,off
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,3332,0,0,1,48.5,52.099998,607.0,1.9,,7.0,
1,3332,0,0,1,48.5,53.0,666.0,1.9,17.6,12.0,44.0
2,3332,0,0,1,48.5,53.900002,717.0,1.2,,6.0,


## Remplissage des données manquantes

Fonctions utilitaires : 
- ridge_over_datas, permet une ride par patient
- globbal filling permet un modèle global

Ces fonctions ne modifient pas les index

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge

def ridge_over_datas(data, group_key='patient_id', y='ledd', x='age', clip_min=1e-6):
    ''' 
    Effectue une régression Ridge sur data pour prédire y si min 2 données par patient.
    Les prédictions négatives sont remplacées par clip_min.
    (Cette fonction est destinée aux tâches de régression.)
    '''
    def impute_ridge(group):
        if group[y].notna().sum() >= 3:
            valid = group[group[y].notna()]
            X_train = valid[x].values.reshape(-1, 1)
            y_train = valid[y].values

            ridge = Ridge(alpha=1.0)
            ridge.fit(X_train, y_train)

            missing_mask = group[y].isna()
            if missing_mask.any():
                X_missing = group.loc[missing_mask, x].values.reshape(-1, 1)
                preds = ridge.predict(X_missing)
                preds = np.clip(preds, clip_min, None)
                group.loc[missing_mask, y] = preds
        return group

    data = data.groupby(group_key, group_keys=False).apply(impute_ridge)
    return data

from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from xgboost import XGBRegressor

def global_filling(data, target='ledd', features=None, model='classifier', clip_min=1e-6):
    ''' 
    Remplit les valeurs manquantes de target dans data.
    Pour les modèles de régression (model != 'classifier'), 
    les prédictions négatives sont remplacées par clip_min.
    Pour les classificateurs, aucune modification des prédictions n'est faite.
    '''
    if features is None:
        features = []

    df_train = data[data[target].notna()].copy()
    df_missing = data[data[target].isna()].copy()

    X_train_model = df_train[features]
    y_train = df_train[target]
    X_missing = df_missing[features]

    if model == 'classifier':
        model_instance = HistGradientBoostingClassifier(random_state=42)
    elif model == 'regressor':
        model_instance = HistGradientBoostingRegressor(random_state=42)
    elif model == 'ridge':
        model_instance = Ridge(alpha=1.0)
    elif model == 'xgboost':
        model_instance = XGBRegressor(random_state=42, use_label_encoder=False, eval_metric='rmse')
    else:
        raise ValueError("Modèle non reconnu: utilisez 'classifier', 'regressor', 'ridge' ou 'xgboost'.")

    model_instance.fit(X_train_model, y_train)
    preds = model_instance.predict(X_missing)

    # Appliquer le clipping seulement si le modèle est de régression (c'est-à-dire, pas un classificateur)
    if model != 'classifier':
        preds = np.clip(preds, clip_min, None)

    data.loc[data[target].isna(), target] = preds
    return data

### Remplissage de 'gene'

In [130]:
data = global_filling(data, target='gene', features=['patient_id', 'cohort', 'sexM', 'age_at_diagnosis', 'age'], model='classifier')
print(data['gene'].isna().sum())

0


### Remplissage de 'ledd'
Pour chaque patient, regression ridge quand min 2 valeurs

In [131]:
# Appliquer la régression Ridge pour les patients avec au moins 2 valeurs valides
data = ridge_over_datas(data, group_key='patient_id', y='ledd', x='age')

# Appliquer le modèle global pour les valeurs manquantes restantes
data = global_filling(data, target='ledd', features=['cohort', 'sexM', 'gene', 'age_at_diagnosis', 'age', 'on', 'off'], model='regressor')

print(data['ledd'].isna().sum())

  data = data.groupby(group_key, group_keys=False).apply(impute_ridge)


0


### Remplissage de 'off'

In [132]:
# Appliquer la régression Ridge pour les patients avec au moins 2 valeurs valides
data = ridge_over_datas(data, group_key='patient_id', y='off', x='age')

# Appliquer le modèle global pour les valeurs manquantes restantes
features = ['cohort', 'sexM', 'gene', 'age_at_diagnosis', 'age', 'ledd', 'time_since_intake_on', 'time_since_intake_off', 'on']
data = global_filling(data, target='off', features=features, model='regressor')

print(data['off'].isna().sum())

  data = data.groupby(group_key, group_keys=False).apply(impute_ridge)


0


### Remplissage de 'on'

In [133]:
target='on'
features = ['cohort', 'sexM', 'gene', 'age_at_diagnosis', 'age', 'ledd', 'time_since_intake_on', 'time_since_intake_off', 'off']

# Appliquer la régression Ridge pour les patients avec au moins 2 valeurs valides
data = ridge_over_datas(data, group_key='patient_id', y=target, x='age')

# Appliquer le modèle global pour les valeurs manquantes restantes
data = global_filling(data, target=target, features=features, model='regressor')

print(data[target].isna().sum())

  data = data.groupby(group_key, group_keys=False).apply(impute_ridge)


0


### Remplissage de 'intake on' and 'intake off' (super importaant)

In [134]:
data.isna().sum()

patient_id                   0
cohort                       0
sexM                         0
gene                         0
age_at_diagnosis          2830
age                          0
ledd                         0
time_since_intake_on     37136
time_since_intake_off    62469
on                           0
off                          0
dtype: int64

In [135]:
features = ['cohort', 'sexM', 'gene', 'age', 'ledd', 'on', 'off'] # j'ai réduit le nombre de features pour ne prendre que celles sans nan donc j'ai notamment enlevé time_since_intake_on
target = 'time_since_intake_off'

# Appliquer la régression Ridge pour les patients avec au moins 2 valeurs valides
data = global_filling(data, target=target, features=features, model='ridge')

data[target].isna().sum()

0

In [136]:
features = ['cohort', 'sexM', 'gene', 'age', 'ledd', 'on', 'off', 'time_since_intake_off'] 
target = 'time_since_intake_on'

# Appliquer la régression Ridge pour les patients avec au moins 2 valeurs valides
data = global_filling(data, target=target, features=features, model='ridge')

data[target].isna().sum()

0

### Remplisage 'age at diagnosis'

In [137]:
features = ['cohort', 'gene', 'sexM', 'age', 'ledd', 'on', 'off',
            'time_since_intake_on', 'time_since_intake_off']
target = 'age_at_diagnosis'


data = global_filling(data, target=target, features=features, model='xgboost')

data[target].isna().sum()

Parameters: { "use_label_encoder" } are not used.



0

In [138]:
print('FULL SET MISSING VALUES (%):')
print(data.isna().sum()*100/len(data))
print('')

FULL SET MISSING VALUES (%):
patient_id               0.0
cohort                   0.0
sexM                     0.0
gene                     0.0
age_at_diagnosis         0.0
age                      0.0
ledd                     0.0
time_since_intake_on     0.0
time_since_intake_off    0.0
on                       0.0
off                      0.0
dtype: float64



## Test rapide

In [139]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

assert (data.loc[X_train.index].index == Y_train.index).all(), "⚠️ Mauvais alignement entre X_train_processed et Y_train !"

X_train_processed = data.loc[X_train.index]
Y_train = Y_train

X_train_train, X_valid, y_train_train, y_valid = train_test_split(X_train_processed, Y_train, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_train, y_train_train)


y_pred = model.predict(X_valid)
mse = mean_squared_error(y_valid, y_pred)
print(f"Mean Squared Error sur le set de validation : {mse}")

  return fit_method(estimator, *args, **kwargs)


Mean Squared Error sur le set de validation : 45.250500816833025


## Sauvegarde du full set

In [140]:
X_test_processed = data.loc[X_test.index]
X_test_processed.reset_index()
X_train_processed = data.loc[X_train.index]
data.to_csv('full_set_complete.csv')
X_train_processed.to_csv('X_train_fill_v0.csv')
X_test_processed.to_csv('X_test_fill_v0.csv')


In [141]:
data.head()

Unnamed: 0_level_0,patient_id,cohort,sexM,gene,age_at_diagnosis,age,ledd,time_since_intake_on,time_since_intake_off,on,off
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,3332,0,0,1,48.5,52.099998,607.0,1.9,14.779591,7.0,38.620296
1,3332,0,0,1,48.5,53.0,666.0,1.9,17.6,12.0,44.0
2,3332,0,0,1,48.5,53.900002,717.0,1.2,14.87831,6.0,39.662327
3,3332,0,0,1,48.5,54.799999,770.0,1.5,14.727922,11.0,40.183342
4,3332,0,0,1,48.5,56.900002,885.0,0.3,14.326217,24.0,41.399048
