
## EXERCICE 8 : Utilisation de Pandas et sklearn pour l'analyse de données réelles



In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale


### Question 1 : 
- utiliser la fonction pandas.read_csv avec les bonnes options pour charger le fichier RealMedicalData.csv. On remarquera que le separateur dans le fichier csv est ';' et que les decimales sont representees par des virgules et non des points. Il est important de remarquer qu'un dataframe et non un numpy array sera retourne. Cette classe contient l'equivalent d'un numpy.array mais aussi le nom des lignes et colonnes, tout comme dans les dataframes de R
- Extraire deux numpy.arrays X et y des donnes pour faire par la suite de la regression. Nous allons essayer d'expliquer  'Disease progression' avec les autres variables. Ce sera alors la colonne qui contient y. Les autres colonnes contiendront les donnees de X.
- Une fois X et y extrait, centrer et reduire X. La fonction sklearn.preprocessing.scale pourra etre utilisee


In [33]:
df = pd.read_csv('RealMedicalData.csv', delimiter=';', decimal=',')

In [47]:
from sklearn.model_selection import train_test_split

y = df["Disease progression"]          # target
X = df.drop(columns=["Disease progression"])  # all other columns as features


# Split into train and test set. Training will be done only with the train set and test only for evaluation.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train = X_train.reset_index(drop=True)
X_test  = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test  = y_test.reset_index(drop=True)

# Standardise X_train and X_test separately to prevent data leakage.
X_train_scaled = scale(X_train)
X_test_scaled = scale(X_test)


### Question 2 : 
- Mettre en lien X et y a l'aide de la regression lineaire multiple avec une penalisation Lasso
- A la vue des resultats, quelle variable vous semble liee a 'Disease progression'?
- Verifier cette relation en representant des nuages de points mettant en lien les observations de chaque variable avec les observations de 'Disease progression'



In [50]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

NBfolds = 10

# estimation ordre de grandeur de alpha

for alpha in [0.001, 0.01, 0.1, 1, 10]:

    sum_mse_score = 0

    kf = KFold(n_splits=NBfolds)

    for train_index, val_index in kf.split(X_train_scaled):


        X_tr=X_train_scaled[train_index]
        y_tr=y_train[train_index]
        X_val=X_train_scaled[val_index]
        y_val=y_train[val_index]
  
        lasso_regressor = Lasso(alpha=alpha)
        lasso_regressor.fit(X_tr, y_tr)

        y_pred_lasso = lasso_regressor.predict(X_val)
        mse_score_lasso = mean_squared_error(y_val, y_pred_lasso)
        sum_mse_score += mse_score_lasso

    print(f'Alpha = {alpha}: Mean MSE = {sum_mse_score / NBfolds}')


Alpha = 0.001: Mean MSE = 17.57248353161927
Alpha = 0.01: Mean MSE = 17.160063461305462
Alpha = 0.1: Mean MSE = 14.960288424145308
Alpha = 1: Mean MSE = 13.027653796165618
Alpha = 10: Mean MSE = 15.834097388440426


In [None]:
for alpha in [0.3, 0.5, 0.7, 0.9]:

    sum_mse_score = 0

    kf = KFold(n_splits=NBfolds)

    for train_index, val_index in kf.split(X_train_scaled):


        X_tr=X_train_scaled[train_index]
        y_tr=y_train[train_index]
        X_val=X_train_scaled[val_index]
        y_val=y_train[val_index]
  
        lasso_regressor = Lasso(alpha=alpha)
        lasso_regressor.fit(X_tr, y_tr)

        y_pred_lasso = lasso_regressor.predict(X_val)
        mse_score_lasso = mean_squared_error(y_val, y_pred_lasso)
        sum_mse_score += mse_score_lasso

    print(f'Alpha = {alpha}: Mean MSE = {sum_mse_score / NBfolds}')

# On garde alpha = 0.5

Alpha = 0.3: Mean MSE = 13.114775618458996
Alpha = 0.5: Mean MSE = 12.574471634457003
Alpha = 0.7: Mean MSE = 12.85555275714537
Alpha = 0.9: Mean MSE = 12.843017053156299


In [55]:
lasso_regressor_final = Lasso(alpha=0.5)
lasso_regressor_final.fit(X_train, y_train)

y_pred_train = lasso_regressor_final.predict(X_train_scaled)
y_pred_test = lasso_regressor_final.predict(X_test_scaled)

print(f'MSE in train: {mean_squared_error(y_train, y_pred_train)}')
print(f'MSE in test: {mean_squared_error(y_test, y_pred_test)}')

MSE in train: 55.59866807515566
MSE in test: 41.40169054660383




In [61]:
lasso_regressor_final.coef_

array([ 0.4320866 ,  0.        ,  0.        ,  0.        ,  0.        ,
       -0.21106819,  0.21608433, -0.10079943, -0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.00403143,  0.        ,
       -0.        , -0.42459898,  0.04561237])