## Phase 3 : Modélisation, entraînement et optimisation des modèles de régression

### Configuration

In [11]:
import seaborn as sns 
import matplotlib as plt
import pandas as pd
import numpy as np

DATA_PATH = "../data/"
RAW_FILE = DATA_PATH + "raw/insurance.csv"
CLEAN_FILE = DATA_PATH + "process/insurance_clean.csv"
OUTPUT_PATH = "../output/"
ENCODAGE_FILE = OUTPUT_PATH + "pickle/encoders.pkl"
TRAINING_FILE = OUTPUT_PATH + "pickle/model.pkl"
COLUMN_FILE = OUTPUT_PATH + "pickle/columns.pkl"

In [12]:
df = pd.read_csv(CLEAN_FILE, sep=",", header=0)

### Division des données

In [15]:
from sklearn.model_selection import train_test_split

features = df.drop(["expenses", "log_expenses"], axis = 1) # Variable explicatif
target = df["log_expenses"] # Variable cible transformée

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

### Selection de modèle de régression

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score

# Initialiser les modèles
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.1),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Support Vector Regressor": SVR(kernel='rbf', C=1.0, epsilon=0.1),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=100, random_state=42)
}

# Stocker les performances des modèles
metrics = {
    "Model": [],
    "Mean Absolute Error (MAE)": [],
    "Mean Squared Error (MSE)": [],
    "R2 Score": []
}
# Validation croisée
cv = 5  # Nombre de plis de la validation croisée

# Évaluer chaque modèle avec la validation croisée
for name, model in models.items():
    print(f"Évaluation du modèle : {name}")
    
    # Calculer les scores pour chaque métrique avec cross_val_score
    mse_scores = -cross_val_score(model, features, target, scoring="neg_mean_squared_error", cv=cv)
    mae_scores = -cross_val_score(model, features, target, scoring="neg_mean_absolute_error", cv=cv)
    r2_scores = cross_val_score(model, features, target, scoring="r2", cv=cv)
    
    # Calculer les moyennes des métriques
    metrics["Model"].append(name)
    metrics["Mean Absolute Error (MAE)"].append(np.mean(mae_scores))
    metrics["Mean Squared Error (MSE)"].append(np.mean(mse_scores))
    metrics["R2 Score"].append(np.mean(r2_scores))

# Convertir les métriques en DataFrame pour un affichage clair
metrics_df = pd.DataFrame(metrics)
metrics_df

Évaluation du modèle : Linear Regression
Évaluation du modèle : Ridge Regression
Évaluation du modèle : Lasso Regression
Évaluation du modèle : Random Forest
Évaluation du modèle : Support Vector Regressor
Évaluation du modèle : XGBoost
Évaluation du modèle : LightGBM
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000169 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 295
[LightGBM] [Info] Number of data points in the train set: 1063, number of used features: 9
[LightGBM] [Info] Start training from score 9.097237
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000111 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 297
[LightGBM] [Info] Number of data points in the train set: 1063, number of used features: 9
[LightGBM] [Info] Start training from score 9.095902
[LightGBM] [Info] Auto-choosing col-wise multi-thread

Unnamed: 0,Model,Mean Absolute Error (MAE),Mean Squared Error (MSE),R2 Score
0,Linear Regression,-0.280584,-0.198758,0.762937
1,Ridge Regression,-0.280899,-0.198772,0.762919
2,Lasso Regression,-0.368981,-0.272033,0.675173
3,Random Forest,-0.202803,-0.160813,0.808205
4,Support Vector Regressor,-0.455811,-0.618015,0.262888
5,XGBoost,-0.243842,-0.197779,0.76506
6,LightGBM,-0.215887,-0.156385,0.813457


### Validation croisée

Par défaut, on a utilisé la validation croisée `K-fold`. On va tester les modèles sur d'autres validations croisées.

#### a. LOOCV

In [33]:
from sklearn.model_selection import LeaveOneOut

# LOOCV
loo = LeaveOneOut()

# Initialiser les modèles
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.1),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Support Vector Regressor": SVR(kernel='rbf', C=1.0, epsilon=0.1),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=100, random_state=42)
}

# Stocker les performances des modèles
metrics_loocv = {
    "Model": [],
    "Mean Absolute Error (MAE)": [],
    "Mean Squared Error (MSE)": [],
    "R2 Score": []
}

# Évaluer chaque modèle avec la validation croisée
for name, model in models.items():
    print(f"Évaluation du modèle : {name}")
    
    # Calculer les scores pour chaque métrique avec cross_val_score
    mse_scores = cross_val_score(model, features, target, scoring="neg_mean_squared_error", cv=loo)
    mae_scores = cross_val_score(model, features, target, scoring="neg_mean_absolute_error", cv=loo)
    r2_scores = cross_val_score(model, features, target, scoring="r2", cv=loo)
    
    # Calculer les moyennes des métriques
    metrics_loocv["Model"].append(name)
    metrics_loocv["Mean Absolute Error (MAE)"].append(np.mean(mae_scores))
    metrics_loocv["Mean Squared Error (MSE)"].append(np.mean(mse_scores))
    metrics_loocv["R2 Score"].append(np.mean(r2_scores))

# Convertir les métriques en DataFrame pour un affichage clair
metrics_df_loocv = pd.DataFrame(metrics_loocv)
metrics_df_loocv

Évaluation du modèle : Linear Regression




Évaluation du modèle : Ridge Regression




Évaluation du modèle : Lasso Regression




Évaluation du modèle : Random Forest


KeyboardInterrupt: 

### Optimisation des hyperparamètres

In [26]:
from sklearn.model_selection import RandomizedSearchCV

# Initialiser le modèle LightGBM
lgb_model = LGBMRegressor(random_state=42)

# Définir l'espace de recherche des hyperparamètres
param_distributions = {
    'num_leaves': [20, 31, 40, 50],  # Nombre de feuilles
    'max_depth': [-1, 10, 20, 30],  # Profondeur maximale
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Taux d'apprentissage
    'n_estimators': [100, 200, 500, 1000],  # Nombre d'estimateurs
    'min_child_samples': [5, 10, 20, 50],  # Minimum d'échantillons dans un nœud
    'subsample': [0.6, 0.8, 1.0],  # Sous-échantillonnage
    'colsample_bytree': [0.6, 0.8, 1.0],  # Proportion de colonnes utilisées
}

# Initialiser RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=param_distributions,
    n_iter=50,  # Nombre d'itérations (combinations testées)
    scoring='neg_mean_squared_error',  # Métrique à optimiser
    cv=3,  # Validation croisée
    verbose=1,
    random_state=42,
    n_jobs=-1  # Utiliser tous les cœurs disponibles
)

# Exécuter l'optimisation
random_search.fit(features_train, target_train)

# Résultats
print("Meilleurs hyperparamètres :", random_search.best_params_)
print("Meilleure performance (RMSE) :", np.sqrt(-random_search.best_score_))

# Évaluation sur le jeu de test
best_model = random_search.best_estimator_

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000275 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 296
[LightGBM] [Info] Number of data points in the train set: 1063, number of used features: 9
[LightGBM] [Info] Start training from score 9.091986
Meilleurs hyperparamètres : {'subsample': 0.8, 'num_leaves': 50, 'n_estimators': 200, 'min_child_samples': 50, 'max_depth': 10, 'learning_rate': 0.05, 'colsample_bytree': 1.0}
Meilleure performance (RMSE) : 0.37221444093832545


In [27]:
y_pred = best_model.predict(features_test)

# Calculer les métriques
mse = mean_squared_error(target_test, y_pred)
mae = mean_absolute_error(target_test, y_pred)
r2 = r2_score(target_test, y_pred)

mse, mae, r2

(np.float64(0.1693856649997125),
 np.float64(0.19839610610065816),
 0.7849182078731934)