# 1. Chargement des librairies et données

## 1.1. Import des librairies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import mlflow
import json
from datetime import datetime

import sklearn
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_validate

import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

mlflow.set_experiment("Insurance_Model_Comparison")

## 1.2. Import des données

In [None]:
dataset = pd.read_csv("./dataset/insurance.csv")
df = pd.DataFrame(dataset)

display(df.head())

## 1.3. Vérification des données

In [None]:
print(" Nombre de lignes dupliquées : ", df.duplicated().sum(), "\n")

display(df[df.duplicated(keep=False)])

df = df.drop_duplicates()
print("\n→ Doublon supprimé")

print("\nDonnées manquantes :")
display(df.isna().sum())
print("\n→ Pas de données manquantes")

# 2. Modélisation automatisée

## 2.1. Split test-train

In [None]:
# Division en données d'entraînement (80 %) et données de test (20 %)

X = df.drop("charges", axis=1)
y = df["charges"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.2,
    random_state = 42
)

## 2.2. Création de la pipeline

In [None]:
def add_features(X):
    X = X.copy()
    X["smoker"] = X["smoker"].map({"yes": 1, "no": 0})
    X["smoker_obese"] = X["smoker"] * (X["bmi"] >= 30).astype(int)
    X["age"] = X["age"] ** 2
    return X

categorical_features = ["region", "sex"]
numerical_features = ["age", "children", "smoker", "smoker_obese", "bmi"]

preprocessor_withFeatures = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(drop="first"), categorical_features),
        ("numerical", StandardScaler(), numerical_features)
    ]
)

pipeline = Pipeline([
    ("feature_engineering", FunctionTransformer(add_features)),
    ("preprocessor", preprocessor_withFeatures),
    ("regressor", LinearRegression())
])


## 2.3. Cross validation

In [None]:
# Evaluation du modèle avec validation croisée : MAE
cv_scores_mae = cross_val_score(
    estimator = pipeline,
    X = X_train,
    y = y_train,
    cv = 5,
    scoring = "neg_mean_absolute_error"
)

print("MAE avec validation croisée (5-fold) :")
print(f"  Scores : {[f"{abs(score):.2f}" for score in cv_scores_mae]}")
print(f"  Moyenne : {abs(cv_scores_mae.mean()):.2f} (+/- {cv_scores_mae.std():.2f})")

In [None]:
# Evaluation du modèle avec validation croisée : R-Squared
cv_scores_r2 = cross_val_score(
    estimator = pipeline,
    X = X_train,
    y = y_train,
    cv = 5,
    scoring = "r2"
)

print("R-Squared avec validation croisée (5-fold) :")
print(f"  Scores : {[f"{score:.2%}" for score in cv_scores_r2]}")
print(f"  Moyenne : {cv_scores_r2.mean():.2%} (+/- {cv_scores_r2.std():.2%})")

## 2.4. Comparaison de modèles avec GridSearchCV

In [None]:
models = [
    ("Linear Regression", LinearRegression()),
    ("Lasso", Lasso(max_iter=10000)),
    ("Ridge", Ridge()),
    ("ElasticNet", ElasticNet(max_iter=10000))
]

param_grids = [
    {"regressor__fit_intercept": [True, False]},
    {"regressor__alpha": [0.001, 0.01, 0.1, 0.5, 1, 5, 10, 50, 100],},
    {"regressor__alpha": [0.001, 0.01, 0.1, 0.5, 1, 5, 10, 50, 100, 1000],},
    {"regressor__alpha": [0.001, 0.01, 0.1, 1, 10],}
]

"""
param_grids = [
    {"regressor__fit_intercept": [True, False]},
    {"regressor__alpha": [0.1, 1, 10]},
    {"regressor__alpha": [0.1, 1, 10]}
]"""

results = []

for (name, model), param_grid in zip(models, param_grids):

    print(f"Entraînement de {name}...")
    pipeline.set_params(regressor=model)

    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=5,
        scoring="r2",
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_train, y_train)

    # Prédictions sur le test
    y_pred = grid_search.predict(X_train)

    # Calcul des métriques
    test_r2 = r2_score(y_train, y_pred)
    test_mae = np.mean(np.abs(y_train - y_pred))
    test_rmse = (mean_squared_error(y_train, y_pred)) ** 0.5

    results.append({
        "Model": name,
        "Best Parameters": str(grid_search.best_params_),
        "Best CV R²": round(grid_search.best_score_, 4),
        "Train R²": round(test_r2, 4),
        "Train MAE": round(test_mae, 2),
        "Train RMSE": round(test_rmse, 2)
    })


results_df = pd.DataFrame(results)

# Affichage
print("\n" + "="*100)
print("Résultats de la comparaison des modèles")
print("="*100)
print(results_df.to_string(index=False))
print("="*100)

In [None]:
# Visualisation
plt.figure(figsize=(8, 6))
bars = plt.bar(results_df["Model"], results_df["Train MAE"],
               color=["steelblue", "seagreen", "coral", "tomato"],
               edgecolor="black", alpha=0.8, linewidth=2)

for i, bar in enumerate(bars):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f"{height:.2f} $",
             ha="center", va="bottom", fontsize=12, fontweight="bold")

plt.ylabel("Train MAE Score", fontsize=13, fontweight="bold")
plt.title("Comparaison des modèles de régression linéaire", fontsize=15, fontweight="bold", pad=15)
plt.ylim(results_df["Train MAE"].min() - 10, results_df["Train MAE"].max() + 10)
plt.xticks(rotation=45, ha="right")
plt.grid(axis="y", alpha=0.3, linestyle="--")
plt.tight_layout()
plt.show()

## 2.5. Comparaison de modèles avec MLFlow

In [None]:
# Initialisation des trois modèles de comparaison
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=10),
    "Lasso": Lasso(alpha=10),
    "ElasticNet": ElasticNet(alpha=0.01)
}

for name, mlflow_model in models.items():
    with mlflow.start_run(run_name=name):

        # On injecte le modèle dans la pipeline
        pipeline.set_params(regressor=mlflow_model)

        # Cross-validation MAE et r2
        cv_results = cross_validate(pipeline, X_train, y_train, cv=5, scoring=["neg_mean_absolute_error", "r2"])

        mae = abs(cv_results["test_neg_mean_absolute_error"].mean())
        r2 = cv_results["test_r2"].mean()

        mlflow.log_param("model_name", name)
        mlflow.log_metric("MAE_mean", mae)
        mlflow.log_metric("r2_mean", r2)

        mlflow.sklearn.log_model(pipeline, name="pipeline_artifact")

        print(f"{name} terminé. MAE : {mae:.2f} | R2 : {r2:.2f}")

## 2.6. Prédictions sur les données de test

In [None]:
final_model = pipeline.set_params(regressor=LinearRegression())

final_model.fit(X_train, y_train)

# Prédictions
y_pred = final_model.predict(X_test)

# Evaluation du modèle sur les données de test (scores R2, MAE, RMSE)
titles = ["R2", "MAE", "MSE"]

scores = [
    r2_score(y_test, y_pred),
    np.mean(abs(y_test - y_pred)),
    (mean_squared_error(y_test, y_pred)) ** 0.5
]

df_scores = pd.DataFrame([scores], columns=titles)

print(f"\nMesures de performance (test) :")

display(df_scores.round(4))


In [None]:
# Représentation graphique des prédictions
plt.figure(figsize=(10, 6))

plt.scatter(y_test, y_pred, alpha=0.6, edgecolors="k", linewidth=0.5, label="Prédictions")

plt.plot([y_test.min(), y_test.max()],
            [y_test.min(), y_test.max()],
            "r-", lw=2, label="Régression parfaite")

plt.xlabel("Valeurs réelles", fontsize=12)
plt.ylabel("Valeurs prédites", fontsize=12)
plt.title(f"Prédiction des charges (MAE : {np.mean(abs(y_test - y_pred)):.2f} $)", fontsize=12, fontweight="bold")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


# 3. Sauvegarde et chargement du modèle

## 3.1. Sauvegarde du modèle final

In [None]:
import os

# Sauvegarde avec joblib
model_path = "./models/insurance_model.pkl"
joblib.dump(final_model, model_path)

print(f"Modèle sauvegardé dans : {model_path}")

# Vérification de la taille du fichier (en KB)
file_size = os.path.getsize(model_path) / 1024
print(f"Taille du fichier : {file_size:.2f} KB")

## 3.2. Sauvegarde des métadonnées du modèle

In [None]:
# Métadonnées du modèle
metadata = {
    "model_name": "Prédicteur de charges d’assurance",
    "model_type": "LinearRegression",
    "training_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "sklearn_version": sklearn.__version__,
    "features": X.columns.tolist(),
    "preprocessing": [
        "Feature engineering (smoker, smoker_obese)",
        "OneHotEncoder (categorical features)",
        "StandardScaler (numerical features)"
    ],
    "description": "Pipeline complet avec feature engineering et régression linéaire"
}

#Sauvegarde des métadonnées
metadata_path = "./models/regression_model_metadata.json"
with open(metadata_path, "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=4, ensure_ascii=False)

print(f"Métadonnées sauvegardées : {metadata_path}")

#Affichage
print("\n Métadonnées du modèle :")
print(json.dumps(metadata, indent=2, ensure_ascii=False))


## 3.3. Chargement du modèle

In [None]:
# Chargement du modèle
loaded_model = joblib.load(model_path)
print("✓ Modèle chargé depuis le fichier")

## 3.4. Prédiction pour un nouveau client

In [None]:
# Vérification : faire une prédiction
test_data = pd.DataFrame({
    "age": [18],
    "children": [0],
    "smoker": ["no"],
    "bmi": [20],
    "sex": ["female"],
    "region": ["southeast"]
})

# Faire la prédiction avec le modèle chargé
predicted_charges = loaded_model.predict(test_data)[0]

print(f"Prédiction des charges : {predicted_charges:.2f} $")