In [19]:
import pandas as pd

# Importation des données nettoyées
donnees = pd.read_csv("data_nettoyer.csv")

# Aperçu des données
print(donnees.head())
print(donnees.info())



   Âge    Sexe     IMC  Enfants Fumeur     Région        Frais
0   19  female  27.900        0    yes  southwest  16884.92400
1   18    male  33.770        1     no  southeast   1725.55230
2   28    male  33.000        3     no  southeast   4449.46200
3   33    male  22.705        0     no  northwest  21984.47061
4   32    male  28.880        0     no  northwest   3866.85520
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1328 entries, 0 to 1327
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Âge      1328 non-null   int64  
 1   Sexe     1328 non-null   object 
 2   IMC      1328 non-null   float64
 3   Enfants  1328 non-null   int64  
 4   Fumeur   1328 non-null   object 
 5   Région   1328 non-null   object 
 6   Frais    1328 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 72.8+ KB
None


In [20]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import root_mean_squared_error, r2_score

X = donnees.drop(columns=['Frais'])
y = donnees['Frais']


In [21]:

X = pd.get_dummies(X, drop_first=True)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X['Fumeur_yes']
)

In [23]:
modeles = {
    "Régression Linéaire": Pipeline([
        ("scaler", StandardScaler()),
        ("modèle", LinearRegression())
    ]),
    "Lasso": Pipeline([
        ("scaler", StandardScaler()),
        ("modèle", Lasso(random_state=42))
    ]),
    "Ridge": Pipeline([
        ("scaler", StandardScaler()),
        ("modèle", Ridge(random_state=42))
    ]),
    "ElasticNet": Pipeline([
        ("scaler", StandardScaler()),
        ("modèle", ElasticNet(random_state=42))
    ])
}


In [24]:

resultats = {}

for nom, pipeline in modeles.items():
    # Entraînement
    pipeline.fit(X_train, y_train)

    # Prédiction
    y_pred = pipeline.predict(X_test)

    # Évaluation
    r2 = r2_score(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)

    # Stock des résultats
    resultats[nom] = {'R²': r2, 'RMSE': rmse}

    print(f"{nom} - R² : {r2:.4f}, RMSE : {rmse:.4f}")


resultats_df = pd.DataFrame(resultats).T
print("\nperformances :")
print(resultats_df)

Régression Linéaire - R² : 0.6816, RMSE : 6417.8223
Lasso - R² : 0.6816, RMSE : 6417.7826
Ridge - R² : 0.6816, RMSE : 6417.1837
ElasticNet - R² : 0.6219, RMSE : 6993.6784

performances :
                           R²         RMSE
Régression Linéaire  0.681567  6417.822302
Lasso                0.681571  6417.782583
Ridge                0.681630  6417.183652
ElasticNet           0.621858  6993.678408


In [25]:

parametres = {
    "modele__alpha": [0.1, 1.0, 10.0]
}


grid = GridSearchCV(modeles["ElasticNet"], parametres, cv=5, scoring="neg_root_mean_squared_error")
grid.fit(X_train, y_train)

print("\nMeilleur ElasticNet :", grid.best_params_)
print("Meilleur RMSE :", -grid.best_score_)


ValueError: Invalid parameter 'modele' for estimator Pipeline(steps=[('scaler', StandardScaler()),
                ('modèle', ElasticNet(random_state=42))]). Valid parameters are: ['memory', 'steps', 'transform_input', 'verbose'].