### Import des modules 

In [6]:
# Selection
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV, 
    cross_validate,
)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error 
from sklearn.inspection import permutation_importance
from sklearn.pipeline import Pipeline

# Preprocess
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

# Modèles
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

import sys
import importlib

sys.path.append("..")
import config

importlib.reload(config)  # Ensure we get the latest TARGET value
from config import TARGET

### Optimisation et interprétation du modèle

A réaliser :
* Reprennez le meilleur algorithme que vous avez sécurisé via l'étape précédente, et réalisez une GridSearch de petite taille sur au moins 3 hyperparamètres.
* Si le meilleur modèle fait partie de la famille des modèles à arbres (RandomForest, GradientBoosting) alors utilisez la fonctionnalité feature importance pour identifier les features les plus impactantes sur la performance du modèle. Sinon, utilisez la méthode Permutation Importance de sklearn.

In [7]:
import pandas as pd

building_consumption = pd.read_csv("../assets/building_consumption_cleaned.csv")
building_consumption.head()

Unnamed: 0,CouncilDistrictCode,YearBuilt,NumberofBuildings,PropertyGFATotal,PropertyGFAParking,PropertyGFABuilding(s),ENERGYSTARScore,TotalGHGEmissions,Office,Parking,...,NaturalGas,Electricity,PrimaryPropertyType_0,PrimaryPropertyType_1,PrimaryPropertyType_2,PrimaryPropertyType_3,PrimaryPropertyType_4,BuildingType_Nonresidential COS,BuildingType_Nonresidential WA,BuildingType_SPS-District K-12
0,7,1927,1.0,88434,0,88434,60.0,249.98,0,0,...,1,1,0,0,0,0,1,0.0,0.0,0.0
1,7,1996,1.0,103566,15064,88502,61.0,295.86,0,1,...,1,1,0,0,0,0,1,0.0,0.0,0.0
2,7,1969,1.0,956110,196718,759392,43.0,2089.28,0,0,...,1,1,0,0,0,0,1,0.0,0.0,0.0
3,7,1926,1.0,61320,0,61320,56.0,286.43,0,0,...,1,1,0,0,0,0,1,0.0,0.0,0.0
4,7,1980,1.0,175580,62000,113580,75.0,505.01,0,1,...,1,1,0,0,0,0,1,0.0,0.0,0.0


In [8]:
# Préprocessor : toutes les features sont numériques et encodées
preprocessor = StandardScaler()

# Pipeline RandomForest
pipeline_rf = Pipeline(
    [("scaler", preprocessor), ("model", RandomForestRegressor(random_state=42))]
)

# Hyperparamètres à tester
param_grid = {
    "model__n_estimators": [100, 300, 500],  # nombre d'arbres
    "model__max_depth": [None, 10, 20],  # profondeur max
    "model__min_samples_split": [2, 5, 10],  # nb min d'échantillons pour splitter
}

grid_search = GridSearchCV(
    pipeline_rf, param_grid, cv=5, scoring="r2", n_jobs=-1, verbose=1
)

# On sépare X et y
X = building_consumption.drop(columns=[TARGET])
y = building_consumption[TARGET]

# Lancement de la recherche
grid_search.fit(X, y)

print("Meilleurs hyperparamètres :", grid_search.best_params_)
print("Meilleur R² :", grid_search.best_score_)

# Feature importance
# best_model = grid_search.best_estimator_.named_steps["model"]
# importances = best_model.feature_importances_
# feature_names = X.columns

# # Création d'un dataframe trié
# import pandas as pd

# feat_imp_df = pd.DataFrame(
#     {"feature": feature_names, "importance": importances}
# ).sort_values(by="importance", ascending=False)

# print(feat_imp_df.head(10))  # top 10 features

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Meilleurs hyperparamètres : {'model__max_depth': None, 'model__min_samples_split': 2, 'model__n_estimators': 500}
Meilleur R² : 0.44720489103941696
