# Modélisation 

### Import des modules 

In [17]:
import numpy as np

# Selection
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV, 
    cross_validate,
)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error 
from sklearn.inspection import permutation_importance

# Preprocess
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

# Modèles
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

import sys
import importlib

sys.path.append("..")
import config
importlib.reload(config)  # Ensure we get the latest TARGET value
from config import TARGET

In [18]:
import pandas as pd
building_consumption = pd.read_csv("../assets/building_consumption_cleaned.csv")
building_consumption.head()

Unnamed: 0,CouncilDistrictCode,YearBuilt,NumberofBuildings,PropertyGFATotal,PropertyGFAParking,PropertyGFABuilding(s),ENERGYSTARScore,TotalGHGEmissions,Office,Parking,...,NaturalGas,Electricity,PrimaryPropertyType_0,PrimaryPropertyType_1,PrimaryPropertyType_2,PrimaryPropertyType_3,PrimaryPropertyType_4,BuildingType_Nonresidential COS,BuildingType_Nonresidential WA,BuildingType_SPS-District K-12
0,7,1927,1.0,88434,0,88434,60.0,249.98,0,0,...,1,1,0,0,0,0,1,0.0,0.0,0.0
1,7,1996,1.0,103566,15064,88502,61.0,295.86,0,1,...,1,1,0,0,0,0,1,0.0,0.0,0.0
2,7,1969,1.0,956110,196718,759392,43.0,2089.28,0,0,...,1,1,0,0,0,0,1,0.0,0.0,0.0
3,7,1926,1.0,61320,0,61320,56.0,286.43,0,0,...,1,1,0,0,0,0,1,0.0,0.0,0.0
4,7,1980,1.0,175580,62000,113580,75.0,505.01,0,1,...,1,1,0,0,0,0,1,0.0,0.0,0.0


### Comparaison de différents modèles supervisés

A réaliser :
* Pour chaque algorithme que vous allez tester, vous devez :
    * Réaliser au préalable une séparation en jeu d'apprentissage et jeu de test via une validation croisée.
    * Si les features quantitatives que vous souhaitez utiliser ont des ordres de grandeur très différents les uns des autres, et que vous utilisez un algorithme de regression qui est sensible à cette différence, alors il faut réaliser un scaling (normalisation) de la donnée au préalable.
    * Entrainer le modèle sur le jeu de Train
    * Prédire la cible sur la donnée de test (nous appelons cette étape, l'inférence).
    * Calculer les métriques de performance R2, MAE et RMSE sur le jeu de train et de test.
    * Interpréter les résultats pour juger de la fiabilité de l'algorithme.
* Vous pouvez choisir par exemple de tester un modèle linéaire, un modèle à base d'arbres et un modèle de type SVM
* Déterminer le modèle le plus performant parmi ceux testés.

### Modèle linéaire - LinearRegression

In [19]:
# Select features (excluding the target column)
features_df = building_consumption.drop(columns=[TARGET])

numerical_features = features_df.select_dtypes(include="number").columns.tolist()
categorical_features = features_df.select_dtypes(exclude="number").columns.tolist()

print("Numerical features:", len(numerical_features))
print("Categorical features:", len(categorical_features))
print("Total features:", len(numerical_features) + len(categorical_features))
print("Total columns (excluding target):", len(features_df.columns))
print("Match:", len(features_df.columns) == len(numerical_features) + len(categorical_features))

print("\nNumerical features:", numerical_features)
print("\nCategorical features:", categorical_features)

Numerical features: 29
Categorical features: 0
Total features: 29
Total columns (excluding target): 29
Match: True

Numerical features: ['CouncilDistrictCode', 'YearBuilt', 'NumberofBuildings', 'PropertyGFATotal', 'PropertyGFAParking', 'PropertyGFABuilding(s)', 'ENERGYSTARScore', 'Office', 'Parking', 'Non-Refrigerated Warehouse', 'Hotel', 'K-12 School', 'Retail Store', 'Other', 'Hospital (General Medical & Surgical)', 'Medical Office', 'Other - Entertainment/Public Assembly', 'NumberofFloors_quintile', 'SteamUse', 'NaturalGas', 'Electricity', 'PrimaryPropertyType_0', 'PrimaryPropertyType_1', 'PrimaryPropertyType_2', 'PrimaryPropertyType_3', 'PrimaryPropertyType_4', 'BuildingType_Nonresidential COS', 'BuildingType_Nonresidential WA', 'BuildingType_SPS-District K-12']

Categorical features: []


**Note importante:** Pour une comparaison équitable, tous les modèles utilisent la même stratégie de validation croisée avec `KFold(n_splits=5, shuffle=True, random_state=42)`. Cela garantit que chaque modèle est évalué sur exactement les mêmes divisions train/test.

### Configuration commune pour l'évaluation des modèles

In [20]:
# Configuration commune pour tous les modèles
from sklearn.model_selection import KFold

# Stratégie de validation croisée commune pour tous les modèles
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

# Métriques d'évaluation communes
scoring = ["r2", "neg_mean_squared_error", "neg_mean_absolute_error"]

# Données communes
X = building_consumption.drop(columns=[TARGET])
y = building_consumption[TARGET]

def evaluate_model(pipeline, model_name):
    """
    Évalue un modèle avec la stratégie de validation croisée commune
    
    Args:
        pipeline: Le pipeline sklearn à évaluer
        model_name: Nom du modèle pour l'affichage
        
    Returns:
        dict: Résultats de la validation croisée
    """
    cv_results = cross_validate(
        pipeline,
        X,
        y,
        cv=cv_strategy,
        scoring=scoring,
        return_train_score=True,
    )
    
    print(f"=== {model_name} Results ===")
    print(f"{'Metric':<12} {'Train Mean':<12} {'Train Std':<12} {'Test Mean':<12} {'Test Std':<12}")
    print("-" * 65)
    
    # Affichage des résultats moyens avec format tabulaire
    for metric in scoring:
        train_scores = cv_results[f"train_{metric}"]
        test_scores = cv_results[f"test_{metric}"]
        # Inverser les scores négatifs pour MAE et RMSE
        if "neg" in metric:
            train_scores = -train_scores
            test_scores = -test_scores
            metric_name = metric.replace("neg_", "").replace("_", " ").upper()
        else:
            metric_name = metric.upper()
        
        print(f"{metric_name:<12} {train_scores.mean():<12.3f} {train_scores.std():<12.3f} {test_scores.mean():<12.3f} {test_scores.std():<12.3f}")
    
    # Calculate RMSE from MSE
    rmse_train = np.sqrt(-cv_results["train_neg_mean_squared_error"])
    rmse_test = np.sqrt(-cv_results["test_neg_mean_squared_error"])
    print(f"{'RMSE':<12} {rmse_train.mean():<12.3f} {rmse_train.std():<12.3f} {rmse_test.mean():<12.3f} {rmse_test.std():<12.3f}")
    print()  # Ligne vide pour séparation
    
    return cv_results

In [21]:
# Since all features are numerical (categorical ones were already encoded) 
# and missing values have been cleaned in the data preparation notebook,
# we only need to scale the numerical features
if len(categorical_features) > 0:
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numerical_features),
            ("cat", "passthrough", categorical_features),
        ]
    )
else:
    # All features are numerical and clean, just apply scaling
    preprocessor = StandardScaler()

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Évaluation avec la fonction commune
cv_results = evaluate_model(pipeline, "LinearRegression")

=== LinearRegression Results ===
Metric       Train Mean   Train Std    Test Mean    Test Std    
-----------------------------------------------------------------
R2           0.669        0.023        0.462        0.239       
MEAN SQUARED ERROR 101593.731   11978.043    119313.590   54409.330   
MEAN ABSOLUTE ERROR 131.797      5.604        136.572      13.312      
RMSE         318.188      18.708       334.459      86.317      



### Modèle SVR (Support Vector Regressor)

In [22]:
# SVR is sensitive to feature scaling, so we need StandardScaler
pipeline_svr = Pipeline(steps=[
    ('preprocessor', StandardScaler()),
    ('model', SVR(kernel='rbf', C=1.0, gamma='scale'))
])

# Évaluation avec la fonction commune
cv_results_svr = evaluate_model(pipeline_svr, "SVR")

=== SVR Results ===
Metric       Train Mean   Train Std    Test Mean    Test Std    
-----------------------------------------------------------------
R2           -0.021       0.001        -0.027       0.007       
MEAN SQUARED ERROR 316189.596   53986.922    316423.122   216203.202  
MEAN ABSOLUTE ERROR 129.813      5.567        130.429      22.476      
RMSE         560.191      48.741       519.809      214.993     



### Modèle RandomForestRegressor

In [None]:
# Random Forest doesn't require feature scaling, but let's use it for consistency
pipeline_rf = Pipeline(steps=[
    ('preprocessor', StandardScaler()),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Évaluation avec la fonction commune
cv_results_rf = evaluate_model(pipeline_rf, "Random Forest")

=== Random Forest Results ===
Metric       Train Mean   Train Std    Test Mean    Test Std    
-----------------------------------------------------------------
R2           0.946        0.017        0.570        0.132       
MEAN SQUARED ERROR 16393.010    5083.791     110320.963   66287.494   
MEAN ABSOLUTE ERROR 37.204       1.490        101.621      16.495      
RMSE         126.627      18.934       314.320      107.351     



### Comparaison des trois modèles

In [24]:
# Create comparison table
import pandas as pd

# Extract results from cross-validation
models_comparison = pd.DataFrame({
    'Model': ['LinearRegression', 'SVR', 'RandomForest'],
    'R2_Train': [
        cv_results["train_r2"].mean(),
        cv_results_svr["train_r2"].mean(),
        cv_results_rf["train_r2"].mean()
    ],
    'R2_Test': [
        cv_results["test_r2"].mean(),
        cv_results_svr["test_r2"].mean(),
        cv_results_rf["test_r2"].mean()
    ],
    'MAE_Train': [
        (-cv_results["train_neg_mean_absolute_error"]).mean(),
        (-cv_results_svr["train_neg_mean_absolute_error"]).mean(),
        (-cv_results_rf["train_neg_mean_absolute_error"]).mean()
    ],
    'MAE_Test': [
        (-cv_results["test_neg_mean_absolute_error"]).mean(),
        (-cv_results_svr["test_neg_mean_absolute_error"]).mean(),
        (-cv_results_rf["test_neg_mean_absolute_error"]).mean()
    ],
    'RMSE_Train': [
        np.sqrt(-cv_results["train_neg_mean_squared_error"]).mean(),
        np.sqrt(-cv_results_svr["train_neg_mean_squared_error"]).mean(),
        np.sqrt(-cv_results_rf["train_neg_mean_squared_error"]).mean()
    ],
    'RMSE_Test': [
        np.sqrt(-cv_results["test_neg_mean_squared_error"]).mean(),
        np.sqrt(-cv_results_svr["test_neg_mean_squared_error"]).mean(),
        np.sqrt(-cv_results_rf["test_neg_mean_squared_error"]).mean()
    ]
})

# Format the table for better readability
models_comparison = models_comparison.round(3)
print("=== COMPARAISON DES MODÈLES ===")
print(models_comparison.to_string(index=False))

# Find best model for each metric
print("\n=== MEILLEUR MODÈLE PAR MÉTRIQUE ===")
print(f"Meilleur R² Test: {models_comparison.loc[models_comparison['R2_Test'].idxmax(), 'Model']} (R² = {models_comparison['R2_Test'].max():.3f})")
print(f"Meilleur MAE Test: {models_comparison.loc[models_comparison['MAE_Test'].idxmin(), 'Model']} (MAE = {models_comparison['MAE_Test'].min():.0f})")
print(f"Meilleur RMSE Test: {models_comparison.loc[models_comparison['RMSE_Test'].idxmin(), 'Model']} (RMSE = {models_comparison['RMSE_Test'].min():.0f})")

# Calculate overfitting indicator (difference between train and test R²)
models_comparison['Overfitting'] = models_comparison['R2_Train'] - models_comparison['R2_Test']
print(f"\n=== ANALYSE DU SURAPPRENTISSAGE (Train R² - Test R²) ===")
for i, row in models_comparison.iterrows():
    print(f"{row['Model']}: {row['Overfitting']:.3f}")
    
print(f"\nModèle le moins sujet au surapprentissage: {models_comparison.loc[models_comparison['Overfitting'].idxmin(), 'Model']}")

=== COMPARAISON DES MODÈLES ===
           Model  R2_Train  R2_Test  MAE_Train  MAE_Test  RMSE_Train  RMSE_Test
LinearRegression     0.669    0.462    131.797   136.572     318.188    334.459
             SVR    -0.021   -0.027    129.813   130.429     560.191    519.809
    RandomForest     0.946    0.570     37.204   101.621     126.627    314.320

=== MEILLEUR MODÈLE PAR MÉTRIQUE ===
Meilleur R² Test: RandomForest (R² = 0.570)
Meilleur MAE Test: RandomForest (MAE = 102)
Meilleur RMSE Test: RandomForest (RMSE = 314)

=== ANALYSE DU SURAPPRENTISSAGE (Train R² - Test R²) ===
LinearRegression: 0.207
SVR: 0.006
RandomForest: 0.376

Modèle le moins sujet au surapprentissage: SVR
