# Analyse Exploratoire

### Import des modules

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
import category_encoders as ce

from sklearn.inspection import permutation_importance

#Preprocess
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

#Modèles
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [2]:
pd.set_option('display.max_columns', 100) 
pd.set_option('display.max_rows', 100)

In [3]:
pd.options.display.float_format = '{:,.3f}'.format

## Observation des données

In [5]:
building_consumption = pd.read_csv("2016_Building_Energy_Benchmarking.csv")

FileNotFoundError: [Errno 2] No such file or directory: '2016_Building_Energy_Benchmarking.csv'

In [None]:
building_consumption = building_consumption[building_consumption["BuildingType"] == "NonResidential"]

In [None]:
building_consumption

In [None]:
building_consumption.info()

In [None]:
building_consumption.shape

## Gestion des données manquantes

In [None]:
building_consumption.isna().mean()

In [None]:
#missing da df
buildings_na = pd.DataFrame(building_consumption.isnull().mean().round(4).mul(100).sort_values(ascending=False), columns=['Percentage Missing'])
buildings_na

In [None]:
#suppression de col
building_consumption.drop(['Comments', 'YearsENERGYSTARCertified'], axis=1, inplace = True)

## Statistiques descriptives

In [None]:
building_consumption.describe()

In [None]:
building_consumption['SiteEnergyUse(kBtu)'].describe()

## Gestion des OUTLIERS

In [None]:
building_consumption = building_consumption[building_consumption['SiteEnergyUse(kBtu)'] > 0]

In [None]:
building_consumption = building_consumption[building_consumption['OSEBuildingID'] != 25772]
#outlier isolé supprimé


In [None]:
#NOUVELLE METHODE EUI : Energie/taille 
building_consumption['EUI'] = building_consumption['SiteEnergyUse(kBtu)'] / building_consumption['PropertyGFATotal']

In [None]:
q1 = building_consumption['EUI'].quantile(0.01)
q99 = building_consumption['EUI'].quantile(0.99)

In [None]:
building_consumption['EUI_outlier'] = 'normal'
building_consumption.loc[building_consumption['EUI'] < q1, 'EUI_outlier'] = 'bas'
building_consumption.loc[building_consumption['EUI'] > q99, 'EUI_outlier'] = 'haut'

In [None]:
# Outliers EUI élevés (haut du spectre)
outliers_haut = building_consumption[building_consumption['EUI_outlier'] == 'haut']

# Outliers EUI très bas
outliers_bas = building_consumption[building_consumption['EUI_outlier'] == 'bas']


In [None]:
building_consumption = building_consumption[building_consumption['EUI'] <= 500]

In [None]:
#suppression outliers EUI <= 3.70
building_consumption = building_consumption[building_consumption['EUI'] >= 3.70]

In [None]:
#Outliers deja marqués dans le df initial suite a mon nettoyage
df_outlier_str = building_consumption[building_consumption["Outlier"].apply(lambda x: isinstance(x, str))]

In [None]:
#Suppression des outliers spécifiés dans la colonne outlier
building_consumption = building_consumption[
    ~building_consumption['OSEBuildingID'].isin(df_outlier_str['OSEBuildingID'])
]
#reste 1426 lignes


In [None]:
building_consumption = building_consumption.drop(columns=['Outlier'])

In [None]:
building_consumption = building_consumption[
    (building_consumption['NumberofBuildings'] != 0) & (building_consumption['NumberofFloors'] != 0)
]
#on garde seulement les bâtiments ayant au moins un étage et au moins un bâtiment.

In [None]:
building_consumption = building_consumption[building_consumption['ComplianceStatus'] != 'Error - Correct Default Data']
#suppression des donnees remplies automatiquement par la ville

## Visualisation de la relation entre les variables 

In [None]:
#comparaison mono usages vs multi usages

building_consumption["UsageType"] = building_consumption["ListOfAllPropertyUseTypes"].apply(
    lambda x: "Multi-usage" if isinstance(x, str) and "," in x else "Mono-usage"
)

building_consumption["UsageType"].value_counts()

In [None]:
sns.histplot(building_consumption['SiteEnergyUse(kBtu)'], bins=50)
plt.title("Distribution de la consommation d'énergie")
plt.show()

In [None]:
building_consumption.plot.scatter(x='PropertyGFATotal', y='SiteEnergyUse(kBtu)')

In [None]:
sns.boxplot(x="UsageType", y="PropertyGFATotal", data=building_consumption)
#comparaison taille proprieté mono vs multi

In [None]:
#comparaison mono vs multi par rapport à conso energie
sns.boxplot(x="UsageType", y="SiteEnergyUse(kBtu)", data=building_consumption)
plt.title("Consommation d'énergie par type d’usage")
plt.xlabel("Type d’usage")
plt.ylabel("Consommation d’énergie (kBtu)")
plt.show()

In [None]:
sns.boxplot(x="PrimaryPropertyType", y="SiteEnergyUse(kBtu)", data=building_consumption)
plt.xticks(rotation=90)  # si les catégories sont longues
plt.show()

## Suppression des variables constantes et peu utiles

In [None]:
#valeur constantes et supression
[col for col in building_consumption.columns if building_consumption[col].nunique() <= 1]


In [None]:
building_no_out = building_consumption.drop(['DataYear', 'BuildingType', 'City', 'State','DefaultData','ComplianceStatus' ], axis=1)

In [None]:
building_no_out

# Préparation à la Modélisation 

### Feature Engineering

In [None]:
# CODE FEATURE ENGINEERING

In [None]:
building_no_out["UsageCount"] = building_no_out["ListOfAllPropertyUseTypes"].apply(
    lambda x: "1" if isinstance(x, str) and "," in x 
    else "0"
)


In [None]:
#regroupement primary property type
rare_types = building_no_out["PrimaryPropertyType"].value_counts()[building_no_out["PrimaryPropertyType"].value_counts() < 50].index

building_no_out["PropertyTypeGrouped"] = building_no_out["PrimaryPropertyType"].replace(rare_types, "Autre")


In [None]:
# Création de colonnes indiquant la présence ou non d'un type d'énergie
building_no_out["HasElectricity"] = building_no_out["Electricity(kWh)"] > 0
building_no_out["HasGas"] = building_no_out["NaturalGas(kBtu)"] > 0
building_no_out["HasSteam"] = building_no_out["SteamUse(kBtu)"] >0

In [None]:
building_no_out["HasParking"] = building_no_out["PropertyGFAParking"] > 0

In [None]:
building_no_out['IsLarge'] = building_no_out['PropertyGFATotal'] > 400000

In [None]:
building_no_out["IsRecent"] = building_no_out["YearBuilt"] >= 2010


In [None]:
building_no_out["Age"] = 2015 - building_no_out["YearBuilt"]

### Suppression des variables peu pertinentes

In [None]:
cols_to_drop = [
    "OSEBuildingID",
    "ZipCode",
    "CouncilDistrictCode",
    "Latitude",
    "Longitude",
    "PropertyGFAParking",
    "PropertyGFABuilding(s)",
    "LargestPropertyUseType",
    "SecondLargestPropertyUseType",
    "SecondLargestPropertyUseTypeGFA",
    "ThirdLargestPropertyUseType",
    "ThirdLargestPropertyUseTypeGFA",
    "ENERGYSTARScore",
    "LargestPropertyUseType",
    "NaturalGas(therms)",
    "GHGEmissionsIntensity",
    "Electricity(kWh)"
]

building_no_out2 = building_no_out.drop(columns=cols_to_drop)


## Matrice de corrélation

In [None]:
building_corr = building_no_out2.corr(method='pearson', min_periods=1, numeric_only=True)
building_corr

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(building_corr, annot=True, fmt=".2f", cmap="coolwarm", annot_kws={"size": 8})
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()


## Suppression features redondantes 

In [None]:
cols_to_drop3 = [
    "SiteEUI(kBtu/sf)",
    "SiteEUIWN(kBtu/sf)",
    "SourceEUI(kBtu/sf)",
    "LargestPropertyUseTypeGFA",
    "Electricity(kBtu)",
    "SiteEnergyUseWN(kBtu)",
    "TotalGHGEmissions",
    "PropertyName",
    "Address",
    "TaxParcelIdentificationNumber",
    "UsageType",
    "ListOfAllPropertyUseTypes",
    'EUI_outlier',
    'SourceEUIWN(kBtu/sf)',
    'SteamUse(kBtu)',
    'NaturalGas(kBtu)',
    'EUI'
]

building_no_out3 = building_no_out2.drop(columns=cols_to_drop3)


# Test du modèle

## Séparation du Dataframe en dfX(features) + seriesY(target)

In [None]:
buildY = building_no_out3['SiteEnergyUse(kBtu)'].squeeze()

In [None]:
buildX = building_no_out3.drop('SiteEnergyUse(kBtu)', axis = 1) 

## Entrainement et Test du modèle Random Forest

In [None]:
# Séparation des données 
X_train_raw, X_test_raw, y_train, y_test = train_test_split(buildX, buildY, test_size=0.2, random_state=42)

# === Encodage des variables catégorielles ===
encoder1 = ce.BinaryEncoder(cols=["PrimaryPropertyType"])
X_train_enc = encoder1.fit_transform(X_train_raw)
X_test_enc = encoder1.transform(X_test_raw)

encoder2 = ce.BinaryEncoder(cols=["Neighborhood"])
X_train_enc = encoder2.fit_transform(X_train_enc)
X_test_enc = encoder2.transform(X_test_enc)

encoder3 = ce.BinaryEncoder(cols=["PropertyTypeGrouped"])
X_train_enc = encoder3.fit_transform(X_train_enc)
X_test_enc = encoder3.transform(X_test_enc)

# === Standardisation des colonnes numériques ===
cols_to_scale = ['NumberofBuildings', 'NumberofFloors', 'PropertyGFATotal','Age']
cols_passthrough = [col for col in X_train_enc.columns if col not in cols_to_scale]

scaler = StandardScaler()
X_train_scaled_num = pd.DataFrame(scaler.fit_transform(X_train_enc[cols_to_scale]), columns=cols_to_scale, index=X_train_enc.index)
X_test_scaled_num = pd.DataFrame(scaler.transform(X_test_enc[cols_to_scale]), columns=cols_to_scale, index=X_test_enc.index)

X_train_final = pd.concat([X_train_scaled_num, X_train_enc[cols_passthrough]], axis=1)
X_test_final = pd.concat([X_test_scaled_num, X_test_enc[cols_passthrough]], axis=1)

### 4. RANDOM FOREST REGRESSOR ###
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}
grid_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, cv=5, scoring='r2')
grid_rf.fit(X_train_final, y_train)
best_rf = grid_rf.best_estimator_
y_pred_rf_train = best_rf.predict(X_train_final)
y_pred_rf_test = best_rf.predict(X_test_final)

print("\n=== RANDOM FOREST REGRESSOR ===")
print("Best params:", grid_rf.best_params_)
print("TEST R²:", r2_score(y_test, y_pred_rf_test))
print("TEST MAE:", mean_absolute_error(y_test, y_pred_rf_test))
print("TEST RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_rf_test)))
print("TRAIN R²:", r2_score(y_train, y_pred_rf_train))
print("TRAIN MAE:", mean_absolute_error(y_train, y_pred_rf_train))
print("TRAIN RMSE:", np.sqrt(mean_squared_error(y_train, y_pred_rf_train)))


## Interpretation des résultats

In [None]:
# === Feature Importance pour le modèle Random Forest ===
importances = best_rf.feature_importances_
feature_names = X_train_final.columns  # Doit correspondre au bon X utilisé pour l'entraînement

# Création du DataFrame trié
feature_imp_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values('Importance', ascending=False)

# Affichage
print(feature_imp_df)


Les trois variables les plus importantes pour le modèle Random Forest sont le nombre de bâtiments, la surface totale de la propriété et le type de propriété principal, qui expliquent à elles seules environ 83 % de l’importance totale des features, indiquant qu’elles jouent un rôle majeur dans la prédiction de la consommation énergétique.