In [1]:
# importation des librairies nécessaires à l'exploration
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [2]:
sklearn.set_config(display="diagram")

In [3]:
def regression_metrics(y_test, y_pred):
    """Function which contains differents metrics about regression
    Input: prediction, test/target
    
    Output: MAE, MSE, RMSE & R² score  
    """
    mae=mean_absolute_error(y_test, y_pred)
    mse=mean_squared_error(y_test, y_pred)
    rmse=mse ** (1/2)
    r_score = r2_score(y_test, y_pred)
    print("MAE :",mae.round(3))
    print("MSE :", mse.round(3))
    print("RMSE :", rmse.round(3))
    print("R² :", r_score.round(3))

In [4]:
data = pd.read_csv("datasets/benchmark_total.csv")

In [5]:
data.head(3)

Unnamed: 0,PrimaryPropertyType,Neighborhood,YearBuilt,NumberofBuildings,NumberofFloors,PropertyGFATotal,ENERGYSTARScore,SiteEnergyUse(kBtu),TotalGHGEmissions
0,Hotel,Downtown,1927,1.0,12.0,88434,65.0,6981428.0,249.43
1,Hotel,Downtown,1996,1.0,11.0,103566,51.0,8354235.0,263.51
2,Hotel,Downtown,1969,1.0,41.0,961990,18.0,73130656.0,2061.48


In [6]:
y_energy = data["SiteEnergyUse(kBtu)"].values
y_ghg = data["TotalGHGEmissions"].values
X = data.drop(["SiteEnergyUse(kBtu)", "TotalGHGEmissions"], axis=1)

In [7]:
y_energy = np.log1p(y_energy)
y_ghg = np.log1p(y_ghg)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y_energy, test_size=0.2, random_state=42)

In [9]:
y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

# Création d'un modèle "baseline"

## Préparation des données

In [10]:
cat_var = ["PrimaryPropertyType", "Neighborhood"]
num_var = ["YearBuilt", "NumberofBuildings", "NumberofFloors", "PropertyGFATotal"]

In [11]:
from sklearn.impute import SimpleImputer


imputer_cat = SimpleImputer(strategy="most_frequent")
imputer_cat.fit(X_train[cat_var])

In [12]:
imputer_num = SimpleImputer(strategy="median")
imputer_num.fit(X_train[num_var])

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train[num_var])

In [14]:
X_train[num_var] = scaler.transform(X_train[num_var])
X_train[num_var] = imputer_num.transform(X_train[num_var])
X_train[cat_var] = imputer_cat.transform(X_train[cat_var])

### Encodage des variables catégorielles

In [15]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()
enc.fit(X_train[cat_var])
X_train = enc.transform(X_train[cat_var])

## Création du modèle

In [16]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

## Evaluation du modèle avec le jeu de test

In [17]:
X_test[num_var] = imputer_num.transform(X_test[num_var])
X_test[cat_var] = imputer_cat.transform(X_test[cat_var])

In [18]:
X_test = enc.transform(X_test[cat_var])

In [19]:
y_pred = lin_reg.predict(X_test)

In [20]:
regression_metrics(y_test, y_pred)

MAE : 0.882
MSE : 1.535
RMSE : 1.239
R² : 0.217


## Test de plusieurs modèles

In [None]:
from sklearn.linear_model import Ridge, Lasso, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor, RandomForestRegressor
import xgboost
