# First prediction notebook

Prediction of the Energy consumption.

* [Imports](#imports)
* [Data loading](#data-loading)
* [Feature seletion](#feature-seletion)
    * [Selection](#selection-pipeline)
* [Baseline](#baseline)
    * [Dummy regressor](#dummy-regressor)
    * [Small linear regression](#small-linear-regression)
* [Prediction](#prediction)
    * [Preprocessing](#preprocessing)
    * [Linear prediction](#linear-prediction)
    * [SVR prediction](#svr-prediction)
    * [Ensemble prediction](#ensemble-prediction)
    * [XGBoost prediction](#xgboost-prediction)
* [Evaluation](#evaluation)
    * [Evaluation](#evaluation)
    * [Residual plot](#residual-plot)
* [Energy score](#energy-score)
    * [First linear regressions](#first-linear-regressions)
    * [Quantitative results](#quantitative-results)
    * [Feature importance](#feature-importance)

<a name="imports"></a>
## Imports

In [None]:
!pip install shap

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
from copy import deepcopy

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, MinMaxScaler
from sklearn.impute import KNNImputer

from sklearn.dummy import DummyRegressor

from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge, ElasticNet, LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.metrics import r2_score, d2_absolute_error_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.inspection import permutation_importance

import shap
# print the JS visualization code to the notebook
shap.initjs()

<a name="data-loading"></a>
## Data loading

In [None]:
cleaned_dataset_v1_path = 'data_cleaned_v1.csv'
if not os.path.exists(cleaned_dataset_v1_path):
    !wget "https://drive.google.com/uc?export=download&id=1EYQuRaqc4yo0QYl35DRoIC6-IDwaX7Wq" -q --show-progress -O "$cleaned_dataset_v1_path"
!head -2 $cleaned_dataset_v1_path

In [None]:
cleaned_dataset_v2_path = 'data_cleaned_v2.csv'
if not os.path.exists(cleaned_dataset_v2_path):
    !wget "https://drive.google.com/uc?export=download&id=14_YIQ_v0D8PGwSkzb8KtbQIcAvlUeCLo" -q --show-progress -O "$cleaned_dataset_v2_path"
!head -2 $cleaned_dataset_v2_path

In [None]:
cleaned_dataset_v3_path = 'data_cleaned_v3.csv'
if not os.path.exists(cleaned_dataset_v3_path):
    !wget "https://drive.google.com/uc?export=download&id=1IUo4LplvDDytlWM58RgXZveljzNVkhzb" -q --show-progress -O "$cleaned_dataset_v3_path"
!head -2 $cleaned_dataset_v3_path

In [None]:
df_v3 = pd.read_csv(cleaned_dataset_v3_path)
df_v3.info()

In [None]:
df_v3.isna().sum()

<a name="feature-selection"></a>
## Feature selection

<a name="selection-pipeline"></a>
### Selection

In [None]:
cat_cols = [
    "BuildingType",
    "PrimaryPropertyType",
    "is_ENERGYSTARScore",
    "is_PropertyGFAParking",
    ]
num_cols = [
    "Latitude",
    "Longitude",
    "NumberofBuildings",
    "NumberofFloors",
    "YearBuilt",
    "PropertyGFABuilding(s)",
    "LargestPropertyUseTypeGFA",
    "ENERGYSTARScore",
    "ratio_SteamUse",
    "ratio_NaturalGas",
    ]
emb_cols = [f"PCA - {i}" for i in range(1,31)]
pred_cols = [
    "SiteEnergyUseWN(kBtu)",
    "mean_SiteEnergyUseWN(kBtu)"
    ]
cols = cat_cols + num_cols + emb_cols + pred_cols
df_pred_v3 = df_v3[cols]

In [None]:
Y_E = df_pred_v3[cols[-2]]
Y_E_mean = df_pred_v3[cols[-1]]
X = df_pred_v3.loc[Y_E < 3e8, cols[:-2]]

In [None]:
Y_E[Y_E > 3e8]

In [None]:
Y_E_mean = Y_E_mean[Y_E < 3e8]
Y_E = Y_E[Y_E < 3e8]

In [None]:
X_train, X_test, Y_E_train, Y_E_test, Y_E_mean_train, Y_E_mean_test = train_test_split(X, Y_E, Y_E_mean, test_size=0.3, random_state=18)
print(f"[INFO] X_test shape: {X_test.shape}")
print(f"[INFO] X_train shape: {X_train.shape}")

In [None]:
num_tr = Pipeline([
    ("imputer", KNNImputer()),
    ("scaler", StandardScaler())
    ])
cat_tr = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ('pca', PCA()),
    ("scaler", StandardScaler())
    ])
emb_tr = Pipeline([
    ("selec", VarianceThreshold()),
    ("scaler", StandardScaler()),
    ])
prep_v3 = ColumnTransformer([
    ("num", num_tr, num_cols),
    ("cat", cat_tr, cat_cols),
    ("emb", emb_tr, emb_cols),
    ])

In [None]:
Y_E_train.describe()

In [None]:
Y_E_test.describe()

<a name="baseline"></a>
## Baseline

<a name="dummy-regressor"></a>
### Dummy regressor

In [None]:
dummy_E = DummyRegressor(strategy="mean")
dummy_E.fit(X_train, Y_E_train)
print(f"Train score:{dummy_E.score(X_train, Y_E_train)}")
print(f"Test score:{dummy_E.score(X_test, Y_E_test)}")

In [None]:
dummy_E = DummyRegressor(strategy="quantile")
param_grid = {
    "quantile": np.linspace(0, 1, 11),
}
s_dummy_E = GridSearchCV(dummy_E, param_grid, n_jobs=-1, return_train_score=True)
s_dummy_E.fit(X_train, Y_E_train)
print("Best parameter (CV score=%0.3f):" % s_dummy_E.best_score_)
print(s_dummy_E.best_params_)
res = pd.DataFrame(s_dummy_E.cv_results_)
print(res[["mean_train_score","std_train_score"]])
print(f"Train score:{s_dummy_E.best_estimator_.score(X_train, Y_E_train)}")
print(f"Test score:{s_dummy_E.best_estimator_.score(X_test, Y_E_test)}")

<a name="small-linear-regression"></a>
### Small linear regression

In [None]:
X_train.columns
small_cols = [
              'PropertyGFABuilding(s)'
              ]
small_lr_E = LinearRegression()
small_lr_E.fit(X_train[small_cols], Y_E_train)
print(f"Train score:{small_lr_E.score(X_train[small_cols], Y_E_train)}")
print(f"Test score:{small_lr_E.score(X_test[small_cols], Y_E_test)}")

In [None]:
X_train.columns
small_cols = ['YearBuilt',
              'PropertyGFABuilding(s)'
              ]
small_lr_E = LinearRegression()
small_lr_E.fit(X_train[small_cols], Y_E_train)
print(f"Train score:{small_lr_E.score(X_train[small_cols], Y_E_train)}")
print(f"Test score:{small_lr_E.score(X_test[small_cols], Y_E_test)}")

In [None]:
X_train.columns
small_cols = ['YearBuilt',
              'PropertyGFABuilding(s)',
    "NumberofBuildings",
    "NumberofFloors",
    "YearBuilt",
              ]
small_lr_E = LinearRegression()
small_lr_E.fit(X_train[small_cols], Y_E_train)
print(f"Train score:{small_lr_E.score(X_train[small_cols], Y_E_train)}")
print(f"Test score:{small_lr_E.score(X_test[small_cols], Y_E_test)}")

In [None]:
X_train.columns
small_cols = ['PropertyGFABuilding(s)'] + emb_cols
small_lr_E = LinearRegression()
small_lr_E.fit(X_train[small_cols], Y_E_train)
print(f"Train score:{small_lr_E.score(X_train[small_cols], Y_E_train)}")
print(f"Test score:{small_lr_E.score(X_test[small_cols], Y_E_test)}")

In [None]:
X_train.columns
small_cols = emb_cols
small_lr_E = LinearRegression()
small_lr_E.fit(X_train[small_cols], Y_E_train)
print(f"Train score:{small_lr_E.score(X_train[small_cols], Y_E_train)}")
print(f"Test score:{small_lr_E.score(X_test[small_cols], Y_E_test)}")

In [None]:
X_train.columns
small_cols = ['PropertyGFABuilding(s)'] + emb_cols
small_lr_E = LinearRegression()
small_lr_E.fit(X_train[small_cols], Y_E_mean_train)
print(f"Train score:{small_lr_E.score(X_train[small_cols], Y_E_mean_train)}")
print(f"Test score:{small_lr_E.score(X_test[small_cols], Y_E_mean_test)}")

<a name="prediction"></a>
## Prediction

<a name="preprocessing"></a>
### Preprocessing

In [None]:
lin_pipe_v3 = Pipeline([
    ('prep', prep_v3),
    ('scaler', StandardScaler()),
    ('lr_en', ElasticNet()),
    ])
lin_pipe_v3

In [None]:
param_grid = {
    "prep__emb__selec__threshold": [0.01],
    "prep__cat__pca__n_components": [9],
    "prep__num__scaler": ["passthrough", MinMaxScaler(), RobustScaler()],
    "prep__emb__scaler": ["passthrough", MinMaxScaler(), RobustScaler()],
    "prep__cat__scaler": ["passthrough", MinMaxScaler(), RobustScaler()],
    "lr_en__alpha": np.logspace(-1, 1, 3),
    "lr_en__l1_ratio": np.logspace(-2, -1, 2),
}
s_lin_E = GridSearchCV(lin_pipe_v3, param_grid, n_jobs=-1, return_train_score=True)
s_lin_E.fit(X_train, Y_E_train)
print("Best parameter (CV score=%0.3f):" % s_lin_E.best_score_)
print(s_lin_E.best_params_)
res = pd.DataFrame(s_lin_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Train score:{s_lin_E.best_estimator_.score(X_train, Y_E_train)}")
print(f"Test score:{s_lin_E.best_estimator_.score(X_test, Y_E_test)}")

In [None]:
param_grid = {
    "prep__emb__selec__threshold": [0, 1e-3, 1e-2, 0.05, 0.1],
    "prep__cat__pca__n_components": [9],
    "prep__num__scaler": [RobustScaler()],
    "prep__emb__scaler": [RobustScaler()],
    "prep__cat__scaler": ["passthrough"],
    "lr_en__alpha": [1.0],
    "lr_en__l1_ratio": [0.1],
}
s_lin_E = GridSearchCV(lin_pipe_v3, param_grid, n_jobs=-1, return_train_score=True)
s_lin_E.fit(X_train, Y_E_train)
print("Best parameter (CV score=%0.3f):" % s_lin_E.best_score_)
print(s_lin_E.best_params_)
res = pd.DataFrame(s_lin_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Train score:{s_lin_E.best_estimator_.score(X_train, Y_E_train)}")
print(f"Test score:{s_lin_E.best_estimator_.score(X_test, Y_E_test)}")

In [None]:
param_grid = {
    "prep__emb__selec__threshold": [0.],
    "prep__cat__pca__n_components": list(range(1,19)),
    "prep__num__scaler": [RobustScaler()],
    "prep__emb__scaler": [RobustScaler()],
    "prep__cat__scaler": ["passthrough"],
    "lr_en__alpha": [1.0],
    "lr_en__l1_ratio": [0.1],
}
s_lin_E = GridSearchCV(lin_pipe_v3, param_grid, n_jobs=-1, return_train_score=True)
s_lin_E.fit(X_train, Y_E_train)
print("Best parameter (CV score=%0.3f):" % s_lin_E.best_score_)
print(s_lin_E.best_params_)
res = pd.DataFrame(s_lin_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Train score:{s_lin_E.best_estimator_.score(X_train, Y_E_train)}")
print(f"Test score:{s_lin_E.best_estimator_.score(X_test, Y_E_test)}")

In [None]:
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
pca = PCA(n_components=18)
pca.fit(encoder.fit_transform(X_train[cat_cols]))

fig, (ax0, ax1) = plt.subplots(ncols=2, sharex=True, figsize=(10, 4))
ax0.plot(
    np.arange(1, pca.n_components_ + 1), pca.explained_variance_ratio_, "+", linewidth=2
)
ax0.set_ylabel("PCA explained variance ratio")

ax0.axvline(
    1,
    linestyle=":",
    label="n_components chosen",
)
ax0.legend(prop=dict(size=12))

# For each number of components, find the best classifier results
results = pd.DataFrame(s_lin_E.cv_results_)
components_col = "param_prep__cat__pca__n_components"
best_clfs = results.groupby(components_col).apply(
    lambda g: g.nlargest(1, "mean_test_score")
)

best_clfs.plot(
    x=components_col, y="mean_test_score", yerr="std_test_score", legend=False, ax=ax1
)
ax1.set_ylabel("Accuracy (val)")
ax1.set_xlabel("n_components")

plt.xlim(-1, 20)
plt.ylim(-0.01, 0.7)

plt.tight_layout()
plt.show()

In [None]:
num_tr = Pipeline([
    ("imputer", KNNImputer()),
    ("scaler", RobustScaler())
    ])
cat_tr = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ('pca', PCA(n_components=1)),
    ("scaler", "passthrough")
    ])
emb_tr = Pipeline([
    ("selec", VarianceThreshold()),
    ("scaler", RobustScaler())
    ])
prep_v3 = ColumnTransformer([
    ("num", num_tr, num_cols),
    ("cat", cat_tr, cat_cols),
    ("emb", emb_tr, emb_cols),
    ])
prep_v3

<a name="linear-prediction"></a>
### Linear prediction

In [None]:
lin_pipe_v3 = Pipeline([
    ('prep', prep_v3),
    ('scaler', StandardScaler()),
    ('lr_en', ElasticNet()),
    ])
lin_pipe_v3

In [None]:
param_grid = {
    "lr_en__alpha": np.logspace(-1, 1, 5),
    "lr_en__l1_ratio": list(np.logspace(-3, 0, 4)),
}
s_lin_E = GridSearchCV(lin_pipe_v3, param_grid, n_jobs=-1, return_train_score=True)
s_lin_E.fit(X_train, Y_E_train)
print("Best parameter (CV score=%0.3f):" % s_lin_E.best_score_)
print(s_lin_E.best_params_)
res = pd.DataFrame(s_lin_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Train score:{s_lin_E.best_estimator_.score(X_train, Y_E_train)}")
print(f"Test score:{s_lin_E.best_estimator_.score(X_test, Y_E_test)}")

In [None]:
lin_pipe_v3 = Pipeline([
    ('prep', prep_v3),
    ('scaler', StandardScaler()),
    ('lr_en', ElasticNet()),
    ])
lin_pipe_v3

In [None]:
param_grid = {
    "lr_en__alpha": np.logspace(-1, 1, 5),
    "lr_en__l1_ratio": list(np.logspace(-3, 0, 4)),
}
s_lin_E = GridSearchCV(lin_pipe_v3, param_grid, n_jobs=-1, return_train_score=True)
s_lin_E.fit(X_train, Y_E_mean_train)
print("Best parameter (CV score=%0.3f):" % s_lin_E.best_score_)
print(s_lin_E.best_params_)
res = pd.DataFrame(s_lin_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Train score:{s_lin_E.best_estimator_.score(X_train, Y_E_mean_train)}")
print(f"Test score:{s_lin_E.best_estimator_.score(X_test, Y_E_mean_test)}")

In [None]:
lin_ri_pipe_v3 = Pipeline([
    ('prep', prep_v3),
    ('scaler', StandardScaler()),
    ('lr_en', Ridge()),
    ])
lin_ri_pipe_v3

In [None]:
param_grid = {
    "lr_en__alpha": np.logspace(-3, 3, 9),
}
s_lin_ri_E = GridSearchCV(lin_ri_pipe_v3, param_grid, n_jobs=-1, return_train_score=True)
s_lin_ri_E.fit(X_train, Y_E_mean_train)
print("Best parameter (CV score=%0.3f):" % s_lin_ri_E.best_score_)
print(s_lin_ri_E.best_params_)
res = pd.DataFrame(s_lin_ri_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Train score:{s_lin_ri_E.best_estimator_.score(X_train, Y_E_mean_train)}")
print(f"Test score:{s_lin_ri_E.best_estimator_.score(X_test, Y_E_mean_test)}")

In [None]:
param_grid = {
    "lr_en__alpha": np.logspace(-2, 2, 9),
}
s_lin_ri_E = GridSearchCV(lin_ri_pipe_v3, param_grid, n_jobs=-1, return_train_score=True, cv=5)
s_lin_ri_E.fit(X_train, Y_E_train)
print("Best parameter (CV score=%0.3f):" % s_lin_ri_E.best_score_)
print(s_lin_ri_E.best_params_)
res = pd.DataFrame(s_lin_ri_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Train score:{s_lin_ri_E.best_estimator_.score(X_train, Y_E_train)}")
print(f"Test score:{s_lin_ri_E.best_estimator_.score(X_test, Y_E_test)}")

In [None]:
lr_tr = TransformedTargetRegressor(regressor=ElasticNet(),
                                func=np.log, 
                                inverse_func=np.exp)

lin_tr_pipe_v3 = Pipeline([
    ('prep', prep_v3),
    ('scaler', StandardScaler()),
    ('lr_en', lr_tr),
    ])
lin_tr_pipe_v3

In [None]:
param_grid = {
    "lr_en__regressor__alpha": np.logspace(-1, 1, 5),
    "lr_en__regressor__l1_ratio": list(np.logspace(-2, 0, 6)),
}
s_lin_tr_E = GridSearchCV(lin_tr_pipe_v3, param_grid, n_jobs=-1, return_train_score=True)
s_lin_tr_E.fit(X_train, Y_E_train)
print("Best parameter (CV score=%0.3f):" % s_lin_tr_E.best_score_)
print(s_lin_tr_E.best_params_)
res = pd.DataFrame(s_lin_tr_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Test score:{s_lin_tr_E.best_estimator_.score(X_test, Y_E_test)}")

In [None]:
lr = TransformedTargetRegressor(regressor=ElasticNet(),
                                func=np.sqrt, 
                                inverse_func=np.square)

lin_tr_pipe_v3 = Pipeline([
    ('prep', prep_v3),
    ('scaler', StandardScaler()),
    ('lr_en', lr_tr),
    ])
lin_tr_pipe_v3

In [None]:
param_grid = {
    "lr_en__regressor__alpha": np.logspace(-1, 1, 5),
    "lr_en__regressor__l1_ratio": list(np.logspace(-4, 0, 6)),
}
s_lin_tr_E = GridSearchCV(lin_tr_pipe_v3, param_grid, n_jobs=-1, return_train_score=True)
s_lin_tr_E.fit(X_train, Y_E_train)
print("Best parameter (CV score=%0.3f):" % s_lin_tr_E.best_score_)
print(s_lin_tr_E.best_params_)
res = pd.DataFrame(s_lin_tr_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Test score:{s_lin_tr_E.best_estimator_.score(X_test, Y_E_test)}")

In [None]:
lr = TransformedTargetRegressor(regressor=Ridge(),
                                func=np.sqrt, 
                                inverse_func=np.square)

lin_tr_pipe_v3 = Pipeline([
    ('prep', prep_v3),
    ('scaler', StandardScaler()),
    ('lr_en', lr_tr),
    ])
lin_tr_pipe_v3

In [None]:
param_grid = {
    "lr_en__regressor__alpha": np.logspace(-3, 3, 9),
}
s_lin_tr_E = GridSearchCV(lin_tr_pipe_v3, param_grid, n_jobs=-1, return_train_score=True)
s_lin_tr_E.fit(X_train, Y_E_train)
print("Best parameter (CV score=%0.3f):" % s_lin_tr_E.best_score_)
print(s_lin_tr_E.best_params_)
res = pd.DataFrame(s_lin_tr_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Test score:{s_lin_tr_E.best_estimator_.score(X_test, Y_E_test)}")

In [None]:
feature_names = s_lin_E.best_estimator_['prep'].get_feature_names_out()
coefs = pd.DataFrame(
    s_lin_E.best_estimator_['lr_en'].coef_,
    columns=["Coefficients"],
    index=feature_names,
)

coefs
coefs.plot.barh(figsize=(9, 15))
plt.title("Elastic net best parameters")
plt.axvline(x=0, color=".5")
plt.xlabel("Raw coefficient values")
plt.subplots_adjust(left=0.3)

<a name="svr-prediction"></a>
### SVR prediction

In [None]:
svr = regressor=SVR()

svr_pipe_v3 = Pipeline([
    ('prep', prep_v3),
    ('scaler', MinMaxScaler()),
    ('svr', svr),
    ])
svr_pipe_v3

In [None]:
param_grid = {
    "svr__C": np.logspace(-4, 4, 5),
    "svr__epsilon": np.logspace(-4, 4, 5),
}
s_svr_E = GridSearchCV(svr_pipe_v3, param_grid, n_jobs=-1, return_train_score=True)
s_svr_E.fit(X_train, Y_E_train)
print("Best parameter (CV score=%0.3f):" % s_svr_E.best_score_)
print(s_svr_E.best_params_)
res = pd.DataFrame(s_svr_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Test score:{s_svr_E.best_estimator_.score(X_test, Y_E_test)}")

In [None]:
svr_tr = TransformedTargetRegressor(regressor=SVR(),
                                func=np.sqrt, 
                                inverse_func=np.square)

svr_tr_pipe_v3 = Pipeline([
    ('prep', prep_v3),
    ('scaler', MinMaxScaler()),
    ('svr', svr_tr),
    ])
svr_tr_pipe_v3

In [None]:
param_grid = {
    "svr__regressor__C": np.logspace(-4, 4, 5),
    "svr__regressor__epsilon": np.logspace(-4, 4, 5),
}
s_svr_tr_E = GridSearchCV(svr_tr_pipe_v3, param_grid, n_jobs=-1, return_train_score=True)
s_svr_tr_E.fit(X_train, Y_E_train)
print("Best parameter (CV score=%0.3f):" % s_svr_tr_E.best_score_)
print(s_svr_tr_E.best_params_)
res = pd.DataFrame(s_svr_tr_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Train score:{s_svr_tr_E.best_estimator_.score(X_train, Y_E_train)}")
print(f"Test score:{s_svr_tr_E.best_estimator_.score(X_test, Y_E_test)}")

In [None]:
svr_tr = TransformedTargetRegressor(regressor=SVR(),
                                func=np.log, 
                                inverse_func=np.exp)

svr_tr_pipe_v3 = Pipeline([
    ('prep', prep_v3),
    ('scaler', MinMaxScaler()),
    ('svr', svr_tr),
    ])
svr_tr_pipe_v3

In [None]:
param_grid = {
    "svr__regressor__C": np.logspace(-4, 4, 5),
    "svr__regressor__epsilon": np.logspace(-4, 4, 5),
}
s_svr_tr_E = GridSearchCV(svr_tr_pipe_v3, param_grid, n_jobs=-1, return_train_score=True)
s_svr_tr_E.fit(X_train, Y_E_train)
print("Best parameter (CV score=%0.3f):" % s_svr_tr_E.best_score_)
print(s_svr_tr_E.best_params_)
res = pd.DataFrame(s_svr_tr_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Train score:{s_svr_tr_E.best_estimator_.score(X_train, Y_E_train)}")
print(f"Test score:{s_svr_tr_E.best_estimator_.score(X_test, Y_E_test)}")

<a name="ensemble-prediction"></a>
### Ensemble prediction

In [None]:
rf_pipe_v3 = Pipeline([
    ('prep', prep_v3),
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor()),
    ])
rf_pipe_v3

In [None]:
param_grid = {
    "rf__n_estimators": [200, 250, 300],
    "rf__max_depth": [10, 15, 25, 50],
    "rf__min_samples_leaf": [3, 5, 7, 15],
}
s_rf_E = GridSearchCV(rf_pipe_v3, param_grid, n_jobs=-1, cv=5, return_train_score=True)
s_rf_E.fit(X_train, Y_E_train)
print("Best parameter (CV score=%0.3f):" % s_rf_E.best_score_)
print(s_rf_E.best_params_)
res = pd.DataFrame(s_rf_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Train score:{s_rf_E.best_estimator_.score(X_train, Y_E_train)}")
print(f"Test score:{s_rf_E.best_estimator_.score(X_test, Y_E_test)}")

In [None]:
rf_tr = TransformedTargetRegressor(regressor=RandomForestRegressor(),
                                func=np.sqrt, 
                                inverse_func=np.square)

rf_tr_pipe_v3 = Pipeline([
    ('prep', prep_v3),
    ('scaler', StandardScaler()),
    ('rf', rf_tr),
    ])
rf_tr_pipe_v3

In [None]:
param_grid = {
    "rf__regressor__n_estimators": [150, 200, 250],
    "rf__regressor__max_depth": [10, 15, 25, 50],
    "rf__regressor__min_samples_leaf": [3, 5, 7],
}
s_rf_tr_E = GridSearchCV(rf_tr_pipe_v3, param_grid, n_jobs=-1, return_train_score=True)
s_rf_tr_E.fit(X_train, Y_E_train)
print("Best parameter (CV score=%0.3f):" % s_rf_tr_E.best_score_)
print(s_rf_tr_E.best_params_)
res = pd.DataFrame(s_rf_tr_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Train score:{s_rf_tr_E.best_estimator_.score(X_train, Y_E_train)}")
print(f"Test score:{s_rf_tr_E.best_estimator_.score(X_test, Y_E_test)}")

<a name="xgboost-prediction"></a>
### XGBoost prediction

In [None]:
xgb = XGBRegressor()

xgb_pipe_v3 = Pipeline([
    ('prep', prep_v3),
    ('scaler', StandardScaler()),
    ('reg', xgb),
    ])
xgb_pipe_v3

In [None]:
param_grid = {
    "reg__n_estimators": [150, 200, 300],
    "reg__max_depth": [5, 15, 25],
    "reg__min_child_weight": [3, 5, 7],
    "reg__gamma": np.logspace(-1, 1, 3),
    "reg__subsample": [0.3, 0.5, 0.7],
}
s_xgb_E = GridSearchCV(xgb_pipe_v3, param_grid, n_jobs=-1, cv=5, return_train_score=True)
s_xgb_E.fit(X_train, Y_E_train)
print("Best parameter (CV score=%0.3f):" % s_xgb_E.best_score_)
print(s_xgb_E.best_params_)
res = pd.DataFrame(s_xgb_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Train score:{s_xgb_E.best_estimator_.score(X_train, Y_E_train)}")
print(f"Test score:{s_xgb_E.best_estimator_.score(X_test, Y_E_test)}")

In [None]:
xgb_tr = TransformedTargetRegressor(regressor=XGBRegressor(),
                                func=np.sqrt, 
                                inverse_func=np.square)

xgb_tr_pipe_v3 = Pipeline([
    ('prep', prep_v3),
    ('scaler', StandardScaler()),
    ('reg', xgb_tr),
    ])
xgb_tr_pipe_v3

In [None]:
param_grid = {
    "reg__regressor__n_estimators": [150, 200, 300],
    "reg__regressor__max_depth": [5, 15, 25],
    "reg__regressor__min_child_weight": [3, 5, 7],
    "reg__regressor__gamma": np.logspace(-2, 1, 3),
    "reg__regressor__subsample": [0.3, 0.5, 0.7],
}
s_xgb_tr_E = GridSearchCV(xgb_tr_pipe_v3, param_grid, n_jobs=-1, cv=5, return_train_score=True)
s_xgb_tr_E.fit(X_train, Y_E_train)
print("Best parameter (CV score=%0.3f):" % s_xgb_tr_E.best_score_)
print(s_xgb_tr_E.best_params_)
res = pd.DataFrame(s_xgb_tr_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Train score:{s_xgb_tr_E.best_estimator_.score(X_train, Y_E_train)}")
print(f"Test score:{s_xgb_tr_E.best_estimator_.score(X_test, Y_E_test)}")

<a name="evaluation"></a>
## Evaluation

<a name="evaluation"></a>
### Evaluation

In [None]:
metrics = {
    "r2": r2_score,
    "d2": d2_absolute_error_score,
    "exp_va": explained_variance_score,
    "mse": mean_squared_error,
    "rmse": lambda *arg,**kwarg: mean_squared_error(*arg,**kwarg)**0.5,
    "mae": mean_absolute_error
    }
models = ["lin", "lin_tr", "svr", "svr_tr", "rf", "rf_tr", "xgb", "xgb_tr"]

E_res = []
for model in models:
    E_res.append([eval(f"m(Y_E_test, s_{model}_E.best_estimator_.predict(X_test))") for m in metrics.values()])

In [None]:
pd.DataFrame(E_res, columns=metrics.keys(), index=models)

In [None]:
def make_best_model():
    num_tr = Pipeline([
        ("imputer", KNNImputer()),
        ("scaler", RobustScaler())
        ])
    cat_tr = Pipeline([
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
        ('pca', PCA(n_components=1)),
        ("scaler", "passthrough")
        ])
    emb_tr = Pipeline([
        ("selec", VarianceThreshold()),
        ("scaler", RobustScaler())
        ])
    prep_v3 = ColumnTransformer([
        ("num", num_tr, num_cols),
        ("cat", cat_tr, cat_cols),
        ("emb", emb_tr, emb_cols),
        ])
    svr_tr = TransformedTargetRegressor(regressor=SVR(C=10000, epsilon=100),
                                    func=np.sqrt, 
                                    inverse_func=np.square)

    svr_tr_pipe_v3 = Pipeline([
        ('prep', prep_v3),
        ('scaler', MinMaxScaler()),
        ('svr', svr_tr),
        ])
    return svr_tr_pipe_v3

def make_best_model_2():
    num_tr = Pipeline([
        ("imputer", KNNImputer()),
        ("scaler", RobustScaler())
        ])
    cat_tr = Pipeline([
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
        ('pca', PCA(n_components=1)),
        ("scaler", "passthrough")
        ])
    emb_tr = Pipeline([
        ("selec", VarianceThreshold()),
        ("scaler", RobustScaler())
        ])
    prep_v3 = ColumnTransformer([
        ("num", num_tr, num_cols),
        ("cat", cat_tr, cat_cols),
        ("emb", emb_tr, emb_cols),
        ])
    best_param = {'max_depth': 15, 'min_samples_leaf': 5, 'n_estimators': 150}
    rf_tr = TransformedTargetRegressor(regressor=RandomForestRegressor(),
                                    func=np.sqrt, 
                                    inverse_func=np.square)

    rf_tr_pipe_v3 = Pipeline([
        ('prep', prep_v3),
        ('scaler', StandardScaler()),
        ('rf', rf_tr),
        ])
    
    
    return rf_tr_pipe_v3

In [None]:
train_scores = []
test_scores = []
fit_times = []
train_mae = []
test_mae = []
kf = KFold(n_splits=5, shuffle=True)
X = X.reset_index(drop=True)
Y_E = Y_E.reset_index(drop=True)
for train, test in kf.split(Y_E):
    model = make_best_model()
    t0 = time()
    model.fit(X.loc[train],Y_E.loc[train])
    fit_times.append(time()-t0)
    train_scores.append(model.score(X.loc[train],Y_E.loc[train]))
    test_scores.append(model.score(X.loc[test],Y_E.loc[test]))
    train_mae.append(mean_absolute_error(model.predict(X.loc[train]),Y_E.loc[train]))
    test_mae.append(mean_absolute_error(model.predict(X.loc[test]),Y_E.loc[test]))


In [None]:
print(f"[INFO] Train score: {np.mean(train_scores):.3f} +- {np.std(train_scores):.3f}")
print(f"[INFO] Test score: {np.mean(test_scores):.3f} +- {np.std(test_scores):.3f}")
print(f"[INFO] Train mae: {np.mean(train_mae):.3f} +- {np.std(train_mae):.3f}")
print(f"[INFO] Test mae: {np.mean(test_mae):.3f} +- {np.std(test_mae):.3f}")
print(f"[INFO] Fit time: {np.mean(fit_times):.3f} +- {np.std(fit_times):.3f}")

In [None]:
train_scores = []
test_scores = []
fit_times = []
train_mae = []
test_mae = []
kf = KFold(n_splits=5, shuffle=True)
X = X.reset_index(drop=True)
Y_E = Y_E.reset_index(drop=True)
for train, test in kf.split(Y_E):
    model = make_best_model_2()
    t0 = time()
    model.fit(X.loc[train],Y_E.loc[train])
    fit_times.append(time()-t0)
    train_scores.append(model.score(X.loc[train],Y_E.loc[train]))
    test_scores.append(model.score(X.loc[test],Y_E.loc[test]))
    train_mae.append(mean_absolute_error(model.predict(X.loc[train]),Y_E.loc[train]))
    test_mae.append(mean_absolute_error(model.predict(X.loc[test]),Y_E.loc[test]))

In [None]:
print(f"[INFO] Train score: {np.mean(train_scores):.3f} +- {np.std(train_scores):.3f}")
print(f"[INFO] Test score: {np.mean(test_scores):.3f} +- {np.std(test_scores):.3f}")
print(f"[INFO] Train mae: {np.mean(train_mae):.3f} +- {np.std(train_mae):.3f}")
print(f"[INFO] Test mae: {np.mean(test_mae):.3f} +- {np.std(test_mae):.3f}")
print(f"[INFO] Fit time: {np.mean(fit_times):.3f} +- {np.std(fit_times):.3f}")

<a name="residual-plot"></a>
### Residual plot

In [None]:
X_train, X_test, Y_E_n, Y_E_test = train_test_split(X, Y_E, test_size=0.3, random_state=18)
print(f"[INFO] X_test shape: {X_test.shape}")
print(f"[INFO] X_train shape: {X_train.shape}")

In [None]:
model = make_best_model()
model.fit(X_train, Y_E_train)

pred_train = model.predict(X_train)
res_train = Y_E_train - pred_train

pred_test = model.predict(X_test)
res_test = Y_E_test - pred_test

In [None]:
plt.scatter(pred_train, res_train, label="Train residuals")
plt.scatter(pred_test, res_test, label="Test residuals")
plt.legend()
plt.xlabel("Predicted")
plt.xscale("log")
plt.ylabel("Residuals")
plt.title("Residuals - Energy (SVR)")
plt.show()

<a name="energy-score"></a>
## Energy score

<a name="first-linear-regressions"></a>
### First linear regressions

In [None]:
cat_cols = [
    "BuildingType",
    "PrimaryPropertyType",
    "is_PropertyGFAParking",
    ]
num_cols = [
    "Latitude",
    "Longitude",
    "NumberofBuildings",
    "NumberofFloors",
    "YearBuilt",
    "PropertyGFABuilding(s)",
    "LargestPropertyUseTypeGFA",
    "ratio_SteamUse",
    "ratio_NaturalGas",
    ]
emb_cols = [f"PCA - {i}" for i in range(1,31)]
pred_cols = [
    "SiteEnergyUseWN(kBtu)",
    "mean_SiteEnergyUseWN(kBtu)"
    ]
cols = cat_cols + num_cols + emb_cols

In [None]:
num_tr = Pipeline([
    ("imputer", KNNImputer()),
    ("scaler", RobustScaler())
    ])
cat_tr = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ('pca', PCA(n_components=1)),
    ("scaler", "passthrough")
    ])
emb_tr = Pipeline([
    ("selec", VarianceThreshold()),
    ("scaler", RobustScaler())
    ])
prep_v3 = ColumnTransformer([
    ("num", num_tr, num_cols),
    ("cat", cat_tr, cat_cols),
    ("emb", emb_tr, emb_cols),
    ])

In [None]:
lr_pipe = Pipeline([
        ('prep', prep_v3),
        ('scaler', StandardScaler()),
        ('lr', LinearRegression()),
        ])
lr_pipe.fit(X_train[cols], Y_E_train)
print(f"Train score:{lr_pipe.score(X_train[cols], Y_E_train)}")
print(f"Test score:{lr_pipe.score(X_test[cols], Y_E_test)}")

In [None]:
cat_cols = [
    "BuildingType",
    "PrimaryPropertyType",
    "is_ENERGYSTARScore",
    "is_PropertyGFAParking",
    ]
num_cols = [
    "Latitude",
    "Longitude",
    "NumberofBuildings",
    "NumberofFloors",
    "YearBuilt",
    "PropertyGFABuilding(s)",
    "LargestPropertyUseTypeGFA",
    "ratio_SteamUse",
    "ratio_NaturalGas",
    ]
emb_cols = [f"PCA - {i}" for i in range(1,31)]
pred_cols = [
    "SiteEnergyUseWN(kBtu)",
    "mean_SiteEnergyUseWN(kBtu)"
    ]
cols = cat_cols + num_cols + emb_cols

In [None]:
num_tr = Pipeline([
    ("imputer", KNNImputer()),
    ("scaler", RobustScaler())
    ])
cat_tr = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ('pca', PCA(n_components=1)),
    ("scaler", "passthrough")
    ])
emb_tr = Pipeline([
    ("selec", VarianceThreshold()),
    ("scaler", RobustScaler())
    ])
prep_v3 = ColumnTransformer([
    ("num", num_tr, num_cols),
    ("cat", cat_tr, cat_cols),
    ("emb", emb_tr, emb_cols),
    ])

In [None]:
lr_pipe = Pipeline([
        ('prep', prep_v3),
        ('scaler', StandardScaler()),
        ('lr', LinearRegression()),
        ])
lr_pipe.fit(X_train[cols], Y_E_train)
print(f"Train score:{lr_pipe.score(X_train[cols], Y_E_train)}")
print(f"Test score:{lr_pipe.score(X_test[cols], Y_E_test)}")

In [None]:
cat_cols = [
    "BuildingType",
    "PrimaryPropertyType",
    "is_ENERGYSTARScore",
    "is_PropertyGFAParking",
    ]
num_cols = [
    "Latitude",
    "Longitude",
    "NumberofBuildings",
    "NumberofFloors",
    "YearBuilt",
    "PropertyGFABuilding(s)",
    "LargestPropertyUseTypeGFA",
    "ENERGYSTARScore",
    "ratio_SteamUse",
    "ratio_NaturalGas",
    ]
emb_cols = [f"PCA - {i}" for i in range(1,31)]
pred_cols = [
    "SiteEnergyUseWN(kBtu)",
    "mean_SiteEnergyUseWN(kBtu)"
    ]
cols = cat_cols + num_cols + emb_cols

In [None]:
num_tr = Pipeline([
    ("imputer", KNNImputer()),
    ("scaler", RobustScaler())
    ])
cat_tr = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ('pca', PCA(n_components=1)),
    ("scaler", "passthrough")
    ])
emb_tr = Pipeline([
    ("selec", VarianceThreshold()),
    ("scaler", RobustScaler())
    ])
prep_v3 = ColumnTransformer([
    ("num", num_tr, num_cols),
    ("cat", cat_tr, cat_cols),
    ("emb", emb_tr, emb_cols),
    ])

In [None]:
lr_pipe = Pipeline([
        ('prep', prep_v3),
        ('scaler', StandardScaler()),
        ('lr', LinearRegression()),
        ])
lr_pipe.fit(X_train[cols], Y_E_train)
print(f"Train score:{lr_pipe.score(X_train[cols], Y_E_train)}")
print(f"Test score:{lr_pipe.score(X_test[cols], Y_E_test)}")

<a name="quantitative-results"></a>
### Quantitative results

In [None]:
imputer = lr_pipe['prep'].transformers_[0][1]['imputer']

In [None]:
X_new = imputer.transform(X[num_cols])

In [None]:
ESS = X_new[:,7]

In [None]:
sns.histplot(x=ESS, hue=X["is_ENERGYSTARScore"])

In [None]:
sns.kdeplot(x=ESS, hue=X["is_ENERGYSTARScore"])
plt.xlabel('ENERGYSTARScore (KNN)')
ymin, ymax = plt.ylim()
plt.vlines(ESS[X["is_ENERGYSTARScore"]].mean(), ymin=ymin, ymax=ymax, color="orange", linestyle=":")
plt.legend(title='is_ENERGYSTARScore',labels=['True', 'False', 'Mean (True)'])

In [None]:
X_2 = deepcopy(X).reset_index()

In [None]:
X_2["ENERGYSTARScore"] = ESS

In [None]:
X_train, X_test, Y_E_train, Y_E_test = train_test_split(X_2, Y_E, test_size=0.3, random_state=18)
print(f"[INFO] X_test shape: {X_test.shape}")
print(f"[INFO] X_train shape: {X_train.shape}")

In [None]:
X_train.columns
small_cols = [
              'PropertyGFABuilding(s)'
              ] 
small_lr_E = LinearRegression()
small_lr_E.fit(X_train[small_cols], Y_E_train)
print(f"Train score:{small_lr_E.score(X_train[small_cols], Y_E_train)}")
print(f"Test score:{small_lr_E.score(X_test[small_cols], Y_E_test)}")

In [None]:
X_train.columns
small_cols = [
              'PropertyGFABuilding(s)',
              "is_ENERGYSTARScore",
              ] 
small_lr_E = LinearRegression()
small_lr_E.fit(X_train[small_cols], Y_E_train)
print(f"Train score:{small_lr_E.score(X_train[small_cols], Y_E_train)}")
print(f"Test score:{small_lr_E.score(X_test[small_cols], Y_E_test)}")

In [None]:
X_train.columns
small_cols = [
              'PropertyGFABuilding(s)',
              "ENERGYSTARScore",
              "is_ENERGYSTARScore",
              ]
small_lr_E = LinearRegression()
small_lr_E.fit(X_train[small_cols], Y_E_train)
print(f"Train score:{small_lr_E.score(X_train[small_cols], Y_E_train)}")
print(f"Test score:{small_lr_E.score(X_test[small_cols], Y_E_test)}")

In [None]:
feature_names = small_cols
coefs = pd.DataFrame(
    small_lr_E.coef_,
    columns=["Coefficients"],
    index=feature_names,
)

coefs

In [None]:
coefs.plot.barh(figsize=(9, 5))
plt.title("Elastic net best parameters")
plt.axvline(x=0, color=".5")
plt.xlabel("Raw coefficient values")
plt.subplots_adjust(left=0.3)

In [None]:
feature_names = small_cols
means = X_train[small_cols].mean(axis=0)
imps = (means * coefs.T) /Y_E_train.mean()

In [None]:
imps.plot.barh(figsize=(9, 5))
plt.title("LR pseudo importance")
plt.axvline(x=0, color=".5")
plt.xlabel("Normalised values")
plt.ylabel("")
plt.subplots_adjust(left=0.3)

In [None]:
train_scores = []
test_scores = []
fit_times = []
train_mae = []
test_mae = []
kf = KFold(n_splits=5, shuffle=True)
X = X_2.reset_index(drop=True)
Y_E = Y_E.reset_index(drop=True)
for train, test in kf.split(Y_E):
    model = LinearRegression()
    small_cols = [
              'PropertyGFABuilding(s)'
              ] 
    t0 = time()
    model.fit(X_2.loc[train, small_cols],Y_E.loc[train])
    fit_times.append(time()-t0)
    train_scores.append(model.score(X_2.loc[train, small_cols],Y_E.loc[train]))
    test_scores.append(model.score(X_2.loc[test, small_cols],Y_E.loc[test]))
    train_mae.append(mean_absolute_error(model.predict(X_2.loc[train, small_cols]),Y_E.loc[train]))
    test_mae.append(mean_absolute_error(model.predict(X_2.loc[test, small_cols]),Y_E.loc[test]))

In [None]:
print(f"[INFO] Train score: {np.mean(train_scores):.3f} +- {np.std(train_scores):.3f}")
print(f"[INFO] Test score: {np.mean(test_scores):.3f} +- {np.std(test_scores):.3f}")
print(f"[INFO] Train mae: {np.mean(train_mae):.3f} +- {np.std(train_mae):.3f}")
print(f"[INFO] Test mae: {np.mean(test_mae):.3f} +- {np.std(test_mae):.3f}")
print(f"[INFO] Fit time: {np.mean(fit_times):.3f} +- {np.std(fit_times):.3f}")

In [None]:
train_scores = []
test_scores = []
fit_times = []
train_mae = []
test_mae = []
kf = KFold(n_splits=5, shuffle=True)
X = X_2.reset_index(drop=True)
Y_E = Y_E.reset_index(drop=True)
for train, test in kf.split(Y_E):
    model = LinearRegression()
    small_cols = [
              'PropertyGFABuilding(s)',
              "is_ENERGYSTARScore",
              ]
    t0 = time()
    model.fit(X_2.loc[train, small_cols],Y_E.loc[train])
    fit_times.append(time()-t0)
    train_scores.append(model.score(X_2.loc[train, small_cols],Y_E.loc[train]))
    test_scores.append(model.score(X_2.loc[test, small_cols],Y_E.loc[test]))
    train_mae.append(mean_absolute_error(model.predict(X_2.loc[train, small_cols]),Y_E.loc[train]))
    test_mae.append(mean_absolute_error(model.predict(X_2.loc[test, small_cols]),Y_E.loc[test]))

In [None]:
print(f"[INFO] Train score: {np.mean(train_scores):.3f} +- {np.std(train_scores):.3f}")
print(f"[INFO] Test score: {np.mean(test_scores):.3f} +- {np.std(test_scores):.3f}")
print(f"[INFO] Train mae: {np.mean(train_mae):.3f} +- {np.std(train_mae):.3f}")
print(f"[INFO] Test mae: {np.mean(test_mae):.3f} +- {np.std(test_mae):.3f}")
print(f"[INFO] Fit time: {np.mean(fit_times):.3f} +- {np.std(fit_times):.3f}")

In [None]:
train_scores = []
test_scores = []
fit_times = []
train_mae = []
test_mae = []
kf = KFold(n_splits=5, shuffle=True)
X = X_2.reset_index(drop=True)
Y_E = Y_E.reset_index(drop=True)
for train, test in kf.split(Y_E):
    model = LinearRegression()
    small_cols = [
              'PropertyGFABuilding(s)',
              "ENERGYSTARScore",
              ]
    t0 = time()
    model.fit(X_2.loc[train, small_cols],Y_E.loc[train])
    fit_times.append(time()-t0)
    train_scores.append(model.score(X_2.loc[train, small_cols],Y_E.loc[train]))
    test_scores.append(model.score(X_2.loc[test, small_cols],Y_E.loc[test]))
    train_mae.append(mean_absolute_error(model.predict(X_2.loc[train, small_cols]),Y_E.loc[train]))
    test_mae.append(mean_absolute_error(model.predict(X_2.loc[test, small_cols]),Y_E.loc[test]))

In [None]:
print(f"[INFO] Train score: {np.mean(train_scores):.3f} +- {np.std(train_scores):.3f}")
print(f"[INFO] Test score: {np.mean(test_scores):.3f} +- {np.std(test_scores):.3f}")
print(f"[INFO] Train mae: {np.mean(train_mae):.3f} +- {np.std(train_mae):.3f}")
print(f"[INFO] Test mae: {np.mean(test_mae):.3f} +- {np.std(test_mae):.3f}")
print(f"[INFO] Fit time: {np.mean(fit_times):.3f} +- {np.std(fit_times):.3f}")

In [None]:
train_scores = []
test_scores = []
fit_times = []
train_mae = []
test_mae = []
kf = KFold(n_splits=5, shuffle=True)
X = X_2.reset_index(drop=True)
Y_E = Y_E.reset_index(drop=True)
for train, test in kf.split(Y_E):
    model = LinearRegression()
    small_cols = [
              'PropertyGFABuilding(s)',
              "ENERGYSTARScore",
              "is_ENERGYSTARScore",
              ]
    t0 = time()
    model.fit(X_2.loc[train, small_cols],Y_E.loc[train])
    fit_times.append(time()-t0)
    train_scores.append(model.score(X_2.loc[train, small_cols],Y_E.loc[train]))
    test_scores.append(model.score(X_2.loc[test, small_cols],Y_E.loc[test]))
    train_mae.append(mean_absolute_error(model.predict(X_2.loc[train, small_cols]),Y_E.loc[train]))
    test_mae.append(mean_absolute_error(model.predict(X_2.loc[test, small_cols]),Y_E.loc[test]))

In [None]:
print(f"[INFO] Train score: {np.mean(train_scores):.3f} +- {np.std(train_scores):.3f}")
print(f"[INFO] Test score: {np.mean(test_scores):.3f} +- {np.std(test_scores):.3f}")
print(f"[INFO] Train mae: {np.mean(train_mae):.3f} +- {np.std(train_mae):.3f}")
print(f"[INFO] Test mae: {np.mean(test_mae):.3f} +- {np.std(test_mae):.3f}")
print(f"[INFO] Fit time: {np.mean(fit_times):.3f} +- {np.std(fit_times):.3f}")

<a name="feature-importance"></a>
### Feature Importance

In [None]:
model_pipe = make_best_model()
model_pipe.fit(X_train, Y_E_train)
model = model_pipe["svr"].regressor_
feature_names = model_pipe['prep'].get_feature_names_out()
X_test_prep = model_pipe["scaler"].transform(model_pipe["prep"].transform(X_test))
X_test_prep = pd.DataFrame(X_test_prep, columns=feature_names)

explainer = shap.KernelExplainer(model = model.predict, data = X_test_prep.head(50), link = "identity")
shap_values = explainer.shap_values(X_test_prep.head(50))

In [None]:
shap.summary_plot(shap_values, X_test_prep.head(50))

In [None]:
model_pipe = make_best_model_2()
model_pipe.fit(X_train, Y_E_train)
model = model_pipe["rf"].regressor_
feature_names = model_pipe['prep'].get_feature_names_out()
X_test_prep = model_pipe["scaler"].transform(model_pipe["prep"].transform(X_test))

r = permutation_importance(model, X_test_prep, Y_E_test,
                           n_repeats=30,
                           random_state=0)
for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{feature_names[i]:<8}"
              f"{r.importances_mean[i]:.3f}"
              f" +/- {r.importances_std[i]:.3f}")

In [None]:
model_pipe = make_best_model_2()
model_pipe.fit(X_train, Y_E_train)
model = model_pipe["rf"].regressor_
feature_names = model_pipe['prep'].get_feature_names_out()
X_test_prep = model_pipe["scaler"].transform(model_pipe["prep"].transform(X_test))
X_test_prep = pd.DataFrame(X_test_prep, columns=feature_names)

shap_values = shap.TreeExplainer(model).shap_values(X_test_prep)

In [None]:
shap.summary_plot(shap_values, X_test_prep, plot_type="bar")

In [None]:
shap.summary_plot(shap_values, X_test_prep)