# Second prediction notebook

Prediction of the CO2 emissions.

* [Imports](#imports)
* [Data loading](#data-loading)
* [Feature seletion](#feature-seletion)
    * [Selection](#selection-pipeline)
* [Prediction](#prediction)
    * [Linear prediction](#linear-prediction)
    * [SVR prediction](#svr-prediction)
    * [Ensemble prediction](#ensemble-prediction)
* [Evaluation](#evaluation)
    * [Evaluation](#evaluation)
    * [Feature importance](#feature-importance)

<a name="imports"></a>
## Imports

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, MinMaxScaler
from sklearn.impute import KNNImputer

from sklearn.dummy import DummyRegressor

from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge, ElasticNet, LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.metrics import r2_score, d2_absolute_error_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error, mean_absolute_error

<a name="data-loading"></a>
## Data loading

In [None]:
cleaned_dataset_v1_path = 'data_cleaned_v1.csv'
if not os.path.exists(cleaned_dataset_v1_path):
    !wget "https://drive.google.com/uc?export=download&id=1EYQuRaqc4yo0QYl35DRoIC6-IDwaX7Wq" -q --show-progress -O "$cleaned_dataset_v1_path"
!head -2 $cleaned_dataset_v1_path

In [None]:
cleaned_dataset_v2_path = 'data_cleaned_v2.csv'
if not os.path.exists(cleaned_dataset_v2_path):
    !wget "https://drive.google.com/uc?export=download&id=14_YIQ_v0D8PGwSkzb8KtbQIcAvlUeCLo" -q --show-progress -O "$cleaned_dataset_v2_path"
!head -2 $cleaned_dataset_v2_path

In [None]:
df_v2 = pd.read_csv(cleaned_dataset_v2_path)
df_v2.info()

In [None]:
df_v2.isna().sum()

<a name="feature-selection"></a>
## Feature selection

<a name="selection-pipeline"></a>
### Selection

In [None]:
cat_cols = [
    "BuildingType",
    "PrimaryPropertyType",
    "NumberofBuildings",
    "NumberofFloors",
    "is_ENERGYSTARScore",
    "is_SteamUse",
    "is_NaturalGas",
    "is_PropertyGFAParking",
    ]
num_cols = [
    "Latitude",
    "Longitude",
    "YearBuilt",
    "PropertyGFABuilding(s)",
    "LargestPropertyUseTypeGFA",
    "ENERGYSTARScore",
    "ratio_SteamUse",
    "ratio_NaturalGas",
    ]
pred_cols = [
    "SiteEnergyUseWN(kBtu)",
    "TotalGHGEmissions"
    ]
cols = cat_cols + num_cols + pred_cols
df_pred_v2 = df_v2[cols]

In [None]:
X = df_pred_v2[cols[:-2]]
Y_E = df_pred_v2[cols[-2]]
Y_CO2 = df_pred_v2[cols[-1]]

In [None]:
X_train, X_test, Y_E_train, Y_E_test, Y_CO2_train, Y_CO2_test = train_test_split(X, Y_E, Y_CO2, test_size=0.3, random_state=6)
print(f"[INFO] X_test shape: {X_test.shape}")
print(f"[INFO] X_train shape: {X_train.shape}")

In [None]:
num_tr = Pipeline([
    ("imputer", KNNImputer()),
    ("scaler", StandardScaler())
    ])
cat_tr = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ('pca', PCA()),
    ])
prep_v2 = ColumnTransformer([
    ("num", num_tr, num_cols),
    ("cat", cat_tr, cat_cols),
    ])

In [None]:
Y_CO2_train.describe()

In [None]:
Y_CO2_test.describe()

## Baseline

### Dummy regressor

In [None]:
dummy_E = DummyRegressor(strategy="mean", quantile=0.7)
dummy_E.fit(X_train, Y_CO2_train)
print(f"Train score:{dummy_E.score(X_train, Y_CO2_train)}")
print(f"Test score:{dummy_E.score(X_test, Y_CO2_test)}")

In [None]:
param_grid = {
    "quantile": np.linspace(0, 1, 10),
}
s_dummy_E = GridSearchCV(dummy_E, param_grid, n_jobs=-1, return_train_score=True)
s_dummy_E.fit(X_train, Y_CO2_train)
print("Best parameter (CV score=%0.3f):" % s_dummy_E.best_score_)
print(s_dummy_E.best_params_)
res = pd.DataFrame(s_dummy_E.cv_results_)
print(res[["mean_train_score","std_train_score"]])
print(f"Test score:{s_dummy_E.best_estimator_.score(X_test, Y_CO2_test)}")

### Small linear regression

In [None]:
X_train.columns
small_cols = [
              'PropertyGFABuilding(s)'
              ]
small_lr_E = LinearRegression()
small_lr_E.fit(X_train[small_cols], Y_CO2_train)
print(f"Train score:{small_lr_E.score(X_train[small_cols], Y_CO2_train)}")
print(f"Test score:{small_lr_E.score(X_test[small_cols], Y_CO2_test)}")

In [None]:
X_train.columns
small_cols = ['YearBuilt',
              'PropertyGFABuilding(s)'
              ]
small_lr_E = LinearRegression()
small_lr_E.fit(X_train[small_cols], Y_CO2_train)
print(f"Train score:{small_lr_E.score(X_train[small_cols], Y_CO2_train)}")
print(f"Test score:{small_lr_E.score(X_test[small_cols], Y_CO2_test)}")

<a name="prediction"></a>
## Prediction

<a name="linear-prediction"></a>
### Linear prediction

In [None]:
lin_pipe_v2 = Pipeline([
    ('prep', prep_v2),
    ('scaler', StandardScaler()),
    ('lr_en', ElasticNet()),
    ])
lin_pipe_v2

In [None]:
param_grid = {
    "prep__cat__pca__n_components": [5, 17, 35, 50, 60, None],
    "prep__num__scaler": [StandardScaler(), RobustScaler(), MinMaxScaler()],
    "lr_en__alpha": np.logspace(-4, 4, 5),
    "lr_en__l1_ratio": np.logspace(-4, 0, 5),
}
s_lin_E = GridSearchCV(lin_pipe_v2, param_grid, n_jobs=-1, return_train_score=True)
s_lin_E.fit(X_train, Y_CO2_train)
print("Best parameter (CV score=%0.3f):" % s_lin_E.best_score_)
print(s_lin_E.best_params_)
res = pd.DataFrame(s_lin_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Test score:{s_lin_E.best_estimator_.score(X_test, Y_CO2_test)}")

In [None]:
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
pca = PCA(n_components=50)
pca.fit(encoder.fit_transform(X_train[cat_cols]))

fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(6, 6))
ax0.plot(
    np.arange(1, pca.n_components_ + 1), pca.explained_variance_ratio_, "+", linewidth=2
)
ax0.set_ylabel("PCA explained variance ratio")

ax0.axvline(
    50,
    linestyle=":",
    label="n_components chosen",
)
ax0.legend(prop=dict(size=12))

# For each number of components, find the best classifier results
results = pd.DataFrame(s_lin_E.cv_results_)
components_col = "param_prep__cat__pca__n_components"
best_clfs = results.groupby(components_col).apply(
    lambda g: g.nlargest(1, "mean_test_score")
)

best_clfs.plot(
    x=components_col, y="mean_test_score", yerr="std_test_score", legend=False, ax=ax1
)
ax1.set_ylabel("Accuracy (val)")
ax1.set_xlabel("n_components")

plt.xlim(-1, 70)

plt.tight_layout()
plt.show()

In [None]:
num_tr = Pipeline([
    ("imputer", KNNImputer()),
    ("scaler", StandardScaler())
    ])
cat_tr = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ('pca', PCA(n_components=50)),
    ])
prep_v2 = ColumnTransformer([
    ("num", num_tr, num_cols),
    ("cat", cat_tr, cat_cols),
    ])

In [None]:
lr_tr = TransformedTargetRegressor(regressor=ElasticNet(),
                                func=np.log, 
                                inverse_func=np.exp)

lin_tr_pipe_v2 = Pipeline([
    ('prep', prep_v2),
    ('scaler', StandardScaler()),
    ('lr_en', lr_tr),
    ])
lin_tr_pipe_v2

In [None]:
param_grid = {
    "lr_en__regressor__alpha": np.logspace(-6, 6, 6),
    "lr_en__regressor__l1_ratio": np.logspace(-6, 0, 6),
}
s_lin_tr_E = GridSearchCV(lin_tr_pipe_v2, param_grid, n_jobs=-1, return_train_score=True)
s_lin_tr_E.fit(X_train, Y_CO2_train)
print("Best parameter (CV score=%0.3f):" % s_lin_tr_E.best_score_)
print(s_lin_tr_E.best_params_)
res = pd.DataFrame(s_lin_tr_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Test score:{s_lin_tr_E.best_estimator_.score(X_test, Y_CO2_test)}")

In [None]:
lr = TransformedTargetRegressor(regressor=ElasticNet(),
                                func=np.sqrt, 
                                inverse_func=np.square)

lin_tr_pipe_v2 = Pipeline([
    ('prep', prep_v2),
    ('scaler', StandardScaler()),
    ('lr_en', lr_tr),
    ])
lin_tr_pipe_v2

In [None]:
param_grid = {
    "lr_en__regressor__alpha": np.logspace(-6, 6, 6),
    "lr_en__regressor__l1_ratio": np.logspace(-6, 0, 6),
}
s_lin_tr_E = GridSearchCV(lin_tr_pipe_v2, param_grid, n_jobs=-1, return_train_score=True)
s_lin_tr_E.fit(X_train, Y_CO2_train)
print("Best parameter (CV score=%0.3f):" % s_lin_tr_E.best_score_)
print(s_lin_tr_E.best_params_)
res = pd.DataFrame(s_lin_tr_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Test score:{s_lin_tr_E.best_estimator_.score(X_test, Y_CO2_test)}")

<a name="svr-prediction"></a>
### SVR prediction

In [None]:
svr = regressor=SVR()

svr_pipe_v2 = Pipeline([
    ('prep', prep_v2),
    ('scaler', MinMaxScaler()),
    ('svr', svr),
    ])
svr_pipe_v2

In [None]:
param_grid = {
    "svr__C": np.logspace(-4, 4, 5),
    "svr__epsilon": np.logspace(-4, 4, 5),
}
s_svr_E = GridSearchCV(svr_pipe_v2, param_grid, n_jobs=-1, return_train_score=True)
s_svr_E.fit(X_train, Y_CO2_train)
print("Best parameter (CV score=%0.3f):" % s_svr_E.best_score_)
print(s_svr_E.best_params_)
res = pd.DataFrame(s_svr_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Test score:{s_svr_E.best_estimator_.score(X_test, Y_CO2_test)}")

In [None]:
svr_tr = TransformedTargetRegressor(regressor=SVR(),
                                func=np.sqrt, 
                                inverse_func=np.square)

svr_tr_pipe_v2 = Pipeline([
    ('prep', prep_v2),
    ('scaler', MinMaxScaler()),
    ('svr', svr_tr),
    ])
svr_tr_pipe_v2

In [None]:
param_grid = {
    "svr__regressor__C": np.logspace(-4, 4, 5),
    "svr__regressor__epsilon": np.logspace(-4, 4, 5),
}
s_svr_tr_E = GridSearchCV(svr_tr_pipe_v2, param_grid, n_jobs=-1, return_train_score=True)
s_svr_tr_E.fit(X_train, Y_CO2_train)
print("Best parameter (CV score=%0.3f):" % s_svr_tr_E.best_score_)
print(s_svr_tr_E.best_params_)
res = pd.DataFrame(s_svr_tr_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Test score:{s_svr_tr_E.best_estimator_.score(X_test, Y_CO2_test)}")

<a name="ensemble-prediction"></a>
### Ensemble prediction

In [None]:
rf_pipe_v2 = Pipeline([
    ('prep', prep_v2),
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor()),
    ])
rf_pipe_v2

In [None]:
param_grid = {
    "rf__n_estimators": [70, 100],
    "rf__max_depth": [3, 5, 7],
    "rf__min_samples_leaf": [3, 7],
}
s_rf_E = GridSearchCV(rf_pipe_v2, param_grid, n_jobs=-1, cv=5, return_train_score=True)
s_rf_E.fit(X_train, Y_CO2_train)
print("Best parameter (CV score=%0.3f):" % s_rf_E.best_score_)
print(s_rf_E.best_params_)
res = pd.DataFrame(s_rf_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Test score:{s_rf_E.best_estimator_.score(X_test, Y_CO2_test)}")

In [None]:
rf_tr = TransformedTargetRegressor(regressor=RandomForestRegressor(),
                                func=np.sqrt, 
                                inverse_func=np.square)

rf_tr_pipe_v2 = Pipeline([
    ('prep', prep_v2),
    ('scaler', StandardScaler()),
    ('rf', rf_tr),
    ])
rf_tr_pipe_v2

In [None]:
param_grid = {
    "rf__regressor__n_estimators": [70, 100, 150],
    "rf__regressor__max_depth": [3, 5, 7],
    "rf__regressor__min_samples_leaf": [2, 3],
}
s_rf_tr_E = GridSearchCV(rf_tr_pipe_v2, param_grid, n_jobs=-1, return_train_score=True)
s_rf_tr_E.fit(X_train, Y_CO2_train)
print("Best parameter (CV score=%0.3f):" % s_rf_tr_E.best_score_)
print(s_rf_tr_E.best_params_)
res = pd.DataFrame(s_rf_tr_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Test score:{s_rf_tr_E.best_estimator_.score(X_test, Y_CO2_test)}")

### XGBoost

In [None]:
xgb = XGBRegressor()

xgb_pipe_v2 = Pipeline([
    ('prep', prep_v2),
    ('scaler', StandardScaler()),
    ('reg', xgb),
    ])
xgb_pipe_v2

In [None]:
param_grid = {
    "reg__n_estimators": [150, 200],
    "reg__max_depth": [3, 5],
    "reg__gamma": np.logspace(-4, -1, 4),
}
s_xgb_E = GridSearchCV(xgb_pipe_v2, param_grid, n_jobs=-1, cv=5, return_train_score=True)
s_xgb_E.fit(X_train, Y_CO2_train)
print("Best parameter (CV score=%0.3f):" % s_xgb_E.best_score_)
print(s_xgb_E.best_params_)
res = pd.DataFrame(s_xgb_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Test score:{s_xgb_E.best_estimator_.score(X_test, Y_CO2_test)}")

In [None]:
xgb_tr = TransformedTargetRegressor(regressor=XGBRegressor(),
                                func=np.sqrt, 
                                inverse_func=np.square)

xgb_tr_pipe_v2 = Pipeline([
    ('prep', prep_v2),
    ('scaler', StandardScaler()),
    ('reg', xgb_tr),
    ])
xgb_pipe_v2

In [None]:
param_grid = {
    "reg__regressor__n_estimators": [150, 200],
    "reg__regressor__max_depth": [3, 5],
    "reg__regressor__gamma": np.logspace(-4, -1, 4),
}
s_xgb_tr_E = GridSearchCV(xgb_tr_pipe_v2, param_grid, n_jobs=-1, cv=5, return_train_score=True)
s_xgb_tr_E.fit(X_train, Y_CO2_train)
print("Best parameter (CV score=%0.3f):" % s_xgb_tr_E.best_score_)
print(s_xgb_tr_E.best_params_)
res = pd.DataFrame(s_xgb_tr_E.cv_results_)
print(res[["mean_train_score","std_train_score","mean_test_score","std_test_score"]])
print(f"Test score:{s_xgb_tr_E.best_estimator_.score(X_test, Y_CO2_test)}")

<a name="evaluation"></a>
## Evaluation

<a name="evaluation"></a>
### Evaluation

In [None]:
metrics = {
    "r2": r2_score,
    "d2": d2_absolute_error_score,
    "exp_va": explained_variance_score,
    "mse": mean_squared_error,
    "rmse": lambda *arg,**kwarg: mean_squared_error(*arg,**kwarg)**0.5,
    "mae": mean_absolute_error
    }
models = ["lin", "lin_tr", "svr", "svr_tr", "rf", "rf_tr", "xgb", "xgb_tr"]

E_res = []
for model in models:
    E_res.append([eval(f"m(Y_CO2_test, s_{model}_E.best_estimator_.predict(X_test))") for m in metrics.values()])

In [None]:
pd.DataFrame(E_res, columns=metrics.keys(), index=models)

<a name="feature-importance"></a>
### Feature Importance

In [None]:
forest = s_rf_E.best_estimator_["rf"]
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)

In [None]:
forest_importances = pd.Series(importances, index=num_cols+[f"pca {i}" for i in range(len(num_cols),len(importances))])

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()