# First prediction notebook

Prediction of the CO2 emissions.

* [Imports](#imports)
* [Data loading](#data-loading)
* [Feature seletion](#feature-seletion)
    * [Selection v1](#selection-pipeline-v1)
* [Prediction](#prediction)
    * [Linear prediction v1](#linear-prediction-v1)
    * [SVR prediction v1](#svr-prediction-v1)
    * [Ensemble prediction v1](#ensemble-prediction-v1)
* [Evaluation](#evaluation)
    * [Evaluation v1](#evaluation-v1)


<a name="imports"></a>
## Imports

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score, d2_absolute_error_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error, mean_absolute_error

<a name="data-loading"></a>
## Data loading

In [None]:
cleaned_dataset_v1_path = 'data_cleaned_v1.csv'
if not os.path.exists(cleaned_dataset_v1_path):
    !wget "https://drive.google.com/uc?export=download&id=1EYQuRaqc4yo0QYl35DRoIC6-IDwaX7Wq" -q --show-progress -O "$cleaned_dataset_v1_path"
!head -2 $cleaned_dataset_v1_path

In [None]:
df_v1 = pd.read_csv(cleaned_dataset_v1_path)
df_v1.info()

<a name="feature-selection"></a>
## Feature selection

<a name="selection-pipeline-v1"></a>
### Selection v1

In [None]:
cat_cols = [
    "PrimaryPropertyType",
    "YearBuilt",
    "NumberofBuildings",
    "NumberofFloors",
    "ENERGYSTARScore_isna",
    ]
num_cols = [
    "Latitude",
    "Longitude",
    "PropertyGFAParking",
    "PropertyGFABuilding(s)",
    ]
pred_cols = [
    "SiteEnergyUseWN(kBtu)",
    "TotalGHGEmissions"
    ]
cols = cat_cols + num_cols + pred_cols
df_pred_v1 = df_v1[cols]

In [None]:
X = df_pred_v1[cols[:-2]]
Y_E = df_pred_v1[cols[-2]]
Y_CO2 = df_pred_v1[cols[-1]]

In [None]:
X_train, X_test, Y_E_train, Y_E_test, Y_CO2_train, Y_CO2_test = train_test_split(X, Y_E, Y_CO2, test_size=0.1, random_state=6)
print(f"[INFO] X_test shape: {X_test.shape}")
print(f"[INFO] X_train shape: {X_train.shape}")

In [None]:
num_tr = Pipeline([("scaler", StandardScaler())])
cat_tr = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ])
prep_v1 = ColumnTransformer([
    ("num", num_tr, num_cols),
    ("cat", cat_tr, cat_cols),
    ])

<a name="prediction"></a>
## Prediction

<a name="linear-prediction-v1"></a>
### Linear prediction v1

In [None]:
lin_pipe_v1 = Pipeline([
    ('prep', prep_v1),
    ('lr_ri', Ridge()),
    ])
lin_pipe_v1

In [None]:
param_grid = {
    "lr_ri__alpha": np.logspace(-4, 4, 10),
}
s_lin_E = GridSearchCV(lin_pipe_v1, param_grid)
s_lin_E.fit(X_train, Y_E_train)
print("Best parameter (CV score=%0.3f):" % s_lin_E.best_score_)
print(s_lin_E.best_params_)

In [None]:
s_lin_CO2 = GridSearchCV(lin_pipe_v1, param_grid)
s_lin_CO2.fit(X_train, Y_CO2_train)
print("Best parameter (CV score=%0.3f):" % s_lin_CO2.best_score_)
print(s_lin_CO2.best_params_)

<a name="svr-prediction-v1"></a>
### SVR prediction v1

In [None]:
svr_pipe_v1 = Pipeline([
    ('prep', prep_v1),
    ('pca', PCA()),
    ('svr', SVR()),
    ])
svr_pipe_v1

In [None]:
param_grid = {
    "pca__n_components": [3, 5, 7, 11, 17, 33, 50],
    "svr__C": np.logspace(-3, 3, 3),
    "svr__epsilon": np.logspace(-4, 1, 3),
}
s_svr_E = GridSearchCV(svr_pipe_v1, param_grid)
s_svr_E.fit(X_train, Y_E_train)
print("Best parameter (CV score=%0.3f):" % s_svr_E.best_score_)
print(s_svr_E.best_params_)

In [None]:
s_svr_CO2 = GridSearchCV(svr_pipe_v1, param_grid)
s_svr_CO2.fit(X_train, Y_CO2_train)
print("Best parameter (CV score=%0.3f):" % s_svr_CO2.best_score_)
print(s_svr_CO2.best_params_)

In [None]:
# Plot the PCA spectrum
pca = svr_pipe_v1['pca']

pca.fit(prep_v1.fit_transform(X_train))

fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(6, 6))
ax0.plot(
    np.arange(1, pca.n_components_ + 1), pca.explained_variance_ratio_, "+", linewidth=2
)
ax0.set_ylabel("PCA explained variance ratio")

ax0.axvline(
    s_svr_CO2.best_estimator_.named_steps["pca"].n_components,
    linestyle=":",
    label="n_components chosen",
)
ax0.legend(prop=dict(size=12))

# For each number of components, find the best classifier results
results = pd.DataFrame(s_svr_CO2.cv_results_)
components_col = "param_pca__n_components"
best_clfs = results.groupby(components_col).apply(
    lambda g: g.nlargest(1, "mean_test_score")
)

best_clfs.plot(
    x=components_col, y="mean_test_score", yerr="std_test_score", legend=False, ax=ax1
)
ax1.set_ylabel("Accuracy (val)")
ax1.set_xlabel("n_components")

plt.xlim(-1, 70)

plt.tight_layout()
plt.show()

<a name="ensemble-prediction-v1"></a>
### Ensemble prediction v1

In [None]:
rf_pipe_v1 = Pipeline([
    ('prep', prep_v1),
    ('rf', RandomForestRegressor()),
    ])
rf_pipe_v1

In [None]:
param_grid = {
    "rf__n_estimators": [10, 20, 40, 70],
    "rf__max_depth": [10, 20, 40, 70],
}
s_rf_E = GridSearchCV(rf_pipe_v1, param_grid)
s_rf_E.fit(X_train, Y_E_train)
print("Best parameter (CV score=%0.3f):" % s_rf_E.best_score_)
print(s_rf_E.best_params_)

In [None]:
s_rf_CO2 = GridSearchCV(rf_pipe_v1, param_grid)
s_rf_CO2.fit(X_train, Y_CO2_train)
print("Best parameter (CV score=%0.3f):" % s_rf_CO2.best_score_)
print(s_rf_CO2.best_params_)

<a name="evaluation"></a>
## Evaluation

<a name="evaluation-v1"></a>
### Evaluation v1

In [None]:
metrics = {
    "r2": r2_score,
    "d2": d2_absolute_error_score,
    "exp_va": explained_variance_score,
    "mse": mean_squared_error,
    "rmse": lambda *arg,**kwarg: mean_squared_error(*arg,**kwarg)**0.5,
    "mae": mean_absolute_error
    }
models = ["lin", "svr", "rf"]

E_res = []
for model in models:
    E_res.append([eval(f"m(Y_E_test, s_{model}_E.best_estimator_.predict(X_test))") for m in metrics.values()])
CO2_res = []
for model in models:
    CO2_res.append([eval(f"m(Y_CO2_test, s_{model}_CO2.best_estimator_.predict(X_test))") for m in metrics.values()])

In [None]:
pd.DataFrame(E_res, columns=metrics.keys(), index=models)

In [None]:
pd.DataFrame(CO2_res, columns=metrics.keys(), index=models)