In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import os

from matplotlib import pyplot as plt
from dotenv import load_dotenv
from sklearn import preprocessing, model_selection, linear_model
from sklearn import metrics, dummy

from scripts.models.regressions import Regressions

load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_cores = int(os.getenv('CORES'))

except TypeError:
    pc_cores = 4

try:
    pc_dpi = int(os.getenv('DPI'))

except TypeError:
    pc_dpi = 100

if pc_dpi is None:
    pc_dpi = 100

if pc_dpi >= 155:
    pc_dpi = 155

## NOTES : cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)


In [None]:
file_ghg_eui = "./data/seattle_predict_ghg_eui.csv"


In [None]:
df_model = pd.read_csv(file_ghg_eui).astype(float)

df_model.columns


In [None]:
df_model.head(n=5)

In [None]:
df_model.set_index("OSEBuildingID", inplace=True)


In [None]:
df_model.dropna(inplace=True)


In [None]:
ohe_cols = [col for col in df_model.columns if col.startswith("ohe")]

df_plot_features = df_model.drop(columns=ohe_cols)


In [None]:
# Target_1 : target_GHGEmissionsIntensity(kgCO2e/ft2) : two cols

droplist_1 = [
    "scaled_GHGEmissionsIntensity(kgCO2e/ft2)",  # Scaled target
    "target_SourceEUI(kWh/m2)"  # not to scale
    ]

df_model_ghg = df_model.drop(columns=droplist_1)


In [None]:
ghg_target = "target_GHGEmissionsIntensity(kgCO2e/ft2)"
ghg_regression = Regressions(dataframe=df_model_ghg, target_col=ghg_target)


In [None]:
alpha_range = np.arange(0.01, 7, 0.05)

lasso_cv = linear_model.LassoCV(
    fit_intercept=False,
    alphas=alpha_range,
    n_jobs=-1,
    )


In [None]:
lasso_cv.fit(
    X=ghg_regression.X_train,
    y=ghg_regression.y_train
)

mses = lasso_cv.mse_path_
print(mses.shape)
mse_avg = []
for mse_list in mses:
    mse_avg.append(np.mean(mse_list))


In [None]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(4, 4),
    dpi=pc_dpi,
)

ax1.plot(lasso_cv.alphas_, mse_avg)


###
# Titles/Lables

#
###

plt.tight_layout()
plt.show()


In [None]:
n_alphas = 200
alphas_ridge = np.logspace(-5, 5, n_alphas)

alpha_range = np.arange(0.01, 50, 0.05)

folds = 20

ridge_cv = linear_model.RidgeCV(
    fit_intercept=False,
    alphas=alpha_range,
    store_cv_values=True,
    )

ridge_cv.fit(ghg_regression.X_train, ghg_regression.y_train)

print(0)

In [None]:
mses_ridge = np.mean(ridge_cv.cv_values_, axis=0)[0]
rmses_ridge = np.sqrt(mses_ridge)
predict_train = ridge_cv.predict(X=ghg_regression.X_train)

In [None]:
np.sqrt(metrics.mean_squared_error(y_true=ghg_regression.y_train, y_pred=predict_train))

In [None]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(4, 4),
    dpi=pc_dpi,
)

ax1.plot(alpha_range, np.sqrt(mses_ridge))

###
# Titles/Lables

#
###
plt.tight_layout()
plt.show()


In [None]:
ridge_cv

In [None]:
# ghg_regression.standard_regression()
# sorted(metrics.SCORERS.keys())

In [None]:
ghg_regression.std_reg_metrics["rsme"] ** 2

In [None]:
ghg_regression.elastic_net_reg()

In [None]:
mses_enet = ghg_regression.clf_enet.mse_path_

np.mean(np.mean(mses_enet, axis=1), axis=1)

mses_enet.shape

In [None]:
droplist_2 = [
    "scaled_SourceEUI(kWh/m2)",  # Scaled target
    "target_GHGEmissionsIntensity(kgCO2e/ft2)"  # not to scale
    ]

df_model_eui = df_model.drop(columns=droplist_2)

eui_target = "target_SourceEUI(kWh/m2)"
eui_regression = Regressions(dataframe=df_model_eui, target_col=eui_target)


In [None]:
eui_regression.display_all_metrics()


In [None]:
eui_regression.df_predictions