In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import os

from matplotlib import pyplot as plt
from dotenv import load_dotenv
from sklearn import preprocessing, model_selection, linear_model, metrics

load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DI'))

except TypeError:
    pc_dpi = 100

if pc_dpi is None:
    pc_dpi = 100

if pc_dpi >= 155:
    pc_dpi = 155


In [None]:
file_ghg_eui = "./data/seattle_predict_ghg_eui.csv"


In [None]:
df_model = pd.read_csv(file_ghg_eui).astype(float)

df_model.columns


In [None]:
df_model.head(n=5)

In [None]:
df_model.set_index("OSEBuildingID", inplace=True)


In [None]:
df_model.dropna(inplace=True)


In [None]:
df_train, df_test = model_selection.train_test_split(df_model, test_size=0.30)


In [None]:
# Target_1 : GHGEmissionsIntensity(kgCO2e/ft2)

X_train = df_train.drop(columns="scaled_GHGEmissionsIntensity(kgCO2e/ft2)").to_numpy()
X_test = df_test.drop(columns="scaled_GHGEmissionsIntensity(kgCO2e/ft2)").to_numpy()

y_train = df_train[["scaled_GHGEmissionsIntensity(kgCO2e/ft2)"]].to_numpy()
y_test = df_test[["scaled_GHGEmissionsIntensity(kgCO2e/ft2)"]].to_numpy()


In [None]:
lin_reg = linear_model.LinearRegression()

lin_reg.fit(X_train, y_train)

y_pred = lin_reg.predict(X=X_test)


In [None]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(8, 6),
    dpi=pc_dpi,
)

ax1.scatter(y_test, y_pred)


###
# Titles/Lables
ax1.set_xlabel("True")
ax1.set_ylabel("Prediction")
#
###

plt.show()


In [None]:
rss = metrics.mean_squared_error(y_true=y_test, y_pred=y_pred)

rmse = np.sqrt(rss)

print(f"RMSE = {rmse}")

r_two = metrics.r2_score(y_true=y_test, y_pred=y_pred)

print(f"R2 Score = {r_two}")

## Regression par une droite non concluante --> regression polynomiale


In [None]:
df_train.head()