In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import os

import plotly.express as px

from matplotlib import pyplot as plt
from dotenv import load_dotenv
from sklearn import preprocessing, model_selection
from sklearn import linear_model


load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

pc_dpi = int(os.getenv('DPI'))

if pc_dpi is None:
    pc_dpi = 100


In [None]:
ready_data = "data/seattle_scaled_clean_data.csv"

df = pd.read_csv(ready_data)

In [None]:
df.set_index("OSEBuildingID", inplace=True)


In [None]:
df.head()

In [None]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(6, 2),
    dpi=255,
)

ax1 = sns.scatterplot(
    x="Energy_consumption_(PC1)",
    y="Pollution_generated_(PC2)",
    data=df,
    hue="ENERGYSTARScore",
    s=5
)

###
# Titles/Lables

#
###
fig.tight_layout()
plt.show()


In [None]:
df_train, df_test = model_selection.train_test_split(df, test_size=0.20)


In [None]:
df_train["ENERGYSTARScore"].describe()


In [None]:
df_test["ENERGYSTARScore"].describe()


In [None]:
kf = model_selection.KFold(n_splits=8)

for train, test in kf.split(df):
    print(len(train), len(test))


In [None]:
df_train

# X = >> PC1 -> GHG_E, Y = E*

In [None]:
X_train = df_train[["Energy_consumption_(PC1)", "Pollution_generated_(PC2)"]].to_numpy()

y_train = df_train[["ENERGYSTARScore"]].to_numpy()


In [None]:
X_test = df_test[["Energy_consumption_(PC1)", "Pollution_generated_(PC2)"]].to_numpy()

y_test = df_test[["ENERGYSTARScore"]].to_numpy()


In [None]:
# Classic regression : 

linear_reg = linear_model.LinearRegression()

linear_reg.fit(X_train, y_train)

predictions = linear_reg.predict(X_test)

actual = [value[0] for value in y_test]

one_d_prediction = [value[0] for value in predictions]

dict_res = {
    "predictions": one_d_prediction,
    "actual": actual
}

df_res_linreg = pd.DataFrame(dict_res)

df_res_linreg

In [None]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(6, 4),
    dpi=pc_dpi,
)

ax1.scatter(predictions, y_test)


###
# Titles/Lables

#
###

plt.show()


In [None]:
linear_reg.coef_