In [35]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import json

from matplotlib import pyplot as plt
from dotenv import load_dotenv
from sklearn import preprocessing, model_selection, neighbors
from sklearn import metrics, dummy, svm

from models.linear_regressions import Linear_reg

load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_cores = int(os.getenv('CORES'))

except TypeError:
    pc_cores = 4

try:
    pc_dpi = int(os.getenv('DPI'))

except TypeError:
    pc_dpi = 100

if pc_dpi is None:
    pc_dpi = 100

if pc_dpi >= 155:
    pc_dpi = 155

## NOTES : cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)


In [4]:
general_file = "./data/seattle_std_scaled.csv"  # Used as backup


In [5]:
df_model = pd.read_csv(general_file)
df_model.set_index("OSEBuildingID", inplace=True)


In [6]:
df_model.dropna(inplace=True)


In [7]:
df_model.columns


Index(['scaled_Electricity(kWh)', 'scaled_GHGEmissionsIntensity(kgCO2e/ft2)',
       'scaled_PropertyArea(SquareMetre)Total', 'scaled_SourceEUI(kWh/m2)',
       'scaled_NaturalGas(kWh)', 'BuildingAge', 'NumberofBuildings',
       'NumberofFloors', 'ENERGYSTARScore', 'Building_proportion',
       'Parking_proportion', 'EnergyStarCert', 'target_SourceEUI(kWh/m2)',
       'target_GHGEmissionsIntensity(kgCO2e/ft2)', 'ohe_Ptype_Hospital',
       'ohe_Ptype_Hotel', 'ohe_Ptype_K-12 School', 'ohe_Ptype_Large Office',
       'ohe_Ptype_Mixed Use Property', 'ohe_Ptype_Office', 'ohe_Ptype_Other',
       'ohe_Ptype_Refrigerated Warehouse', 'ohe_Ptype_Residence Hall',
       'ohe_Ptype_Small & Medium medical facility',
       'ohe_Ptype_Small & Mid-Sized Office',
       'ohe_Ptype_Supermarket & Grocery store', 'ohe_Ptype_Warehouse',
       'ohe_Ptype_Worship Facility', 'ohe_Nbhood_BALLARD',
       'ohe_Nbhood_CENTRAL', 'ohe_Nbhood_DELRIDGE', 'ohe_Nbhood_DOWNTOWN',
       'ohe_Nbhood_EAST', 'ohe_Nbh

In [8]:
df_model.head()

Unnamed: 0_level_0,scaled_Electricity(kWh),scaled_GHGEmissionsIntensity(kgCO2e/ft2),scaled_PropertyArea(SquareMetre)Total,scaled_SourceEUI(kWh/m2),scaled_NaturalGas(kWh),BuildingAge,NumberofBuildings,NumberofFloors,ENERGYSTARScore,Building_proportion,...,ohe_Nbhood_DOWNTOWN,ohe_Nbhood_EAST,ohe_Nbhood_GREATER DUWAMISH,ohe_Nbhood_LAKE UNION,ohe_Nbhood_MAGNOLIA / QUEEN ANNE,ohe_Nbhood_NORTH,ohe_Nbhood_NORTHEAST,ohe_Nbhood_NORTHWEST,ohe_Nbhood_SOUTHEAST,ohe_Nbhood_SOUTHWEST
OSEBuildingID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.963847,1.770173,0.389139,1.012887,0.500705,95.0,1,12,60.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.615322,1.799564,0.659821,0.913481,4.178122,26.0,1,11,61.0,0.855,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.380423,3.57283,-0.095878,1.53632,1.009034,96.0,1,10,56.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.401384,1.074583,0.292078,0.455283,2.089235,96.0,1,11,27.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,-0.326328,3.945117,-0.029889,0.564008,2.706276,94.0,1,9,48.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
ghg_target = "target_GHGEmissionsIntensity(kgCO2e/ft2)"

droplist = [
    "scaled_GHGEmissionsIntensity(kgCO2e/ft2)",  # Scaled target
    "target_SourceEUI(kWh/m2)",  # not to scale
    "EnergyStarCert",
    ]

df_model = df_model.drop(columns=droplist)


In [13]:
ghg_linear = Linear_reg(dataframe=df_model, target=ghg_target)


In [15]:
# Loading known split, ids are unique building OSE id

with open("./data/splits_ghg.json", "r") as json_file:
    splits = json.load(json_file)

ids_train = splits["train"]
ids_test = splits["test"]


In [16]:
# Overriding

df_train_override = df_model[df_model.index.isin(ids_train)]
df_test_override = df_model[df_model.index.isin(ids_test)]

ghg_linear.force_split(
    df_train_ovr=df_train_override,
    df_test_ovr=df_test_override
)


In [71]:
train_matrix, test_matrix = ghg_linear.X_train, ghg_linear.X_test
y_train, y_test = ghg_linear.y_train, ghg_linear.y_test

param_grid = {'n_neighbors':[5, 7, 9, 11, 13, 15]}

knnr = neighbors.KNeighborsRegressor()

# Grid search
l1out = model_selection.LeaveOneOut()
rkf = model_selection.RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

param_grid_knnr = {'n_neighbors':[13, 15, 17, 19]}

score = "neg_mean_squared_error"

# reg KNN Setup

knn_reg = model_selection.GridSearchCV(
    estimator=knnr,
    param_grid=param_grid,
    cv=l1out,
    scoring=score
)


In [72]:
knn_reg.fit(
    X=train_matrix,
    y=y_train
)


GridSearchCV(cv=LeaveOneOut(), estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': [5, 7, 9, 11, 13, 15]},
             scoring='neg_mean_squared_error')

In [73]:
knn_reg.best_estimator_


KNeighborsRegressor(n_neighbors=15)

In [74]:
rmse_train = np.sqrt(abs(knn_reg.best_score_))
predict_train = knn_reg.predict(train_matrix)
r2_train = metrics.r2_score(y_pred=predict_train, y_true=y_train)


In [75]:
r2_train

0.2444019611499113

In [76]:
rmse_train


0.9170048623795171