In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
def regression_metrics(y_pred, y_test):
    """Function which contains differents metrics about regression
    Input: prediction, test/target
    
    Output: MAE, MSE, RMSE & R² score  
    """
    mae=mean_absolute_error(y_pred, y_test)
    mse=mean_squared_error(y_pred, y_test)
    rmse=mse ** (1/2)
    r_score = r2_score(y_pred, y_test)
    print("MAE :",mae.round(3))
    print("MSE :", mse.round(3))
    print("RMSE :", rmse.round(3))
    print("R² :", r_score.round(3))

In [3]:
test_set =  pd.read_csv("datasets/test_set.csv")
train_set = pd.read_csv("datasets/train_set.csv")

In [4]:
train_set.head(3)

Unnamed: 0,BuildingType,PrimaryPropertyType,Neighborhood,YearBuilt,NumberofBuildings,NumberofFloors,PropertyGFATotal,PropertyGFABuilding(s),ENERGYSTARScore,SourceEUIWN(kBtu/sf),...,NoBuildings,NoFloors,SteamUse(kBtu/sf),Electricity(kBtu/sf),NaturalGas(kBtu/sf),OtherFuelUse(kBtu/sf),%SteamUse,%Electricity,%NaturalGas,%OtherFuelUse
0,NonResidential,Other,Downtown,1930,1.0,2.0,67224,67224,,64.9,...,,2-5,0.0,20.66,0.0,0.0,0.0,100.0,0.0,0.0
1,NonResidential,Mixed Use Property,Northwest,1996,1.0,3.0,55912,33062,,232.699997,...,,2-5,0.0,74.12,0.0,,0.0,100.0,0.0,
2,SPS-District K-12,Educational Building,Magnolia / queen anne,1989,1.0,2.0,51582,51582,94.0,95.8,...,,2-5,0.0,28.55,0.0,0.0,0.0,100.0,0.0,0.0


In [5]:
train_set.isnull().mean() * 100

BuildingType               0.000000
PrimaryPropertyType        0.000000
Neighborhood               0.000000
YearBuilt                  0.000000
NumberofBuildings          0.000000
NumberofFloors             0.311526
PropertyGFATotal           0.000000
PropertyGFABuilding(s)     0.000000
ENERGYSTARScore           32.398754
SourceEUIWN(kBtu/sf)       0.000000
SiteEnergyUse(kBtu)        0.000000
SteamUse(kBtu)             0.000000
Electricity(kBtu)          0.000000
NaturalGas(kBtu)           0.000000
OtherFuelUse(kBtu)        49.532710
TotalGHGEmissions          0.000000
GHGEmissionsIntensity      0.000000
DecadeBuilt                0.000000
NoBuildings               97.196262
NoFloors                   0.934579
SteamUse(kBtu/sf)          0.000000
Electricity(kBtu/sf)       0.000000
NaturalGas(kBtu/sf)        0.000000
OtherFuelUse(kBtu/sf)     49.532710
%SteamUse                  0.000000
%Electricity               0.000000
%NaturalGas                0.000000
%OtherFuelUse             49

In [6]:
cat_var = ["PrimaryPropertyType","BuildingType", "Neighborhood", "DecadeBuilt"]

num_var = ["YearBuilt", "PropertyGFATotal", "NumberofBuildings", "NumberofFloors",
           "ENERGYSTARScore", "SourceEUIWN(kBtu/sf)", "SteamUse(kBtu)",
           "Electricity(kBtu)", "NaturalGas(kBtu)", 
           "OtherFuelUse(kBtu)","TotalGHGEmissions", "GHGEmissionsIntensity"]

In [7]:
from sklearn.impute import SimpleImputer

imputer_num = SimpleImputer(strategy="median")
imputer_cat = SimpleImputer(strategy="most_frequent")
imputer_num.fit(train_set[num_var])
imputer_cat.fit(train_set[cat_var])

SimpleImputer(strategy='most_frequent')

In [8]:
train_set[num_var] = imputer_num.transform(train_set[num_var])
train_set[cat_var] = imputer_cat.transform(train_set[cat_var])

In [9]:
train_set.isnull().mean() * 100

BuildingType               0.000000
PrimaryPropertyType        0.000000
Neighborhood               0.000000
YearBuilt                  0.000000
NumberofBuildings          0.000000
NumberofFloors             0.000000
PropertyGFATotal           0.000000
PropertyGFABuilding(s)     0.000000
ENERGYSTARScore            0.000000
SourceEUIWN(kBtu/sf)       0.000000
SiteEnergyUse(kBtu)        0.000000
SteamUse(kBtu)             0.000000
Electricity(kBtu)          0.000000
NaturalGas(kBtu)           0.000000
OtherFuelUse(kBtu)         0.000000
TotalGHGEmissions          0.000000
GHGEmissionsIntensity      0.000000
DecadeBuilt                0.000000
NoBuildings               97.196262
NoFloors                   0.934579
SteamUse(kBtu/sf)          0.000000
Electricity(kBtu/sf)       0.000000
NaturalGas(kBtu/sf)        0.000000
OtherFuelUse(kBtu/sf)     49.532710
%SteamUse                  0.000000
%Electricity               0.000000
%NaturalGas                0.000000
%OtherFuelUse             49

In [10]:
y_train = train_set[["ENERGYSTARScore"]].values
X_train = train_set.drop("ENERGYSTARScore", axis=1)

y_train = y_train.reshape(-1, 1)

In [11]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()
enc.fit(X_train[cat_var])
X_train = enc.transform(X_train[cat_var])

In [12]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()
linreg.fit(X_train, y_train)

LinearRegression()

In [13]:
test_set[num_var] = imputer_num.transform(test_set[num_var])
test_set[cat_var] = imputer_cat.transform(test_set[cat_var])

In [14]:
y_test = test_set[["ENERGYSTARScore"]].values
X_test = test_set.drop("ENERGYSTARScore", axis=1)

y_test = y_test.reshape(-1, 1)

In [15]:
X_test = enc.transform(X_test[cat_var])

In [16]:
y_pred = linreg.predict(X_test)

In [17]:
regression_metrics(y_pred, y_test)

MAE : 17.008
MSE : 513.933
RMSE : 22.67
R² : -3.035
