In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
def regression_metrics(y_pred, y_test):
    """Function which contains differents metrics about regression
    Input: prediction, test/target
    
    Output: MAE, MSE, RMSE & R² score  
    """
    mae=mean_absolute_error(y_pred, y_test)
    mse=mean_squared_error(y_pred, y_test)
    rmse=mse ** (1/2)
    r_score = r2_score(y_pred, y_test)
    print("MAE :",mae.round(3))
    print("MSE :", mse.round(3))
    print("RMSE :", rmse.round(3))
    print("R² :", r_score.round(3))

In [3]:
test_set =  pd.read_csv("datasets/test_set.csv")
train_set = pd.read_csv("datasets/train_set.csv")

In [4]:
train_set.head(3)

Unnamed: 0,PrimaryPropertyType,Neighborhood,YearBuilt,NumberofBuildings,NumberofFloors,PropertyGFABuilding(s),ENERGYSTARScore,TotalGHGEmissions,%SteamUse,%Electricity,%NaturalGas,%OtherFuelUse
0,Office,Lake union,1986,1.0,4.0,35526,,17.57,0.0,100.0,0.0,
1,Office,Delridge,1988,1.0,2.0,30659,9.0,76.48,0.0,68.81,31.19,0.0
2,Other,Downtown,1991,1.0,2.0,26579,,298.91,0.0,35.75,64.25,


In [5]:
train_set.isnull().mean() * 100

PrimaryPropertyType        0.000000
Neighborhood               0.000000
YearBuilt                  0.000000
NumberofBuildings          0.000000
NumberofFloors             0.155280
PropertyGFABuilding(s)     0.000000
ENERGYSTARScore           31.987578
TotalGHGEmissions          0.000000
%SteamUse                  0.000000
%Electricity               0.000000
%NaturalGas                0.000000
%OtherFuelUse             52.639752
dtype: float64

In [6]:
cat_var = ["PrimaryPropertyType", "Neighborhood"]

num_var = ["YearBuilt", "NumberofBuildings", "NumberofFloors", "PropertyGFABuilding(s)",
           "ENERGYSTARScore", "%SteamUse", "%Electricity", "%NaturalGas",
           "%OtherFuelUse", "TotalGHGEmissions"]

In [7]:
from sklearn.impute import SimpleImputer

imputer_num = SimpleImputer(strategy="median")
imputer_cat = SimpleImputer(strategy="most_frequent")
imputer_num.fit(train_set[num_var])
imputer_cat.fit(train_set[cat_var])

SimpleImputer(strategy='most_frequent')

In [8]:
train_set[num_var] = imputer_num.transform(train_set[num_var])
train_set[cat_var] = imputer_cat.transform(train_set[cat_var])

In [9]:
train_set.isnull().mean() * 100

PrimaryPropertyType       0.0
Neighborhood              0.0
YearBuilt                 0.0
NumberofBuildings         0.0
NumberofFloors            0.0
PropertyGFABuilding(s)    0.0
ENERGYSTARScore           0.0
TotalGHGEmissions         0.0
%SteamUse                 0.0
%Electricity              0.0
%NaturalGas               0.0
%OtherFuelUse             0.0
dtype: float64

In [10]:
y_train = train_set[["ENERGYSTARScore"]].values
X_train = train_set.drop("ENERGYSTARScore", axis=1)

y_train = y_train.reshape(-1, 1)

In [11]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()
enc.fit(X_train[cat_var])
X_train = enc.transform(X_train[cat_var])

In [12]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()
linreg.fit(X_train, y_train)

LinearRegression()

In [13]:
test_set[num_var] = imputer_num.transform(test_set[num_var])
test_set[cat_var] = imputer_cat.transform(test_set[cat_var])

In [14]:
y_test = test_set[["ENERGYSTARScore"]].values
X_test = test_set.drop("ENERGYSTARScore", axis=1)

y_test = y_test.reshape(-1, 1)

In [15]:
X_test = enc.transform(X_test[cat_var])

In [16]:
y_pred = linreg.predict(X_test)

In [17]:
regression_metrics(y_pred, y_test)

MAE : 16.667
MSE : 537.516
RMSE : 23.184
R² : -5.867
