# Machine Learning Regression: Predicting Energy Efficiency Of Buildings
## For Hamoye Internship Stage B
## By Tito Akinyemi

## Loading the dataset

In [66]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [67]:
# loading the dataset
df= pd.read_csv('energydata_complete.csv')
df.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


Date, time year-month-day hour:minute:second

Appliances, energy use in Wh 

lights, energy use of light fixtures in the house in Wh

T1, Temperature in kitchen area, in Celsius

RH_1, Humidity in kitchen area, in %

T2, Temperature in living room area, in Celsius

RH_2, Humidity in living room area, in %

T3, Temperature in laundry room area

RH_3, Humidity in laundry room area, in %

T4, Temperature in office room, in Celsius

RH_4, Humidity in office room, in %

T5, Temperature in bathroom, in Celsius

RH_5, Humidity in bathroom, in %

T6, Temperature outside the building (north side), in Celsius

RH_6, Humidity outside the building (north side), in %

T7, Temperature in ironing room , in Celsius

RH_7, Humidity in ironing room, in %

T8, Temperature in teenager room 2, in Celsius

RH_8, Humidity in teenager room 2, in %

T9, Temperature in parents room, in Celsius

RH_9, Humidity in parents room, in %

To, Temperature outside (from Chievres weather station), in Celsius

Pressure (from Chievres weather station), in mm Hg

RH_out, Humidity outside (from Chievres weather station), in %

Wind speed (from Chievres weather station), in m/s

Visibility (from Chievres weather station), in km

Tdewpoint (from Chievres weather station), Â°C

rv1, Random variable 1, nondimensional

rv2, Random variable 2, nondimensional

## Setting the scenes and Exploring the dataset

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  float64
 19  T9  

In [10]:
# renaming the columns for easy identification
columns_names= {
   'date': 'date',

"Appliances": "appliances",

'lights': "lights",

'T1': "kitchen_temp",

'RH_1': "kitchen_humidty",

'T2': "LivingRoom_temp",

'RH_2': "livingRoom_humidity",

"T3" :"laundry_temp",

"RH_3": "laundry_humidity",
"T4" : "office_temp", 

'RH_4': "office_humidity",

'T5' : "bathroom_temp",

'RH_5': "bathroom_humidity",

'T6': "outsideNS_temp",

'RH_6': "outsideNS_humidity",

'T7': "ironingRoom_temp",

'RH_7': "ironingRoom_humidity",

'T8' : "teenRoom_temp",

'RH_8': "teenRoom_humidity",

'T9': "parentRoom_temp",

'RH_9': "parentRoom_humidty",

'T_out': "outsideCWS_temp",

'Press_mm_hg': "outsideCWS_pressure",

'RH_out': "outsideCWS_humidity",

'Windspeed' : "windSpeed_CWS",

'Visibility': "visibility_CWS",

'Tdewpoint' :"Tdewpoint_CWS",

'rv1': "random_variable1",

'rv2': "random_variable2"
    
}

In [11]:
# assigning the new names to columns
df.rename(columns=columns_names, inplace=True)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   date                  19735 non-null  object 
 1   appliances            19735 non-null  int64  
 2   lights                19735 non-null  int64  
 3   kitchen_temp          19735 non-null  float64
 4   kitchen_humidty       19735 non-null  float64
 5   LivingRoom_temp       19735 non-null  float64
 6   livingRoom_humidity   19735 non-null  float64
 7   laundry_temp          19735 non-null  float64
 8   laundry_humidity      19735 non-null  float64
 9   office_temp           19735 non-null  float64
 10  office_humidity       19735 non-null  float64
 11  bathroom_temp         19735 non-null  float64
 12  bathroom_humidity     19735 non-null  float64
 13  outsideNS_temp        19735 non-null  float64
 14  outsideNS_humidity    19735 non-null  float64
 15  ironingRoom_temp   

In [13]:
df.isna().sum()

date                    0
appliances              0
lights                  0
kitchen_temp            0
kitchen_humidty         0
LivingRoom_temp         0
livingRoom_humidity     0
laundry_temp            0
laundry_humidity        0
office_temp             0
office_humidity         0
bathroom_temp           0
bathroom_humidity       0
outsideNS_temp          0
outsideNS_humidity      0
ironingRoom_temp        0
ironingRoom_humidity    0
teenRoom_temp           0
teenRoom_humidity       0
parentRoom_temp         0
parentRoom_humidty      0
outsideCWS_temp         0
outsideCWS_pressure     0
outsideCWS_humidity     0
windSpeed_CWS           0
visibility_CWS          0
Tdewpoint_CWS           0
random_variable1        0
random_variable2        0
dtype: int64

In [14]:
df.duplicated().sum()

0

## Tag along code to Quiz

### Question 17
#### From the dataset, fit a linear model on the relationship between the temperature in the living room in Celsius (x = T2) and the temperature outside the building (y = T6). What is the Root Mean Squared error in three D.P?


In [10]:
# separating the dataset to features(dependent variables) and target(response variable)
x= df[['T2']]
y= df[['T6']]

#Now splitting the training and testing dataset.
# Recall that we had earlier segmented the features and target variables.
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state= 3)

# importing the model
from sklearn.linear_model import LinearRegression

# creating the model 
linear_model= LinearRegression()

#fit the model 
linear_model.fit(x_train, y_train)

LinearRegression()

In [11]:
model_prediction= linear_model.predict(y_test)

from sklearn.metrics import mean_squared_error

mse= mean_squared_error(y_test, model_prediction)
rmse= np.sqrt(mse)
round(rmse, 3)

Feature names unseen at fit time:
- T6
Feature names seen at fit time, yet now missing:
- T2



28.611

### Question 18
#### 


In [68]:
df= df.drop(columns= ['date', 'lights'])

In [None]:
y= df['Appliances']

In [70]:

#Now splitting the training and testing dataset.
# Recall that we had earlier segmented the features and target variables.
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.3, random_state= 42)



In [69]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

norm_df= pd.DataFrame(scaler.fit_transform(df) , columns= df.columns)
norm_df.head()


Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,0.046729,0.32735,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,...,0.223032,0.67729,0.37299,0.097674,0.894737,0.5,0.953846,0.538462,0.265449,0.265449
1,0.046729,0.32735,0.541326,0.225345,0.68214,0.215188,0.748871,0.351351,0.782437,0.175506,...,0.2265,0.678532,0.369239,0.1,0.894737,0.47619,0.894872,0.533937,0.372083,0.372083
2,0.037383,0.32735,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,...,0.219563,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848
3,0.037383,0.32735,0.52408,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,0.175506,...,0.219563,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261
4,0.046729,0.32735,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,0.178691,...,0.219563,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611


In [39]:
df= df.drop(columns= ['date', 'lights'])

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

norm_df= pd.DataFrame(scaler.fit_transform(df) , columns= df.columns)
norm_df.head()



Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,0.046729,0.32735,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,...,0.223032,0.67729,0.37299,0.097674,0.894737,0.5,0.953846,0.538462,0.265449,0.265449
1,0.046729,0.32735,0.541326,0.225345,0.68214,0.215188,0.748871,0.351351,0.782437,0.175506,...,0.2265,0.678532,0.369239,0.1,0.894737,0.47619,0.894872,0.533937,0.372083,0.372083
2,0.037383,0.32735,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,...,0.219563,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848
3,0.037383,0.32735,0.52408,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,0.175506,...,0.219563,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261
4,0.046729,0.32735,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,0.178691,...,0.219563,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611


In [61]:

# separating the dataset to features(dependent variables) and target(response variable)
x= norm_df.drop(columns= ['Appliances'])
y= norm_df['Appliances']


#Now splitting the training and testing dataset.
# Recall that we had earlier segmented the features and target variables.
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.3, random_state= 42)


# importing the model
from sklearn.linear_model import LinearRegression

# creating the model 
linear_model= LinearRegression()

#fit the model 
linear_model.fit(x_train, y_train)


LinearRegression()

In [71]:
# importing the model
from sklearn.linear_model import LinearRegression

# creating the model 
linear_model= LinearRegression()

#fit the model 
linear_model.fit(x_train, y_train)

mod= linear_model.predict(x_train)

In [72]:
from sklearn.metrics import mean_absolute_error
mae= mean_absolute_error(y_train, mod)
mae

0.0502264309111538

### Question 19
#### 


In [73]:
from sklearn.metrics import mean_squared_error

mse= mean_squared_error(y_train, mod)
rmse= np.sqrt(mse)
rmse

0.08898659799265987

In [None]:
mae= 

### Question 20
#### 


In [74]:
prediction= linear_model.predict(x_test)

In [75]:
from sklearn.metrics import mean_absolute_error
mae= mean_absolute_error(y_test, prediction)
mae

0.05013362397742955

### Question 21
#### 


In [76]:
from sklearn.metrics import mean_squared_error

mse= mean_squared_error(y_test, prediction)
rmse= np.sqrt(mse)
rmse

0.0875144494766171

### Question 22
#### 


In [77]:
from sklearn.linear_model import Ridge
ridge_reg= Ridge(alpha=0.5)
ridge= ridge_reg.fit(x_train, y_train)

In [79]:
ridge_prediction= ridge_reg.predict(x_test)
# testing the change
ridge_mae= mean_absolute_error(y_test, ridge_prediction)
mae

0.05013362397742955

### Question 23
#### 


In [80]:
from sklearn.linear_model import Lasso
lasso_reg= Lasso(alpha=0.001)
lasso_reg.fit(x_train, y_train)

Lasso(alpha=0.001)

In [81]:
#comparing the effects of regularisation
def get_weights_df(model, feat, col_name):
    #this function returns the weight of every feature
    weights = pd.Series(model.coef_, feat.columns).sort_values()
    weights_df = pd.DataFrame(weights).reset_index()
    weights_df.columns = ['Features', col_name]
    weights_df[col_name].round(3)
    return weights_df

linear_model_weights = get_weights_df(linear_model, x_train, 'Linear_Model_Weight')
ridge_weights_df = get_weights_df(ridge_reg, x_train, 'Ridge_Weight')
lasso_weights_df = get_weights_df(lasso_reg, x_train, 'Lasso_weight')
final_weights = pd.merge(linear_model_weights, ridge_weights_df, on='Features')
final_weights = pd.merge(final_weights, lasso_weights_df, on='Features')

### Question 24
#### 


In [82]:
final_weights = pd.merge(linear_model_weights, ridge_weights_df, on='Features')
final_weights = pd.merge(final_weights, lasso_weights_df, on='Features')

In [86]:
final_weights

Unnamed: 0,Features,Linear_Model_Weight,Ridge_Weight,Lasso_weight
0,RH_2,-0.456698,-0.401134,-0.0
1,T_out,-0.32186,-0.250765,0.0
2,T2,-0.236178,-0.19388,0.0
3,T9,-0.189941,-0.188584,-0.0
4,RH_8,-0.157595,-0.156596,-0.00011
5,RH_out,-0.077671,-0.050541,-0.049557
6,RH_7,-0.044614,-0.046291,-0.0
7,RH_9,-0.0398,-0.041701,-0.0
8,T5,-0.015657,-0.020727,-0.0
9,T1,-0.003281,-0.021549,0.0


### Question 25
#### 


In [None]:
from sklearn.metrics import mean_squared_error

mse= mean_squared_error(y_test, prediction)
rmse= np.sqrt(mse)
rmse

In [87]:
lasso_prediction= lasso_reg.predict(x_test)
from sklearn.metrics import mean_squared_error

mse= mean_squared_error(y_test, lasso_prediction)
rmse= np.sqrt(mse)
rmse

0.09358170467245137