# Predicting Energy Efficiency of Buildings

The data set is at 10 min for about 4.5 months. The house temperature and humidity conditions were monitored with a ZigBee wireless sensor network. Each wireless node transmitted the temperature and humidity conditions around 3.3 min. Then, the wireless data was averaged for 10 minutes periods. The energy data was logged every 10 minutes with m-bus energy meters. Weather from the nearest airport weather station (Chievres Airport, Belgium) was downloaded from a public data set from Reliable Prognosis (rp5.ru), and merged together with the experimental data sets using the date and time column. Two random variables have been included in the data set for testing the regression models and to filter out non predictive attributes (parameters)

In [1]:
#Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#Load file
energy = pd.read_csv('energydata_complete.csv')

In [3]:
#First 10
energy.head(10)

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097
5,2016-01-11 17:50:00,50,40,19.89,46.026667,19.2,44.5,19.79,44.933333,18.89,...,17.0,45.29,6.016667,734.0,92.0,5.333333,43.833333,4.8,44.919484,44.919484
6,2016-01-11 18:00:00,60,50,19.89,45.766667,19.2,44.5,19.79,44.9,18.89,...,17.0,45.29,5.9,734.1,92.0,5.0,40.0,4.7,47.233763,47.233763
7,2016-01-11 18:10:00,60,50,19.856667,45.56,19.2,44.5,19.73,44.9,18.89,...,17.0,45.29,5.916667,734.166667,91.833333,5.166667,40.0,4.683333,33.03989,33.03989
8,2016-01-11 18:20:00,60,40,19.79,45.5975,19.2,44.433333,19.73,44.79,18.89,...,17.0,45.29,5.933333,734.233333,91.666667,5.333333,40.0,4.666667,31.455702,31.455702
9,2016-01-11 18:30:00,70,40,19.856667,46.09,19.23,44.4,19.79,44.863333,18.89,...,17.0,45.29,5.95,734.3,91.5,5.5,40.0,4.65,3.089314,3.089314


In [4]:
#Statistical description
energy.describe()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
count,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,...,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,97.694958,3.801875,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,39.026904,...,19.485828,41.552401,7.411665,755.522602,79.750418,4.039752,38.330834,3.760707,24.988033,24.988033
std,102.524891,7.935988,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,4.341321,...,2.014712,4.151497,5.317409,7.399441,14.901088,2.451221,11.794719,4.194648,14.496634,14.496634
min,10.0,0.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,27.66,...,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322,0.005322
25%,50.0,0.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,35.53,...,18.0,38.5,3.666667,750.933333,70.333333,2.0,29.0,0.9,12.497889,12.497889
50%,60.0,0.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,38.4,...,19.39,40.9,6.916667,756.1,83.666667,3.666667,40.0,3.433333,24.897653,24.897653
75%,100.0,0.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,42.156667,...,20.6,44.338095,10.408333,760.933333,91.666667,5.5,40.0,6.566667,37.583769,37.583769
max,1080.0,70.0,26.26,63.36,29.856667,56.026667,29.236,50.163333,26.2,51.09,...,24.5,53.326667,26.1,772.3,100.0,14.0,66.0,15.5,49.99653,49.99653


In [5]:
#Dataset information
energy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  float64
 19  T9  

In [6]:
#Checking for null values
energy.isnull().sum()

date           0
Appliances     0
lights         0
T1             0
RH_1           0
T2             0
RH_2           0
T3             0
RH_3           0
T4             0
RH_4           0
T5             0
RH_5           0
T6             0
RH_6           0
T7             0
RH_7           0
T8             0
RH_8           0
T9             0
RH_9           0
T_out          0
Press_mm_hg    0
RH_out         0
Windspeed      0
Visibility     0
Tdewpoint      0
rv1            0
rv2            0
dtype: int64

In [7]:
#Drop Lights and date columns
energy.drop(['lights'], axis=1,inplace=True)

energy.drop(['date'], axis=1, inplace=True)

In [8]:
#Import scaler and split libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [10]:
# Initialize scaler
ss = MinMaxScaler()

# normalize dataset
norm_data = pd.DataFrame(ss.fit_transform(energy), columns=energy.columns)

In [11]:
#Get target variable
var_feat = norm_data.drop(['Appliances'], axis=1)

var_tar = norm_data['Appliances']

In [12]:
# Split dataset

X_train, X_test, y_train, y_test = train_test_split(var_feat, var_tar, test_size = 0.3, random_state = 42)

In [13]:
#Import model algorithms
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [22]:
# Obtain training set of the T2 and T6
energy_train_x = X_train[['T2']]
energy_train_y = X_train[['T6']]

# Obtain test set of the T2 and T6 
energy_test_x = X_test[['T2']]
energy_test_y = X_test[['T6']]

# Initialize linear regresssion
energy_lin = LinearRegression()

# Fit model
energy_lin.fit(energy_train_x, energy_train_y)

# Make predictions
pred1 = energy_lin.predict(energy_test_x)


#Compute r2_score
energy_r2 = round(r2_score(energy_test_y, pred1), 2)
energy_r2

0.64

In [16]:
# Initialize model
energy_reg = LinearRegression()

# Fit model
energy_reg.fit(X_train, y_train)

# Make predictions 
y_pred = energy_reg.predict(X_test)

In [18]:
#Mean Absolute Error
energy_mae = round(mean_absolute_error(y_test, y_pred), 2)

energy_mae

0.05

In [19]:
#Residual Sum of squares
energy_rss = round(np.sum(np.square(y_test - y_pred)), 2)

energy_rss

45.35

In [20]:
#Root Mean Squared Error
energy_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)), 3)

energy_rmse

0.088

In [21]:
#Coefficient
energy_coef = r2_score(y_test, y_pred)
round(energy_coef, 2)

0.15

In [25]:
#Feature weight
def weights(energy_lin, energy_feat, energy_col):
    energy_w = pd.Series(energy_lin.coef_, energy_feat.columns).sort_values()
    energy_w1 = pd.DataFrame(energy_w).reset_index()
    energy_w1.columns = ['Features', energy_col]
    energy_w1[energy_col].round(3)
return energy_w1

model_weight = weights(energy_reg, X_train, 'Feature Weights')

model_weight

Unnamed: 0,Features,Feature Weights
0,RH_2,-0.456698
1,T_out,-0.32186
2,T2,-0.236178
3,T9,-0.189941
4,RH_8,-0.157595
5,RH_out,-0.077671
6,RH_7,-0.044614
7,RH_9,-0.0398
8,T5,-0.015657
9,T1,-0.003281


In [27]:
#Initialize ridge model
energy_rid = Ridge(alpha=0.4)

# Fit model
energy_rid.fit(X_train, y_train)

# Make predictions
pred2 = energy_rid.predict(X_test)

energy_rid_rmse = round(np.sqrt(mean_squared_error(y_test, pred2)), 3)

print(f'RMSE score: {energy_rid_rmse}')

RMSE score: 0.088


In [29]:
# Initialize lasso model
energy_lasso = Lasso(alpha=0.001)

# Fit model
energy_lasso.fit(X_train, y_train)

# Make predictions
pred3 = energy_lasso.predict(X_test)

In [35]:
# Feature weight
def weights(energy_lin, energy_feat, energy_col):
  #this function returns the weight of every feature
  energy_w = pd.Series(energy_lin.coef_, energy_feat.columns).sort_values()
  energy_w1 = pd.DataFrame(energy_w).reset_index()
  energy_w1.columns = ['Features', energy_col]
  energy_w1[energy_col].round(3)
  return energy_w1

model_weight2 = weights(energy_lasso, X_train, 'Feature Weight')

# Non-zero
non_zero = model_weight2[model_weight2['Feature Weight'] != 0]

non_zero

print(f'Non_zero: {len(non_zero)}')

Non_zero: 4


In [36]:
# Compute RMSE with lasso
lasso_rmse = round(np.sqrt(mean_squared_error(y_test, pred3)), 3)

lasso_rmse

0.094