# Energy and Buildings

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

In [4]:
%matplotlib inline

In [23]:
energydata_c = pd.read_csv("energydata_complete.csv",parse_dates=["date"])

In [6]:
print(energydata_c.shape)
energydata_c.head()

(19735, 29)


Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [7]:
energydata_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
date           19735 non-null datetime64[ns]
Appliances     19735 non-null int64
lights         19735 non-null int64
T1             19735 non-null float64
RH_1           19735 non-null float64
T2             19735 non-null float64
RH_2           19735 non-null float64
T3             19735 non-null float64
RH_3           19735 non-null float64
T4             19735 non-null float64
RH_4           19735 non-null float64
T5             19735 non-null float64
RH_5           19735 non-null float64
T6             19735 non-null float64
RH_6           19735 non-null float64
T7             19735 non-null float64
RH_7           19735 non-null float64
T8             19735 non-null float64
RH_8           19735 non-null float64
T9             19735 non-null float64
RH_9           19735 non-null float64
T_out          19735 non-null float64
Press_mm_hg    19735 non-null float64
RH_out        

In [24]:
energydata_c['month'] = energydata_c['date'].dt.month
energydata_c['day-of-year'] = energydata_c['date'].dt.dayofyear
energydata_c['day'] = energydata_c['date'].dt.day
energydata_c['hour'] = energydata_c['date'].dt.hour

In [25]:
energydata_c.drop(['date'], 1,inplace=True)


In [26]:
energydata = energydata_c
energydata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 32 columns):
Appliances     19735 non-null int64
lights         19735 non-null int64
T1             19735 non-null float64
RH_1           19735 non-null float64
T2             19735 non-null float64
RH_2           19735 non-null float64
T3             19735 non-null float64
RH_3           19735 non-null float64
T4             19735 non-null float64
RH_4           19735 non-null float64
T5             19735 non-null float64
RH_5           19735 non-null float64
T6             19735 non-null float64
RH_6           19735 non-null float64
T7             19735 non-null float64
RH_7           19735 non-null float64
T8             19735 non-null float64
RH_8           19735 non-null float64
T9             19735 non-null float64
RH_9           19735 non-null float64
T_out          19735 non-null float64
Press_mm_hg    19735 non-null float64
RH_out         19735 non-null float64
Windspeed      19735 

In [9]:
X = energydata.iloc[:, 1:32].values
y = energydata.iloc[:, 0].values

In [11]:
energydata.iloc[:, 1:32].columns

Index(['lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4', 'RH_4', 'T5',
       'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9', 'RH_9', 'T_out',
       'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility', 'Tdewpoint', 'rv1',
       'rv2', 'month', 'day-of-year', 'day', 'hour'],
      dtype='object')

In [12]:
rf = RandomForestClassifier(n_jobs=-1, class_weight='auto', max_depth=7)
# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2)

In [13]:
print(X)

[[ 30.          19.89        47.59666667 ...  11.          11.
   17.        ]
 [ 30.          19.89        46.69333333 ...  11.          11.
   17.        ]
 [ 30.          19.89        46.3        ...  11.          11.
   17.        ]
 ...
 [ 10.          25.5         46.59666667 ... 148.          27.
   17.        ]
 [ 10.          25.5         46.99       ... 148.          27.
   17.        ]
 [ 10.          25.5         46.6        ... 148.          27.
   18.        ]]


In [14]:
print(y)

[ 60  60  50 ... 270 420 430]


# Boruta selection

In [15]:
 # find all relevant features
#feat_selector.fit(X, y)
 
# check selected features
#feat_selector.support_
 
# check ranking of features
#feat_selector.ranking_
 
# call transform() on X to filter it down to selected features
#X_filtered = feat_selector.transform(X)
estimator = SVR(kernel="linear")

In [66]:
selector = RFE(estimator, 5, step=1)

In [67]:
selector = selector.fit(X, y)

In [69]:
selector.support_ 

array([False, False,  True, False,  True,  True, False, False, False,
       False, False, False, False, False, False,  True, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False])

In [68]:
#energydata.iloc[0].columns
energydata.iloc[:, 1:32].columns[selector.support_]

Index(['RH_1', 'RH_2', 'T3', 'T8', 'T9'], dtype='object')

In [53]:
energydata.iloc[:, 0].head

<bound method NDFrame.head of 0         60
1         60
2         50
3         50
4         60
5         50
6         60
7         60
8         60
9         70
10       230
11       580
12       430
13       250
14       100
15       100
16        90
17        70
18        80
19       140
20       120
21       190
22       110
23       110
24       110
25       110
26       100
27       100
28       100
29       100
        ... 
19705    280
19706    240
19707    250
19708    220
19709    230
19710    190
19711    160
19712     80
19713     60
19714     60
19715     60
19716     60
19717     70
19718     80
19719     80
19720     70
19721    100
19722    100
19723     90
19724    100
19725    220
19726    180
19727    120
19728    110
19729     90
19730    100
19731     90
19732    270
19733    420
19734    430
Name: Appliances, Length: 19735, dtype: int64>

In [56]:
energydata.iloc[:, 1:29].columns

Index(['lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4', 'RH_4', 'T5',
       'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9', 'RH_9', 'T_out',
       'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility', 'Tdewpoint', 'rv1',
       'rv2', 'month'],
      dtype='object')

In [57]:
selector.ranking_

array([11,  3,  1,  2,  1,  1, 19, 10, 13,  9, 16,  8, 20,  4, 18,  1,  5,
        1,  7, 17, 21, 15, 12, 23, 14, 24, 22,  6])

In [16]:
estimator = SVR(kernel="linear")
selector = RFE(estimator, 10, step=1)

In [17]:
selector = selector.fit(X, y)

In [18]:
energydata.iloc[:, 1:32].columns[selector.support_]

Index(['T1', 'RH_1', 'T2', 'RH_2', 'T3', 'T7', 'T8', 'RH_8', 'T9', 'month'], dtype='object')

In [21]:
# call transform() on X to filter it down to selected features
X_filtered = selector.transform(X)

In [25]:
selector.ranking_

array([ 5,  1,  1,  1,  1,  1, 18,  4, 11, 17, 15,  3, 16,  1, 14,  1,  1,
        1,  2,  6, 19, 13,  7, 22, 12, 20, 21,  1,  9, 10,  8])

In [29]:
X = energydata.loc[1:32, ['T1', 'RH_1', 'T2', 'RH_2', 'T3', 'T7', 'T8', 'RH_8', 'T9', 'month']]
y = energydata.iloc[:, 0].values

In [32]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 1 to 32
Data columns (total 10 columns):
T1       32 non-null float64
RH_1     32 non-null float64
T2       32 non-null float64
RH_2     32 non-null float64
T3       32 non-null float64
T7       32 non-null float64
T8       32 non-null float64
RH_8     32 non-null float64
T9       32 non-null float64
month    32 non-null int64
dtypes: float64(9), int64(1)
memory usage: 2.6 KB


In [28]:
energydata.drop(columns=['lights', 'RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'RH_7','RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility', 'Tdewpoint', 'rv1', 'rv2', 'day-of-year', 'day', 'hour'], axis=1,inplace=True)

In [34]:
energydata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 11 columns):
Appliances    19735 non-null int64
T1            19735 non-null float64
RH_1          19735 non-null float64
T2            19735 non-null float64
RH_2          19735 non-null float64
T3            19735 non-null float64
T7            19735 non-null float64
T8            19735 non-null float64
RH_8          19735 non-null float64
T9            19735 non-null float64
month         19735 non-null int64
dtypes: float64(9), int64(2)
memory usage: 1.7 MB
