In [3]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# graphics in svg format are more sharp and legible
%config InlineBackend.figure_format = 'svg'

# Read Data

In [4]:
# Read consolidated data set
energy = pd.read_csv('U:\\projects\\data Science\\Energy Prediction Use Case\\data\\energy_transformed.csv')

In [5]:
energy.shape

(19735, 36)

In [6]:
energy.head()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,rv1,rv2,NSM,WeekStatus,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,50,0,21.566667,40.79,19.6,41.09,21.0,40.4,20.6,38.4,...,38.878697,38.878697,0,1,0,0,0,0,0,1
1,60,20,21.6,43.59,20.166667,44.2,21.426667,42.29,21.566667,43.56,...,29.036011,29.036011,0,1,0,0,0,0,1,0
2,50,10,21.79,39.9,20.1,40.79,21.39,40.59,19.2,41.0,...,48.320978,48.320978,0,0,0,0,1,0,0,0
3,60,0,22.29,35.53,19.2,38.5,23.463333,34.79,22.0,32.09,...,32.132327,32.132327,0,0,0,0,1,0,0,0
4,70,0,22.7,37.2,21.1,37.9,23.39,37.29,22.83,34.856,...,43.189054,43.189054,0,1,0,0,0,0,0,0


In [7]:
energy.columns

Index(['Appliances', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4',
       'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9',
       'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility',
       'Tdewpoint', 'rv1', 'rv2', 'NSM', 'WeekStatus', 'Monday', 'Saturday',
       'Sunday', 'Thursday', 'Tuesday', 'Wednesday'],
      dtype='object')

## Build Models

In [9]:
# Enable printing multiple outputs (if any) for each cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # default is last_exp

Feature Selection using Backward Elimination. So Build model with ALL features and remove one feature at at time and check R2/Adjusted R2 and individual feautures p value. Drop features with p value < 0.005

1st Iteration: Removed random variables rv1 & rv2

2nd Iteration: Removed pressure from Chievres weather station

3rd Iteration: Removed T4,RH_4 & T5,RH_5 Temperature and humidity in office and bathroom - No appliances

In [15]:
# Assign X & y

X = energy[['lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9',
       'RH_9', 'T_out', 'RH_out', 'Windspeed', 'Visibility',
       'Tdewpoint', 'NSM', 'WeekStatus', 'Monday', 'Saturday',
       'Sunday', 'Thursday', 'Tuesday', 'Wednesday']]
y = energy['Appliances'].values

### Prediction with Statistical Modeling (Stats Models)

In [16]:
import statsmodels.api as sm

# add columnn for the intercept term
X_train = sm.add_constant(X, prepend=False)

# instantiate model
lin_mod = sm.OLS(y, X_train)

# fit the beta coefs
SMMLRegression = lin_mod.fit()

print(SMMLRegression .summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.174
Model:                            OLS   Adj. R-squared:                  0.173
Method:                 Least Squares   F-statistic:                     153.6
Date:                Tue, 11 Feb 2020   Prob (F-statistic):               0.00
Time:                        09:03:33   Log-Likelihood:            -1.1749e+05
No. Observations:               19735   AIC:                         2.350e+05
Df Residuals:                   19707   BIC:                         2.353e+05
Df Model:                          27                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
lights         1.9203      0.092     20.956      0.0

In [17]:
SMMLRegression.pvalues

lights         1.875391e-96
T1             1.932861e-02
RH_1          1.539762e-101
T2             1.196552e-31
RH_2           6.540539e-76
T3            2.453287e-120
RH_3           1.833308e-17
T6             1.851895e-30
RH_6           7.738417e-04
T7             2.745825e-01
RH_7           3.710352e-04
T8             4.212405e-15
RH_8           1.149194e-29
T9             1.954762e-16
RH_9           6.943031e-01
T_out          1.100355e-10
RH_out         1.783055e-02
Windspeed      1.716718e-06
Visibility     3.987870e-04
Tdewpoint      2.375370e-02
NSM            7.000076e-14
WeekStatus     4.430572e-03
Monday         1.089109e-03
Saturday       1.572706e-03
Sunday         1.964698e-01
Thursday       9.265358e-15
Tuesday        3.502737e-17
Wednesday      5.618629e-12
const          1.390721e-02
dtype: float64

In [12]:
dir(SMMLRegression)

['HC0_se',
 'HC1_se',
 'HC2_se',
 'HC3_se',
 '_HCCM',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_cache',
 '_data_attr',
 '_get_robustcov_results',
 '_is_nested',
 '_wexog_singular_values',
 'aic',
 'bic',
 'bse',
 'centered_tss',
 'compare_f_test',
 'compare_lm_test',
 'compare_lr_test',
 'condition_number',
 'conf_int',
 'conf_int_el',
 'cov_HC0',
 'cov_HC1',
 'cov_HC2',
 'cov_HC3',
 'cov_kwds',
 'cov_params',
 'cov_type',
 'df_model',
 'df_resid',
 'diagn',
 'eigenvals',
 'el_test',
 'ess',
 'f_pvalue',
 'f_test',
 'fittedvalues',
 'fvalue',
 'get_influence',
 'get_prediction',
 'get_robustcov_results',
 'initialize',
 'k_constant',
 'llf',
 'load',
 'model',
