In [1]:
#%load imports.py
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

Our dataset displays the energy consumption of appliances and lights for a family home. 
We want to predict appliance consumption based upon the temperature and humidity in a number of rooms in the house, as well as external climatic conditions.

In [44]:
energy=pd.read_csv('KAG_energydata_complete.csv')
energy.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [45]:
energy.isnull().sum()

date           0
Appliances     0
lights         0
T1             0
RH_1           0
T2             0
RH_2           0
T3             0
RH_3           0
T4             0
RH_4           0
T5             0
RH_5           0
T6             0
RH_6           0
T7             0
RH_7           0
T8             0
RH_8           0
T9             0
RH_9           0
T_out          0
Press_mm_hg    0
RH_out         0
Windspeed      0
Visibility     0
Tdewpoint      0
rv1            0
rv2            0
dtype: int64

In [46]:
energy.shape

(19735, 29)

In order to make this data easier to understand, we can change the column names to something a little easier to read. 

In this dataset, 'Appliances' and 'Lights' is the energy consumption of appliances and lights within the home in Wh.

rv1 and rv2 are random variables.

In [47]:
energy=energy.rename(columns={'T1':'Temp_kitchen', 'RH_1':'RHumid_kitchen',
                             'T2': 'Temp_living', 'RH_2': 'RHumid_living',
                             'T3': 'Temp_laundry', 'RH_3': 'RHumid_laundry',
                             'T4':'Temp_office', 'RH_4': 'RHumid_office',
                             'T5': 'Temp_bath', 'RH_5': 'RHumid_bath',
                              'T6':'Temp_outN', 'RH_6': 'RHumid_outN',
                              'T7':'Temp_iron', 'RH_7': 'RHumid_iron',
                              'T8': 'Temp_teen', 'RH_8': 'RHumid_teen',
                              'T9': 'Temp_parent', 'RH_9': 'RHumid_parent', 
                             })
energy.head()

Unnamed: 0,date,Appliances,lights,Temp_kitchen,RHumid_kitchen,Temp_living,RHumid_living,Temp_laundry,RHumid_laundry,Temp_office,...,Temp_parent,RHumid_parent,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [48]:
energy.dtypes

date               object
Appliances          int64
lights              int64
Temp_kitchen      float64
RHumid_kitchen    float64
Temp_living       float64
RHumid_living     float64
Temp_laundry      float64
RHumid_laundry    float64
Temp_office       float64
RHumid_office     float64
Temp_bath         float64
RHumid_bath       float64
Temp_outN         float64
RHumid_outN       float64
Temp_iron         float64
RHumid_iron       float64
Temp_teen         float64
RHumid_teen       float64
Temp_parent       float64
RHumid_parent     float64
T_out             float64
Press_mm_hg       float64
RH_out            float64
Windspeed         float64
Visibility        float64
Tdewpoint         float64
rv1               float64
rv2               float64
dtype: object

In [49]:
energy['date']=pd.to_datetime(energy['date'])

We have a lot of features in our dataset. In order to determine which features have the biggest impact on predictions, we can use permutation importance. 

This works by shuffing a single feature in the data, and seeing how this impacts the final prediction performance. 

The higher the error obtained after this shuffle, the higher the feature importance. 

But first, we need to build the model.

## Building the model 

In [50]:
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance

ModuleNotFoundError: No module named 'sklearn.inspection'

splitting our data into x and y, where y is the target variables (energy consumption of appliances and lights) and x are in the input parameters.

In [52]:
energy=energy.drop(['rv1', 'rv2', 'date'],axis=1)
x=energy.drop(['Appliances','lights'], axis=1)
y=energy[['Appliances','lights']]

We need to determine how to scale our data. If the data distribution is small (eg STD near 1) we might not need data scaling. If in doubt, normalise input data. 

Scaling data can improve model training speed and avoid the model getting stuck during training.

The scale of the output variable should match the scale of the activation function. 

## Data normalisation vs standardisation--

- Normalisation- rescaling of the data from the original range so that all values are within the range of 0 and 1.
    eg with MinMaxScaler()
    
    
- Standardisation- rescaling the distribution of values so that the mean of observed values is 0 and the standard deviation is 1. Requires data to fit a Gaussian distribution.
    eg with StandardScaler()

In [53]:
from scipy import stats
stat, p = stats.shapiro((pd.concat([x,y], axis=1)))
print('p=%.3f' %p)
alpha = 0.05
if p > alpha:
	print('Sample looks Gaussian (fail to reject H0)')
else:
	print('Sample does not look Gaussian (reject H0)')

p=0.000
Sample does not look Gaussian (reject H0)




The sample is not gaussian, therefore we should not perform any standardisation techniques.

## splitting data into test and training datasets

In [54]:
i=15
np.random.seed(i)

x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2, random_state=i)
scaler=MinMaxScaler()

x_train_scaled=scaler.fit_transform(x_train)
x_test_scaled=scaler.transform(x_test)

In [55]:
mlp=MLPRegressor()

mlp.fit(x_train_scaled,y_train)
predictions=mlp.predict(x_test_scaled)

In [56]:
RTWO2=sklearn.metrics.r2_score(y_test,predictions)
print('R2',RTWO2)
print('MAE',sklearn.metrics.mean_absolute_error(y_test,predictions))

R2 0.12162620762564064
MAE 29.657474038536332


## Permutation Importance with Sklearn

In [57]:
results = permutation_importance(mlp, x, y, scoring='neg_mean_squared_error')

NameError: name 'permutation_importance' is not defined

In [58]:
importance = results.importances_mean

for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))

pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

NameError: name 'results' is not defined

we can also look at feature effect on predictions, which means that we hold all values constant and only vary the feature we want o study and understand.

For example if we want to look at the effect of outdoor temperature (T_out), first we remove the target variable appliances and lights.

we can then find the median of all the other variables. 



In [63]:
energy_features=energy.drop(['Appliances','lights'], axis=1).median()
print(energy_features)

Temp_kitchen       21.600000
RHumid_kitchen     39.656667
Temp_living        20.000000
RHumid_living      40.500000
Temp_laundry       22.100000
RHumid_laundry     38.530000
Temp_office        20.666667
RHumid_office      38.400000
Temp_bath          19.390000
RHumid_bath        49.090000
Temp_outN           7.300000
RHumid_outN        55.290000
Temp_iron          20.033333
RHumid_iron        34.863333
Temp_teen          22.100000
RHumid_teen        42.375000
Temp_parent        19.390000
RHumid_parent      40.900000
T_out               6.916667
Press_mm_hg       756.100000
RH_out             83.666667
Windspeed           3.666667
Visibility         40.000000
Tdewpoint           3.433333
dtype: float64


we then determine the variation in T_out.

In [60]:
minimumT, maximumT = energy['T_out'].min(), energy['T_out'].max()
print(minimumT, maximumT)

-5.0 26.1


In [61]:
# create array of 100 values between min and max
arr = np.arange(minimumT, maximumT, (maximumT - minimumT)/100)

The input to the model will now be each variation plus median values for the other features held constant.

In [64]:
input_list=[]
for variation in arr:
    all_vals=list(energy.drop(['Appliances','lights'], axis=1).median().values)
    all_vals[18]=variation
    input_list.append(all_vals)
input_list=np.array(input_list)

This input list of every variation of T out along with the constant of all other variables can be used to train the model.

In [65]:
new_model=scaler.transform(input_list)

In [66]:
predictions_new_model=mlp.predict(new_model)

In [68]:
plt.figure(figsize=(12, 6))
sns.lineplot(y_test, predictions.flatten(), color='r')
plt.xlabel("T out")
plt.ylabel("Target")

plt.twinx()
sns.distplot(df['T out'], kde=False, bins=10)
plt.ylabel("Distribution of energy", labelpad=7)

AttributeError: module 'seaborn' has no attribute 'lineplot'

<matplotlib.figure.Figure at 0x1f40bc83390>