In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('energydata_complete.csv')

## Description

The dataset for the remainder of this quiz is the Appliances Energy Prediction data. The data set is at 10 min for about 4.5 months. The house temperature and humidity conditions were monitored with a ZigBee wireless sensor network. Each wireless node transmitted the temperature and humidity conditions around 3.3 min. Then, the wireless data was averaged for 10 minutes periods. The energy data was logged every 10 minutes with m-bus energy meters. Weather from the nearest airport weather station (Chievres Airport, Belgium) was downloaded from a public data set from Reliable Prognosis (rp5.ru), and merged together with the experimental data sets using the date and time column. Two random variables have been included in the data set for testing the regression models and to filter out non predictive attributes (parameters). The attribute information can be seen below.

### Attribute Information:

#### Date, time year-month-day hour:minute:second

#### Appliances, energy use in Wh

#### lights, energy use of light fixtures in the house in Wh

- T1, Temperature in kitchen area, in Celsius

- RH_1, Humidity in kitchen area, in %

- T2, Temperature in living room area, in Celsius

- RH_2, Humidity in living room area, in %

- T3, Temperature in laundry room area

- RH_3, Humidity in laundry room area, in %

- T4, Temperature in office room, in Celsius

- RH_4, Humidity in office room, in %

- T5, Temperature in bathroom, in Celsius

- RH_5, Humidity in bathroom, in %

- T6, Temperature outside the building (north side), in Celsius

- RH_6, Humidity outside the building (north side), in %

- T7, Temperature in ironing room , in Celsius

- RH_7, Humidity in ironing room, in %

- T8, Temperature in teenager room 2, in Celsius

- RH_8, Humidity in teenager room 2, in %

- T9, Temperature in parents room, in Celsius

- RH_9, Humidity in parents room, in %

- To, Temperature outside (from Chievres weather station), in Celsius

- Pressure (from Chievres weather station), in mm Hg

- RH_out, Humidity outside (from Chievres weather station), in %

- Wind speed (from Chievres weather station), in m/s

- Visibility (from Chievres weather station), in km

- Tdewpoint (from Chievres weather station), Â°C

- rv1, Random variable 1, nondimensional

- rv2, Random variable 2, nondimensional

In [3]:
data.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  float64
 19  T9  

In [5]:
data.describe()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
count,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,...,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,97.694958,3.801875,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,39.026904,...,19.485828,41.552401,7.411665,755.522602,79.750418,4.039752,38.330834,3.760707,24.988033,24.988033
std,102.524891,7.935988,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,4.341321,...,2.014712,4.151497,5.317409,7.399441,14.901088,2.451221,11.794719,4.194648,14.496634,14.496634
min,10.0,0.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,27.66,...,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322,0.005322
25%,50.0,0.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,35.53,...,18.0,38.5,3.666667,750.933333,70.333333,2.0,29.0,0.9,12.497889,12.497889
50%,60.0,0.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,38.4,...,19.39,40.9,6.916667,756.1,83.666667,3.666667,40.0,3.433333,24.897653,24.897653
75%,100.0,0.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,42.156667,...,20.6,44.338095,10.408333,760.933333,91.666667,5.5,40.0,6.566667,37.583769,37.583769
max,1080.0,70.0,26.26,63.36,29.856667,56.026667,29.236,50.163333,26.2,51.09,...,24.5,53.326667,26.1,772.3,100.0,14.0,66.0,15.5,49.99653,49.99653


In [6]:
# Model fitting and Prediction

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X1 = np.array(data['T2']).reshape(-1,1)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X1, data['T6'], test_size=0.3, random_state=42)

In [10]:
from sklearn.linear_model import LinearRegression

In [11]:
model = LinearRegression()

In [12]:
model.fit(X_train, y_train)

LinearRegression()

In [13]:
predictions = model.predict(X_test)

In [82]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Q.12 

In [141]:
r2_s = r2_score(y_test, predictions)

In [119]:
round(r2_s,2)

0.64

In [17]:
from sklearn.preprocessing import MinMaxScaler

In [18]:
scaler = MinMaxScaler()

In [19]:
new_data = data.drop(columns=['date','lights'])

In [20]:
scaler.fit(new_data)

MinMaxScaler()

In [21]:
normalized_data = pd.DataFrame(scaler.transform(new_data), columns=new_data.columns)

In [22]:
normalized_data.head()

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,0.046729,0.32735,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,...,0.223032,0.67729,0.37299,0.097674,0.894737,0.5,0.953846,0.538462,0.265449,0.265449
1,0.046729,0.32735,0.541326,0.225345,0.68214,0.215188,0.748871,0.351351,0.782437,0.175506,...,0.2265,0.678532,0.369239,0.1,0.894737,0.47619,0.894872,0.533937,0.372083,0.372083
2,0.037383,0.32735,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,...,0.219563,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848
3,0.037383,0.32735,0.52408,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,0.175506,...,0.219563,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261
4,0.046729,0.32735,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,0.178691,...,0.219563,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611


In [23]:
X2 = normalized_data.drop('Appliances', axis=1)

In [24]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, normalized_data['Appliances'], test_size=0.3, random_state=42)

In [92]:
appliances_model = LinearRegression()

In [93]:
appliances_model.fit(X2_train, y2_train)

LinearRegression()

In [94]:
predictions2 = model.predict(X2_test)

# Q.13

In [142]:
mae = mean_absolute_error(y2_test, predictions2)

In [121]:
round(mae,2)

0.05

# Q.14

In [145]:
def rss(y_true, y_pred):
    return np.sum(np.square(y_true - y_pred))

In [146]:
print(round(rss(y2_test, predictions2),2))

45.35


# Q.15

In [144]:
appliances_rmse = np.sqrt(mean_squared_error(y2_test, predictions2))

In [132]:
round(appliances_rmse,2)

0.09

# Q.16

In [147]:
r2_s2 = r2_score(y2_test, predictions2)

In [35]:
round(r2_s2,2)

0.15

In [87]:
def get_weights_df(model, feat, col_name):
    weights = pd.Series(model.coef_, feat.columns).sort_values()
    weights_df = pd.DataFrame(weights).reset_index()
    weights_df.columns = ['Features', col_name]
    weights_df[col_name].round(3)
    return weights_df

# Q.17

In [148]:
appliances_model_weights = get_weights_df(appliances_model, X2, 'appliances_model_weight')

In [103]:
appliances_model_weights

Unnamed: 0,Features,appliances_model_weight
0,RH_2,-0.456698
1,T_out,-0.32186
2,T2,-0.236178
3,T9,-0.189941
4,RH_8,-0.157595
5,RH_out,-0.077671
6,RH_7,-0.044614
7,RH_9,-0.0398
8,T5,-0.015657
9,T1,-0.003281


## RH_2 and RH_1 have the lowest and highest weight respectively

In [61]:
from sklearn.linear_model import Ridge

# Q.18

In [149]:
ridge_model = Ridge(alpha=0.4)

In [71]:
ridge_model.fit(X2_train, y2_train)

Ridge(alpha=0.4)

In [73]:
ridge_pred = ridge_model.predict(X2_test)

In [74]:
ridge_rmse = np.sqrt(mean_squared_error(y2_test, ridge_pred))

In [75]:
round(ridge_rmse,2)

0.09

## ridge_rmse and appliances_rmse are the same

In [105]:
from sklearn.linear_model import Lasso

# Q.19

In [153]:
lasso_model = Lasso(0.001)

In [154]:
lasso_model.fit(X2_train, y2_train)

Lasso(alpha=0.001)

In [155]:
lasso_model_weights = get_weights_df(lasso_model, X2, 'lasso_model_weight')
lasso_model_weights

Unnamed: 0,Features,lasso_model_weight
0,RH_out,-0.049557
1,RH_8,-0.00011
2,T1,0.0
3,Tdewpoint,0.0
4,Visibility,0.0
5,Press_mm_hg,-0.0
6,T_out,0.0
7,RH_9,-0.0
8,T9,-0.0
9,T8,0.0


## There are 7 non-zero feature weights in the dataframe

# Q.20

In [156]:
lasso_pred = lasso_model.predict(X2_test)

In [140]:
lasso_rmse = np.sqrt(mean_squared_error(y2_test, lasso_pred))
round(lasso_rmse,3)

0.094