In [1]:
#import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
#read in data set
df = pd.read_csv("./downloads/energydata_complete.csv")

In [3]:
#preview data
df.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,55.2,7.026667,84.256667,17.2,41.626667,18.2,48.9,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,55.2,6.833333,84.063333,17.2,41.56,18.2,48.863333,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,55.09,6.56,83.156667,17.2,41.433333,18.2,48.73,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,55.09,6.433333,83.423333,17.133333,41.29,18.1,48.59,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,55.09,6.366667,84.893333,17.2,41.23,18.1,48.59,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097



Attribute Information:

Date, time year-month-day hour:minute:second

Appliances, energy use in Wh

lights, energy use of light fixtures in the house in Wh

T1, Temperature in kitchen area, in Celsius

RH_1, Humidity in kitchen area, in %

T2, Temperature in living room area, in Celsius

RH_2, Humidity in living room area, in %

T3, Temperature in laundry room area

RH_3, Humidity in laundry room area, in %

T4, Temperature in office room, in Celsius

RH_4, Humidity in office room, in %

T5, Temperature in bathroom, in Celsius

RH_5, Humidity in bathroom, in %

T6, Temperature outside the building (north side), in Celsius

RH_6, Humidity outside the building (north side), in %

T7, Temperature in ironing room , in Celsius

RH_7, Humidity in ironing room, in %

T8, Temperature in teenager room 2, in Celsius

RH_8, Humidity in teenager room 2, in %

T9, Temperature in parents room, in Celsius

RH_9, Humidity in parents room, in %

To, Temperature outside (from Chievres weather station), in Celsius

Pressure (from Chievres weather station), in mm Hg

RH_out, Humidity outside (from Chievres weather station), in %

Wind speed (from Chievres weather station), in m/s

Visibility (from Chievres weather station), in km

Tdewpoint (from Chievres weather station), Â °C

rv1, Random variable 1, nondimensional

rv2, Random variable 2, nondimensional

In [4]:
df_model = df.copy()

In [5]:
df_model.shape

(19735, 29)

In [6]:
#drop light and date columns
df_model = df_model.drop(columns=['date','lights'])

In [7]:
df_model.shape

(19735, 27)

In [8]:
df_model.head()

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,60,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,55.2,7.026667,84.256667,17.2,41.626667,18.2,48.9,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,60,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,55.2,6.833333,84.063333,17.2,41.56,18.2,48.863333,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,50,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,55.09,6.56,83.156667,17.2,41.433333,18.2,48.73,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,50,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,55.09,6.433333,83.423333,17.133333,41.29,18.1,48.59,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,60,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,55.09,6.366667,84.893333,17.2,41.23,18.1,48.59,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [9]:
features = df_model.iloc[:,1:]

In [10]:
features.columns

Index(['T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9', 'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility', 'Tdewpoint', 'rv1', 'rv2'], dtype='object')

In [15]:
target = df_model['Appliances']

In [18]:
target.shape

(19735,)

In [96]:
#inear model on the relationship between the temperature in the living room in Celsius (x = T2) and the temperature outside the building (y = T6)

In [98]:
#Q12
X_train,X_test,Y_train,Y_test = train_test_split(features_df['T2'],features_df['T2'],test_size = 0.3,random_state =42)

In [19]:

#normalize the dataset
from sklearn.preprocessing import MinMaxScaler
scalar = MinMaxScaler()

In [20]:
df_normal = pd.DataFrame(scalar.fit_transform(df_model),columns=df_model.columns)

In [21]:
df_normal.head()

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,0.046729,0.32735,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,0.381691,0.38107,0.841827,0.170594,0.653428,0.173329,0.661412,0.223032,0.67729,0.37299,0.097674,0.894737,0.5,0.953846,0.538462,0.265449,0.265449
1,0.046729,0.32735,0.541326,0.225345,0.68214,0.215188,0.748871,0.351351,0.782437,0.175506,0.381691,0.375443,0.839872,0.170594,0.651064,0.173329,0.660155,0.2265,0.678532,0.369239,0.1,0.894737,0.47619,0.894872,0.533937,0.372083,0.372083
2,0.037383,0.32735,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,0.380037,0.367487,0.830704,0.170594,0.646572,0.173329,0.655586,0.219563,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848
3,0.037383,0.32735,0.52408,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,0.175506,0.380037,0.3638,0.833401,0.16431,0.641489,0.164175,0.650788,0.219563,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261
4,0.046729,0.32735,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,0.178691,0.380037,0.361859,0.848264,0.170594,0.639362,0.164175,0.650788,0.219563,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611


In [29]:
features_df = df_normal.iloc[:,1:]

In [54]:
features_df.shape

(19735, 26)

In [30]:
target = df_normal['Appliances']

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
x_train,x_test,y_train,y_test = train_test_split(features_df,target,test_size = 0.3,random_state =42)

In [33]:
liner_regression.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [34]:
predicted = liner_regression.predict(x_test)

In [36]:
#mean absolute error
from sklearn.metrics import mean_absolute_error

In [80]:
mae = mean_absolute_error(y_test,predicted) 

mae = round(mae,2)
print("mean absolute error of linear model: ",mae)


mean absolute error of linear model:  0.05


In [40]:
#rMSE
from sklearn.metrics import mean_squared_error

In [42]:
rmse = np.sqrt(mean_squared_error(y_test,predicted))

In [81]:
rmse = round(rmse,3)
print("Root Mean Squared Error of linear model: ",rmse)

Root Mean Squared Error of linear model:  0.088


In [44]:
#residual sum of squares
rss = np.sum(np.square(y_test - predicted))

In [82]:
rss = round(rss,2)
print("Residual Sum of Squares of linear model: ",rss)

Residual Sum of Squares of linear model:  45.35


In [92]:
#Coefficient of Determination 
from sklearn.metrics import r2_score

In [93]:
r2_score = r2_score(y_test,predicted)


In [95]:
round(r2_score,2)

0.15

In [51]:
#higheest weights
weight = pd.Series(liner_regression.coef_,features_df.columns).sort_values()

In [58]:
weight

RH_2          -0.456698
T_out         -0.321860
T2            -0.236178
T9            -0.189941
RH_8          -0.157595
RH_out        -0.077671
RH_7          -0.044614
RH_9          -0.039800
T5            -0.015657
T1            -0.003281
rv1            0.000770
rv2            0.000770
Press_mm_hg    0.006839
T7             0.010319
Visibility     0.012307
RH_5           0.016006
RH_4           0.026386
T4             0.028981
Windspeed      0.029183
RH_6           0.038049
RH_3           0.096048
T8             0.101995
Tdewpoint      0.117758
T6             0.236425
T3             0.290627
RH_1           0.553547
dtype: float64

In [60]:
#ridge regression model 
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha = 0.4)

In [61]:
#fit model
ridge_reg.fit(x_train,y_train)

Ridge(alpha=0.4, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [62]:
#predict test
pred_ridge = ridge_reg.predict(x_test)

In [64]:
#evaluate model rmse
rmse_ridge = np.sqrt(mean_squared_error(y_test,pred_ridge))
round(rmse_ridge,3)

0.088

In [65]:
#train lasso model
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=0.001)

In [66]:
#fit model
lasso_reg.fit(x_train,y_train)

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [67]:
#evaluate model rmse
pred_lasso = lasso_reg.predict(x_test)
rmse_lasso = np.sqrt(mean_squared_error(y_test,pred_lasso))
round(rmse_lasso,3)

0.094

In [68]:
#check feature weights
lasso_weight = pd.Series(lasso_reg.coef_,features_df.columns).sort_values()

In [73]:
#
lasso_weight 

RH_out        -0.049557
RH_8          -0.000110
T1             0.000000
Tdewpoint      0.000000
Visibility     0.000000
Press_mm_hg   -0.000000
T_out          0.000000
RH_9          -0.000000
T9            -0.000000
T8             0.000000
RH_7          -0.000000
rv1           -0.000000
T7            -0.000000
T6             0.000000
RH_5           0.000000
T5            -0.000000
RH_4           0.000000
T4            -0.000000
RH_3           0.000000
T3             0.000000
RH_2          -0.000000
T2             0.000000
RH_6          -0.000000
rv2           -0.000000
Windspeed      0.002912
RH_1           0.017880
dtype: float64