# Appliances Energy Prediction data

#### # Import packages

In [42]:
# Data manipulations
import pandas as pd
# Scientific computations
import numpy as np
# Regression models
from sklearn.linear_model import LinearRegression, Lasso, Ridge
# For splitting datasets into training and test sets
from sklearn.model_selection import train_test_split 
# Model evalution
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error 
# Scaling
from sklearn.preprocessing import MinMaxScaler 

### # Load dataset

In [43]:
AEP = pd.read_csv("energydata_complete.csv")
AEP.shape

(19735, 29)

In [44]:
AEP.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [45]:
AEP.describe()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
count,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,...,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,97.694958,3.801875,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,39.026904,...,19.485828,41.552401,7.411665,755.522602,79.750418,4.039752,38.330834,3.760707,24.988033,24.988033
std,102.524891,7.935988,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,4.341321,...,2.014712,4.151497,5.317409,7.399441,14.901088,2.451221,11.794719,4.194648,14.496634,14.496634
min,10.0,0.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,27.66,...,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322,0.005322
25%,50.0,0.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,35.53,...,18.0,38.5,3.666667,750.933333,70.333333,2.0,29.0,0.9,12.497889,12.497889
50%,60.0,0.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,38.4,...,19.39,40.9,6.916667,756.1,83.666667,3.666667,40.0,3.433333,24.897653,24.897653
75%,100.0,0.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,42.156667,...,20.6,44.338095,10.408333,760.933333,91.666667,5.5,40.0,6.566667,37.583769,37.583769
max,1080.0,70.0,26.26,63.36,29.856667,56.026667,29.236,50.163333,26.2,51.09,...,24.5,53.326667,26.1,772.3,100.0,14.0,66.0,15.5,49.99653,49.99653


In [46]:
AEP.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  float64
 19  T9  

### # Data Wrangling

#### # Checking for missing values

In [47]:
AEP.isnull().sum()

date           0
Appliances     0
lights         0
T1             0
RH_1           0
T2             0
RH_2           0
T3             0
RH_3           0
T4             0
RH_4           0
T5             0
RH_5           0
T6             0
RH_6           0
T7             0
RH_7           0
T8             0
RH_8           0
T9             0
RH_9           0
T_out          0
Press_mm_hg    0
RH_out         0
Windspeed      0
Visibility     0
Tdewpoint      0
rv1            0
rv2            0
dtype: int64

### # Drop the Date and Lights features from the dataset

In [48]:
AEP.drop(columns=["date", "lights"], inplace=True)
AEP.head()

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,60,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,60,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,50,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,50,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,60,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


### # Normalize the dataset using MinMaxScaler

#### # create a MinMaxScaler object

In [49]:
scaler = MinMaxScaler() 
norm_AEP = pd.DataFrame(scaler.fit_transform(AEP), columns = AEP.columns)
norm_AEP.head()

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,0.046729,0.32735,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,...,0.223032,0.67729,0.37299,0.097674,0.894737,0.5,0.953846,0.538462,0.265449,0.265449
1,0.046729,0.32735,0.541326,0.225345,0.68214,0.215188,0.748871,0.351351,0.782437,0.175506,...,0.2265,0.678532,0.369239,0.1,0.894737,0.47619,0.894872,0.533937,0.372083,0.372083
2,0.037383,0.32735,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,...,0.219563,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848
3,0.037383,0.32735,0.52408,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,0.175506,...,0.219563,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261
4,0.046729,0.32735,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,0.178691,...,0.219563,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611


###  # Functions to evaluate the performance of a regression model

In [50]:
def performance(y_test, ypred):
# Instantiate an empty dictonary
    metrics = {} 
# Calculate r2 and add it to the dictionary    
    metrics["R2-Score"] = r2_score(y_test, ypred) 
# Calculate MAE and add it to the dictionary    
    metrics["MAE"] = mean_absolute_error(y_test, ypred) 
# Calculate RSS and add it to the dictionary    
    metrics["RSS"] = np.sum(np.square(y_test - ypred)) 
# Calculate RMSE and add it to the dictionary    
    metrics["RMSE"] = np.sqrt(mean_squared_error(y_test, ypred)) # calculate RMSE and add it to the dictionary 
    return metrics

#### Question 12

From the dataset, fit a linear model on the relationship between the temperature in the living room in Celsius (x = T2) and the temperature outside the building (y = T6). What is the R^2 value in two D.P.

In [51]:
# Define predictor
X = norm_AEP[["T2"]]
# Define target
y = norm_AEP["T6"] 
# Split the data into training and test sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42)
# Create a Linear Regression model
simple_linear_model = LinearRegression() 
# Train the model
simple_linear_model.fit(train_X, train_y) 
# Make predictions with the model
ypred = simple_linear_model.predict(test_X)
# Evaluate R^2 and round it to 2 d.p.
round(r2_score(test_y, ypred), 2) 

0.64

### Create a Linear Regression model and fit the training data to it

In [52]:
# Define predictors; all features except "Appliances" are used
X = norm_AEP.drop(columns="Appliances")
# Define target variable
y = norm_AEP["Appliances"] 
# Split the data into training and test sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42)
# Create a linear regression model
linear_model = LinearRegression() 
# Train the model
linear_model.fit(train_X, train_y) 
# Make predictions with the model
ypred = linear_model.predict(test_X) 

#### Question 13

What is the Mean Absolute Error (in two decimal places)?

In [53]:
round(performance(test_y, ypred)["MAE"], 2)

0.05

### Question 14

What is the Residual Sum of Squares (in two decimal places)?

In [54]:
round(performance(test_y, ypred)["RSS"], 2)

45.35

### Question 15

What is the Root Mean Squared Error (in three decimal places)?

In [55]:
round(performance(test_y, ypred)["RMSE"], 3)

0.088

### Question 16

What is the Coefficient of Determination (in two decimal places)?

In [56]:
round(performance(test_y, ypred)["R2-Score"], 2)

0.15

### Question 17

Obtain the feature weights from your linear model above. Which features have the lowest and highest weights respectively?

In [57]:
# Obtain the feature weights and restructure it into a Series "weights"
weights = pd.Series(linear_model.coef_, X.columns)
weights 

T1            -3.281051e-03
RH_1           5.535574e-01
T2            -2.361946e-01
RH_2          -4.567150e-01
T3             2.906326e-01
RH_3           9.603389e-02
T4             2.899615e-02
RH_4           2.638924e-02
T5            -1.566300e-02
RH_5           1.600706e-02
T6             2.364314e-01
RH_6           3.805687e-02
T7             1.031356e-02
RH_7          -4.458875e-02
T8             1.019936e-01
RH_8          -1.576019e-01
T9            -1.899264e-01
RH_9          -3.981379e-02
T_out         -3.218479e-01
Press_mm_hg    6.845979e-03
RH_out        -7.765200e-02
Windspeed      2.919410e-02
Visibility     1.230399e-02
Tdewpoint      1.177425e-01
rv1            2.276510e+10
rv2           -2.276510e+10
dtype: float64

In [58]:
# Sort the weight Series in ascending order and retrieve the first and last elements
weights.sort_values()[[0, -1]]

rv2   -2.276510e+10
rv1    2.276510e+10
dtype: float64

#### Question 18

Train a ridge regression model with an alpha value of 0.4. Is there any change to the root mean squared error (RMSE) when evaluated on the test set?

In [59]:
# Create a ridge regression model
ridge = Ridge(alpha=0.4)
# Fit the model with the training data
ridge.fit(train_X, train_y) 
# Make predictions on the test set
yhat = ridge.predict(test_X) 
# Obtain the RMSE by invoking the performance function
round(performance(test_y, yhat)["RMSE"], 3) 

0.088

#### Question 19

Train a lasso regression model with an alpha value of 0.001 and obtain the new feature weights with it. How many of the features have non-zero feature weights

In [60]:
# Create a lasso regression model
lasso = Lasso(alpha=0.001) 
# Train the model
lasso.fit(train_X, train_y) 
# Obtain the feature weights
weights = pd.Series(lasso.coef_, X.columns) 

In [61]:
# Get the number of features with non-zero weights
len(weights[weights != 0])

4

### Question 20

In [62]:
# Make predictions with the model
ypred = lasso.predict(test_X) 
# Determine the RMSE using the helper function and round it to 3 d.p
round(performance(test_y, ypred)["RMSE"], 3) 

0.094