In [82]:
# import the library
import pandas as pd
import numpy as np

In [83]:
# load the dataset
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv")

In [84]:
# drop irrelevant features like the date column
df.drop("date", axis = 1, inplace = True)

In [85]:
# print the head
df.head()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


# Simple Linear Regression Model

In [86]:
# define the linear model class

from sklearn.linear_model import LinearRegression

In [87]:
# firstly we normalize the dataset to common scale using the min max scaler

from sklearn.preprocessing import MinMaxScaler

In [88]:
# Instantiate an object for the min max scaler

scaler = MinMaxScaler()

In [89]:
# Create a normalized dataframe

normalized_df = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)

In [90]:
# create a separate object for the features and target variables
# This means defining X and Y

feature_df = normalized_df[["T2"]]

target = normalized_df["T6"]

In [91]:
# Split the dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(feature_df, target, test_size = 0.3, random_state = 1)

In [92]:
# print the shape X and Y
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(13814, 1) (5921, 1)
(13814,) (5921,)


In [93]:
# Instantiate an object for the model class
reg = LinearRegression()

In [94]:
# Train the model using the training dataset
reg.fit(X_train, y_train)

LinearRegression()

In [96]:
# print the coefficient of the regression model
reg.coef_

array([0.89003912])

In [97]:
# Make prediction with the model
reg_pred = reg.predict(X_test)

In [100]:
# Evaluate the performance of the model using R_squared
from sklearn.metrics import r2_score
R = r2_score(y_test, reg_pred)
print(round(R, 4))

0.6459


# Multiple Linear Regression Model

In [101]:
normalized_df.columns

Index(['Appliances', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4',
       'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9',
       'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility',
       'Tdewpoint', 'rv1', 'rv2'],
      dtype='object')

In [102]:
# Drop the lights column
normalized_df.drop("lights", axis = 1, inplace = True)

In [103]:
feature = normalized_df.drop("Appliances", axis =1)
target = normalized_df["Appliances"]

In [104]:
# Print the column names
normalized_df.columns

Index(['Appliances', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4', 'RH_4',
       'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9', 'RH_9',
       'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility',
       'Tdewpoint', 'rv1', 'rv2'],
      dtype='object')

In [105]:
# Split the dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size = 0.3, random_state = 42)

In [106]:
# Create an object for the model
ml = LinearRegression()

In [107]:
# Train the model
ml.fit(X_train, y_train)

LinearRegression()

In [108]:
# make prediction with the model
ml_pred = ml.predict(X_test)

In [109]:
# Evaluate the performance of the model using Mean Absolute Error (MAE) metric
from sklearn.metrics import mean_absolute_error
MAE = mean_absolute_error(y_test, ml_pred)
print(round(MAE, 2))

0.05


In [110]:
# Calculate the Residual Sum of Squares
RSS = np.sum(np.square(y_test - ml_pred))
print(round(RSS, 2))

45.35


In [111]:
# Calculate the Root Mean Squared Error
from sklearn.metrics import mean_squared_error

RMSE = np.sqrt(mean_squared_error(y_test, ml_pred))

print(round(RMSE, 3))

0.088


In [112]:
# Calculate the R_squared fo the multiple regression model
from sklearn.metrics import r2_score
R_squared = r2_score(y_test, ml_pred)
print(round(R_squared, 2))

0.15


In [113]:
# Create a dataframe for the wieghts of the features
model_coefficient = pd.DataFrame()
model_coefficient["columns"] = X_train.columns
model_coefficient["Coefficient Estimate"] = pd.Series(ml.coef_)
model_coefficient

Unnamed: 0,columns,Coefficient Estimate
0,T1,-0.003281
1,RH_1,0.553547
2,T2,-0.236178
3,RH_2,-0.456698
4,T3,0.290627
5,RH_3,0.096048
6,T4,0.028981
7,RH_4,0.026386
8,T5,-0.015657
9,RH_5,0.016006


# Ridge Regression Model (L2 Regularization)

In [119]:
# applying the Ridge Regression

from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=0.4)

# Train the model

ridge_model.fit(X_train, y_train)

# make prediction with the model

ridge_pred = ridge_model.predict(X_test)

# Calculate the Root Mean squared error (RMSE)
 
RMSE1 = np.sqrt(mean_squared_error(y_test, ridge_pred))

print("The RMSE is :", round(RMSE1, 3))


# Putting together the coefficients and their corresponding variable names

ridge_coefficient = pd.DataFrame()
ridge_coefficient["columns"] = X_train.columns
ridge_coefficient["Coefficient Estimate"] = pd.Series(ridge_model.coef_)
print(ridge_coefficient)

The RMSE is : 0.088
        columns  Coefficient Estimate
0            T1             -0.018406
1          RH_1              0.519525
2            T2             -0.201397
3          RH_2             -0.411071
4            T3              0.288087
5          RH_3              0.095135
6            T4              0.027384
7          RH_4              0.024579
8            T5             -0.019853
9          RH_5              0.016152
10           T6              0.217292
11         RH_6              0.035519
12           T7              0.010098
13         RH_7             -0.045977
14           T8              0.101028
15         RH_8             -0.156830
16           T9             -0.188916
17         RH_9             -0.041367
18        T_out             -0.262172
19  Press_mm_hg              0.006584
20       RH_out             -0.054724
21    Windspeed              0.030268
22   Visibility              0.012076
23    Tdewpoint              0.083128
24          rv1              0

# Lasso Regression Model (L1 Regularization)

In [121]:
# import the lasso model
from sklearn.linear_model import Lasso
lasso_model = Lasso(alpha=0.001)

# Train the model

lasso_model.fit(X_train, y_train)

# make Predictions with the trained model

lasso_pred = lasso_model.predict(X_test)

# Calulate the Root Mean Squared error (RMSE)


RMSE2 = np.sqrt(mean_squared_error(y_test, lasso_pred))

print("The RMSE is: ", round(RMSE2, 3))


# putting together the coefficient and their respective variable names

lasso_coef = pd.DataFrame()
lasso_coef["columns"] = X_train.columns
lasso_coef["Coefficient Estimate"] = pd.Series(lasso_model.coef_)

print(lasso_coef)

The RMSE is:  0.094
        columns  Coefficient Estimate
0            T1              0.000000
1          RH_1              0.017880
2            T2              0.000000
3          RH_2             -0.000000
4            T3              0.000000
5          RH_3              0.000000
6            T4             -0.000000
7          RH_4              0.000000
8            T5             -0.000000
9          RH_5              0.000000
10           T6              0.000000
11         RH_6             -0.000000
12           T7             -0.000000
13         RH_7             -0.000000
14           T8              0.000000
15         RH_8             -0.000110
16           T9             -0.000000
17         RH_9             -0.000000
18        T_out              0.000000
19  Press_mm_hg             -0.000000
20       RH_out             -0.049557
21    Windspeed              0.002912
22   Visibility              0.000000
23    Tdewpoint              0.000000
24          rv1             -0