In [1]:
### Import libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

In [2]:
### Load dataset

df = pd.read_csv('energydata_complete.csv')

In [3]:
df['date'] = pd.to_datetime(df['date'])     ### Ensure datetime is of correct datatype

In [4]:
""" Fit a linear model on the relationship between the temperature in the living room in Celsius (x = T2) 
and the temperature outside the building (y = T6).
What is the Root Mean Squared error in three D.P? """


from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

x = df[['T2']]
y = df['T6']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Create and train the Linear Regression model
model = LinearRegression()
model.fit(x_train, y_train)

# Make predictions on the test set
y_pred = model.predict(x_test)

# Calculate the Root Mean Squared Error (RMSE)
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print RMSE rounded to 3 decimal places
print("Root Mean Squared Error (RMSE): {:.3f}".format(rmse))

Root Mean Squared Error (RMSE): 3.630


In [5]:
df.dtypes

date           datetime64[ns]
Appliances              int64
lights                  int64
T1                    float64
RH_1                  float64
T2                    float64
RH_2                  float64
T3                    float64
RH_3                  float64
T4                    float64
RH_4                  float64
T5                    float64
RH_5                  float64
T6                    float64
RH_6                  float64
T7                    float64
RH_7                  float64
T8                    float64
RH_8                  float64
T9                    float64
RH_9                  float64
T_out                 float64
Press_mm_hg           float64
RH_out                float64
Windspeed             float64
Visibility            float64
Tdewpoint             float64
rv1                   float64
rv2                   float64
dtype: object

In [6]:
""""
Remove the following columns: [“date”, “lights”]. The target variable is “Appliances”. 
Use a 70-30 train-test set split with a  random state of 42 (for reproducibility). 
Normalize the dataset using the MinMaxScaler 
(Hint: Use the MinMaxScaler fit_transform and transform methods on the train and test set respectively). 
Run a multiple linear regression using the training set. Answer the following questions:

What is the Mean Absolute Error (in three decimal places) for the  training set?
"""
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

x = scaler.fit_transform(df[['T1', 'RH_1', 'T2', 'RH_2', 'T3',
       'RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8',
       'RH_8', 'T9', 'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed',
       'Visibility', 'Tdewpoint', 'rv1', 'rv2']])

y = df['Appliances']
y = y.values.reshape(-1, 1)
y = scaler.fit_transform(y)

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=42)

# Create and train the Linear Regression model
model = LinearRegression()
model.fit(x_train, y_train)

# Make predictions for the test and training sets
y_test_predict = model.predict(x_test)
y_train_predict = model.predict(x_train)

### Mean Absolute Error (MAE)

In [8]:
mae_train = mean_absolute_error(y_train,y_train_predict)*1000
mae_test = mean_absolute_error(y_test,y_test_predict)*1000


# Print the RMSE for both training and test sets
print(f"Mean Squared Error (MAE) for Training Set: {mae_train:.3f}")
print(f"Mean Squared Error (MAE) for Test Set: {mae_test:.3f}")


Mean Squared Error (MAE) for Training Set: 50.226
Mean Squared Error (MAE) for Test Set: 50.134


### Root Mean Squared Error (RMSE)

In [9]:
# Calculate the Root Mean Squared Error (RMSE) for the training set
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_predict))*1000

# Calculate the Root Mean Squared Error (RMSE) for the test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_predict))*1000

# Print the RMSE for both training and test sets
print(f"Root Mean Squared Error (RMSE) for Training Set: {rmse_train:.3f}")
print(f"Root Mean Squared Error (RMSE) for Test Set: {rmse_test:.3f}")

Root Mean Squared Error (RMSE) for Training Set: 88.987
Root Mean Squared Error (RMSE) for Test Set: 87.514


### Ridge penalization method

In [10]:
""""
Train a ridge regression model with default parameters. 
Is there any change to the root mean squared error (RMSE) when evaluated on the test set?
"""

x = df[['T1', 'RH_1', 'T2', 'RH_2', 'T3',
       'RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8',
       'RH_8', 'T9', 'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed',
       'Visibility', 'Tdewpoint', 'rv1', 'rv2']]

y = df['Appliances']

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=42)
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=0.5)
ridge_reg.fit(x_train, y_train)

y_tested = ridge_reg.predict(x_test)
y_trained = ridge_reg.predict(x_train)

rmse_train = np.sqrt(mean_squared_error(y_train, y_trained))

# Calculate the Root Mean Squared Error (RMSE) for the test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_tested))

# Print the RMSE for both training and test sets
print(f"Root Mean Squared Error (RMSE) for Training Set: {rmse_train:.3f}")
print(f"Root Mean Squared Error (RMSE) for Test Set: {rmse_test:.3f}")                               

Root Mean Squared Error (RMSE) for Training Set: 95.216
Root Mean Squared Error (RMSE) for Test Set: 93.641


### Lasso penalization method

In [11]:
"""
Train a lasso regression model with default value and obtain the new feature weights with it. 
How many of the features have non-zero feature weights?
"""
x = df[['T1', 'RH_1', 'T2', 'RH_2', 'T3',
       'RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8',
       'RH_8', 'T9', 'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed',
       'Visibility', 'Tdewpoint', 'rv1', 'rv2']]

y = df['Appliances']

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=42)

from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(x_train, y_train)

y_tested = lasso_reg.predict(x_test)
y_trained = lasso_reg.predict(x_train)

rmse_train = np.sqrt(mean_squared_error(y_train, y_trained))

# Calculate the Root Mean Squared Error (RMSE) for the test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_tested))

# Print the RMSE for both training and test sets
print(f"Root Mean Squared Error (RMSE) for Training Set: {rmse_train:.3f}")
print(f"Root Mean Squared Error (RMSE) for Test Set: {rmse_test:.3f}") 

Root Mean Squared Error (RMSE) for Training Set: 95.216
Root Mean Squared Error (RMSE) for Test Set: 93.640


### Number of features with non-zero weights using Lasso penalization

In [16]:
# Get the feature weights (coefficients) from the trained Lasso model
features = x.columns
weights = lasso_reg.coef_

# Count the number of features with non-zero weights
non_zero_feature_count = sum(weights != 0)

print("Number of features with non-zero feature weights:", non_zero_feature_count)
print(" ")
print("See below features and their weights:")

for feature, weight in zip(features,weights):
    print(f"{feature}: {weight:.4f}")

Number of features with non-zero feature weights: 26
 
See below features and their weights
T1: -0.3665
RH_1: 16.2997
T2: -18.3632
RH_2: -13.7372
T3: 25.8311
RH_3: 4.8020
T4: 2.7887
RH_4: 1.2029
T5: -1.6016
RH_5: 0.2575
T6: 7.3603
RH_6: 0.4115
T7: 1.0344
RH_7: -1.6919
T8: 9.9883
RH_8: -5.7790
T9: -21.1384
RH_9: -1.7623
T_out: -11.0596
Press_mm_hg: 0.1702
RH_out: -1.0912
Windspeed: 2.2306
Visibility: 0.2025
Tdewpoint: 5.6890
rv1: 0.0330
rv2: 0.0000
