# HDSC Stage B Tag Along

### Importing necessary libraries & dataset, renaming columns, checking description and information of my dataset

In [128]:
# importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [129]:
# importing my dataset
energy_data = pd.read_csv('energydata_completed.csv')
energy_data.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [130]:
# what is the dimension of my dataset?
print (f"My dataset(energy_data) has {energy_data.shape[0]} rows and {energy_data.shape[1]} columns.")

My dataset(energy_data) has 19735 rows and 29 columns.


In [131]:
# what are the summary statistics of the features in my dataset?
energy_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Appliances,19735.0,97.694958,102.524891,10.0,50.0,60.0,100.0,1080.0
lights,19735.0,3.801875,7.935988,0.0,0.0,0.0,0.0,70.0
T1,19735.0,21.686571,1.606066,16.79,20.76,21.6,22.6,26.26
RH_1,19735.0,40.259739,3.979299,27.023333,37.333333,39.656667,43.066667,63.36
T2,19735.0,20.341219,2.192974,16.1,18.79,20.0,21.5,29.856667
RH_2,19735.0,40.42042,4.069813,20.463333,37.9,40.5,43.26,56.026667
T3,19735.0,22.267611,2.006111,17.2,20.79,22.1,23.29,29.236
RH_3,19735.0,39.2425,3.254576,28.766667,36.9,38.53,41.76,50.163333
T4,19735.0,20.855335,2.042884,15.1,19.53,20.666667,22.1,26.2
RH_4,19735.0,39.026904,4.341321,27.66,35.53,38.4,42.156667,51.09


In [132]:
# more information about the features of my dataset
energy_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  float64
 19  T9  

#### *Note: It is observed that there are no missing values in our dataset.

## Simple Linear Regression(SLR) model to predict the 'temperature outside the building' (y = T6) based on the 'temperature in the Living room' (x = T2)

### The equation for SLR is; y=mx+b
where; y - Target variable,
x - Feature,
m - Slope (coefficient) of the line(representing how much y changes for a unit change in x),
b - Intercept(representing the value of y when x is 0).

In [133]:
# extracting T2 (Living Room temperature - Feature) and T6 (Outside temperature - Target Variable)
X = energy_data['T2'].values.reshape(-1, 1)
y = energy_data['T6'].values

# creating and fitting a linear regression model on the extracted values
model = LinearRegression()
model.fit(X, y)

# predicting T6(y) from T2(X)
y_pred = model.predict(X)

# calculating the Root Mean Squared Error (RMSE) for evaluation
rmse_cal = np.sqrt(mean_squared_error(y, y_pred))

# printing the RMSE rounded to three decimal places
print(f"Root Mean Squared Error: {rmse_cal:.3f}")

Root Mean Squared Error: 3.644


## Multiple Linear Regression(MLR) model to predict 'Appliances' based all the columns in our dataset (excluding 'date' and 'light')

### The equation for MLR is; Y = β0 + β1X1 + β2X2 + … + βpXp + ε
where; Y - dependent variable (the variable I want to predict),
β0 - intercept (the value of Y when all predictor variables are 0),
β1 to βp - coefficients of the predictor variables,
X1 to Xp - predictor variables (features),
ε - error term, which captures the unexplained variation in Y.

In [134]:
# removing the ["date", "lights"] columns
if "date" in energy_data.columns and "lights" in energy_data.columns:
    energy_data = energy_data.drop(["date", "lights"], axis=1)
else:
    energy_data


# defining the target variable and the features
X = energy_data.drop("Appliances", axis=1)
y = energy_data["Appliances"]

# splitting the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# normalizing the dataset using MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# creating and training the MLR model
mlr_model = LinearRegression()
mlr_model.fit(X_train, y_train)

# making predictions on the train and test sets
y_pred_t = linear_model.predict(X_train)
y_pred_te = linear_model.predict(X_test) 

# calculating Mean Absolute Error(MAE) for train
mae1 = mean_absolute_error(y_train, y_pred_t)
print(f"The Mean Absolute Error for the train set is {mae1:.3f}")

# calculating Root Mean Squared Error(RMSE) for train
rmse1 = np.sqrt(mean_squared_error(y_train, y_pred_t))
print(f"The Root Mean Squared Error for the train set is {rmse1:.3f}")

# calculating Mean Absolute Error(MAE) for test
mae2 = mean_absolute_error(y_test, y_pred_te)
print(f"The Mean Absolute Error for the test set is {mae2:.3f}")

# calculating Root Mean Squared Error(RMSE) for test
rmse2 = np.sqrt(mean_squared_error(y_test, y_pred_te))
print(f"The Root Mean Squared Error for the test set is {rmse2:.3f}")

The Mean Absolute Error for the train set is 53.742
The Root Mean Squared Error for the train set is 95.216
The Mean Absolute Error for the test set is 53.643
The Root Mean Squared Error for the test set is 93.640


#### *Based on the values of the MAE and RMSE of both the test and train sets, it seems our model is NOT overfitting our training data.

## Regularization techniques

### Ridge regression

In [135]:
# creating and training a Ridge regression model with default parameters
ridge_model = Ridge()
ridge_model.fit(X_train, y_train)

# making predictions on the test set with the ridge model
y_pred_ridge = ridge_model.predict(X_test)

# Calculate RMSE for the Ridge regression model on the test set
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
print(f"The Root Mean Squared Error for the Ridge model on the test set is {rmse_ridge:.3f}")

The Root Mean Squared Error for the Ridge model on the test set is 93.709


### Lasso regression and Feature Selection

In [136]:
# creating and training a Lasso regression model with default parameters
lasso_model = Lasso()
lasso_model.fit(X_train, y_train)

# obtaining the feature weights
feature_weights = lasso_model.coef_

# counting the number of features with non-zero weights
non_zero_feature_count = np.sum(feature_weights != 0)

print(f"Number of features with non-zero weights: {non_zero_feature_count}")


# making predictions on the test set with the lasso model
y_pred_lasso = lasso_model.predict(X_test)

# Calculate RMSE for the Lasso regression model on the test set
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
print(f"The Root Mean Squared Error for the Lasso model on the test set is {rmse_lasso:.3f}")

Number of features with non-zero weights: 4
The Root Mean Squared Error for the Lasso model on the test set is 99.424
