In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
df = pd.read_csv('energydata_complete.csv')
df.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


### 12. From the dataset, fit a linear model on the relationship between the temperature in the living room in Celsius (x = T2) and the temperature outside the building (y = T6). What is the R^2 value in two d.p.?

In [3]:
x_train_sub, x_test_sub, y_train_sub, y_test_sub = train_test_split(df.T2, df.T6,
test_size=0.3, random_state=42)

In [4]:
model = LinearRegression()
model.fit(np.array(x_train_sub).reshape(-1, 1), y_train_sub)

LinearRegression()

In [5]:
pred = model.predict(np.array(x_test_sub).reshape(-1, 1))

In [6]:
r2_value = r2_score(y_test_sub, pred)
round(r2_value, 2)

0.64

#### The r^2 value of the linear model is 0.64

### Normalize the dataset using the MinMaxScaler after removing the following columns: [“date”, “lights”]. The target variable is “Appliances”. Use a 70-30 train-test set split with a random state of 42 (for reproducibility). Run a multiple linear regression using the training set and evaluate your model on the test set. Answer questions(13-17):


In [7]:
#removing columns=[“date”, “lights”]
df.drop(columns=['date', 'lights'], inplace=True)
#normalizing the dataset
scaler = MinMaxScaler()
normalised_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
#creating the independent variables
x = normalised_df.drop(columns=['Appliances'])
#extracting the dependent variables
y = normalised_df['Appliances']

In [8]:
#splitting the dataset into train_test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [9]:
linear_model = LinearRegression()
#fitting the model to the training dataset
linear_model.fit(x_train, y_train)
#obtaining predictions
predicted_values = linear_model.predict(x_test)

### 13. What is the Mean Absolute Error (in two decimal places)?

In [10]:
mae = mean_absolute_error(y_test, predicted_values)
round(mae, 2)

0.05

#### The mean absolute error for the linear regression model is 0.05

### 14. What is the Residual Sum of Squares (in two decimal places)?

In [11]:
rss = np.sum(np.square(y_test - predicted_values))
round(rss, 2) 

45.35

#### The residual sum of squares for the linear regression model is 45.35

### 15. What is the Root Mean Squared Error (in three decimal places)?

In [12]:
rmse = np.sqrt(mean_squared_error(y_test, predicted_values))
round(rmse, 3)

0.088

#### The root mean squared error for the linear regression model is 0.088

### 16. What is the Coefficient of Determination (in two decimal places)?

In [13]:
r2_value = r2_score(y_test, predicted_values)
round(r2_value, 2)

0.15

#### The coefficient of determination for linear regression model is 0.15

### 17. Obtain the feature weights from your linear model above. Which features have the lowest and highest weights respectively?

In [14]:
def get_weights_df(model, feat, col_name):
#this function returns the weight of every feature
    weights = pd.Series(model.coef_, feat.columns).sort_values()
    weights_df = pd.DataFrame(weights).reset_index()
    weights_df.columns = ['Features', col_name]
    weights_df[col_name].round(3)
    return weights_df

In [15]:
#creating the weights dataframe
linear_model_weights = get_weights_df(linear_model, x_train, 'Linear_Model_Weight')
linear_model_weights.sort_values(by='Linear_Model_Weight', ascending=True)

Unnamed: 0,Features,Linear_Model_Weight
0,RH_2,-0.456698
1,T_out,-0.32186
2,T2,-0.236178
3,T9,-0.189941
4,RH_8,-0.157595
5,RH_out,-0.077671
6,RH_7,-0.044614
7,RH_9,-0.0398
8,T5,-0.015657
9,T1,-0.003281


#### The lowest and highest feature weights for the linear regression model are RH_2 and RH_1 respectively

### 18. Train a ridge regression model with an alpha value of 0.4. Is there any change to the root mean squared error (RMSE) when evaluated on the test set?

In [16]:
ridge_reg = Ridge(alpha=0.4)
ridge_reg.fit(x_train, y_train)

Ridge(alpha=0.4)

In [17]:
pred = ridge_reg.predict(x_test)

In [18]:
rmse = np.sqrt(mean_squared_error(y_test, pred))
round(rmse, 3)

0.088

#### The ridge regression model has a Root Mean Squared Error of 0.088 which is the same as the Root Mean Squared Error of the linear regression model

### 19. Train a lasso regression model with an alpha value of 0.001 and obtain the new feature weights with it. How many of the features have non-zero feature weights?

In [19]:
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(x_train, y_train)


Lasso(alpha=0.001)

In [20]:
lasso_model_weights = get_weights_df(lasso_reg, x_train, 'Lasso_Model_Weight')
(lasso_model_weights.Lasso_Model_Weight != 0).sum()

4

#### The lasso regression model as 4 non-zero feature weights

### 20. What is the new RMSE with the lasso regression? (Answer should be in three (3) decimal places)

In [21]:
pred = lasso_reg.predict(x_test)

rmse = np.sqrt(mean_squared_error(y_test, pred))
round(rmse, 3)

0.094

#### The Root Mean Squared Error of the Lasso Regression Model is 0.094