In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('energydata_complete.csv')

In [3]:
df.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [4]:
X = df.T2
y = df.T6

#splitting df into training data and testing data
from sklearn.model_selection import train_test_split #imports train_test_split from model_selection's module of sklearn

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2) #splits the predictors
# and target into train and test (70% and 30% respectively)

X_train = X_train.values.reshape(-1, 1)

from sklearn.linear_model import LinearRegression #imports Linear_Regression from sklearn
linear_model = LinearRegression() #creating an instance of Linear_Regression

#fitting the model to the taining set
linear_model.fit(X_train, y_train)

#predicitng with model
predicted_values = linear_model.predict(X_test.values.reshape(-1,1))


#calculating the coefficient of determonation value - R2 value
from sklearn.metrics import r2_score #imports r2 score from sklearn.metrics

R2 = r2_score(y_test, predicted_values) #returns r2 score
round(R2, 2) #rounds R2_score up to 3 decimal places

0.64

In [5]:
df.drop(columns = ['date', 'lights'], inplace = True)

In [6]:
#feature normalizing
from sklearn.preprocessing import MinMaxScaler #imports Minmaxscaler from sklearn's preprocessing module

#creating an instance of the scaler
scaler = MinMaxScaler()

#normalizing the dataframe
norm_df = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)

In [7]:
#predictors and target variable
predictors = df.drop(columns = ['Appliances']) #drops heating_load and cooling_load columns
target = norm_df['Appliances'] # selects only heating_load column

In [8]:
#splitting df into training data and testing data
from sklearn.model_selection import train_test_split #imports train_test_split from model_selection's module of sklearn

X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size = 0.3, random_state = 42) #splits the predictors
# and target into train and test (70% and 30% respectively)

from sklearn.linear_model import LinearRegression #imports Linear_Regression from sklearn
l_model = LinearRegression() #creating an instance of Linear_Regression

#fitting the model to the taining set
l_model.fit(X_train, y_train)

#predicitng with model
predicted_y = l_model.predict(X_test)

from sklearn.metrics import mean_absolute_error #imports mean_absolute_error from sklearn.metrics

mae = mean_absolute_error(y_test, predicted_y) #returns MAE

round(mae, 2) #rounds MAE up to 2 decimal places

0.05

In [18]:
df2 = pd.DataFrame({'Actual': y_test, 'Predicted': predicted_y })
RSS = np.sum(np.square(df2['Predicted'] - df2['Actual']))
round(RSS, 2)

45.35

In [10]:
#calculating mean squared error
import numpy as np
from sklearn.metrics import mean_squared_error #imports mean_squared_error from sklearn.metrics

rmse = np.sqrt(mean_squared_error(y_test, predicted_y)) #np.sqrt finds the square root of mean squared error 
#(rmse is root mean squared error)

round(rmse, 3)#rounds rmse up to 3 decimal places

0.088

In [11]:
#calculating the coefficient of determonation value - R2 value
from sklearn.metrics import r2_score #imports r2 score from sklearn.metrics

R2 = r2_score(y_test, predicted_y) #returns r2 score

round(R2, 2) #rounds R2_score up to 2 decimal places

0.15

In [12]:
#getting weights of model
def get_weights_df(model, feat, col_name): #creates a function that takes model, features and a column name
    weights = pd.Series(model.coef_, feat.columns).sort_values()# creates and sorts a series of the coefficients of the features 
    weights_df = pd.DataFrame(weights).reset_index() #creates a dataframe of the weighs and resets the index
    weights_df.columns = ['Features', col_name]#renames the columns
    weights_df[col_name].round(3)
    return weights_df

In [15]:
linear_model_weights = get_weights_df(l_model, X_train, 'linear_model')

In [16]:
linear_model_weights
#.sort_values(by = ['linear_model'])

Unnamed: 0,Features,linear_model
0,T9,-0.019765
1,T2,-0.017168
2,RH_2,-0.012842
3,T_out,-0.010349
4,RH_8,-0.005401
5,RH_9,-0.001647
6,RH_7,-0.001582
7,T5,-0.001496
8,RH_out,-0.001022
9,T1,-0.000346


In [60]:
from sklearn.linear_model import Ridge #imports ridge regularization
ridge = Ridge(alpha = 0.4) #creates an instance of Ridge
ridge.fit(X_train, y_train) #fits the model to the training set

ridge_prediction = ridge.predict(X_test) #prediction using ridge regularization


ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_prediction)) #np.sqrt finds the square root of mean squared error 
#(rmse is root mean squared error)

print(round(ridge_rmse, 3))#rounds rmse up to 3 decimal places
print(round(ridge_rmse, 3) == round(rmse, 3))

0.088
True


In [79]:
from sklearn.linear_model import Lasso #imports ridge regularization
lasso = Lasso(alpha = 0.001) #creates an instance of Ridge
lasso.fit(X_train, y_train) #fits the model to the training set

lasso_weights = get_weights_df(lasso, X_train, 'lasso')
lasso_weights

Unnamed: 0,Features,lasso
0,T9,-0.01512603
1,T2,-0.009668148
2,RH_2,-0.009300608
3,RH_8,-0.005077541
4,T_out,-0.003325131
5,RH_7,-0.001644124
6,T5,-0.001476185
7,RH_9,-0.001431118
8,RH_out,-1.934555e-05
9,T1,-0.0


In [80]:
non_zero_weights = lasso_weights.lasso[lasso_weights.lasso != 0]


In [81]:
non_zero_weights

0    -1.512603e-02
1    -9.668148e-03
2    -9.300608e-03
3    -5.077541e-03
4    -3.325131e-03
5    -1.644124e-03
6    -1.476185e-03
7    -1.431118e-03
8    -1.934555e-05
14    3.175526e-20
15    2.132002e-05
16    6.963612e-05
17    1.652974e-04
18    2.326420e-04
19    2.800849e-04
20    2.028805e-03
21    3.336119e-03
22    4.352944e-03
23    5.924399e-03
24    1.375405e-02
25    1.953587e-02
Name: lasso, dtype: float64

In [90]:
#lasso_prediction = lasso.predict(X_test) #prediction using ridge regularization


lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso.predict(X_test))) #np.sqrt finds the square root of mean squared error 
#(rmse is root mean squared error)

round(lasso_rmse, 3)#rounds rmse up to 3 decimal places

0.088