In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
df = pd.read_excel("ENB2012_data.xlsx")
column_names = {'X1':"Relative_Compactness",
               'X2': "Surface_Area",
               'X3': "Wall_Area",
               'X4': "Roof_Area",
               'X5': 'Overall_Height',
               'X6': "Orientation",
               'X7': 'Glazing_Area',
               'X8': 'Glazing_Area_Distribution',
               'Y1': 'Heating_Load',
               'Y2': 'Cooling_Load'}
df = df.rename(columns = column_names)

In [9]:
df.corr().round(decimals = 2)
# .round(deciamls  = no. of d.p. to round the numbers)

Unnamed: 0,Relative_Compactness,Surface_Area,Wall_Area,Roof_Area,Overall_Height,Orientation,Glazing_Area,Glazing_Area_Distribution,Heating_Load,Cooling_Load
Relative_Compactness,1.0,-0.99,-0.2,-0.87,0.83,0.0,-0.0,-0.0,0.62,0.63
Surface_Area,-0.99,1.0,0.2,0.88,-0.86,-0.0,0.0,0.0,-0.66,-0.67
Wall_Area,-0.2,0.2,1.0,-0.29,0.28,-0.0,-0.0,0.0,0.46,0.43
Roof_Area,-0.87,0.88,-0.29,1.0,-0.97,-0.0,-0.0,-0.0,-0.86,-0.86
Overall_Height,0.83,-0.86,0.28,-0.97,1.0,0.0,0.0,-0.0,0.89,0.9
Orientation,0.0,-0.0,-0.0,-0.0,0.0,1.0,-0.0,-0.0,-0.0,0.01
Glazing_Area,-0.0,0.0,-0.0,-0.0,0.0,-0.0,1.0,0.21,0.27,0.21
Glazing_Area_Distribution,-0.0,0.0,0.0,-0.0,-0.0,-0.0,0.21,1.0,0.09,0.05
Heating_Load,0.62,-0.66,0.46,-0.86,0.89,-0.0,0.27,0.09,1.0,0.98
Cooling_Load,0.63,-0.67,0.43,-0.86,0.9,0.01,0.21,0.05,0.98,1.0


## Steps to Measure regression performance
1. Features Scaling/Scale the data
    Feature Scaling or Standardization: It is a step of Data Pre Processing which is applied to independent variables or features of data. 
    It basically helps to normalise the data within a particular range. Sometimes, it also helps in speeding up the calculations in an algorithm.
2. Split the dataset into independent and dependent columns
3. Split dataset into training and testing datasets
4. Import desired model and and fit dataset into model
4. Measure performance of model

## Methods used in measuring performance of model
1. Mean Absolute error
2. Residual Sum of Squares
2. Root mean square error
4. R-squared statistic

In [15]:

# Feature Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
#   Feature Scaling or Standardization: It is a step of Data Pre Processing which is applied to independent variables or features of data. 
#   It basically helps to normalise the data within a particular range. Sometimes, it also helps in speeding up the calculations in an algorithm.

normalised_df = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)
print('normalised/scaled df')
print(normalised_df)

## Spiliting
      
# drop the dependent columns, since we are dealing with only the independent  variables
features_df = normalised_df.drop(columns = ['Heating_Load', 'Cooling_Load'])
print('independent/features df')
print(features_df)

# select the response column
heating_target = normalised_df['Heating_Load']
print('response df which is = heating_target = Heating_Load')
print(heating_target)

normalised/scaled df
     Relative_Compactness  Surface_Area  Wall_Area  Roof_Area  Overall_Height  \
0                1.000000      0.000000   0.285714   0.000000             1.0   
1                1.000000      0.000000   0.285714   0.000000             1.0   
2                1.000000      0.000000   0.285714   0.000000             1.0   
3                1.000000      0.000000   0.285714   0.000000             1.0   
4                0.777778      0.166667   0.428571   0.111111             1.0   
..                    ...           ...        ...        ...             ...   
763              0.055556      0.916667   0.571429   1.000000             0.0   
764              0.000000      1.000000   0.714286   1.000000             0.0   
765              0.000000      1.000000   0.714286   1.000000             0.0   
766              0.000000      1.000000   0.714286   1.000000             0.0   
767              0.000000      1.000000   0.714286   1.000000             0.0   

     O

In [16]:
# splitting into test and train sets

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features_df,
                                                   heating_target,
                                                   test_size = 0.3,
                                                   random_state = 1)

In [24]:
# import linear regression model and fit into it

from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()

linear_model.fit(x_train, y_train)

#obtain prediction values
predicted_values = linear_model.predict(x_test)
predicted_values

array([0.18432617, 0.58435059, 0.74633789, 0.71484375, 0.7644043 ,
       0.24377441, 0.16345215, 0.6126709 , 0.24499512, 0.54943848,
       0.31628418, 0.66540527, 0.69567871, 0.30065918, 0.1862793 ,
       0.71716309, 0.57067871, 0.31286621, 0.25476074, 0.74279785,
       0.70141602, 0.07055664, 0.61047363, 0.32128906, 0.21447754,
       0.75134277, 0.32836914, 0.17944336, 0.80151367, 0.26953125,
       0.68786621, 0.74060059, 0.13867188, 0.28283691, 0.56591797,
       0.8079834 , 0.33044434, 0.13208008, 0.72290039, 0.24658203,
       0.81970215, 0.1550293 , 0.10327148, 0.15930176, 0.11437988,
       0.14709473, 0.48742676, 0.69152832, 0.75939941, 0.72058105,
       0.09790039, 0.67114258, 0.09680176, 0.24328613, 0.76599121,
       0.77087402, 0.7421875 , 0.63049316, 0.15234375, 0.2956543 ,
       0.18518066, 0.59667969, 0.60473633, 0.76672363, 0.56835938,
       0.92285156, 0.6842041 , 0.3170166 , 0.67614746, 0.23535156,
       0.54711914, 0.24401855, 0.23327637, 0.72631836, 0.28369

In [37]:
# measure mean_absolute_error 

from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, predicted_values)

print(mae)
print(round(mae,3))
print(mae.round(decimals = 4))


0.06275563329660785
0.063
0.0628


## Residual Sum of Squares
- It explains variance of data i.e. how well the model approximates the data
- The lower the RSS, the better the model approximates the data and vice versa

In [39]:
# Residual Some of squares (RSS)
#RSS = sum of the square of (original sample - predicted sample)

rss = np.sum(np.square(y_test - predicted_values))
rss.round(decimals = 3)

1.832

## Root Mean Square error (RMSE)
- It is the squareroot of RSS -> Standard Deviation
- It describes the spread of the residuals from the line of best fit and noise of the model.
- When RMSE is low, it means the error made by the model has a small deviation fromt the true values

In [45]:
# Root mean square error (RMSE)
# RMSE -> square-root of mean squared error

from sklearn.metrics import mean_squared_error
mse = (mean_squared_error(y_test, predicted_values)).round(decimals = 3)
print ('mean squared error: ' f'{mse}')
rmse = np.sqrt(mse).round(decimals = 3)
print ('root mean square error: ' f'{rmse}')

mean squared error: 0.008
root mean square error: 0.089


## R-squared
- It is also known as coefficient of determination
- It determines the goodness of fit of the model
- It lies between 0 & 1
- Usually, the higher the R^2 , the better the model and vice versa but not always true

In [47]:
# R-squared statistic

from sklearn.metrics import r2_score
r2_score = r2_score(y_test, predicted_values)
round(r2_score, 3)

0.893