In [113]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [114]:
#read data from input csv file
bostonhouse_data = pd.read_csv("regressionDataSet/boston_housing.csv")
bostonhouse_data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [115]:
#fetching all the columns and datatypes involved
bostonhouse_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


### Selecting features to work on

In [116]:
bostonHouse_df=bostonhouse_data[['RM', 'DIS', 'TAX', 'INDUS','MEDV']]
bostonHouse_df.head()

Unnamed: 0,RM,DIS,TAX,INDUS,MEDV
0,6.575,4.09,296,2.31,24.0
1,6.421,4.9671,242,7.07,21.6
2,7.185,4.9671,242,7.07,34.7
3,6.998,6.0622,222,2.18,33.4
4,7.147,6.0622,222,2.18,36.2


In [117]:
X=bostonHouse_df[[ 'RM','DIS', 'TAX', 'INDUS']]

from statsmodels.stats.outliers_influence import variance_inflation_factor
#calculating the VIF for each attributes
vif = pd.Series([variance_inflation_factor(X.values,idx) 
           for idx in range(X.shape[1])],
          index=X.columns)
print(vif)


RM       16.747965
DIS       6.880607
TAX      13.861510
INDUS     9.598374
dtype: float64


### Normalizing- Encoding categorical data

In [118]:
bostonHouse_df.columns.drop(['RM','TAX'])

Index(['DIS', 'INDUS', 'MEDV'], dtype='object')

In [119]:
# encoding categorical values

cat_features = ["DIS","INDUS"]
bostonHouse_df = pd.get_dummies(bostonHouse_df,columns=cat_features)
print(bostonHouse_df.columns)

Index(['RM', 'TAX', 'MEDV', 'DIS_1.1296', 'DIS_1.137', 'DIS_1.1691',
       'DIS_1.1742', 'DIS_1.1781', 'DIS_1.2024', 'DIS_1.2852',
       ...
       'INDUS_11.93', 'INDUS_12.83', 'INDUS_13.89', 'INDUS_13.92',
       'INDUS_15.04', 'INDUS_18.1', 'INDUS_19.58', 'INDUS_21.89',
       'INDUS_25.65', 'INDUS_27.74'],
      dtype='object', length=491)


### Normalizing Continuous DataSet

In [120]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler().fit(bostonHouse_df[['MEDV']])
bostonHouseData_Normalized=scaler.transform(bostonHouse_df[['MEDV']])
bostonHouseData_Normalized

array([[ 0.15968566],
       [-0.10152429],
       [ 1.32424667],
       [ 1.18275795],
       [ 1.48750288],
       [ 0.6712218 ],
       [ 0.03996443],
       [ 0.49708184],
       [-0.65659542],
       [-0.39538548],
       [-0.81985164],
       [-0.39538548],
       [-0.09064054],
       [-0.23212926],
       [-0.47157171],
       [-0.286548  ],
       [ 0.06173193],
       [-0.54775795],
       [-0.25389676],
       [-0.47157171],
       [-0.97222411],
       [-0.31919924],
       [-0.79808414],
       [-0.87427038],
       [-0.75454915],
       [-0.93957286],
       [-0.64571167],
       [-0.84161913],
       [-0.44980422],
       [-0.16682677],
       [-1.07017784],
       [-0.87427038],
       [-1.0157591 ],
       [-1.02664285],
       [-0.98310786],
       [-0.39538548],
       [-0.27566425],
       [-0.16682677],
       [ 0.23587189],
       [ 0.89978051],
       [ 1.34601416],
       [ 0.4426631 ],
       [ 0.30117438],
       [ 0.23587189],
       [-0.14505928],
       [-0

#### Replacing the numerical columns with normalized values

In [121]:
#updating the numerical(normalized) data into the dataframe

bostonHouseNormalized_df=pd.DataFrame(bostonHouseData_Normalized, columns=['MEDV'])
bostonHouseNormalized_df=bostonHouseNormalized_df.join(bostonHouse_df[bostonHouse_df.columns.drop(['MEDV'])])

In [122]:
bostonHouseNormalized_df.head()

Unnamed: 0,MEDV,RM,TAX,DIS_1.1296,DIS_1.137,DIS_1.1691,DIS_1.1742,DIS_1.1781,DIS_1.2024,DIS_1.2852,...,INDUS_11.93,INDUS_12.83,INDUS_13.89,INDUS_13.92,INDUS_15.04,INDUS_18.1,INDUS_19.58,INDUS_21.89,INDUS_25.65,INDUS_27.74
0,0.159686,6.575,296,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-0.101524,6.421,242,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.324247,7.185,242,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.182758,6.998,222,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1.487503,7.147,222,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Selecting features and target

In [123]:
#bostonHouseNormalized_df.columns.drop(['RM'])

In [124]:
Y=bostonHouseNormalized_df['MEDV']
X=bostonHouseNormalized_df[bostonHouseNormalized_df.columns.drop(['MEDV'])]
print(X.shape)

(506, 490)


### Spillting into train & test data

In [125]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3, random_state=100)
X_train.shape , X_test.shape

((354, 490), (152, 490))

In [126]:
#importing required module to build the model
from sklearn.linear_model import LinearRegression

#building and training the model
model = LinearRegression()
model.fit(X_train,Y_train)

#Evaluating the model on the train and test data for R-Squared score 
train_score=model.score(X_train,Y_train)
test_score=model.score(X_test,Y_test)

print('Train Score (R-Squared): ',train_score)
print('Test Score (R-Squared)',test_score)

Train Score (R-Squared):  0.9870365752818168
Test Score (R-Squared) -3.0097955005301584e+16


In [127]:
from sklearn.metrics import mean_squared_error

#root mean square error (RMSE) calculation for train data
train_predictions = model.predict(X_train)
train_RMSE=mean_squared_error(Y_train,train_predictions)**0.5

#root mean square error (RMSE) calculation for test data
test_predictions = model.predict(X_test)
test_RMSE=mean_squared_error(Y_test,test_predictions)**0.5

print('Train RMSE : ',train_RMSE)
print('Test RMSE : ',test_RMSE)

Train RMSE :  0.10885282002626638
Test RMSE :  190028799.33888912
