In this project, we are going to create different Regression-based models for predicting the house prices in Boston based on various factors. We will create various models and select the model with the best accuracy at the end. Before that we will also import the dataset and apply some transformations on it to refine it for our Regression models.

# Import all the required libraries

In [42]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
import scipy.stats as stats


# Boston Housing Dataset

In [52]:
df = pd.read_csv('Datasets/Boston.csv')
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NX       506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


In [54]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [55]:
df.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NX         0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

In [56]:
target_corr = np.abs(df.corrwith(df['MEDV']))
print(target_corr.sort_values(ascending=False))


MEDV       1.000000
LSTAT      0.737663
RM         0.695360
PTRATIO    0.507787
INDUS      0.483725
TAX        0.468536
NX         0.427321
CRIM       0.388305
RAD        0.381626
AGE        0.376955
ZN         0.360445
B          0.333461
DIS        0.249929
CHAS       0.175260
dtype: float64


In [57]:
df = df.drop(['CHAS', 'DIS','B'], axis=1)
df

Unnamed: 0,CRIM,ZN,INDUS,NX,RM,AGE,RAD,TAX,PTRATIO,LSTAT,MEDV
0,0.00632,18.0,2.31,0.538,6.575,65.2,1,296.0,15.3,4.98,24.0
1,0.02731,0.0,7.07,0.469,6.421,78.9,2,242.0,17.8,9.14,21.6
2,0.02729,0.0,7.07,0.469,7.185,61.1,2,242.0,17.8,4.03,34.7
3,0.03237,0.0,2.18,0.458,6.998,45.8,3,222.0,18.7,2.94,33.4
4,0.06905,0.0,2.18,0.458,7.147,54.2,3,222.0,18.7,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.573,6.593,69.1,1,273.0,21.0,9.67,22.4
502,0.04527,0.0,11.93,0.573,6.120,76.7,1,273.0,21.0,9.08,20.6
503,0.06076,0.0,11.93,0.573,6.976,91.0,1,273.0,21.0,5.64,23.9
504,0.10959,0.0,11.93,0.573,6.794,89.3,1,273.0,21.0,6.48,22.0


In [58]:
X = df.drop(['MEDV'], axis=1)
y = df['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=2529,test_size=0.2)

print(X_train.shape)
print(X_test.shape)

(404, 10)
(102, 10)


# Linear Regression with only Standardization and no PowerTransformers

In [59]:
# Applying Regression without any transformation
lr = LinearRegression()
std = StandardScaler()
X_train_transformed = std.fit_transform(X_train)
X_test_transformed = std.transform(X_test)
lr.fit(X_train_transformed,y_train)

y_pred = lr.predict(X_test_transformed)


r2_score(y_test,y_pred)

0.6442041772047282

In [60]:
np.mean(cross_val_score(lr,X,y,scoring='r2', cv=10))

0.105806033547411

In [61]:
# for col in X_train.columns:
#     plt.figure(figsize=(14,4))
#     plt.subplot(121)
#     sns.kdeplot(X_train[col])
#     plt.title(col)

#     plt.subplot(122)
#     stats.probplot(X_train[col], dist="norm", plot=plt)
#     plt.title(col)

#     plt.show()

In [62]:
pt = PowerTransformer(method='box-cox')
pt2 = PowerTransformer()
X_train_transformed_box = pt.fit_transform(X_train+0.000001)
X_test_transformed_box = pt.transform(X_test+0.000001)
X_train_transformed_yeo = pt2.fit_transform(X_train)
X_test_transformed_yeo = pt2.transform(X_test)

In [63]:
# Before/After Transformation comparison using Box-Cox and Yeo-Johnson.

# X_train_transformed_box = pd.DataFrame(X_train_transformed_box,columns=X_train.columns)
# X_train_transformed_yeo = pd.DataFrame(X_train_transformed_yeo,columns=X_train.columns)
# for col in X_train_transformed_box.columns:
#     plt.figure(figsize=(14,4))
#     plt.subplot(131)
#     sns.kdeplot(X_train[col])
#     plt.title(col)

#     plt.subplot(132)
#     sns.kdeplot(X_train_transformed_box[col])
#     plt.title('Box-Cox')
    
#     plt.subplot(133)
#     sns.kdeplot(X_train_transformed_yeo[col])
#     plt.title('Yeo-Johnson')

#     plt.show()

In [64]:
# Fitting the model on data transformed using Box-Cox:-

lr_box = LinearRegression()
lr_box.fit(X_train_transformed_box,y_train)

y_pred = lr_box.predict(X_test_transformed_box)

lnr_r2 = r2_score(y_test,y_pred)

lnr_r2

0.6998280804799626

In [65]:
np.mean(cross_val_score(lr_box,X,y,scoring='r2', cv=10))

0.105806033547411

# Regression Trees

In [66]:
#  Training regression tree models with data(on which only standardization is applied)


reg_tree = DecisionTreeRegressor(random_state=2529)
parameters = {'criterion' : ['squared_error'], 'min_samples_split' : [i for i in range(10,60,5)], 'min_impurity_decrease' : [0.01, 0.05, 0.1, 0.2, 0.25, 0.5, 0.7], 'splitter' : ['best','random'], 'max_depth': [i for i in range(10, 40, 5)]}

tree_grid = GridSearchCV(reg_tree, param_grid = parameters, scoring = ['r2'], refit='r2', cv = 10, verbose=4).fit(X_train_transformed, y_train)

print(tree_grid.best_estimator_)

Fitting 10 folds for each of 840 candidates, totalling 8400 fits
[CV 1/10] END criterion=squared_error, max_depth=10, min_impurity_decrease=0.01, min_samples_split=10, random_state=2529, splitter=best; r2: (test=0.207) total time=   0.0s
[CV 2/10] END criterion=squared_error, max_depth=10, min_impurity_decrease=0.01, min_samples_split=10, random_state=2529, splitter=best; r2: (test=0.634) total time=   0.0s
[CV 3/10] END criterion=squared_error, max_depth=10, min_impurity_decrease=0.01, min_samples_split=10, random_state=2529, splitter=best; r2: (test=0.679) total time=   0.0s
[CV 4/10] END criterion=squared_error, max_depth=10, min_impurity_decrease=0.01, min_samples_split=10, random_state=2529, splitter=best; r2: (test=0.900) total time=   0.0s
[CV 5/10] END criterion=squared_error, max_depth=10, min_impurity_decrease=0.01, min_samples_split=10, random_state=2529, splitter=best; r2: (test=0.830) total time=   0.0s
[CV 6/10] END criterion=squared_error, max_depth=10, min_impurity_decr

In [67]:
print(tree_grid.best_score_)

0.6976376389342946


In [68]:
best_model = tree_grid.best_estimator_
y_pred = best_model.predict(X_test_transformed)
print(r2_score(y_test, y_pred))
reg_tree_r2 = r2_score(y_test, y_pred)

0.7633660471203758


In [69]:
np.mean(cross_val_score(best_model,X,y,scoring='r2',cv=10))

0.18835949419300566

# Regression Tree with Box-Cox transformed data

In [70]:
tree_transformed_grid = GridSearchCV(reg_tree, param_grid = parameters, scoring = ['r2'], refit='r2', cv=10, verbose=4).fit(X_train_transformed_box, y_train)


Fitting 10 folds for each of 840 candidates, totalling 8400 fits
[CV 1/10] END criterion=squared_error, max_depth=10, min_impurity_decrease=0.01, min_samples_split=10, random_state=2529, splitter=best; r2: (test=0.207) total time=   0.0s
[CV 2/10] END criterion=squared_error, max_depth=10, min_impurity_decrease=0.01, min_samples_split=10, random_state=2529, splitter=best; r2: (test=0.631) total time=   0.0s
[CV 3/10] END criterion=squared_error, max_depth=10, min_impurity_decrease=0.01, min_samples_split=10, random_state=2529, splitter=best; r2: (test=0.679) total time=   0.0s
[CV 4/10] END criterion=squared_error, max_depth=10, min_impurity_decrease=0.01, min_samples_split=10, random_state=2529, splitter=best; r2: (test=0.906) total time=   0.0s
[CV 5/10] END criterion=squared_error, max_depth=10, min_impurity_decrease=0.01, min_samples_split=10, random_state=2529, splitter=best; r2: (test=0.830) total time=   0.0s
[CV 6/10] END criterion=squared_error, max_depth=10, min_impurity_decr

In [74]:
print(tree_transformed_grid.best_estimator_)
print(tree_transformed_grid.best_score_)

DecisionTreeRegressor(max_depth=10, min_impurity_decrease=0.1,
                      min_samples_split=10, random_state=2529,
                      splitter='random')
0.7704613822180942


In [75]:
best_model1 = tree_transformed_grid.best_estimator_
y_pred = best_model.predict(X_test_transformed_box)
print(r2_score(y_test, y_pred))

0.6267970542725123


In [76]:
np.mean(cross_val_score(best_model,X,y,scoring='r2',cv=10))

0.18835949419300566

In [77]:
print(f"The best accuracy for Linear Regression model is {lnr_r2}")
print(f"The best accuracy achieved using Regression Tree model is {reg_tree_r2}")

The best accuracy for Linear Regression model is 0.6998280804799626
The best accuracy achieved using Regression Tree model is 0.7633660471203758


In [78]:
#After executing the code below, a file named 'tree.png' would be generated which contains the decision tree image
from sklearn.tree import export_graphviz
export_graphviz(best_model, out_file='tree.dot', filled=True, feature_names=list(X_train.columns))
!dot -Tpng tree.dot -o tree.png