# Boston price prediction

In [1]:
# Import the required libraries
import pandas as pd
import matplotlib.pyplot
import numpy as np
import sklearn.model_selection as model_selection
import sklearn.tree as tree


In [2]:
# Read the data
df = pd.read_csv("boston_prices.csv")

# Check for missing values
df.isnull().sum()

CRIM                             0
ZN                               0
INDUS                            0
Charles River dummy variable     0
nitric oxides concentration      0
#rooms/dwelling                  0
AGE                              0
DIS                              0
RAD                              0
TAX                              0
PTRATIO                          0
B                                0
LSTAT                            0
MEDV                            54
dtype: int64

In [3]:
df['MEDV'].describe()

count    452.000000
mean      23.750442
std        8.808602
min        6.300000
25%       18.500000
50%       21.950000
75%       26.600000
max       50.000000
Name: MEDV, dtype: float64

In [4]:
# Drop the missing value as risky to impute the missing value in Target variable
df = df.dropna()

In [5]:
df.isnull().sum()
df.shape

(452, 14)

In [6]:
# Create dataframe with Dependent and Independent variable separately
X = df.drop('MEDV',axis=1)
y = df['MEDV']

In [7]:
# Split the training and testing data with 80/20
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=200)

In [8]:
# Create a model and fit into training data
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
mod = GradientBoostingRegressor(n_estimators=80)
mod.fit(X_train,y_train)
mod.score(X_train,y_train)

0.9733193660347284

In [9]:
# Check the model accuracy in test data
mod.score(X_test,y_test)

0.8381397564807751

In [10]:
# Compute the best estimator using GridSearchCV
from sklearn.model_selection import GridSearchCV
mod=GridSearchCV(mod,param_grid={'n_estimators':[60,80,100,120,140,160]})
mod.fit(X_train,y_train)

GridSearchCV(estimator=GradientBoostingRegressor(n_estimators=80),
             param_grid={'n_estimators': [60, 80, 100, 120, 140, 160]})

In [11]:
mod.best_estimator_

GradientBoostingRegressor()

In [12]:
# Create a model with best estimator and fit into training data
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
mod = GradientBoostingRegressor(n_estimators=160)
mod.fit(X_train,y_train)
mod.score(X_train,y_train)

0.9878894239918318

In [13]:
# Check the model accuracy in test data
mod.score(X_test,y_test)

0.839371025528903

In [14]:
# Get the model predicted values
y_pred_test = mod.predict(X_test)
y_pred_train = mod.predict(X_train)

In [15]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,mean_absolute_percentage_error
r2_score = r2_score(y_test,y_pred_test)
mean_absolute_error = mean_absolute_error(y_test,y_pred_test)
mean_squared_error = mean_squared_error(y_test,y_pred_test)
mean_absolute_percentage_error = mean_absolute_percentage_error(y_test,y_pred_test)
RMSE = np.sqrt(mean_squared_error)
print("R squared score:",r2_score)
print("Mean Absolute Error:",mean_absolute_error)
print("Mean Squared Error:",mean_squared_error)
print("Mean Absolute Percentage Error :",mean_absolute_percentage_error)
print("RMSE :",RMSE)

R squared score: 0.839371025528903
Mean Absolute Error: 2.0446550162037007
Mean Squared Error: 12.464520175199056
Mean Absolute Percentage Error : 0.09179522838973256
RMSE : 3.5305127354534576


In [16]:
# Check for feature importances
mod.feature_importances_

array([1.94556571e-02, 4.37368010e-04, 3.34150370e-03, 2.07387889e-03,
       8.81889351e-03, 5.98528972e-01, 9.75704036e-03, 8.51750536e-02,
       3.25354957e-03, 1.62136048e-02, 2.66867685e-02, 6.99580950e-03,
       2.19261901e-01])

In [17]:
# Get the series of features which are important  
Feature_Importance = pd.Series(mod.feature_importances_,index=X.columns).sort_values(ascending=False).head(8)
Feature_Importance

#rooms/dwelling                0.598529
LSTAT                          0.219262
DIS                            0.085175
PTRATIO                        0.026687
CRIM                           0.019456
TAX                            0.016214
AGE                            0.009757
nitric oxides concentration    0.008819
dtype: float64

In [18]:
# Try to run the model with only those features and see how it performs
df_new = pd.concat([df['#rooms/dwelling'],df['LSTAT'],df['DIS'],df['CRIM'],df['MEDV']],axis=1,join='outer')

In [19]:
X = df_new.drop('MEDV',axis=1)
y = df_new['MEDV']

In [20]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=200)

In [21]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
mod = GradientBoostingRegressor(n_estimators=160)
mod.fit(X_train,y_train)
mod.score(X_train,y_train)

0.9791674258287358

In [22]:
mod.score(X_test,y_test)

0.7673896337504081

In [23]:
y_pred_test = mod.predict(X_test)
y_pred_train = mod.predict(X_train)

In [24]:
y_pred_test = mod.predict(X_test)
y_pred_train = mod.predict(X_train)

In [25]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,mean_absolute_percentage_error
r2_score = r2_score(y_test,y_pred_test)
mean_absolute_error = mean_absolute_error(y_test,y_pred_test)
mean_squared_error = mean_squared_error(y_test,y_pred_test)
mean_absolute_percentage_error = mean_absolute_percentage_error(y_test,y_pred_test)
RMSE = np.sqrt(mean_squared_error)
print("R squared score:",r2_score)
print("Mean Absolute Error:",mean_absolute_error)
print("Mean Squared Error:",mean_squared_error)
print("Mean Absolute Percentage Error :",mean_absolute_percentage_error)
print("RMSE :",RMSE)

R squared score: 0.7673896337504081
Mean Absolute Error: 2.4697959708543347
Mean Squared Error: 18.05014700881491
Mean Absolute Percentage Error : 0.11049075655129899
RMSE : 4.2485464583566594
