# Boston price prediction

In [1]:
# Import the required libraries
import pandas as pd
import matplotlib.pyplot
import numpy as np
import sklearn.model_selection as model_selection
import sklearn.tree as tree


In [2]:
# Read the data
df = pd.read_csv("boston_prices.csv")

# Check for missing values
df.isnull().sum()

CRIM                             0
ZN                               0
INDUS                            0
Charles River dummy variable     0
nitric oxides concentration      0
#rooms/dwelling                  0
AGE                              0
DIS                              0
RAD                              0
TAX                              0
PTRATIO                          0
B                                0
LSTAT                            0
MEDV                            54
dtype: int64

In [3]:
df['MEDV'].describe()

count    452.000000
mean      23.750442
std        8.808602
min        6.300000
25%       18.500000
50%       21.950000
75%       26.600000
max       50.000000
Name: MEDV, dtype: float64

In [4]:
# Drop the missing value as risky to impute the missing value in Target variable
df = df.dropna()

In [5]:
df.isnull().sum()
df.shape

(452, 14)

In [6]:
# Create dataframe with Dependent and Independent variable separately
X = df.drop('MEDV',axis=1)
y = df['MEDV']

In [7]:
# Split the training and testing data with 80/20
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=200)

In [8]:
# Create a model and fit into training data
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
mod = BaggingRegressor(n_estimators=80,n_jobs=-1,oob_score=True,base_estimator=DecisionTreeRegressor())
mod.fit(X_train,y_train)
mod.score(X_train,y_train)

0.977324374487053

In [9]:
# Check the model accuracy in test data
mod.score(X_test,y_test)

0.8070875128638447

In [10]:
# Get the model predicted values
y_pred_test = mod.predict(X_test)
y_pred_train = mod.predict(X_train)

In [11]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,mean_absolute_percentage_error
r2_score = r2_score(y_test,y_pred_test)
mean_absolute_error = mean_absolute_error(y_test,y_pred_test)
mean_squared_error = mean_squared_error(y_test,y_pred_test)
mean_absolute_percentage_error = mean_absolute_percentage_error(y_test,y_pred_test)
RMSE = np.sqrt(mean_squared_error)
print("R squared score:",r2_score)
print("Mean Absolute Error:",mean_absolute_error)
print("Mean Squared Error:",mean_squared_error)
print("Mean Absolute Percentage Error :",mean_absolute_percentage_error)
print("RMSE :",RMSE)

R squared score: 0.8070875128638447
Mean Absolute Error: 2.3329258241758244
Mean Squared Error: 14.969662826236265
Mean Absolute Percentage Error : 0.10187744347764167
RMSE : 3.8690648516451964


In [12]:
# Check for feature importances
mod.estimators_[0].feature_importances_

array([6.92186218e-03, 2.32784610e-04, 1.00059455e-02, 7.07259061e-04,
       1.31993955e-03, 2.28127878e-01, 5.87494195e-03, 1.39071479e-01,
       9.13619203e-03, 8.49164848e-03, 1.87873870e-02, 1.79732783e-02,
       5.53349405e-01])

In [13]:
# We can extract feature importance from each tree then take a mean for all trees
imp=[]
for i in mod.estimators_:
    imp.append(i.feature_importances_)
imp=np.mean(imp,axis=0)

In [14]:
# Get the series of features which are important  
Feature_Importance = pd.Series(imp,index=X.columns).sort_values(ascending=False).head(8)
Feature_Importance

#rooms/dwelling                0.565321
LSTAT                          0.268220
DIS                            0.058038
CRIM                           0.027429
PTRATIO                        0.015969
AGE                            0.015424
nitric oxides concentration    0.013050
B                              0.012635
dtype: float64

In [15]:
# Try to run the model with only those features and see how it performs
df_new = pd.concat([df['#rooms/dwelling'],df['LSTAT'],df['DIS'],df['CRIM'],df['MEDV']],axis=1,join='outer')

In [16]:
X = df_new.drop('MEDV',axis=1)
y = df_new['MEDV']

In [17]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=200)

In [18]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
mod = BaggingRegressor(n_estimators=80,n_jobs=-1,oob_score=True,base_estimator=DecisionTreeRegressor())
mod.fit(X_train,y_train)
mod.score(X_train,y_train)

0.9792203499067382

In [19]:
mod.score(X_test,y_test)

0.7828145825609392

In [20]:
y_pred_test = mod.predict(X_test)
y_pred_train = mod.predict(X_train)

In [21]:
y_pred_test = mod.predict(X_test)
y_pred_train = mod.predict(X_train)

In [22]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,mean_absolute_percentage_error
r2_score = r2_score(y_test,y_pred_test)
mean_absolute_error = mean_absolute_error(y_test,y_pred_test)
mean_squared_error = mean_squared_error(y_test,y_pred_test)
mean_absolute_percentage_error = mean_absolute_percentage_error(y_test,y_pred_test)
RMSE = np.sqrt(mean_squared_error)
print("R squared score:",r2_score)
print("Mean Absolute Error:",mean_absolute_error)
print("Mean Squared Error:",mean_squared_error)
print("Mean Absolute Percentage Error :",mean_absolute_percentage_error)
print("RMSE :",RMSE)

R squared score: 0.7828145825609392
Mean Absolute Error: 2.3925000000000005
Mean Squared Error: 16.85319866071429
Mean Absolute Percentage Error : 0.1053950242125069
RMSE : 4.105264749162262
