# Boston price prediction

In [1]:
# Import the required libraries
import pandas as pd
import matplotlib.pyplot
import numpy as np
import sklearn.model_selection as model_selection
import sklearn.tree as tree


In [2]:
# Read the data
df = pd.read_csv("boston_prices.csv")

# Check for missing values
df.isnull().sum()

CRIM                             0
ZN                               0
INDUS                            0
Charles River dummy variable     0
nitric oxides concentration      0
#rooms/dwelling                  0
AGE                              0
DIS                              0
RAD                              0
TAX                              0
PTRATIO                          0
B                                0
LSTAT                            0
MEDV                            54
dtype: int64

In [3]:
df['MEDV'].describe()

count    452.000000
mean      23.750442
std        8.808602
min        6.300000
25%       18.500000
50%       21.950000
75%       26.600000
max       50.000000
Name: MEDV, dtype: float64

In [4]:
# Drop the missing value as risky to impute the missing value in Target variable
df = df.dropna()

In [5]:
df.isnull().sum()
df.shape

(452, 14)

In [6]:
# Create dataframe with Dependent and Independent variable separately
X = df.drop('MEDV',axis=1)
y = df['MEDV']

In [7]:
# Split the training and testing data with 80/20
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=200)

In [9]:
# Create a model and fit into training data
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
mod = RandomForestRegressor(n_estimators=80,n_jobs=-1,oob_score=True)
mod.fit(X_train,y_train)
mod.score(X_train,y_train)

0.9769469713044384

In [10]:
# Check the model accuracy in test data
mod.score(X_test,y_test)

0.8100120403350546

In [11]:
# Get the model predicted values
y_pred_test = mod.predict(X_test)
y_pred_train = mod.predict(X_train)

In [12]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,mean_absolute_percentage_error
r2_score = r2_score(y_test,y_pred_test)
mean_absolute_error = mean_absolute_error(y_test,y_pred_test)
mean_squared_error = mean_squared_error(y_test,y_pred_test)
mean_absolute_percentage_error = mean_absolute_percentage_error(y_test,y_pred_test)
RMSE = np.sqrt(mean_squared_error)
print("R squared score:",r2_score)
print("Mean Absolute Error:",mean_absolute_error)
print("Mean Squared Error:",mean_squared_error)
print("Mean Absolute Percentage Error :",mean_absolute_percentage_error)
print("RMSE :",RMSE)

R squared score: 0.8100120403350546
Mean Absolute Error: 2.272266483516483
Mean Squared Error: 14.742724742445052
Mean Absolute Percentage Error : 0.09934822714743918
RMSE : 3.8396255992537935


In [13]:
# Check for feature importances
mod.feature_importances_

array([0.02910071, 0.00079489, 0.00886358, 0.0016624 , 0.00619454,
       0.62391107, 0.0169729 , 0.0574193 , 0.00492844, 0.01168236,
       0.01695741, 0.01062139, 0.210891  ])

In [14]:
# Get the series of features which are important  
Feature_Importance = pd.Series(mod.feature_importances_,index=X.columns).sort_values(ascending=False).head(8)
Feature_Importance

#rooms/dwelling    0.623911
LSTAT              0.210891
DIS                0.057419
CRIM               0.029101
AGE                0.016973
PTRATIO            0.016957
TAX                0.011682
B                  0.010621
dtype: float64

In [15]:
# Try to run the model with only those features and see how it performs
df_new = pd.concat([df['#rooms/dwelling'],df['LSTAT'],df['DIS'],df['CRIM'],df['MEDV']],axis=1,join='outer')

In [16]:
X = df_new.drop('MEDV',axis=1)
y = df_new['MEDV']

In [17]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=200)

In [18]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
mod = BaggingRegressor(n_estimators=80,n_jobs=-1,oob_score=True,base_estimator=DecisionTreeRegressor())
mod.fit(X_train,y_train)
mod.score(X_train,y_train)

0.9778921053890022

In [19]:
mod.score(X_test,y_test)

0.7704870037911883

In [20]:
y_pred_test = mod.predict(X_test)
y_pred_train = mod.predict(X_train)

In [21]:
y_pred_test = mod.predict(X_test)
y_pred_train = mod.predict(X_train)

In [22]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,mean_absolute_percentage_error
r2_score = r2_score(y_test,y_pred_test)
mean_absolute_error = mean_absolute_error(y_test,y_pred_test)
mean_squared_error = mean_squared_error(y_test,y_pred_test)
mean_absolute_percentage_error = mean_absolute_percentage_error(y_test,y_pred_test)
RMSE = np.sqrt(mean_squared_error)
print("R squared score:",r2_score)
print("Mean Absolute Error:",mean_absolute_error)
print("Mean Squared Error:",mean_squared_error)
print("Mean Absolute Percentage Error :",mean_absolute_percentage_error)
print("RMSE :",RMSE)

R squared score: 0.7704870037911883
Mean Absolute Error: 2.447431318681318
Mean Squared Error: 17.809796651785717
Mean Absolute Percentage Error : 0.10636976503602619
RMSE : 4.2201654768250165
