# Boston price prediction

In [1]:
# Import the required libraries
import pandas as pd
import matplotlib.pyplot
import numpy as np
import sklearn.model_selection as model_selection
import sklearn.tree as tree


In [2]:
# Read the data
df = pd.read_csv("boston_prices.csv")

# Check for missing values
df.isnull().sum()

CRIM                             0
ZN                               0
INDUS                            0
Charles River dummy variable     0
nitric oxides concentration      0
#rooms/dwelling                  0
AGE                              0
DIS                              0
RAD                              0
TAX                              0
PTRATIO                          0
B                                0
LSTAT                            0
MEDV                            54
dtype: int64

In [3]:
df['MEDV'].describe()

count    452.000000
mean      23.750442
std        8.808602
min        6.300000
25%       18.500000
50%       21.950000
75%       26.600000
max       50.000000
Name: MEDV, dtype: float64

In [4]:
# Drop the missing value as risky to impute the missing value in Target variable
df = df.dropna()

In [5]:
df.isnull().sum()
df.shape

(452, 14)

In [6]:
# Create dataframe with Dependent and Independent variable separately
X = df.drop('MEDV',axis=1)
y = df['MEDV']

In [7]:
# Split the training and testing data with 80/20
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=200)

In [8]:
# Create a model and fit into training data
from sklearn.tree import DecisionTreeRegressor
mod = DecisionTreeRegressor(max_depth=3)
mod.fit(X_train,y_train)
mod.score(X_train,y_train)

0.843959464659443

In [9]:
# Check the model accuracy in test data
mod.score(X_test,y_test)

0.7552566744590662

In [10]:
# Get the model predicted values
y_pred_test = mod.predict(X_test)
y_pred_train = mod.predict(X_train)

In [11]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,mean_absolute_percentage_error
r2_score = r2_score(y_test,y_pred_test)
mean_absolute_error = mean_absolute_error(y_test,y_pred_test)
mean_squared_error = mean_squared_error(y_test,y_pred_test)
mean_absolute_percentage_error = mean_absolute_percentage_error(y_test,y_pred_test)
RMSE = np.sqrt(mean_squared_error)
print("R squared score:",r2_score)
print("Mean Absolute Error:",mean_absolute_error)
print("Mean Squared Error:",mean_squared_error)
print("Mean Absolute Percentage Error :",mean_absolute_percentage_error)
print("RMSE :",RMSE)

R squared score: 0.7552566744590662
Mean Absolute Error: 3.0851478968748327
Mean Squared Error: 18.99164287760048
Mean Absolute Percentage Error : 0.14041265180067483
RMSE : 4.35794021042057


In [12]:
# Check for feature importances
mod.feature_importances_

array([0.01429274, 0.        , 0.        , 0.        , 0.        ,
       0.74251388, 0.        , 0.10898921, 0.        , 0.        ,
       0.00592912, 0.        , 0.12827504])

In [13]:
# Get the series of features which are important  
Feature_Importance = pd.Series(mod.feature_importances_,index=X.columns).sort_values(ascending=False).head(8)
Feature_Importance

#rooms/dwelling                 0.742514
LSTAT                           0.128275
DIS                             0.108989
CRIM                            0.014293
PTRATIO                         0.005929
ZN                              0.000000
INDUS                           0.000000
Charles River dummy variable    0.000000
dtype: float64

In [14]:
# Try to run the model with only those features and see how it performs
df_new = pd.concat([df['#rooms/dwelling'],df['LSTAT'],df['DIS'],df['CRIM'],df['MEDV']],axis=1,join='outer')

In [15]:
X = df_new.drop('MEDV',axis=1)
y = df_new['MEDV']

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=200)

In [17]:
from sklearn.tree import DecisionTreeRegressor
mod = DecisionTreeRegressor(max_depth=3)
mod.fit(X_train,y_train)
mod.score(X_train,y_train)

0.843735513598761

In [18]:
mod.score(X_test,y_test)

0.7018296697846882

In [19]:
y_pred_test = mod.predict(X_test)
y_pred_train = mod.predict(X_train)

In [20]:
y_pred_test = mod.predict(X_test)
y_pred_train = mod.predict(X_train)

In [21]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,mean_absolute_percentage_error
r2_score = r2_score(y_test,y_pred_test)
mean_absolute_error = mean_absolute_error(y_test,y_pred_test)
mean_squared_error = mean_squared_error(y_test,y_pred_test)
mean_absolute_percentage_error = mean_absolute_percentage_error(y_test,y_pred_test)
RMSE = np.sqrt(mean_squared_error)
print("R squared score:",r2_score)
print("Mean Absolute Error:",mean_absolute_error)
print("Mean Squared Error:",mean_squared_error)
print("Mean Absolute Percentage Error :",mean_absolute_percentage_error)
print("RMSE :",RMSE)

R squared score: 0.7018296697846882
Mean Absolute Error: 3.2132970756312975
Mean Squared Error: 23.137482567214292
Mean Absolute Percentage Error : 0.14508536866968683
RMSE : 4.810143715858632
