In [52]:
# importing required libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score  
from sklearn import datasets
%matplotlib inline

In [15]:
#loading the dataset
boston = datasets.load_boston()
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [8]:
#converting the data to dataframe format
data = pd.DataFrame(boston.data,columns=boston.feature_names)
data['PRICE'] = targets = boston.target
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [9]:
data.info() # describe the summary information of data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  PRICE    506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


In [10]:
data.isnull().sum() # check if there are anu null values in any of the features

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
PRICE      0
dtype: int64

In [17]:
data['CHAS'].value_counts() #from values count we come to know that it is a categorical data

0.0    471
1.0     35
Name: CHAS, dtype: int64

In [18]:
data['RAD'].value_counts()

24.0    132
5.0     115
4.0     110
3.0      38
6.0      26
8.0      24
2.0      24
1.0      20
7.0      17
Name: RAD, dtype: int64

In [25]:
Chas = pd.get_dummies(data['CHAS'],drop_first=True) #converting th categorical data to one hot encoding

In [26]:
Chas

Unnamed: 0,1.0
0,0
1,0
2,0
3,0
4,0
...,...
501,0
502,0
503,0
504,0


In [27]:
Rad = pd.get_dummies(data['RAD'],drop_first=True)

In [28]:
Rad

Unnamed: 0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,24.0
0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
501,0,0,0,0,0,0,0,0
502,0,0,0,0,0,0,0,0
503,0,0,0,0,0,0,0,0
504,0,0,0,0,0,0,0,0


In [29]:
#preparning the final data for model traning
data = pd.concat([data,Chas,Rad],axis=1).drop(columns=['CHAS', 'RAD'])
data.head()

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,TAX,PTRATIO,B,...,PRICE,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,24.0
0,0.00632,18.0,2.31,0.538,6.575,65.2,4.09,296.0,15.3,396.9,...,24.0,0,0,0,0,0,0,0,0,0
1,0.02731,0.0,7.07,0.469,6.421,78.9,4.9671,242.0,17.8,396.9,...,21.6,0,1,0,0,0,0,0,0,0
2,0.02729,0.0,7.07,0.469,7.185,61.1,4.9671,242.0,17.8,392.83,...,34.7,0,1,0,0,0,0,0,0,0
3,0.03237,0.0,2.18,0.458,6.998,45.8,6.0622,222.0,18.7,394.63,...,33.4,0,0,1,0,0,0,0,0,0
4,0.06905,0.0,2.18,0.458,7.147,54.2,6.0622,222.0,18.7,396.9,...,36.2,0,0,1,0,0,0,0,0,0


In [66]:
X = data.drop(columns='PRICE')
Y = data['PRICE']

In [67]:
# scaling the data
scalar = StandardScaler()
X = scalar.fit_transform(X)

In [68]:
#splitiing the data into two parts training and testing
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.30, random_state= 355)

In [69]:
#initialising the random forest classifier
rand_clf = RandomForestRegressor(random_state=6)

In [74]:
# tuning hyperparameters, we are passing the different values for both parameters
grid_param = {
    "n_estimators" : range(1,30,5),
    'criterion': ['mse', 'mae'],
    'max_depth' : range(2,20,1),
    'min_samples_leaf' : range(1,10,1),
    'min_samples_split': range(2,10,1),
    'max_features' : ['auto','log2']
}

In [75]:
grid_search = GridSearchCV(estimator=rand_clf,param_grid=grid_param,cv=5,n_jobs =-1,verbose = 3)

In [76]:
grid_search.fit(x_train,y_train)

Fitting 5 folds for each of 31104 candidates, totalling 155520 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 344 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done 984 tasks      | elapsed:   23.7s
[Parallel(n_jobs=-1)]: Done 1880 tasks      | elapsed:   41.9s
[Parallel(n_jobs=-1)]: Done 3032 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 4440 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 6104 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 8024 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 10200 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 12632 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 15320 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 18264 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 21464 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 24920 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 28632 tasks

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=6,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jobs=

In [77]:
#let's see the best parameters as per our grid search
grid_search.best_params_

{'criterion': 'mae',
 'max_depth': 19,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 21}

In [78]:
#again initialising the classifie but with the best parameters
rand_clf = RandomForestRegressor(criterion= 'mae',
 max_depth = 19,
 max_features = 'log2',
 min_samples_leaf = 1,
 min_samples_split= 2,
 n_estimators = 21,random_state=6)

In [79]:
rand_clf.fit(x_train,y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae',
                      max_depth=19, max_features='log2', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=21, n_jobs=None, oob_score=False,
                      random_state=6, verbose=0, warm_start=False)

In [81]:
rand_clf.score(x_test,y_test) * 100

89.0843561476469