# 27-Nov-2019

## Random Forest

In [1]:
import pandas as pd
df = pd.read_csv('diabetes.csv')
print(df.shape)
df.head()

(768, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [3]:
x = df.drop('Outcome',axis=1)
y = df['Outcome']

In [4]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.35,random_state=3)
x_train.shape,x_test.shape

((499, 8), (269, 8))

In [5]:
from sklearn.ensemble import RandomForestClassifier

In [6]:
rand_forest = RandomForestClassifier()
rand_forest.fit(x,y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [7]:
predict = rand_forest.predict(x_test)

In [19]:
rand_forest.predict([[6,148,72,35,0,33.6,0.627,50]])

array([1], dtype=int64)

In [25]:
rand_forest.predict([[1,85,66,29,0,26.6,0.351,31]])

array([0], dtype=int64)

In [20]:
rand_forest.predict([[6,48,72,35,0,33.6,0.627,50]])

array([0], dtype=int64)

In [10]:
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y_test,predict)
score

0.978958143767061

In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report,confusion_matrix

In [12]:
rfc_cv_score = cross_val_score(rand_forest,x,y,cv=10,scoring='roc_auc')

In [13]:
print("==== Confusion Matrix ====")
print(confusion_matrix(y_test,predict))
print('\n')
print("==== Classification Report ====")
print(classification_report(y_test,predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ",rfc_cv_score.mean())
# AUC ---> Area Under Curve

==== Confusion Matrix ====
[[156   1]
 [  4 108]]


==== Classification Report ====
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       157
           1       0.99      0.96      0.98       112

    accuracy                           0.98       269
   macro avg       0.98      0.98      0.98       269
weighted avg       0.98      0.98      0.98       269



=== All AUC Scores ===
[0.76777778 0.8137037  0.81740741 0.71185185 0.74888889 0.81
 0.83555556 0.85481481 0.70192308 0.85384615]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.7915769230769231


In [14]:
# The next thing is we will tune our hyperparameters(parameter whose value is high )
# improve the performance of the model
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start=200,stop=2000,num=10)]
# number of features at every split
max_features = ['auto','sqrt']
max_depth = [int(x) for x in np.linspace(100,500,num=11)]
max_depth.append(None)
# create random grid
random_grid = {'n_estimators':n_estimators,'max_features':max_features,'max_depth':max_depth}
# random search of parameters
rfc_random = RandomizedSearchCV(estimator=rand_forest,param_distributions = random_grid,n_iter = 100, cv=3,
                               verbose=2, random_state=42,n_jobs=-1)
# fit the model
rfc_random.fit(x_train,y_train)
# print results
rfc_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.2min finished


{'n_estimators': 1000, 'max_features': 'auto', 'max_depth': 500}

In [16]:
rfc = RandomForestClassifier(n_estimators=1000,max_features='auto',max_depth=500)
rfc.fit(x_train,y_train)
predict = rfc.predict(x_test)
rfc_cv_score = cross_val_score(rand_forest,x,y,cv=10,scoring='roc_auc')
print("==== Confusion Matrix ====")
print(confusion_matrix(y_test,predict))
print('\n')
print("==== Classification Report ====")
print(classification_report(y_test,predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ",rfc_cv_score.mean())

==== Confusion Matrix ====
[[136  21]
 [ 46  66]]


==== Classification Report ====
              precision    recall  f1-score   support

           0       0.75      0.87      0.80       157
           1       0.76      0.59      0.66       112

    accuracy                           0.75       269
   macro avg       0.75      0.73      0.73       269
weighted avg       0.75      0.75      0.74       269



=== All AUC Scores ===
[0.75814815 0.81037037 0.82925926 0.74       0.79777778 0.82555556
 0.83       0.85666667 0.80076923 0.82692308]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.8075470085470086
