# Random Forest Classifier on Heart Disease Data
###### Data ubtained from `jbrownlee`'s GitHub Repository for Diabetes in Indias

In [49]:
import numpy as np
import pandas as pd

#list for column headers
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

#open file with pd.read_csv
df = pd.read_csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv", names=names)
print(df.shape)

#print head of data set
print(df.head())

(768, 9)
   preg  plas  pres  skin  test  mass   pedi  age  class
0     6   148    72    35     0  33.6  0.627   50      1
1     1    85    66    29     0  26.6  0.351   31      0
2     8   183    64     0     0  23.3  0.672   32      1
3     1    89    66    23    94  28.1  0.167   21      0
4     0   137    40    35   168  43.1  2.288   33      1


In [50]:
X = df.drop('class', axis=1)
Y = df['class']

In [51]:
from sklearn.model_selection import train_test_split

#implementing train-test-split
X_train, X_test, Y_train, Y_test, = train_test_split(X, Y, test_size=0.33, random_state=66)

In [52]:
from sklearn.ensemble import RandomForestClassifier

#random forest model creation
rfc = RandomForestClassifier()
rfc.fit(X_train, Y_train)

#predictions
rfc_predict = rfc.predict(X_test)



In [53]:

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
rfc_cv_score = cross_val_score(rfc, X, Y, cv =10, scoring='roc_auc')

In [54]:
print("=== Confusion Matrix ===")
print(confusion_matrix(Y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(Y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

=== Confusion Matrix ===
[[153  23]
 [ 39  39]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.80      0.87      0.83       176
           1       0.63      0.50      0.56        78

    accuracy                           0.76       254
   macro avg       0.71      0.68      0.69       254
weighted avg       0.75      0.76      0.75       254



=== All AUC Scores ===
[0.75185185 0.81851852 0.79740741 0.70259259 0.75666667 0.81888889
 0.79703704 0.86481481 0.73153846 0.81884615]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.7858162393162393


In [55]:
from sklearn.model_selection import RandomizedSearchCV

# number of trees in random forest
n_estimators = [int (x) for x in np.linspace(start=200, stop=2000, num=10)]

# number of features at every split
max_features = ['auto', 'sqrt']

# max depth
max_depth = [int (x) for x in np.linspace(100, 500, num=11)]
max_depth.append(None)

#create random grid
random_grid = {
    'n_estimators': n_estimators, 
    'max_features': max_features,
    'max_depth': max_depth
}

# random search of parameters
rfc_random = RandomizedSearchCV(estimator=rfc, param_distributions=random_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

#fit the model 
rfc_random.fit(X_train, Y_train)

# print results
print(rfc_random.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   39.8s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.4min finished


{'n_estimators': 800, 'max_features': 'auto', 'max_depth': 380}


In [56]:
rfc = RandomForestClassifier(n_estimators=600, max_depth=300, max_features='sqrt')
rfc.fit(X_train,Y_train)
rfc_predict = rfc.predict(X_test)
rfc_cv_score = cross_val_score(rfc, X, Y, cv=10, scoring='roc_auc')
print("=== Confusion Matrix ===")
print(confusion_matrix(Y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(Y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

=== Confusion Matrix ===
[[149  27]
 [ 33  45]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.82      0.85      0.83       176
           1       0.62      0.58      0.60        78

    accuracy                           0.76       254
   macro avg       0.72      0.71      0.72       254
weighted avg       0.76      0.76      0.76       254



=== All AUC Scores ===
[0.77925926 0.83518519 0.8162963  0.72962963 0.81592593 0.85555556
 0.86481481 0.91555556 0.81076923 0.85884615]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.8281837606837608
