# Random Forest Classifier on Heart Disease Data
###### Data ubtained from UCI ML Repository for Heart Disease

In [50]:
import numpy as np
import pandas as pd

#list for column headers
names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']

#open file with pd.read_csv
df = pd.read_csv("C:\\Users\\Haris\\Documents\\Code\\AI\\RandomForestClassifier\\data\\processed.cleveland.data", names=names, encoding='ANSI')
print(df.shape)

#print head of data set
print(df.head())

(303, 14)
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1  67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2  67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3  37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4  41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   

   slope   ca  thal  num  
0    3.0  0.0   6.0    0  
1    2.0  3.0   3.0    2  
2    2.0  2.0   7.0    1  
3    3.0  0.0   3.0    0  
4    1.0  0.0   3.0    0  


In [51]:
X = df.drop('num', axis=1)
Y = df['num']

In [52]:
from sklearn.model_selection import train_test_split

#implementing train-test-split
X_train, X_test, Y_train, Y_test, = train_test_split(X, Y, test_size=0.33, random_state=66)

In [53]:
from sklearn.ensemble import RandomForestClassifier

#random forest model creation
rfc = RandomForestClassifier()
rfc.fit(X_train, Y_train)

#predictions
rfc_predict = rfc.predict(X_test)



In [54]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
rfc_cv_score = cross_val_score(rfc, X, Y, cv =10)

In [55]:
print("=== Confusion Matrix ===")
print(confusion_matrix(Y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(Y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

=== Confusion Matrix ===
[[46  2  3  0  0]
 [13  4  0  2  0]
 [ 3  2  4  2  0]
 [ 3  5  4  2  0]
 [ 1  1  2  1  0]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.70      0.90      0.79        51
           1       0.29      0.21      0.24        19
           2       0.31      0.36      0.33        11
           3       0.29      0.14      0.19        14
           4       0.00      0.00      0.00         5

    accuracy                           0.56       100
   macro avg       0.32      0.32      0.31       100
weighted avg       0.48      0.56      0.51       100



=== All AUC Scores ===
[0.60606061 0.51515152 0.54545455 0.59375    0.5483871  0.5862069
 0.60714286 0.57142857 0.53571429 0.60714286]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.5716439231421155


  'precision', 'predicted', average, warn_for)


In [56]:
from sklearn.model_selection import RandomizedSearchCV

# number of trees in random forest
n_estimators = [int (x) for x in np.linspace(start=200, stop=2000, num=10)]

# number of features at every split
max_features = ['auto', 'sqrt']

# max depth
max_depth = [int (x) for x in np.linspace(100, 500, num=11)]
max_depth.append(None)

#create random grid
random_grid = {
    'n_estimators': n_estimators, 
    'max_features': max_features,
    'max_depth': max_depth
}

# random search of parameters
rfc_random = RandomizedSearchCV(estimator=rfc, param_distributions=random_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

#fit the model 
rfc_random.fit(X_train, Y_train)

# print results
print(rfc_random.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   39.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.3min finished


{'n_estimators': 2000, 'max_features': 'sqrt', 'max_depth': 100}


In [58]:
rfc = RandomForestClassifier(n_estimators=600, max_depth=300, max_features='sqrt')
rfc.fit(X_train,Y_train)
rfc_predict = rfc.predict(X_test)
rfc_cv_score = cross_val_score(rfc, X, Y, cv=10)
print("=== Confusion Matrix ===")
print(confusion_matrix(Y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(Y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

=== Confusion Matrix ===
[[46  4  1  0  0]
 [13  2  3  1  0]
 [ 4  2  2  3  0]
 [ 4  3  5  2  0]
 [ 2  0  2  1  0]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.67      0.90      0.77        51
           1       0.18      0.11      0.13        19
           2       0.15      0.18      0.17        11
           3       0.29      0.14      0.19        14
           4       0.00      0.00      0.00         5

    accuracy                           0.52       100
   macro avg       0.26      0.27      0.25       100
weighted avg       0.43      0.52      0.46       100



=== All AUC Scores ===
[0.57575758 0.54545455 0.57575758 0.625      0.51612903 0.5862069
 0.57142857 0.53571429 0.60714286 0.57142857]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.5710019911493771


  'precision', 'predicted', average, warn_for)
