In [1]:
#importing in necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score

In [2]:
#reading in the prepared training data.  currently, data is not scaled.

df = pd.read_excel('data/CP-Num_binned-3.xlsx')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,thalrest,exang,num
0,63,1,1,145,233,1,60,0,0
1,67,1,0,160,286,0,64,1,1
2,67,1,0,120,229,0,78,1,1
3,37,1,1,130,250,0,84,0,0
4,41,0,1,130,204,0,71,0,0


In [3]:
#need to define the attributes and labels (aka, CAD diagnosis)

y = df['num']
X = df.drop(['num'], axis=1)

In [4]:
#defining the training and test sets; going with 80% for training and 20% for testing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=10)

In [5]:
#training the initial Random Forest model

classifier = RandomForestClassifier(n_estimators=800, bootstrap=False, random_state=0)  
classifier.fit(X_train, y_train)  
y_pred = classifier.predict(X_test) 

In [6]:
# evaluating the model

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test,y_pred))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test,y_pred)) 
print('\n')
print("=== Accuracy Score ===")
print(accuracy_score(y_test, y_pred))

=== Confusion Matrix ===
[[64 20]
 [17 79]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.79      0.76      0.78        84
           1       0.80      0.82      0.81        96

   micro avg       0.79      0.79      0.79       180
   macro avg       0.79      0.79      0.79       180
weighted avg       0.79      0.79      0.79       180



=== Accuracy Score ===
0.7944444444444444


In [7]:
# conducting a stratified 10-fold cross-validation to further evaluate the model.  Using OpenML's
# algorithm as found in sklearn.

classifier_cv_score = cross_val_score(classifier, X, y, cv=10, scoring='roc_auc')

print("=== All AUC Scores ===")
print(classifier_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", classifier_cv_score.mean())

=== All AUC Scores ===
[0.78439024 0.86560976 0.83414634 0.9315     0.88875    0.88061224
 0.84056122 0.88341837 0.88826531 0.95      ]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.8747253484320557


In [8]:
# performing an optimatization of the RandomForestClassifier parameters to determine what values to use.
# focusing on n_estimators, bootstraping, max_features, max_depth, min_samples_split, and min_samples_leaf).

from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = classifier, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

# print results
print(rf_random.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  9.2min finished


{'n_estimators': 1000, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': True}


In [9]:
#re-training the Random Forest model with the optimized parameters

classifier2 = RandomForestClassifier(n_estimators=1000, bootstrap=True, random_state=0, min_samples_split=5, min_samples_leaf=2, max_features='auto', max_depth=20)  
classifier2.fit(X_train, y_train)  
y_pred2 = classifier2.predict(X_test)

In [11]:
# evaluating the improved model

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test,y_pred2))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test,y_pred2)) 
print('\n')
print("=== Accuracy Score ===")
print(accuracy_score(y_test, y_pred2))

=== Confusion Matrix ===
[[63 21]
 [15 81]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.81      0.75      0.78        84
           1       0.79      0.84      0.82        96

   micro avg       0.80      0.80      0.80       180
   macro avg       0.80      0.80      0.80       180
weighted avg       0.80      0.80      0.80       180



=== Accuracy Score ===
0.8


In [12]:
# conducting a stratified 10-fold cross-validation to further evaluate the improved model.  Using 
# OpenML's algorithm as found in sklearn.

classifier_cv_score2 = cross_val_score(classifier2, X, y, cv=10, scoring='roc_auc')

print("=== All AUC Scores ===")
print(classifier_cv_score2)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", classifier_cv_score2.mean())

=== All AUC Scores ===
[0.80390244 0.88390244 0.84634146 0.9445     0.8825     0.88265306
 0.86122449 0.91428571 0.91479592 0.94132653]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.887543205574913


In [27]:
# creating a feature name list that can be used in the tree visualization
feature_list = list(X.columns)
feature_list

['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'thalrest', 'exang']

In [29]:
# Lets visualize one decision tree from the forest to see how complex it is!

# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot

# Pull out one tree from the forest
tree = classifier2.estimators_[500]

# Export the image to a dot file
export_graphviz(tree, out_file = 'tree500.dot', feature_names = feature_list, rounded = True, filled = True, precision = 1)
# Use dot file to create a graph

(graph, ) = pydot.graph_from_dot_file('tree500.dot')

# Write graph to a png file
graph.write_png('tree500.png')

In [31]:
#okay, now we're pickling the model to save it as is, trained on the data.

# save the model to local disk
import pickle
filename = 'CAD_model.sav'
pickle.dump(classifier2, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk and run on new data
#loaded_model = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(X_test, Y_test)
#print(result)