In [1]:
#importing in necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score

In [2]:
#reading in the prepared training data.  currently, data is not scaled.

df = pd.read_excel('data/CP-Num_binned-3.xlsx')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,thalrest,exang,num
0,63,1,1,145,233,1,60,0,0
1,67,1,0,160,286,0,64,1,1
2,67,1,0,120,229,0,78,1,1
3,37,1,1,130,250,0,84,0,0
4,41,0,1,130,204,0,71,0,0


In [3]:
#need to define the attributes and labels (aka, CAD diagnosis)

y = df['num']
X = df.drop(['num'], axis=1)

In [4]:
#defining the training and test sets; going with 80% for training and 20% for testing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=10)

In [None]:
#training the initial Random Forest model

classifier = RandomForestClassifier(n_estimators=800, bootstrap=False, random_state=0)  
classifier.fit(X_train, y_train)  
y_pred = classifier.predict(X_test) 

In [None]:
# evaluating the model

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test,y_pred))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test,y_pred)) 
print('\n')
print("=== Accuracy Score ===")
print(accuracy_score(y_test, y_pred))

In [None]:
# conducting a stratified 10-fold cross-validation to further evaluate the model.  Using OpenML's
# algorithm as found in sklearn.

classifier_cv_score = cross_val_score(classifier, X, y, cv=10, scoring='roc_auc')

print("=== All AUC Scores ===")
print(classifier_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", classifier_cv_score.mean())

In [None]:
# performing an optimatization of the RandomForestClassifier parameters to determine what values to use.
# focusing on n_estimators, bootstraping, max_features, max_depth, min_samples_split, and min_samples_leaf).

from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = classifier, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

# print results
print(rf_random.best_params_)

In [5]:
#re-training the Random Forest model with the optimized parameters

classifier2 = RandomForestClassifier(n_estimators=1000, bootstrap=True, random_state=0, min_samples_split=5, min_samples_leaf=2, max_features='auto', max_depth=20)  
classifier2.fit(X_train, y_train)  
y_pred2 = classifier2.predict(X_test)

In [6]:
# evaluating the improved model

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test,y_pred2))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test,y_pred2)) 
print('\n')
print("=== Accuracy Score ===")
print(accuracy_score(y_test, y_pred2))

=== Confusion Matrix ===
[[63 21]
 [15 81]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.81      0.75      0.78        84
           1       0.79      0.84      0.82        96

   micro avg       0.80      0.80      0.80       180
   macro avg       0.80      0.80      0.80       180
weighted avg       0.80      0.80      0.80       180



=== Accuracy Score ===
0.8


In [None]:
# conducting a stratified 10-fold cross-validation to further evaluate the improved model.  Using 
# OpenML's algorithm as found in sklearn.

classifier_cv_score2 = cross_val_score(classifier2, X, y, cv=10, scoring='roc_auc')

print("=== All AUC Scores ===")
print(classifier_cv_score2)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", classifier_cv_score2.mean())

In [None]:
# creating a feature name list that can be used in the tree visualization
feature_list = list(X.columns)
feature_list

In [None]:
# Lets visualize one decision tree from the forest to see how complex it is!

# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot

# Pull out one tree from the forest
tree = classifier2.estimators_[500]

# Export the image to a dot file
export_graphviz(tree, out_file = 'tree500.dot', feature_names = feature_list, rounded = True, filled = True, precision = 1)
# Use dot file to create a graph

(graph, ) = pydot.graph_from_dot_file('tree500.dot')

# Write graph to a png file
graph.write_png('tree500.png')

In [None]:
#okay, now we're pickling the model to save it as is, trained on the data.

# save the model to local disk
import pickle
filename = 'CAD_model.sav'
pickle.dump(classifier2, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk and run on new data
#loaded_model = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(X_test, Y_test)
#print(result)

## Exporting the X/Y-test data and compare to model predictions to help me tally up the FNs/FPs

In [7]:
y_pred2

array([1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 1], dtype=int64)

In [8]:
# turning the numpy array of model predictions into a pandas series
prediction = pd.Series(np.array(y_pred2))
prediction

0      1
1      0
2      0
3      1
4      0
5      1
6      0
7      0
8      1
9      0
10     0
11     1
12     0
13     1
14     0
15     0
16     1
17     1
18     0
19     1
20     0
21     1
22     0
23     1
24     1
25     1
26     0
27     0
28     0
29     1
      ..
150    1
151    1
152    1
153    1
154    0
155    1
156    1
157    1
158    1
159    1
160    0
161    1
162    0
163    1
164    1
165    0
166    1
167    1
168    1
169    0
170    1
171    1
172    1
173    0
174    1
175    1
176    0
177    0
178    0
179    1
Length: 180, dtype: int64

In [11]:
#turning the panda series into a single column dataframe
df2 = prediction.to_frame(name='prediction')

In [12]:
df2

Unnamed: 0,prediction
0,1
1,0
2,0
3,1
4,0
5,1
6,0
7,0
8,1
9,0


In [13]:
X_test.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,thalrest,exang
437,69,1,0,134,264,1,72,1
131,51,1,1,94,227,0,83,1
633,48,1,1,100,241,0,57,0
195,67,1,0,100,299,0,75,1
230,52,0,1,136,196,0,91,0


In [14]:
y_test.head()

437    1
131    0
633    0
195    1
230    0
Name: num, dtype: int64

In [15]:
# merging the X_test and y_test dataframes
df3 = pd.merge(X_test, y_test, left_index=True, right_index=True)
df3

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,thalrest,exang,num
437,69,1,0,134,264,1,72,1,1
131,51,1,1,94,227,0,83,1,0
633,48,1,1,100,241,0,57,0,0
195,67,1,0,100,299,0,75,1,1
230,52,0,1,136,196,0,91,0,0
731,49,1,0,130,341,0,70,1,1
75,65,0,1,160,360,0,69,0,0
513,56,1,1,130,241,0,98,0,0
353,57,1,0,140,214,0,82,1,1
546,50,0,1,110,202,0,60,0,0


In [16]:
# exporting the merged test dataframe
writer = pd.ExcelWriter('Clev-test_model_data.xlsx', engine='xlsxwriter')
df3.to_excel(writer, sheet_name='testdata')
writer.save()

In [17]:
# exporting the model predictions of the test dataset
writer = pd.ExcelWriter('Clev-test_model_pred.xlsx', engine='xlsxwriter')
df2.to_excel(writer, sheet_name='pred')
writer.save()