In [2]:
#importing in necessary libraries

import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score

In [3]:
#reading in the prepared NHANES 2013-2014 test data.

df = pd.read_excel('data/NHanesTestData_final.xlsx')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,thalrest,exang,num
0,54,1,1,160,170,114,74,1,0
1,56,1,1,158,226,126,60,1,1
2,65,1,1,100,168,112,70,0,0
3,76,1,1,94,189,111,74,0,0
4,69,0,1,116,192,102,68,1,0


In [4]:
#need to define the attributes and labels (aka, CAD diagnosis)

y = df['num']
X = df.drop(['num'], axis=1)

In [6]:
# save the model to local disk
#import pickle
filename = 'CAD_model.sav'
#pickle.dump(classifier2, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk and run on new data
loaded_model = pickle.load(open(filename, 'rb'))

# the model.score function appears to give the accuracy score given a set of data with known labels
result = loaded_model.score(X, y)
print(result)

0.8018575851393189


In [12]:
# this runs the predictions on the new data set
test = loaded_model.predict(X) 

In [14]:
# predictions output as a numpy array
test

array([1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [13]:
# evaluating the performance of the model given that we know the patient CAD status

print("=== Confusion Matrix ===")
print(confusion_matrix(y, test))
print('\n')
print("=== Classification Report ===")
print(classification_report(y, test)) 
print('\n')
print("=== Accuracy Score ===")
print(accuracy_score(y, test))

=== Confusion Matrix ===
[[492  58]
 [ 70  26]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.88      0.89      0.88       550
           1       0.31      0.27      0.29        96

   micro avg       0.80      0.80      0.80       646
   macro avg       0.59      0.58      0.59       646
weighted avg       0.79      0.80      0.80       646



=== Accuracy Score ===
0.8018575851393189


In [24]:
# turning the numpy array of model predictions into a pandas series
prediction = pd.Series(np.array(test))

In [28]:
# appending the model predictions series as a column to the dataframe

df2 = df.assign(column_name=prediction)
df2.rename(columns={'column_name': 'prediction'}, inplace=True)
df2

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,thalrest,exang,num,prediction
0,54,1,1,160,170,114,74,1,0,1
1,56,1,1,158,226,126,60,1,1,1
2,65,1,1,100,168,112,70,0,0,0
3,76,1,1,94,189,111,74,0,0,0
4,69,0,1,116,192,102,68,1,0,0
5,80,1,1,124,172,100,88,0,0,0
6,80,1,1,160,173,110,84,0,1,0
7,63,0,1,126,271,114,72,1,1,1
8,67,0,1,90,168,114,84,0,0,0
9,59,1,1,136,188,99,88,0,0,0


In [29]:
# exporting the df as Excel file

writer = pd.ExcelWriter('NHANES_model_preds.xlsx', engine='xlsxwriter')
df2.to_excel(writer, sheet_name='predictions')
writer.save()