In [1]:
#Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import warnings
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import pydotplus
import collections

plt.rc('font', size=12)
warnings.filterwarnings('ignore')

In [2]:
#Load original dataframes
hemoOrig = pd.read_csv("Preprocessed Data/Cluster_Hemo.csv", sep=",", index_col='DEIDNUM') #all feature dataset
real_scores = hemoOrig['Score']
hemoOrig

Unnamed: 0_level_0,RAP,PAS,PAD,PAMN,CWP,PCWPMod,PCWPA,PCWPMN,CO,CI,...,CPI,PP,PPP,PAPP,VR,RAT,PPRatio,Age,EjF,Score
DEIDNUM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
72,24.0,42.0,24.0,30.0,36.0,36.0,36.0,36.0,2.20,1.60,...,0.607834,28.0,0.245614,0.428571,5357.575758,0.666667,0.266667,88.0,25.0,5
72,18.0,42.0,24.0,30.0,24.0,24.0,24.0,28.0,3.10,2.00,...,0.682927,24.0,0.235294,0.428571,3509.677419,0.750000,0.272727,88.0,25.0,3
81,10.0,40.0,20.0,27.0,18.0,18.0,18.0,20.0,4.52,2.02,...,0.546430,17.0,0.212500,0.500000,1982.300885,0.555556,0.242857,69.0,20.0,1
81,12.0,35.0,15.0,25.0,17.0,17.0,17.0,14.0,4.65,2.08,...,0.530377,40.0,0.470588,0.571429,1772.043011,0.705882,0.571429,69.0,20.0,1
814,3.0,51.0,19.0,30.0,18.0,18.0,18.0,18.0,4.10,2.00,...,0.609017,34.0,0.354167,0.627451,2621.138211,0.166667,0.566667,58.0,,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98508,,,,,,,,,,,...,,,,,,,,57.0,30.0,4
99302,19.0,87.0,37.0,54.0,44.0,44.0,44.0,,3.90,2.00,...,0.660754,54.0,0.486486,0.574713,2666.666667,0.431818,0.642857,41.0,15.0,5
99302,11.0,70.0,28.0,44.0,28.0,28.0,28.0,,5.30,2.70,...,1.033703,86.0,0.623188,0.600000,2440.251572,0.392857,1.088608,41.0,15.0,3
99935,8.0,49.0,29.0,31.0,15.0,15.0,15.0,26.0,4.00,2.40,...,0.899335,94.0,0.676259,0.408163,3220.000000,0.533333,1.253333,64.0,20.0,3


In [21]:
hemo.agg([min, max])

hemo.min()
dct = {}
for c in hemo.columns:
    dct[c] = [hemo[c].min(), hemo[c].max()]

len(dct)

27

In [6]:
hemo = hemoOrig.drop('Score',axis=1)
hemo = hemo.replace(np.inf, 0)
hemo = hemo.fillna(0)
xTrain, xTest, yTrain, yTest = train_test_split(hemo, real_scores, test_size=.2)
xTrain.shape, xTest.shape

((334, 27), (84, 27))

In [23]:
dt = DecisionTreeClassifier(criterion = "gini", random_state = 100,
                               max_depth=7, min_samples_leaf=5)
dt.fit(xTrain, yTrain)

DecisionTreeClassifier(max_depth=7, min_samples_leaf=5, random_state=100)

In [24]:
dot_data = tree.export_graphviz(dt,
                                feature_names=hemo.columns,
                                class_names=["1", "2", "3", "4", "5"],
                                out_file=None,
                                filled=True,
                                rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data)

colors = ('palegreen','honeydew','lightyellow','mistyrose','lightcoral')
edges = collections.defaultdict(list)
nodes = graph.get_node_list()

for node in nodes:
    if node.get_name() not in ('node', 'edge'):
        vals = dt.tree_.value[int(node.get_name())][0]
        maxPos = np.argmax(vals)
        node.set_fillcolor(colors[maxPos])

graph.write_png('decision_tree.png')

True

In [25]:
# Get Feature importance
# feat_importance = clf_pruned.tree_.compute_feature_importances(normalize=False)
featureDict = dict(zip(hemo.columns, dt.feature_importances_))
featureImp = pd.DataFrame.from_dict(featureDict, orient='index')
featureImp.rename(columns = {0:'Feature Importance'}, inplace = True)
featureImp = featureImp.sort_values(by=['Feature Importance'], ascending=False)
featureImp.head(10)

Unnamed: 0,Feature Importance
CWP,0.227235
HRTRT,0.15367
PAD,0.144771
MPAP,0.134282
CPI,0.090993
PCWPMN,0.083641
PAS,0.04031
MAP,0.028797
BPDIAS,0.02304
PAMN,0.019011


In [26]:
#Get accuracy & confusion matrix
dtPreds = dt.predict(xTest) 
print(confusion_matrix(yTest, dtPreds))
print(classification_report(yTest, dtPreds))

[[ 3  1  0  0  0]
 [ 1 25  5  0  0]
 [ 1  2 20  0  2]
 [ 0  0  0  9  0]
 [ 0  0  3  0 12]]
              precision    recall  f1-score   support

           1       0.60      0.75      0.67         4
           2       0.89      0.81      0.85        31
           3       0.71      0.80      0.75        25
           4       1.00      1.00      1.00         9
           5       0.86      0.80      0.83        15

    accuracy                           0.82        84
   macro avg       0.81      0.83      0.82        84
weighted avg       0.83      0.82      0.82        84



In [27]:
rf = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 42)
rf.fit(xTrain, yTrain)
rfPreds = rf.predict(xTest)
print(confusion_matrix(yTest, rfPreds))
print(classification_report(yTest, rfPreds))

[[ 4  0  0  0  0]
 [ 0 30  1  0  0]
 [ 0  1 23  0  1]
 [ 0  0  0  9  0]
 [ 0  0  4  0 11]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         4
           2       0.97      0.97      0.97        31
           3       0.82      0.92      0.87        25
           4       1.00      1.00      1.00         9
           5       0.92      0.73      0.81        15

    accuracy                           0.92        84
   macro avg       0.94      0.92      0.93        84
weighted avg       0.92      0.92      0.92        84



In [16]:
allDataOrig = pd.read_csv("Preprocessed Data/Cluster_AllData.csv", sep=",", index_col='DEIDNUM') #all feature dataset
print(allDataOrig.columns)

Index(['Age', 'Gender', 'Race', 'Wt', 'BMI', 'InitialHospDays',
       'TotalHospDays', 'NYHA', 'MLHFS', 'AF',
       ...
       'EjF', 'BPDIAS', 'BPSYS', 'HR', 'PV', 'MAP', 'PP', 'PPP', 'PPRatio',
       'Score'],
      dtype='object', length=119)


In [22]:
allData = allDataOrig
allData = allData.replace(np.inf, 0)
allData = allData.fillna(0)
dct = {}
for c in allData.columns:
    dct[c] = [allData[c].min(), allData[c].max()]

dct

{'Age': [20.0, 88.0],
 'Gender': [1.0, 2.0],
 'Race': [1.0, 98.0],
 'Wt': [0.0, 134.0],
 'BMI': [0.0, 57.06555671],
 'InitialHospDays': [0.0, 51.0],
 'TotalHospDays': [1.0, 154.0],
 'NYHA': [0.0, 4.0],
 'MLHFS': [0.0, 105.0],
 'AF': [0.0, 1.0],
 'AlchE': [0.0, 1.0],
 'ANGP': [0.0, 1.0],
 'AOREG': [0.0, 1.0],
 'AOST': [0.0, 1.0],
 'ARRH': [0.0, 1.0],
 'CABG': [0.0, 1.0],
 'CARREST': [0.0, 1.0],
 'COPD': [0.0, 1.0],
 'CVD': [0.0, 1.0],
 'CYTOE': [0.0, 1.0],
 'DEPR': [0.0, 1.0],
 'DIAB': [0.0, 1.0],
 'FAMILE': [0.0, 1.0],
 'GOUT': [0.0, 1.0],
 'HEPT': [0.0, 1.0],
 'HTN': [0.0, 1.0],
 'HYPERE': [0.0, 1.0],
 'HTRANS': [0.0, 3.0],
 'ICD': [0.0, 1.0],
 'IDIOPE': [0.0, 1.0],
 'ISCHD': [0.0, 1.0],
 'ISCHEME': [0.0, 1.0],
 'MALIG': [0.0, 1.0],
 'MI': [0.0, 1.0],
 'MTST': [0.0, 1.0],
 'OTHUNE': [0.0, 1.0],
 'PACE': [0.0, 1.0],
 'PERIPAE': [0.0, 1.0],
 'PMRG': [0.0, 1.0],
 'PTCI': [0.0, 1.0],
 'PTREG': [0.0, 1.0],
 'PVD': [0.0, 1.0],
 'RENALI': [0.0, 1.0],
 'SMOKING': [0.0, 3.0],
 'STERD': [0.0, 1