# Predicting Heart Disease using Decision Tree and Random Forest

# Author - Rohan Kulkarni

In [1]:
import pandas as pd

hd= pd.read_csv(r"C:\Users\Prajwal\Downloads\HeartDisease.csv")

# normal decision tree

In [2]:
from sklearn.model_selection import train_test_split
hd_train, hd_test= train_test_split(hd, test_size=.2)

hd_train_x= hd_train.iloc[:,0:-1]
hd_train_y= hd_train.iloc[:,-1]

hd_test_x= hd_test.iloc[:,0:-1]
hd_test_y= hd_test.iloc[:,-1]
from sklearn.tree import DecisionTreeClassifier

dtree= DecisionTreeClassifier()

dtree.fit(hd_train_x,hd_train_y)

dtree_pred= dtree.predict(hd_test_x)

from sklearn.metrics import confusion_matrix 

tab_dtree= confusion_matrix(dtree_pred, hd_test_y)
print("confusion matrix for normal decision tree is\n",tab_dtree)

overall_acc_dtree= tab_dtree.diagonal().sum()*100/ tab_dtree.sum()
print("overall accuracy for decision tree is---->",overall_acc_dtree)

features1= pd.DataFrame({"importance": dtree.feature_importances_, "variables": hd_train_x.columns})
features1

confusion matrix for normal decision tree is
 [[26  7]
 [ 5 23]]
overall accuracy for decision tree is----> 80.32786885245902


Unnamed: 0,importance,variables
0,0.097767,age
1,0.025848,gender
2,0.32222,chest_pain
3,0.013634,rest_bps
4,0.058872,cholestrol
5,0.005983,fasting_blood_sugar
6,0.016753,rest_ecg
7,0.087435,thalach
8,0.025741,exer_angina
9,0.101889,old_peak


# decision tree auto balanced 

In [3]:
from sklearn.model_selection import train_test_split
hd_train_autobalanced, hd_test_autobalanced= train_test_split(hd, test_size=.2)

hd_train_x_autobalanced= hd_train_autobalanced.iloc[:,0:-1]
hd_train_y_autobalanced= hd_train_autobalanced.iloc[:,-1]

hd_test_x_autobalanced= hd_test.iloc[:,0:-1]
hd_test_y_autobalanced= hd_test.iloc[:,-1]

from sklearn.tree import DecisionTreeClassifier

dtree2= DecisionTreeClassifier(class_weight="balanced")

dtree2.fit(hd_train_x_autobalanced,hd_train_y_autobalanced)

dtree_pred2= dtree2.predict(hd_test_x_autobalanced)

from sklearn.metrics import confusion_matrix 

tab_dtree2= confusion_matrix(dtree_pred2, hd_test_y_autobalanced)
print("confusion matrix for normal decision tree is\n",tab_dtree2)

overall_acc_dtree_autobalanced= tab_dtree2.diagonal().sum()*100/ tab_dtree2.sum()
print("overall accuracy with autobalanced decision tree is---->",overall_acc_dtree_autobalanced)


features2= pd.DataFrame({"importance": dtree.feature_importances_, "variables": hd_train_x.columns})
features2

confusion matrix for normal decision tree is
 [[30  3]
 [ 1 27]]
overall accuracy with autobalanced decision tree is----> 93.44262295081967


Unnamed: 0,importance,variables
0,0.097767,age
1,0.025848,gender
2,0.32222,chest_pain
3,0.013634,rest_bps
4,0.058872,cholestrol
5,0.005983,fasting_blood_sugar
6,0.016753,rest_ecg
7,0.087435,thalach
8,0.025741,exer_angina
9,0.101889,old_peak


# GRID

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

search_dict={"criterion":("entropy","gini"),
             "max_depth":(3,4,5),
             "min_samples_split":(30,40,50)}


hd_train_grid, hd_test_grid= train_test_split(hd, test_size=.2)

hd_train_x_grid= hd_train_grid.iloc[:,0:-1]
hd_train_y_grid= hd_train_grid.iloc[:,-1]

hd_test_x_grid= hd_test_grid.iloc[:,0:-1]
hd_test_y_grid= hd_test_grid.iloc[:,-1]

dtree3= DecisionTreeClassifier()


grid= GridSearchCV(dtree3, param_grid=search_dict)

grid.fit(hd_train_x_grid , hd_train_y_grid)
print("best hyper-parameters are---->",grid.best_params_)

grid_pred= grid.predict(hd_test_x_grid)

from sklearn.metrics import confusion_matrix 

tab_dtree3= confusion_matrix(grid_pred, hd_test_y_grid)
print("confusion matrix for decision tree using GRID\n",tab_dtree3)

overall_acc_dtree_grid= tab_dtree3.diagonal().sum()*100/ tab_dtree3.sum()
print("overall accuracy of decision tree using GRID---->",overall_acc_dtree_grid)


features3= pd.DataFrame({"importance": dtree.feature_importances_, "variables": hd_train_x.columns})
features3


best hyper-parameters are----> {'criterion': 'entropy', 'max_depth': 4, 'min_samples_split': 30}
confusion matrix for decision tree using GRID
 [[23  6]
 [ 3 29]]
overall accuracy of decision tree using GRID----> 85.24590163934427


Unnamed: 0,importance,variables
0,0.097767,age
1,0.025848,gender
2,0.32222,chest_pain
3,0.013634,rest_bps
4,0.058872,cholestrol
5,0.005983,fasting_blood_sugar
6,0.016753,rest_ecg
7,0.087435,thalach
8,0.025741,exer_angina
9,0.101889,old_peak


# RANDOM FOREST

In [5]:
from sklearn.model_selection import train_test_split
hd_train_rf, hd_test_rf= train_test_split(hd, test_size=.2)

hd_train_rf_x= hd_train_rf.iloc[:,0:-1]
hd_train_rf_y= hd_train_rf.iloc[:,-1]

hd_test_rf_x= hd_test_rf.iloc[:,0:-1]
hd_test_rf_y= hd_test_rf.iloc[:,-1]

from sklearn.ensemble import RandomForestClassifier
rfc= RandomForestClassifier(n_estimators=100)

rfc.fit(hd_train_rf_x,hd_train_rf_y)

pred_rfc= rfc.predict(hd_test_rf_x)

tab4=confusion_matrix(pred_rfc,hd_test_rf_y)
print("confusion matrix for random forest\n",tab4)

overall_acc_raf= tab4.diagonal().sum()*100/ tab4.sum()
print("overall accuracy of random forest---->",overall_acc_raf)


features4= pd.DataFrame({"importance": rfc.feature_importances_, "variables": hd_train_rf_x.columns})
features4




confusion matrix for random forest
 [[21  2]
 [ 5 33]]
overall accuracy of random forest----> 88.52459016393442


Unnamed: 0,importance,variables
0,0.077538,age
1,0.032732,gender
2,0.124357,chest_pain
3,0.087475,rest_bps
4,0.092007,cholestrol
5,0.008122,fasting_blood_sugar
6,0.022575,rest_ecg
7,0.113581,thalach
8,0.033413,exer_angina
9,0.136514,old_peak
