In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import export_graphviz 
from sklearn.metrics import confusion_matrix 

In [4]:
file_url = "http://storage.googleapis.com/download.tensorflow.org/data/heart.csv"
dataframe = pd.read_csv(file_url)

In [5]:
dataframe.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0
5,56,1,2,120,236,0,0,178,0,0.8,1,0,normal,0
6,62,0,4,140,268,0,2,160,0,3.6,3,2,normal,1
7,57,0,4,120,354,0,0,163,1,0.6,1,0,normal,0
8,63,1,4,130,254,0,2,147,0,1.4,2,1,reversible,1
9,53,1,4,140,203,1,2,155,1,3.1,3,0,reversible,0


Check the shape, information on the features, and data types of features

In [6]:
dataframe.shape

(303, 14)

In [7]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    object 
 13  target    303 non-null    int64  
dtypes: float64(1), int64(12), object(1)
memory usage: 33.3+ KB


Describe the data to understand the average on various variables

In [8]:
dataframe.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,303.0,54.594059,9.01637,29.0,48.0,56.0,61.0,77.0
sex,303.0,0.676568,0.46856,0.0,0.0,1.0,1.0,1.0
cp,303.0,3.108911,1.028414,0.0,2.0,3.0,4.0,4.0
trestbps,303.0,131.785479,17.748338,94.0,120.0,130.0,140.0,200.0
chol,303.0,246.547855,52.175933,126.0,211.0,241.0,275.0,564.0
fbs,303.0,0.148515,0.356198,0.0,0.0,0.0,0.0,1.0
restecg,303.0,0.990099,0.988293,0.0,0.0,1.0,2.0,2.0
thalach,303.0,149.194719,23.173368,71.0,132.0,152.0,165.5,202.0
exang,303.0,0.326733,0.469794,0.0,0.0,0.0,1.0,1.0
oldpeak,303.0,1.057756,1.165025,0.0,0.0,0.8,1.6,6.2


Understanding the values of the data and what it means

Resting blood pressure ranging from 94 to 200 with an average of 131.69.


In [9]:
dataframe.trestbps

0      145
1      160
2      120
3      130
4      130
      ... 
298    118
299    132
300    135
301    130
302    150
Name: trestbps, Length: 303, dtype: int64

Cholesterol ranges from 126 to 564. The average is 246.69. The healthy range for total cholesterol is under 200 mg/dL. 

In [10]:
dataframe.chol

0      233
1      286
2      229
3      250
4      204
      ... 
298    186
299    341
300    254
301    256
302    407
Name: chol, Length: 303, dtype: int64

Resting electrocardiographic results: 
- 0:  normal 
- 1:  having ST-T wave abnormality
- 2:  showing probable or definite left ventricular hypertrophy by Estes' criteria

In [11]:
dataframe.restecg

0      2
1      2
2      2
3      0
4      2
      ..
298    2
299    2
300    2
301    2
302    2
Name: restecg, Length: 303, dtype: int64

Thalach results = max heart rate achieved
Ranging from 71 to 202 with an average of 149.6.

In [12]:
dataframe.thalach

0      150
1      108
2      129
3      187
4      172
      ... 
298    190
299    136
300    127
301    150
302    154
Name: thalach, Length: 303, dtype: int64

Meaning of each value for Thalassemia (blood disorder):
- 3.0: normal
- 6.0: fixed defect
- 7.0: reversable defect

In [13]:
dataframe.thal

0           fixed
1          normal
2      reversible
3          normal
4          normal
          ...    
298         fixed
299    reversible
300    reversible
301    reversible
302    reversible
Name: thal, Length: 303, dtype: object

Heart disease diagnosis relies on a blend of observable symptoms and examination findings. The selection of tests is determined by the physician's assessment of the situation. These tests may include electrocardiograms, cardiac CT scans, blood tests, and exercise stress tests.

Risk factors for heart disease include elevated cholesterol, hypertension, diabetes, obesity, familial history, and smoking. Immutable factors such as advancing age and genetic inheritance significantly contribute to the risk. Thalassemia, a hereditary condition, is among the variables in this dataset.

After learning more about the features of the dataset I have decided to change column names to make it easier to understand

In [14]:
dataframe.columns = ["age", "sex", "chest_pain_type", "resting_blood_pressure", "chol", "fasting_blood_sugar", "rest_ecg", "max_heart_rate", "exercise_induced_angina", "st_depression", "st_slope", "num_major_vessels", "thalassemia", "target"]

In [15]:
dataframe['sex'][dataframe['sex'] == 0] = 'female'
dataframe['sex'][dataframe['sex'] == 1] = 'male'

dataframe['chest_pain_type'][dataframe['chest_pain_type'] == 1] = 'typical angina'
dataframe['chest_pain_type'][dataframe['chest_pain_type'] == 2] = 'atypical angina'
dataframe['chest_pain_type'][dataframe['chest_pain_type'] == 3] = 'non-anginal pain'
dataframe['chest_pain_type'][dataframe['chest_pain_type'] == 4] = 'asymptomatic'

dataframe['fasting_blood_sugar'][dataframe['fasting_blood_sugar'] == 0] = 'lower than 120mg/ml'
dataframe['fasting_blood_sugar'][dataframe['fasting_blood_sugar'] == 1] = 'greater than 120mg/ml'

dataframe['rest_ecg'][dataframe['rest_ecg'] == 0] = 'normal'
dataframe['rest_ecg'][dataframe['rest_ecg'] == 1] = 'ST-T wave abnormality'
dataframe['rest_ecg'][dataframe['rest_ecg'] == 2] = 'left ventricular hypertrophy'

dataframe['exercise_induced_angina'][dataframe['exercise_induced_angina'] == 0] = 'no'
dataframe['exercise_induced_angina'][dataframe['exercise_induced_angina'] == 1] = 'yes'

dataframe['st_slope'][dataframe['st_slope'] == 1] = 'upsloping'
dataframe['st_slope'][dataframe['st_slope'] == 2] = 'flat'
dataframe['st_slope'][dataframe['st_slope'] == 3] = 'downsloping'

dataframe['thalassemia'][dataframe['thalassemia'] == 1] = 'normal'
dataframe['thalassemia'][dataframe['thalassemia'] == 2] = 'fixed defect'
dataframe['thalassemia'][dataframe['thalassemia'] == 3] = 'reversible defect'


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['sex'][dataframe['sex'] == 0] = 'female'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['chest_pain_type'][dataframe['chest_pain_type'] == 1] = 'typical angina'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['fasting_blood_sugar'][dataframe['fasting_blood_sugar'] == 0] = 'lower than 120mg/ml'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/sta

In [16]:
dataframe['sex'] = dataframe['sex'].astype('object')
dataframe['chest_pain_type'] = dataframe['chest_pain_type'].astype('object')
dataframe['fasting_blood_sugar'] = dataframe['fasting_blood_sugar'].astype('object')
dataframe['rest_ecg'] = dataframe['rest_ecg'].astype('object')
dataframe['exercise_induced_angina'] = dataframe['exercise_induced_angina'].astype('object')
dataframe['st_slope'] = dataframe['st_slope'].astype('object')
dataframe['thalassemia'] = dataframe['thalassemia'].astype('object')

In [17]:
dataframe.dtypes

age                          int64
sex                         object
chest_pain_type             object
resting_blood_pressure       int64
chol                         int64
fasting_blood_sugar         object
rest_ecg                    object
max_heart_rate               int64
exercise_induced_angina     object
st_depression              float64
st_slope                    object
num_major_vessels            int64
thalassemia                 object
target                       int64
dtype: object

In [18]:
dt = pd.get_dummies(dataframe, drop_first=True)

In [19]:
dt.head()

Unnamed: 0,age,resting_blood_pressure,chol,max_heart_rate,st_depression,num_major_vessels,target,sex_male,chest_pain_type_asymptomatic,chest_pain_type_atypical angina,...,fasting_blood_sugar_lower than 120mg/ml,rest_ecg_left ventricular hypertrophy,rest_ecg_normal,exercise_induced_angina_yes,st_slope_flat,st_slope_upsloping,thalassemia_2,thalassemia_fixed,thalassemia_normal,thalassemia_reversible
0,63,145,233,150,2.3,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
1,67,160,286,108,1.5,3,1,1,1,0,...,1,1,0,1,1,0,0,0,1,0
2,67,120,229,129,2.6,2,0,1,1,0,...,1,1,0,1,1,0,0,0,0,1
3,37,130,250,187,3.5,0,0,1,0,0,...,1,0,1,0,0,0,0,0,1,0
4,41,130,204,172,1.4,0,0,0,0,1,...,1,1,0,0,0,1,0,0,1,0


Modeling

Let's begin modeling!

In [20]:
X_train, X_test, y_train, y_test = train_test_split(dt.drop('target',1), dt['target'], test_size = .2,random_state = 17)

  X_train, X_test, y_train, y_test = train_test_split(dt.drop('target',1), dt['target'], test_size = .2,random_state = 17)


In [21]:
model = RandomForestClassifier(max_depth=5)
model.fit(X_train, y_train)

In [22]:
estimator = model.estimators_[1]
feature_names = [i for i in X_train.columns]

y_train_str = y_train.astype('str')
y_train_str[y_train_str == '0'] = 'no disease'
y_train_str[y_train_str == '1'] = 'disease'
y_train_str = y_train_str.values

In [30]:
y_predict = model.predict(X_test)
y_pred_quant = model.predict_proba(X_test)[:, 1]
y_pred_bin = model.predict(X_test)

In [33]:
confusion_matrix = confusion_matrix(y_test, y_pred_bin)
confusion_matrix

array([[39,  1],
       [ 8, 13]], dtype=int64)

Predicted 39 patients having heart disease when they did actually have it correctly. 
Predicted 13 patients not having heart dieasese when they did not have it correctly.


In [34]:
total = sum(sum(confusion_matrix))

sensitivity = confusion_matrix[0,0]/(confusion_matrix[0,0]+confusion_matrix[1,0])
print('Sensitivity : ', sensitivity )

specificity = confusion_matrix[1,1]/(confusion_matrix[1,1]+confusion_matrix[0,1])
print('Specificity : ', specificity)


Sensitivity :  0.8297872340425532
Specificity :  0.9285714285714286
