# 1. Imports and prep dataset

In [12]:
import os
import pandas as pd
import numpy as np
from scipy import stats
from ydata_profiling import ProfileReport
 

In [13]:
# pip install ucimlrepo
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
heart_disease = fetch_ucirepo(id=45) 
  
# data (as pandas dataframes) 
X = heart_disease.data.features 
y = heart_disease.data.targets  

In [14]:
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45,1,1,110,264,0,0,132,0,1.2,2,0.0,7.0
299,68,1,4,144,193,1,0,141,0,3.4,2,2.0,7.0
300,57,1,4,130,131,0,0,115,1,1.2,2,1.0,7.0
301,57,0,2,130,236,0,2,174,0,0.0,2,1.0,3.0


In [15]:
result = pd.concat([X, y], axis=1).reindex(X.index)
X['output'] = y

In [20]:
X['output'] = X['output'].where(X['output'] == 0, 1)

# 2. Explaination of Variables

``` 
•	age: patient age in years
•	sex: patient sex (1 = male, 0 = female)
•	cp: Type of chest pain experienced (1: typical angina, 2: atypical angina, 3: non-anginal pain, 4: asymptomatic)
•	trestbps: patient resting blood pressure (mmHg measured at intake into hospital)
•	chol: patient cholesterol level (mg/dl)
•	fbs: patient fasting blood sugar (> 120 mg/dl, 1 = true; 0 = false)
•	restecg: patient resting electrocardiograph measurement (0 = normal, 1 = having ST-T wave abnormality, 2 = showing probable or definite left ventricular hypertrophy by Estes' criteria)
•	thalach: patient maximum heart rate achieved
•	exang: exercise induced angina (1 = yes; 0 = no)
•	oldpeak: ST depression induced by exercise relative to rest ('ST' relates to positions on the ECG plot)
•	slope: the slope of the peak exercise ST segment (1: upsloping, 2: flat, 3: downsloping)
•	ca: number of major vessels  (0-3)
•	thal: see below (3 = normal; 6 = fixed defect; 7 = reversable defect)
•	target: presence of heart disease (0 = not present, 1 = present.)


Background on Thalium Tracer test (thal)
--Nuclear stress testing requires the injection of a tracer, commonly technicium 99M (Myoview or Cardiolyte), which is then taken up by healthy, viable myocardial cells. A camera (detector) is used afterwards to image the heart and compare segments. A coronary stenosis is detected when a myocardial segment takes up the nuclear tracer at rest, but not during cardiac stress. This is called a "reversible defect." Scarred myocardium from prior infarct will not take up tracer at all and is referred to as a "fixed defect."
```

In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        299 non-null    float64
 12  thal      301 non-null    float64
dtypes: float64(3), int64(10)
memory usage: 30.9 KB


In [6]:
X.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,299.0,301.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.672241,4.734219
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.937438,1.939706
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,3.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0,3.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0


In [7]:
X.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
dtype: int64

In [8]:
profile = ProfileReport(X, title="Profiling Report")

In [9]:
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [10]:
# entry #93, 139, 164, 165 and 252 are NaNs in the ca column.
# entry #49 and 282 are NaNs in the thal column.

X = X.dropna()

In [11]:
profile_dropna = ProfileReport(X, title="Profiling Report") 
print()




In [12]:
profile_dropna

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [13]:

# Calculate the z-score for values in our dataframe
z = np.abs(stats.zscore(X))
print(z)
# Identify outliers as values with a z-score greater than 3
threshold = 3
outliers = X[z > threshold]

# Print the outliers
# drop rows containing outliers
#X = X.drop(outliers.index)
outliers.isna().sum()

          age       sex        cp  trestbps      chol       fbs   restecg  \
0    0.936181  0.691095  2.240629  0.750380  0.276443  2.430427  1.010199   
1    1.378929  0.691095  0.873880  1.596266  0.744555  0.411450  1.010199   
2    1.378929  0.691095  0.873880  0.659431  0.353500  0.411450  1.010199   
3    1.941680  0.691095  0.164289  0.095506  0.051047  0.411450  1.003419   
4    1.498933  1.446980  1.202459  0.095506  0.835103  0.411450  1.010199   
..        ...       ...       ...       ...       ...       ...       ...   
297  0.272059  1.446980  0.873880  0.468418  0.122330  0.411450  1.003419   
298  1.056185  0.691095  2.240629  1.223355  0.320744  0.411450  1.003419   
299  1.489615  0.691095  0.873880  0.693988  1.047008  2.430427  1.003419   
300  0.272059  0.691095  0.873880  0.095506  2.241384  0.411450  1.003419   
301  0.272059  1.446980  1.202459  0.095506  0.218651  0.411450  1.010199   

      thalach     exang   oldpeak     slope        ca      thal  
0    0.01

age         297
sex         297
cp          297
trestbps    295
chol        293
fbs         297
restecg     297
thalach     296
exang       297
oldpeak     295
slope       297
ca          297
thal        297
dtype: int64

In [14]:
#Remove rows containing outliers (z-score > 3)
outliers_removed = X[(np.abs(stats.zscore(X)) < 3).all(axis=1)]

In [15]:
df_final = outliers_removed.reindex()

In [17]:
# 9 rows removed for outliers
# 4 rows removed for missing variables

print(df_final)

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0     63    1   1       145   233    1        2      150      0      2.3   
1     67    1   4       160   286    0        2      108      1      1.5   
2     67    1   4       120   229    0        2      129      1      2.6   
3     37    1   3       130   250    0        0      187      0      3.5   
4     41    0   2       130   204    0        2      172      0      1.4   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
297   57    0   4       140   241    0        0      123      1      0.2   
298   45    1   1       110   264    0        0      132      0      1.2   
299   68    1   4       144   193    1        0      141      0      3.4   
300   57    1   4       130   131    0        0      115      1      1.2   
301   57    0   2       130   236    0        2      174      0      0.0   

     slope   ca  thal  
0        3  0.0   6.0  
1        2  3.0   3.0  
2        2  2.0