# 1. Imports and prep dataset

In [1]:
import os
import pandas as pd
import numpy as np
from scipy import stats
from ydata_profiling import ProfileReport

In [2]:
# pip install ucimlrepo
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
heart_disease = fetch_ucirepo(id=45) 
  
# data (as pandas dataframes) 
X = heart_disease.data.features 
y = heart_disease.data.targets

pd.concat([X,y]).to_csv('heart_disease.csv', index = False)

In [3]:
print(X)

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0     63    1   1       145   233    1        2      150      0      2.3   
1     67    1   4       160   286    0        2      108      1      1.5   
2     67    1   4       120   229    0        2      129      1      2.6   
3     37    1   3       130   250    0        0      187      0      3.5   
4     41    0   2       130   204    0        2      172      0      1.4   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
298   45    1   1       110   264    0        0      132      0      1.2   
299   68    1   4       144   193    1        0      141      0      3.4   
300   57    1   4       130   131    0        0      115      1      1.2   
301   57    0   2       130   236    0        2      174      0      0.0   
302   38    1   3       138   175    0        0      173      0      0.0   

     slope   ca  thal  
0        3  0.0   6.0  
1        2  3.0   3.0  
2        2  2.0

In [4]:
# add 'output' column to X and set all nonzero values to 1
X['output'] = y
X['output'] = X['output'].where(X['output'] == 0, 1)

In [5]:
print(X)

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0     63    1   1       145   233    1        2      150      0      2.3   
1     67    1   4       160   286    0        2      108      1      1.5   
2     67    1   4       120   229    0        2      129      1      2.6   
3     37    1   3       130   250    0        0      187      0      3.5   
4     41    0   2       130   204    0        2      172      0      1.4   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
298   45    1   1       110   264    0        0      132      0      1.2   
299   68    1   4       144   193    1        0      141      0      3.4   
300   57    1   4       130   131    0        0      115      1      1.2   
301   57    0   2       130   236    0        2      174      0      0.0   
302   38    1   3       138   175    0        0      173      0      0.0   

     slope   ca  thal  output  
0        3  0.0   6.0       0  
1        2  3.0   3.0  

# 2. Explaination of Variables

``` 
•	age: patient age in years
•	sex: patient sex (1 = male, 0 = female)
•	cp: Type of chest pain experienced (1: typical angina, 2: atypical angina, 3: non-anginal pain, 4: asymptomatic)
•	trestbps: patient resting blood pressure (mmHg measured at intake into hospital)
•	chol: patient cholesterol level (mg/dl)
•	fbs: patient fasting blood sugar (> 120 mg/dl, 1 = true; 0 = false)
•	restecg: patient resting electrocardiograph measurement (0 = normal, 1 = having ST-T wave abnormality, 2 = showing probable or definite left ventricular hypertrophy by Estes' criteria)
•	thalach: patient maximum heart rate achieved
•	exang: exercise induced angina (1 = yes; 0 = no)
•	oldpeak: ST depression induced by exercise relative to rest ('ST' relates to positions on the ECG plot)
•	slope: the slope of the peak exercise ST segment (1: upsloping, 2: flat, 3: downsloping)
•	ca: number of major vessels  (0-3)
•	thal: see below (3 = normal; 6 = fixed defect; 7 = reversble defect)
•	target: presence of heart disease (0 = not present, 1 = present.)


Background on Thalium Tracer test (thal)
--Nuclear stress testing requires the injection of a tracer, commonly technicium 99M (Myoview or Cardiolyte), which is then taken up by healthy, viable myocardial cells. A camera (detector) is used afterwards to image the heart and compare segments. Coronary stenosis is detected when a myocardial segment takes up the nuclear tracer at rest, but not during cardiac stress. This is called a "reversible defect." Scarred myocardium from prior infarct will not take up tracer at all and is referred to as a "fixed defect."
```

# 3. Cleaning the data

In [6]:
# Rename original columns for clarity
X.columns = ['Age', 'Sex', 'Chest_Pain_Type', 'Resting_BP', 
              'Cholesterol', 'Fasting_BS', 'Resting_ECG', 
              'Max_HR', 'Exercise_Induced_Angina', 
              'ST_Depression', 'ST_Slope', 'Num_Major_Vessels',
              'Thal_Test', 'Disease']

In [7]:
# See datatypes and null values of X
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      303 non-null    int64  
 1   Sex                      303 non-null    int64  
 2   Chest_Pain_Type          303 non-null    int64  
 3   Resting_BP               303 non-null    int64  
 4   Cholesterol              303 non-null    int64  
 5   Fasting_BS               303 non-null    int64  
 6   Resting_ECG              303 non-null    int64  
 7   Max_HR                   303 non-null    int64  
 8   Exercise_Induced_Angina  303 non-null    int64  
 9   ST_Depression            303 non-null    float64
 10  ST_Slope                 303 non-null    int64  
 11  Num_Major_Vessels        299 non-null    float64
 12  Thal_Test                301 non-null    float64
 13  Disease                  303 non-null    int64  
dtypes: float64(3), int64(11)
m

In [8]:
X.describe()

Unnamed: 0,Age,Sex,Chest_Pain_Type,Resting_BP,Cholesterol,Fasting_BS,Resting_ECG,Max_HR,Exercise_Induced_Angina,ST_Depression,ST_Slope,Num_Major_Vessels,Thal_Test,Disease
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,299.0,301.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.672241,4.734219,0.458746
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.937438,1.939706,0.49912
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,3.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0,3.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0,1.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0,1.0


In [9]:
X.isnull().sum()

Age                        0
Sex                        0
Chest_Pain_Type            0
Resting_BP                 0
Cholesterol                0
Fasting_BS                 0
Resting_ECG                0
Max_HR                     0
Exercise_Induced_Angina    0
ST_Depression              0
ST_Slope                   0
Num_Major_Vessels          4
Thal_Test                  2
Disease                    0
dtype: int64

In [10]:
# Drop  NaNs
X = X.dropna()

In [11]:
# Identify outliers
from scipy.stats import zscore

# Calculating the z-score for continuous values in our dataframe that we assume are normally distibuted
numeric_cols = ['Age', 'Resting_BP', 'Cholesterol', 'Max_HR']
z = zscore(X[numeric_cols])
abs_z = np.abs(z)
#define threshold of 3
filtered = (abs_z < 3).all(axis=1)
X_outliers_removed = X[filtered]
print("\nData after removing outliers: ")
print(X_outliers_removed.describe())



Data after removing outliers: 
              Age         Sex  Chest_Pain_Type  Resting_BP  Cholesterol  \
count  290.000000  290.000000       290.000000  290.000000   290.000000   
mean    54.382759    0.686207         3.151724  131.248276   244.337931   
std      9.063122    0.464836         0.968936   17.069786    46.092313   
min     29.000000    0.000000         1.000000   94.000000   126.000000   
25%     47.250000    0.000000         3.000000  120.000000   211.000000   
50%     55.500000    1.000000         3.000000  130.000000   240.500000   
75%     61.000000    1.000000         4.000000  140.000000   274.000000   
max     77.000000    1.000000         4.000000  180.000000   394.000000   

       Fasting_BS  Resting_ECG      Max_HR  Exercise_Induced_Angina  \
count  290.000000    290.00000  290.000000               290.000000   
mean     0.141379      0.97931  149.693103                 0.327586   
std      0.349015      0.99458   22.560161                 0.470144   
min     

In [12]:
df_final = X_outliers_removed.reset_index()

In [13]:
df_final.rename({"index": "patient"},  
          axis = "columns", inplace = True) 

In [14]:
# 7 rows removed for outliers
# 4 rows removed for missing variables
print(df_final)

     patient  Age  Sex  Chest_Pain_Type  Resting_BP  Cholesterol  Fasting_BS  \
0          0   63    1                1         145          233           1   
1          1   67    1                4         160          286           0   
2          2   67    1                4         120          229           0   
3          3   37    1                3         130          250           0   
4          4   41    0                2         130          204           0   
..       ...  ...  ...              ...         ...          ...         ...   
285      297   57    0                4         140          241           0   
286      298   45    1                1         110          264           0   
287      299   68    1                4         144          193           1   
288      300   57    1                4         130          131           0   
289      301   57    0                2         130          236           0   

     Resting_ECG  Max_HR  Exercise_Indu

In [15]:
df_final.shape

(290, 15)

In [16]:
df_final.to_csv('df_cleaned.csv', index= False)