# Heart Disease Prediction ML Project: Data Preparation

## Imports and Config

### Imports

In [47]:
from src.config import DIR_RAW_DATA
from src.loading import load_raw_data

### Config

In [48]:
RAW_TEST_DATA="test.csv"
RAW_TRAIN_DATA="train.csv"

## Load Data

In [49]:
data_train_raw = load_raw_data(DIR_RAW_DATA, RAW_TRAIN_DATA)
data_test_raw = load_raw_data(DIR_RAW_DATA, RAW_TEST_DATA)

## Check Features

In [50]:
print(data_train_raw.columns)
print(data_test_raw.columns)

Index(['id', 'Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol',
       'FBS over 120', 'EKG results', 'Max HR', 'Exercise angina',
       'ST depression', 'Slope of ST', 'Number of vessels fluro', 'Thallium',
       'Heart Disease'],
      dtype='str')
Index(['id', 'Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol',
       'FBS over 120', 'EKG results', 'Max HR', 'Exercise angina',
       'ST depression', 'Slope of ST', 'Number of vessels fluro', 'Thallium'],
      dtype='str')


### Feature: Id

In [51]:
print("Training data")
print(data_train_raw["id"].describe())
print(f'Unique values: {len(data_train_raw["id"].unique())}')
print("")
print("Test data")
print(data_test_raw["id"].describe())
print(f'Unique values: {len(data_test_raw["id"].unique())}')

Training data
count    630000.000000
mean     314999.500000
std      181865.479132
min           0.000000
25%      157499.750000
50%      314999.500000
75%      472499.250000
max      629999.000000
Name: id, dtype: float64
Unique values: 630000

Test data
count    270000.000000
mean     764999.500000
std       77942.430678
min      630000.000000
25%      697499.750000
50%      764999.500000
75%      832499.250000
max      899999.000000
Name: id, dtype: float64
Unique values: 270000


- ID is unique
- ID is running zero-based index
- ID is imported as numeric

### Feature: Age

In [52]:
print("Training data")
print(data_train_raw["Age"].describe())
print("\n")
print("Test data")
print(data_test_raw["Age"].describe())

Training data
count    630000.000000
mean         54.136706
std           8.256301
min          29.000000
25%          48.000000
50%          54.000000
75%          60.000000
max          77.000000
Name: Age, dtype: float64


Test data
count    270000.000000
mean         54.159870
std           8.255471
min          29.000000
25%          48.000000
50%          54.000000
75%          60.000000
max          77.000000
Name: Age, dtype: float64


- Age contains age in years
- Age is imported as numeric

### Feature: Sex

In [53]:
print("Training data")
print(data_train_raw["Sex"].value_counts())
print(sorted(data_train_raw["Sex"].unique()))
print("\n")
print("Test data")
print(data_test_raw["Sex"].value_counts())
print(sorted(data_test_raw["Sex"].unique()))

Training data
Sex
1    450283
0    179717
Name: count, dtype: int64
[np.int64(0), np.int64(1)]


Test data
Sex
1    193405
0     76595
Name: count, dtype: int64
[np.int64(0), np.int64(1)]


- Sex contains binary sex as female (0) and male (1)
- Sex is imported as numeric

### Feature: Chest Pain Type

In [54]:
print("Training data")
print(data_train_raw["Chest pain type"].value_counts())
print(sorted(data_train_raw["Chest pain type"].unique()))
print("\n")
print("Test data")
print(data_test_raw["Chest pain type"].value_counts())
print(sorted(data_test_raw["Chest pain type"].unique()))

Training data
Chest pain type
4    329179
3    197278
2     74941
1     28602
Name: count, dtype: int64
[np.int64(1), np.int64(2), np.int64(3), np.int64(4)]


Test data
Chest pain type
4    141641
3     84042
2     32072
1     12245
Name: count, dtype: int64
[np.int64(1), np.int64(2), np.int64(3), np.int64(4)]


- Chest pain type contains four types: typical angina (1), atypical angina (2), non-anginal pain (3), and asymptomatic (4)
- Chest pain type is imported as numeric

### Feature: Blood Pressure

In [55]:
print("Training data")
print(data_train_raw["BP"].describe())
print("\n")
print("Test data")
print(data_test_raw["BP"].describe())

Training data
count    630000.000000
mean        130.497433
std          14.975802
min          94.000000
25%         120.000000
50%         130.000000
75%         140.000000
max         200.000000
Name: BP, dtype: float64


Test data
count    270000.000000
mean        130.555089
std          15.007908
min          94.000000
25%         120.000000
50%         130.000000
75%         140.000000
max         200.000000
Name: BP, dtype: float64


- BP contains resting blood pressure in millimetres of mercury
- BP is imported as numeric

### Feature: Cholesterol

In [56]:
print("Training data")
print(data_train_raw["Cholesterol"].describe())
print("\n")
print("Test data")
print(data_test_raw["Cholesterol"].describe())

Training data
count    630000.000000
mean        245.011814
std          33.681581
min         126.000000
25%         223.000000
50%         243.000000
75%         269.000000
max         564.000000
Name: Cholesterol, dtype: float64


Test data
count    270000.000000
mean        245.045270
std          33.695458
min         126.000000
25%         223.000000
50%         243.000000
75%         269.000000
max         564.000000
Name: Cholesterol, dtype: float64


- Cholesterol contains serum cholesterol level in mg per deciliter
- Cholesterol is imported as numeric

### Feature: Blood Sugar

In [57]:
print("Training data")
print(data_train_raw["FBS over 120"].value_counts())
print(sorted(data_train_raw["FBS over 120"].unique()))
print("\n")
print("Test data")
print(data_test_raw["FBS over 120"].value_counts())
print(sorted(data_test_raw["FBS over 120"].unique()))

Training data
FBS over 120
0    579608
1     50392
Name: count, dtype: int64
[np.int64(0), np.int64(1)]


Test data
FBS over 120
0    248386
1     21614
Name: count, dtype: int64
[np.int64(0), np.int64(1)]


- Blood sugar contains level of fasting blood glucose as two categories: below or equal to 120 milligrams of glucose per deciliter of blood (0) and above 120 (1)
- Blood sugar is imported as numeric

### Feature: EKG Results

In [58]:
print("Training data")
print(data_train_raw["EKG results"].value_counts())
print(sorted(data_train_raw["EKG results"].unique()))
print("\n")
print("Test data")
print(data_test_raw["EKG results"].value_counts())
print(sorted(data_test_raw["EKG results"].unique()))

Training data
EKG results
0    320116
2    308562
1      1322
Name: count, dtype: int64
[np.int64(0), np.int64(1), np.int64(2)]


Test data
EKG results
0    137611
2    131804
1       585
Name: count, dtype: int64
[np.int64(0), np.int64(1), np.int64(2)]


- EKG results contains resting electrocardiogram results in three categories: normal (0), ST-T wave abnormality (1), and left ventricular hypertrophy (2)
- EKG results is imported as numeric

### Feature: Heart Rate

In [59]:
print("Training data")
print(data_train_raw["Max HR"].describe())
print("\n")
print("Test data")
print(data_test_raw["Max HR"].describe())

Training data
count    630000.000000
mean        152.816763
std          19.112927
min          71.000000
25%         142.000000
50%         157.000000
75%         166.000000
max         202.000000
Name: Max HR, dtype: float64


Test data
count    270000.000000
mean        152.783756
std          19.153454
min          71.000000
25%         142.000000
50%         157.000000
75%         166.000000
max         202.000000
Name: Max HR, dtype: float64


- Max HR contains maximum heart rate achieved
- Max HR is imported as numeric

### Feature: Exercise Angina

In [60]:
print("Training data")
print(data_train_raw["Exercise angina"].value_counts())
print(sorted(data_train_raw["Exercise angina"].unique()))
print("\n")
print("Test data")
print(data_test_raw["Exercise angina"].value_counts())
print(sorted(data_test_raw["Exercise angina"].unique()))

Training data
Exercise angina
0    457553
1    172447
Name: count, dtype: int64
[np.int64(0), np.int64(1)]


Test data
Exercise angina
0    195843
1     74157
Name: count, dtype: int64
[np.int64(0), np.int64(1)]


- Exercise angina contains information on an exercise-induced angina in two categories: no (0) and yes (1).
- Exercise angina is imported as numeric

### Feature: ST Depression

In [61]:
print("Training data")
print(data_train_raw["ST depression"].describe())
print("\n")
print("Test data")
print(data_test_raw["ST depression"].describe())

Training data
count    630000.000000
mean          0.716028
std           0.948472
min           0.000000
25%           0.000000
50%           0.100000
75%           1.400000
max           6.200000
Name: ST depression, dtype: float64


Test data
count    270000.000000
mean          0.718082
std           0.947417
min           0.000000
25%           0.000000
50%           0.100000
75%           1.400000
max           6.200000
Name: ST depression, dtype: float64


- ST depression contains the ST depression induced by exercise relative to rest
- ST depression is imported as numeric

### Feature: Slope of ST

In [62]:
print("Training data")
print(data_train_raw["Slope of ST"].describe())
print("\n")
print("Test data")
print(data_test_raw["Slope of ST"].describe())

Training data
count    630000.000000
mean          1.455871
std           0.545192
min           1.000000
25%           1.000000
50%           1.000000
75%           2.000000
max           3.000000
Name: Slope of ST, dtype: float64


Test data
count    270000.000000
mean          1.459356
std           0.546186
min           1.000000
25%           1.000000
50%           1.000000
75%           2.000000
max           3.000000
Name: Slope of ST, dtype: float64


- Slope of ST contains slope of the peak exercise ST segment
- Slope is imported as numeric

### Feature: Number of Vessels

In [63]:
print("Training data")
print(data_train_raw["Number of vessels fluro"].describe())
print("\n")
print("Test data")
print(data_test_raw["Number of vessels fluro"].describe())

Training data
count    630000.000000
mean          0.451040
std           0.798549
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max           3.000000
Name: Number of vessels fluro, dtype: float64


Test data
count    270000.00000
mean          0.45480
std           0.80127
min           0.00000
25%           0.00000
50%           0.00000
75%           1.00000
max           3.00000
Name: Number of vessels fluro, dtype: float64


- Number of vessels contains number of major vessels (0â€“3) colored by fluoroscopy
- Number of vessels is imported as numeric

### Feature: Thallium

In [64]:
print("Training data")
print(data_train_raw["Thallium"].value_counts())
print(sorted(data_train_raw["Thallium"].unique()))
print("\n")
print("Test data")
print(data_test_raw["Thallium"].value_counts())
print(sorted(data_test_raw["Thallium"].unique()))

Training data
Thallium
3    372286
7    246748
6     10966
Name: count, dtype: int64
[np.int64(3), np.int64(6), np.int64(7)]


Test data
Thallium
3    159498
7    105833
6      4669
Name: count, dtype: int64
[np.int64(3), np.int64(6), np.int64(7)]


- Thallium contains Thallium stress test result as three categories: normal (3), fixed effect (6), and reversible defect (7)
- Thallium is imported as numeric

### Feature: Heart Disease

In [65]:
print("Training data")
print(data_train_raw["Heart Disease"].value_counts())
print(sorted(data_train_raw["Heart Disease"].unique()))
print(data_train_raw["Heart Disease"].dtype)

Training data
Heart Disease
Absence     347546
Presence    282454
Name: count, dtype: int64
['Absence', 'Presence']
str


- Heart Disease contains the presence of a heart disease as two categories: absence and presence
- Heart disease is imported as string