In [1]:
import numpy as np
import pandas as pd


## Data Labels

1. **age**: Age in years  
2. **sex**: Sex  
   - `1` = Male  
   - `0` = Female  
3. **cp**: Chest pain type  
   - Value `1`: Typical angina  
   - Value `2`: Atypical angina  
   - Value `3`: Non-anginal pain  
   - Value `4`: Asymptomatic  
4. **trestbps**: Resting blood pressure (in mm Hg on admission to the hospital)  
5. **chol**: Serum cholesterol in mg/dl  
6. **fbs**: Fasting blood sugar (> 120 mg/dl)  
   - `1` = True  
   - `0` = False  
7. **restecg**: Resting electrocardiographic results  
   - Value `0`: Normal  
   - Value `1`: Having ST-T wave abnormality (T wave inversions and/or ST  
     elevation or depression of > 0.05 mV)  
   - Value `2`: Showing probable or definite left ventricular hypertrophy by  
     Estes' criteria  
8. **thalach**: Maximum heart rate achieved  
9. **exang**: Exercise-induced angina  
   - `1` = Yes  
   - `0` = No  
10. **oldpeak**: ST depression induced by exercise relative to rest  
11. **slope**: The slope of the peak exercise ST segment  
    - Value `1`: Upsloping  
    - Value `2`: Flat  
    - Value `3`: Downsloping  
12. **ca**: Number of major vessels (0-3) colored by fluoroscopy  
13. **thal**:  
    - `3` = Normal  
    - `6` = Fixed defect  
    - `7` = Reversible defect

In [33]:
columns = ['age','sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'label']

In [34]:
hungary_df = pd.read_csv('data/processed.hungarian.data',index_col=False, names = columns)
swiss_df = pd.read_csv('data/processed.switzerland.data',index_col=False, names = columns)
cleveland_df = pd.read_csv('data/processed.cleveland.data',index_col=False, names = columns)
va_df = pd.read_csv('data/processed.va.data',index_col=False, names = columns)

combined_df = pd.concat([hungary_df, swiss_df, cleveland_df, va_df], axis = 0)


In [35]:
combined_df.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
0,28.0,1.0,2.0,130,132,0,2,185,0,0.0,?,?,?,0
1,29.0,1.0,2.0,120,243,0,0,160,0,0.0,?,?,?,0
2,29.0,1.0,2.0,140,?,0,0,170,0,0.0,?,?,?,0
3,30.0,0.0,1.0,170,237,0,1,170,0,0.0,?,?,6,0
4,31.0,0.0,2.0,100,219,0,1,150,0,0.0,?,?,?,0
5,32.0,0.0,2.0,105,198,0,0,165,0,0.0,?,?,?,0
6,32.0,1.0,2.0,110,225,0,0,184,0,0.0,?,?,?,0
7,32.0,1.0,2.0,125,254,0,0,155,0,0.0,?,?,?,0
8,33.0,1.0,3.0,120,298,0,0,185,0,0.0,?,?,?,0
9,34.0,0.0,2.0,130,161,0,0,190,0,0.0,?,?,?,0


In [36]:
# replace "?" with NaN, and then check the percentage of missing values for each feature. 

combined_df.replace('?', np.nan, inplace = True )

missing_percentage = (combined_df.isnull().sum() / len(combined_df)) * 100
print(missing_percentage)

age          0.000000
sex          0.000000
cp           0.000000
trestbps     6.413043
chol         3.260870
fbs          9.782609
restecg      0.217391
thalach      5.978261
exang        5.978261
oldpeak      6.739130
slope       33.586957
ca          66.413043
thal        52.826087
label        0.000000
dtype: float64
