In [37]:
# Importing the necessary libraries
import pandas as pd

In [38]:
# Loading the Dataset
data = pd.read_csv('heart_cleveland_upload.csv')
display(data.head(1))
print(f"Number of Columns: {len(data.columns)}\nNumber of Rows: {len(data)}")

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0


Number of Columns: 14
Number of Rows: 297


In [39]:
# Checking the data Types of data
display(data.dtypes)

age            int64
sex            int64
cp             int64
trestbps       int64
chol           int64
fbs            int64
restecg        int64
thalach        int64
exang          int64
oldpeak      float64
slope          int64
ca             int64
thal           int64
condition      int64
dtype: object

In [40]:
# Checking the missing values
display(data.isnull().sum())

age          0
sex          0
cp           0
trestbps     0
chol         0
fbs          0
restecg      0
thalach      0
exang        0
oldpeak      0
slope        0
ca           0
thal         0
condition    0
dtype: int64

In [41]:
# Checking for duplicates
display(data.duplicated().value_counts())

False    297
Name: count, dtype: int64

In [42]:
display(data.head(1))

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0


In [43]:
# Renaming the columns for appropriate words
data.rename(columns={'age' : 'Age'}, inplace=True)
data.rename(columns={'sex' : 'Sex'}, inplace=True) # sex: sex (1 = male; 0 = female)
data.rename(columns={'cp' : 'Chest Pain Type'}, inplace=True)
# Chest Pain Type
# Value 0: typical angina
# Value 1: atypical angina
# Value 2: non-anginal pain
# Value 3: asymptomatic
data.rename(columns={'trestbps' : 'Resting Blood Pressure'}, inplace=True)
data.rename(columns={'chol' : 'Serum Cholestoral'}, inplace=True)
data.rename(columns={'fbs' : 'Fasting Blood Sugar > 120 mg/dl'}, inplace=True)
data.rename(columns={'restecg' : 'Resting Electrocardiographic'}, inplace=True) 
# Resting Electrocardiographic
# Value 0: normal
# Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
# Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
data.rename(columns={'thalach' : 'Maximum Heart Rate'}, inplace=True)
data.rename(columns={'exang' : 'Exercise Induced Angina'}, inplace=True)
data.rename(columns={'oldpeak' : 'ST Depression'}, inplace=True)
data.rename(columns={'slope' : 'Slope of Peak Exercise ST Segment'}, inplace=True) # A treadmill ECG stress test
# slope: the slope of the peak exercise ST segment
# Value 0: upsloping
# Value 1: flat
# Value 2: downsloping
data.rename(columns={'ca' : 'Number of Major Vessels'}, inplace=True)
data.rename(columns={'thal' : 'Thalassemia'}, inplace=True) # thal: A blood disorder called thalassemia
# thal: 0 = normal
# 1 = fixed defect
# 2 = reversable defect and the label
data.rename(columns={'condition' : 'Disease Condition'}, inplace=True) # Condition: 0 = no disease, 1 = disease

In [44]:
# sex: sex (1 = male; 0 = female)
data['Sex'] = data['Sex'].replace({0: 'Female', 1: 'Male'})

In [45]:
# Chest Pain Type
# Value 0: typical angina
# Value 1: atypical angina
# Value 2: non-anginal pain
# Value 3: asymptomatic
data['Chest Pain Type'] = data['Chest Pain Type'].replace({0: 'Typical Angina', 1: 'Atypical Angina', 2: 'Non-anginal Pain', 3: 'Asymptomatic'})

In [46]:
data['Fasting Blood Sugar > 120 mg/dl'] = data['Fasting Blood Sugar > 120 mg/dl'].replace({0:'False', 1:'True'})

In [47]:
# Resting Electrocardiographic
# Value 0: normal
# Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
# Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
data['Resting Electrocardiographic'].value_counts()
data['Resting Electrocardiographic'] = data['Resting Electrocardiographic'].replace({0: 'Normal ECG', 1: 'ST-T Wave Abnormalities', 2: 'Left Ventricular Hypertrophy'})

In [48]:
# slope: the slope of the peak exercise ST segment
# Value 0: upsloping
# Value 1: flat
# Value 2: downsloping
data['Slope of Peak Exercise ST Segment'] = data['Slope of Peak Exercise ST Segment'].replace({0: 'Upsloping', 1: 'Flat', 2: 'Downsloping'})

In [49]:
# thal: 0 = normal
# 1 = fixed defect
# 2 = reversable defect and the label
data['Thalassemia'] = data['Thalassemia'].replace({0: 'Normal', 1: 'Fixed Defect', 2: 'Reversable Defect'})

In [50]:
data['Exercise Induced Angina'] = data['Exercise Induced Angina'].replace({0: 'No', 1: 'Yes'})

In [51]:
# Condition: 0 = no disease, 1 = disease
data['Disease Condition'] = data['Disease Condition'].replace({0: 'No Disease', 1: 'Disease'})

In [52]:
# Exporting the data to csv format for further process
data.to_csv('heart_cleaned_data.csv', index=False)

In [53]:
# Checking for data types after the cleaning process
display(data.dtypes)

Age                                    int64
Sex                                   object
Chest Pain Type                       object
Resting Blood Pressure                 int64
Serum Cholestoral                      int64
Fasting Blood Sugar > 120 mg/dl       object
Resting Electrocardiographic          object
Maximum Heart Rate                     int64
Exercise Induced Angina               object
ST Depression                        float64
Slope of Peak Exercise ST Segment     object
Number of Major Vessels                int64
Thalassemia                           object
Disease Condition                     object
dtype: object

Insights after the cleaning process


1. There are 297 rows and 14 columns present in the data.

2. There are no missing values and duplicates in the data.

3. There are some columns with random names.

4. Therefore renaming those columns with some appropriate words.

5. There are some columns with categorical variables.

6. Convert categorical variables to interpretable form.