🧼 Data Processing & Cleaning Plan


Load the Data

Check Column Names

Handle Missing Values

Handle Duplicates

Convert Data Types

Encode Categorical Features

Scale/Normalize Numerical Features

Save Cleaned Data



1. Load the Data

In [54]:
import pandas as pd 
import numpy as np 
import matplotlib as plt 
import seaborn as sns 

# Load_datasets
df = pd.read_csv(r"E:\DATA_PROCESSING_PROJECTS\heart_attack_dataset.csv")
print(df.shape)
df.head()

(372974, 32)


Unnamed: 0,Age,Gender,Cholesterol,BloodPressure,HeartRate,BMI,Smoker,Diabetes,Hypertension,FamilyHistory,...,ExerciseInducedAngina,Slope,NumberOfMajorVessels,Thalassemia,PreviousHeartAttack,StrokeHistory,Residence,EmploymentStatus,MaritalStatus,Outcome
0,31,Male,194,162,71,22.9,0,1,0,0,...,Yes,Downsloping,1,Normal,0,0,Suburban,Retired,Single,No Heart Attack
1,69,Male,208,148,93,33.9,1,1,0,0,...,Yes,Upsloping,2,Normal,0,0,Suburban,Unemployed,Married,No Heart Attack
2,34,Female,132,161,94,34.0,0,0,1,1,...,Yes,Upsloping,0,Normal,1,0,Rural,Retired,Single,Heart Attack
3,53,Male,268,134,91,35.0,0,1,1,0,...,Yes,Flat,0,Reversible defect,1,0,Suburban,Retired,Widowed,No Heart Attack
4,57,Female,203,140,75,30.1,0,1,0,0,...,Yes,Flat,0,Fixed defect,1,0,Rural,Retired,Married,Heart Attack


✅ 2. Check Column Names
Let's rename and clean the column names if necessary (spaces, inconsistent casing, etc.):

In [56]:
df.columns

Index(['Age', 'Gender', 'Cholesterol', 'BloodPressure', 'HeartRate', 'BMI',
       'Smoker', 'Diabetes', 'Hypertension', 'FamilyHistory',
       'PhysicalActivity', 'AlcoholConsumption', 'Diet', 'StressLevel',
       'Ethnicity', 'Income', 'EducationLevel', 'Medication', 'ChestPainType',
       'ECGResults', 'MaxHeartRate', 'ST_Depression', 'ExerciseInducedAngina',
       'Slope', 'NumberOfMajorVessels', 'Thalassemia', 'PreviousHeartAttack',
       'StrokeHistory', 'Residence', 'EmploymentStatus', 'MaritalStatus',
       'Outcome'],
      dtype='object')

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 372974 entries, 0 to 372973
Data columns (total 32 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Age                    372974 non-null  int64  
 1   Gender                 372974 non-null  object 
 2   Cholesterol            372974 non-null  int64  
 3   BloodPressure          372974 non-null  int64  
 4   HeartRate              372974 non-null  int64  
 5   BMI                    372974 non-null  float64
 6   Smoker                 372974 non-null  int64  
 7   Diabetes               372974 non-null  int64  
 8   Hypertension           372974 non-null  int64  
 9   FamilyHistory          372974 non-null  int64  
 10  PhysicalActivity       372974 non-null  int64  
 11  AlcoholConsumption     372974 non-null  int64  
 12  Diet                   372974 non-null  object 
 13  StressLevel            372974 non-null  int64  
 14  Ethnicity              372974 non-nu

In [60]:


df.columns = df.columns.str.strip().str.replace(" ","_").str.replace(":","").str.replace(".","").str.lower()

In [15]:
df.columns

Index(['age', 'gender', 'cholesterol', 'bloodpressure', 'heartrate', 'bmi',
       'smoker', 'diabetes', 'hypertension', 'familyhistory',
       'physicalactivity', 'alcoholconsumption', 'diet', 'stresslevel',
       'ethnicity', 'income', 'educationlevel', 'medication', 'chestpaintype',
       'ecgresults', 'maxheartrate', 'stdepression', 'exerciseinducedangina',
       'slope', 'numberofmajorvessels', 'thalassemia', 'previousheartattack',
       'strokehistory', 'residence', 'employmentstatus', 'maritalstatus',
       'outcome'],
      dtype='object')

3. Handle Missing Values

In [62]:
# Check missing values
print(df.isnull().sum())

# Option 1: Drop rows with missing values
df = df.dropna()

# Option 2: Fill missing values
# For numerical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

# For categorical columns
cat_cols = df.select_dtypes(include='object').columns
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])


age                      0
gender                   0
cholesterol              0
bloodpressure            0
heartrate                0
bmi                      0
smoker                   0
diabetes                 0
hypertension             0
familyhistory            0
physicalactivity         0
alcoholconsumption       0
diet                     0
stresslevel              0
ethnicity                0
income                   0
educationlevel           0
medication               0
chestpaintype            0
ecgresults               0
maxheartrate             0
st_depression            0
exerciseinducedangina    0
slope                    0
numberofmajorvessels     0
thalassemia              0
previousheartattack      0
strokehistory            0
residence                0
employmentstatus         0
maritalstatus            0
outcome                  0
dtype: int64


4. remove duplicates 


In [64]:
# to check the duplicates 
df.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
372969    False
372970    False
372971    False
372972    False
372973    False
Length: 372974, dtype: bool

3 dropiing duplicates 

In [66]:
df = df.drop_duplicates()

5. Convert DataTypes 

In [68]:
df.columns

Index(['age', 'gender', 'cholesterol', 'bloodpressure', 'heartrate', 'bmi',
       'smoker', 'diabetes', 'hypertension', 'familyhistory',
       'physicalactivity', 'alcoholconsumption', 'diet', 'stresslevel',
       'ethnicity', 'income', 'educationlevel', 'medication', 'chestpaintype',
       'ecgresults', 'maxheartrate', 'st_depression', 'exerciseinducedangina',
       'slope', 'numberofmajorvessels', 'thalassemia', 'previousheartattack',
       'strokehistory', 'residence', 'employmentstatus', 'maritalstatus',
       'outcome'],
      dtype='object')

In [74]:
df['outcome'] = df['outcome'].map({'No Heart Attack': 0, 'Heart Attack': 1})


In [76]:
df['age'] = df["age"].astype(int)
df['outcome'] = df['outcome'].astype(int)

In [78]:
df.columns 

Index(['age', 'gender', 'cholesterol', 'bloodpressure', 'heartrate', 'bmi',
       'smoker', 'diabetes', 'hypertension', 'familyhistory',
       'physicalactivity', 'alcoholconsumption', 'diet', 'stresslevel',
       'ethnicity', 'income', 'educationlevel', 'medication', 'chestpaintype',
       'ecgresults', 'maxheartrate', 'st_depression', 'exerciseinducedangina',
       'slope', 'numberofmajorvessels', 'thalassemia', 'previousheartattack',
       'strokehistory', 'residence', 'employmentstatus', 'maritalstatus',
       'outcome'],
      dtype='object')

In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 372974 entries, 0 to 372973
Data columns (total 32 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   age                    372974 non-null  int32  
 1   gender                 372974 non-null  object 
 2   cholesterol            372974 non-null  int64  
 3   bloodpressure          372974 non-null  int64  
 4   heartrate              372974 non-null  int64  
 5   bmi                    372974 non-null  float64
 6   smoker                 372974 non-null  int64  
 7   diabetes               372974 non-null  int64  
 8   hypertension           372974 non-null  int64  
 9   familyhistory          372974 non-null  int64  
 10  physicalactivity       372974 non-null  int64  
 11  alcoholconsumption     372974 non-null  int64  
 12  diet                   372974 non-null  object 
 13  stresslevel            372974 non-null  int64  
 14  ethnicity              372974 non-nu

 6. Encode Categorical Features
Using Label Encoding or One-Hot Encoding depending on the model:

In [84]:
from sklearn.preprocessing import LabelEncoder

# label encode binary categorical variable 
label_enc = LabelEncoder()


In [90]:
df.columns

Index(['age', 'gender', 'cholesterol', 'bloodpressure', 'heartrate', 'bmi',
       'smoker', 'diabetes', 'hypertension', 'familyhistory',
       'physicalactivity', 'alcoholconsumption', 'diet', 'stresslevel',
       'ethnicity', 'income', 'educationlevel', 'medication', 'chestpaintype',
       'ecgresults', 'maxheartrate', 'st_depression', 'exerciseinducedangina',
       'slope', 'numberofmajorvessels', 'thalassemia', 'previousheartattack',
       'strokehistory', 'residence', 'employmentstatus', 'maritalstatus',
       'outcome'],
      dtype='object')

In [92]:
binaray_cols = ['gender','smoker','diabetes','hypertension','familyhistory','residence','maritalstatus']
for col in binaray_cols: 
    df[col] = label_enc.fit_transform(df[col])

# one_hot Encodeing mulicclas categorical variablesabs 
df = pd.get_dummies(df,columns=['chestpaintype','ethnicity','educationlevel','employmentstatus','diet'],drop_first=True)

7. Scale/Normalize Numerical Features

In [95]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
num_cols = ['age', 'cholesterol', 'bloodpressure', 'heartrate', 'bmi', 'maxheartrate', 'st_depression']
df[num_cols] = scaler.fit_transform(df[num_cols])


In [101]:
df.to_csv(r"E:\_cleaned_data_processing.csv", index=False)


In [103]:
df

Unnamed: 0,age,gender,cholesterol,bloodpressure,heartrate,bmi,smoker,diabetes,hypertension,familyhistory,...,ethnicity_Black,ethnicity_Hispanic,ethnicity_Other,ethnicity_White,educationlevel_High School,educationlevel_Postgraduate,employmentstatus_Retired,employmentstatus_Unemployed,diet_Moderate,diet_Unhealthy
0,-1.637055,1,-0.094517,1.058007,-1.066449,-0.962380,0,1,0,0,...,False,True,False,False,True,False,True,False,False,True
1,0.757853,1,0.148019,0.519224,0.202101,0.771146,1,1,0,0,...,False,False,False,False,False,False,False,True,False,True
2,-1.447984,0,-1.168607,1.019522,0.259762,0.786905,0,0,1,1,...,True,False,False,False,False,False,True,False,False,False
3,-0.250530,1,1.187461,-0.019559,0.086778,0.944498,0,1,1,0,...,False,True,False,False,True,False,True,False,False,False
4,0.001566,0,0.061399,0.211348,-0.835804,0.172291,0,1,0,0,...,False,True,False,False,True,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
372969,-0.502625,0,1.481969,-0.135013,-0.605158,-0.221692,0,1,1,1,...,False,True,False,False,False,True,False,False,False,True
372970,1.262044,1,-0.042545,0.942553,-1.470079,0.708108,0,1,0,1,...,False,False,False,True,False,True,False,False,False,True
372971,-0.250530,0,0.754360,-1.558939,-0.316852,0.897220,1,1,1,0,...,False,False,True,False,False,True,True,False,True,False
372972,1.325068,0,1.585913,-1.597423,-0.662820,0.046217,1,1,0,1,...,True,False,False,False,True,False,False,False,False,False
