## Importing the libaries

In [23]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/heart.csv')

# Check for missing values
print(df.isnull().sum())

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64


In [24]:
df.head(5)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


# Data cleaning and processing

In [2]:
# Replace 0 values in Cholesterol with median (since it's likely missing data)
df['Cholesterol'] = df['Cholesterol'].replace(0, df['Cholesterol'].median())

# Check for other impossible values in RestingBP (blood pressure can't be 0)
df['RestingBP'] = df['RestingBP'].replace(0, df['RestingBP'].median())

# Check MaxHR (maximum heart rate) for unrealistic values
df = df[(df['MaxHR'] > 40) & (df['MaxHR'] < 220)]  # Reasonable heart rate range

# Standardize categorical values (ensure consistent capitalization/spacing)


In [3]:
# Standardize categorical values (ensure consistent capitalization/spacing)
df['ChestPainType'] = df['ChestPainType'].str.strip().str.upper()
df['RestingECG'] = df['RestingECG'].str.strip()
df['ST_Slope'] = df['ST_Slope'].str.strip()

# Convert binary columns to 0/1
df['ExerciseAngina'] = df['ExerciseAngina'].map({'N': 0, 'Y': 1})
df['Sex'] = df['Sex'].map({'M': 1, 'F': 0})

# Define reasonable ranges for each numerical feature


In [4]:
# Define reasonable ranges for each numerical feature
reasonable_ranges = {
    'Age': (20, 100),
    'RestingBP': (80, 200),
    'Cholesterol': (100, 600),  # Upper limit for extreme but possible cases
    'MaxHR': (40, 220),
    'Oldpeak': (0, 6.5)  # ST depression range
}

# Clip values to these ranges
for col, (min_val, max_val) in reasonable_ranges.items():
    df[col] = df[col].clip(min_val, max_val)

# Verify no missing values remain


In [5]:
# Verify no missing values remain
assert df.isnull().sum().sum() == 0

# Verify all categorical values are valid
valid_chest_pain = {'ATA', 'NAP', 'ASY', 'TA'}
assert set(df['ChestPainType'].unique()).issubset(valid_chest_pain)

valid_ecg = {'Normal', 'ST', 'LVH'}
assert set(df['RestingECG'].unique()).issubset(valid_ecg)

valid_slope = {'Up', 'Flat', 'Down'}
assert set(df['ST_Slope'].unique()).issubset(valid_slope)

In [9]:
df.head(5)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,ATA,140,289,0,Normal,172,0,0.0,Up,0
1,49,0,NAP,160,180,0,Normal,156,0,1.0,Flat,1
2,37,1,ATA,130,283,0,ST,98,0,0.0,Up,0
3,48,0,ASY,138,214,0,Normal,108,1,1.5,Flat,1
4,54,1,NAP,150,195,0,Normal,122,0,0.0,Up,0


# Binary encoding (already done in previous cleaning)

In [16]:
df['ExerciseAngina'] = df['ExerciseAngina'].map({'N': 0, 'Y': 1})
df['Sex'] = df['Sex'].map({'M': 1, 'F': 0})

# Ordinal encoding for ST_Slope (ordered categories)
st_slope_order = {'Up': 0, 'Flat': 1, 'Down': 2}
df['ST_Slope_encoded'] = df['ST_Slope'].map(st_slope_order)

# Ordinal encoding for RestingECG
ecg_order = {'Normal': 0, 'ST': 1, 'LVH': 2}
df['RestingECG_encoded'] = df['RestingECG'].map(ecg_order)

# One-hot encoding for ChestPainType (nominal)
chest_pain_dummies = pd.get_dummies(df['ChestPainType'], prefix='ChestPain')
df = pd.concat([df, chest_pain_dummies], axis=1)

# Drop original categorical columns
df = df.drop(['ChestPainType', 'RestingECG', 'ST_Slope'], axis=1)

# Verify the transformations
print(df[['ST_Slope_encoded', 'RestingECG_encoded']].head())
print(df.filter(like='ChestPain_').head())

   ST_Slope_encoded  RestingECG_encoded
0                 0                   0
1                 1                   0
2                 0                   1
3                 1                   0
4                 0                   0
   ChestPain_ASY  ChestPain_ATA  ChestPain_NAP  ChestPain_TA
0          False           True          False         False
1          False          False           True         False
2          False           True          False         False
3           True          False          False         False
4          False          False           True         False


In [17]:
df.head(5)

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ST_Slope_encoded,RestingECG_encoded,ChestPain_ASY,ChestPain_ATA,ChestPain_NAP,ChestPain_TA
0,40,1,140,289,0,172,0,0.0,0,0,0,False,True,False,False
1,49,0,160,180,0,156,0,1.0,1,1,0,False,False,True,False
2,37,1,130,283,0,98,0,0.0,0,0,1,False,True,False,False
3,48,0,138,214,0,108,1,1.5,1,1,0,True,False,False,False
4,54,1,150,195,0,122,0,0.0,0,0,0,False,False,True,False


# Convert boolean columns to integer type (0s and 1s)

In [18]:
# Convert boolean columns to integer type (0s and 1s)
boolean_columns = ['ChestPain_ASY', 'ChestPain_ATA', 'ChestPain_NAP', 'ChestPain_TA']
for col in boolean_columns:
    df[col] = df[col].astype(int)

# Verify the conversion
print(df[boolean_columns].head())

   ChestPain_ASY  ChestPain_ATA  ChestPain_NAP  ChestPain_TA
0              0              1              0             0
1              0              0              1             0
2              0              1              0             0
3              1              0              0             0
4              0              0              1             0


In [19]:
df.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ST_Slope_encoded,RestingECG_encoded,ChestPain_ASY,ChestPain_ATA,ChestPain_NAP,ChestPain_TA
0,40,1,140,289,0,172,0,0.0,0,0,0,0,1,0,0
1,49,0,160,180,0,156,0,1.0,1,1,0,0,0,1,0
2,37,1,130,283,0,98,0,0.0,0,0,1,0,1,0,0
3,48,0,138,214,0,108,1,1.5,1,1,0,1,0,0,0
4,54,1,150,195,0,122,0,0.0,0,0,0,0,0,1,0


#saving the clean data


In [20]:
#saving the clean data
df.to_csv('heart_cleaned.csv', index=False)