In [9]:
# Import libraries and load the dataset

import pandas as pd
import numpy as np

# Load dataset
data = pd.read_csv("heart_disease_uci.csv")

# Display first few rows
data.head()


Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [10]:
# Basic information and stats
data.info()
data.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,num
count,920.0,920.0,861.0,890.0,865.0,858.0,309.0,920.0
mean,460.5,53.51087,132.132404,199.130337,137.545665,0.878788,0.676375,0.995652
std,265.725422,9.424685,19.06607,110.78081,25.926276,1.091226,0.935653,1.142693
min,1.0,28.0,0.0,0.0,60.0,-2.6,0.0,0.0
25%,230.75,47.0,120.0,175.0,120.0,0.0,0.0,0.0
50%,460.5,54.0,130.0,223.0,140.0,0.5,0.0,1.0
75%,690.25,60.0,140.0,268.0,157.0,1.5,1.0,2.0
max,920.0,77.0,200.0,603.0,202.0,6.2,3.0,4.0


In [None]:
# Explore target variable distribution
data['num'].value_counts()




num
0    411
1    265
2    109
3    107
4     28
Name: count, dtype: int64

In [None]:
# Check for missing values
data.isnull().sum()

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

In [None]:
# Handle Missing Data (Numeric + Categorical, Proximity-based)
from sklearn.impute import KNNImputer

# Make a copy of the data to work safely
df = data.copy()

# Step 1 — Drop columns with excessive missingness (>50%)
df = df.drop(columns=['ca', 'thal', 'slope'])  # too many NaNs

# Step 2 — KNN Imputation for numeric features
# Select numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Initialize imputer
knn_imputer = KNNImputer(n_neighbors=5)

# Apply KNN imputation
df_numeric_imputed = pd.DataFrame(knn_imputer.fit_transform(df[numeric_cols]), columns=numeric_cols)

# Replace numeric columns with imputed values
df[numeric_cols] = df_numeric_imputed

# Proximity-based + fallback imputation for categorical columns
def fill_categorical_mode_with_fallback(df, target_col, group_cols):
    # Group-based fill using mode (proximity)
    df[target_col] = df.groupby(group_cols)[target_col].transform(
        lambda x: x.fillna(x.mode().iloc[0]) if not x.mode().empty else x
    )
    # Fallback global mode if still missing
    if df[target_col].isnull().any():
        global_mode = df[target_col].mode().iloc[0]
        df[target_col].fillna(global_mode, inplace=True)
    return df

# Handle remaining categorical columns manually
# These are known categorical / binary columns in Heart dataset
categorical_cols = ['sex', 'dataset', 'cp', 'fbs', 'restecg', 'exang']

# Apply proximity-mode imputation
df = fill_categorical_mode_with_fallback(df, 'fbs', ['sex'])
df = fill_categorical_mode_with_fallback(df, 'restecg', ['sex'])
df = fill_categorical_mode_with_fallback(df, 'exang', ['sex', 'cp'])

# Verify all missing values handled
print("Missing values per column after cleaning:\n")
print(df.isnull().sum())

# Final clean dataset
data_cleaned = df.copy()




Missing values per column after cleaning:

id          0
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
num         0
dtype: int64


  lambda x: x.fillna(x.mode().iloc[0]) if not x.mode().empty else x
  lambda x: x.fillna(x.mode().iloc[0]) if not x.mode().empty else x


In [None]:
# Perform one-hot encoding for categorical variables
data_encoded = pd.get_dummies(data_cleaned, columns=categorical_cols, drop_first=True)

# Display new shape and head
data_encoded.shape, data_encoded.head()


((920, 18),
     id   age  trestbps   chol  thalch  oldpeak  num  sex_Male  \
 0  1.0  63.0     145.0  233.0   150.0      2.3  0.0      True   
 1  2.0  67.0     160.0  286.0   108.0      1.5  2.0      True   
 2  3.0  67.0     120.0  229.0   129.0      2.6  1.0      True   
 3  4.0  37.0     130.0  250.0   187.0      3.5  0.0      True   
 4  5.0  41.0     130.0  204.0   172.0      1.4  0.0     False   
 
    dataset_Hungary  dataset_Switzerland  dataset_VA Long Beach  \
 0            False                False                  False   
 1            False                False                  False   
 2            False                False                  False   
 3            False                False                  False   
 4            False                False                  False   
 
    cp_atypical angina  cp_non-anginal  cp_typical angina  fbs_True  \
 0               False           False               True      True   
 1               False           False      