# Import The Libraries

In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# Reading The Data

In [108]:
df = pd.read_csv('/content/heart_disease_uci.csv')
df

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,916,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,917,62,Male,VA Long Beach,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,918,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,919,58,Male,VA Long Beach,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


# Preparing The Data

In [110]:
df = df.drop('id', axis= 1)
df = df.drop('dataset' , axis = 1)
df = df.rename(columns={'num': 'disease_state'})

In [111]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,disease_state
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,37,Male,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,41,Female,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


# Collecting the Columns according to data type

In [112]:
numeric_columns = df.select_dtypes(include=np.number).columns
categorical_columns = df.select_dtypes(exclude=np.number).columns

In [114]:
print(numeric_columns)
print(categorical_columns)

Index(['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca', 'disease_state'], dtype='object')
['sex', 'cp', 'restecg', 'slope', 'thal']
['fbs', 'exang']


# Prepare The Numerical Data

In [115]:
df['age'] = df['age'].replace(0, np.nan)
df['ca'] = df['ca'].mask(~df['ca'].isin([0, 1, 2, 3]), np.nan)

# Prepare The Categorical Data

In [116]:
df['sex'] = df['sex'].mask(~df['sex'].isin(['Male' , 'Female']), np.nan)
df['cp'] = df['cp'].mask(~df['cp'].isin(['typical angina', 'atypical angina' , 'non-anginal' , 'asymptomatic']), np.nan)
df['restecg'] = df['restecg'].mask(~df['restecg'].isin(['normal', 'stt abnormality', 'lv hypertrophy']), np.nan)
df['slope'] = df['slope'].mask(~df['slope'].isin(['upsloping', 'flat', 'downsloping']), np.nan)
df['thal'] = df['thal'].mask(~df['thal'].isin(['normal', 'fixed defect', 'reversable defect']), np.nan)
df['fbs'] = df['fbs'].mask(~df['fbs'].isin(['True' , 'False']), np.nan)
df['exang'] = df['exang'].mask(~df['exang'].isin(['True' , 'False']), np.nan)

# Split The data For Train and Test

In [118]:
train_data = df.sample(frac=0.8, random_state=42)
test_data = df.drop(train_data.index)

# Handle Missing Values

In [119]:
threshold = 80

# Drop The Columns That Has Nulls Greater Than The Threshold

# For Train Data

In [120]:
for col in train_data.columns:
  col_data = (train_data[col].isna().sum()/ len(train_data)) * 100
  if col_data > threshold:
    print(f"The {col} Has Nulls Greater Than {threshold}%")
    train_data = train_data.drop(col, axis=1)
  else:
    print(f"The {col} Has Nulls = {col_data}%")

The age Has Nulls = 0.0%
The sex Has Nulls = 0.0%
The cp Has Nulls = 0.0%
The trestbps Has Nulls = 6.7934782608695645%
The chol Has Nulls = 3.3967391304347823%
The fbs Has Nulls Greater Than 80%
The restecg Has Nulls = 19.429347826086957%
The thalch Has Nulls = 6.25%
The exang Has Nulls Greater Than 80%
The oldpeak Has Nulls = 7.201086956521739%
The slope Has Nulls = 35.05434782608695%
The ca Has Nulls = 66.57608695652173%
The thal Has Nulls = 51.902173913043484%
The disease_state Has Nulls = 0.0%


# For Test Data

In [121]:
for col in test_data.columns:
  col_data = (test_data[col].isna().sum()/ len(test_data)) * 100
  if col_data > threshold:
    print(f"The {col} Has Nulls Greater Than {threshold}%")
    test_data = test_data.drop(col, axis=1)
  else:
    print(f"The {col} Has Nulls = {col_data}%")

The age Has Nulls = 0.0%
The sex Has Nulls = 0.0%
The cp Has Nulls = 0.0%
The trestbps Has Nulls = 4.891304347826087%
The chol Has Nulls = 2.717391304347826%
The fbs Has Nulls Greater Than 80%
The restecg Has Nulls = 20.652173913043477%
The thalch Has Nulls = 4.891304347826087%
The exang Has Nulls Greater Than 80%
The oldpeak Has Nulls = 4.891304347826087%
The slope Has Nulls = 27.717391304347828%
The ca Has Nulls = 65.76086956521739%
The thal Has Nulls = 56.52173913043478%
The disease_state Has Nulls = 0.0%


# Handle Missing Values In Numerical Columns

In [None]:
def has_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return ((series < lower_bound) | (series > upper_bound)).any()

for col in numeric_columns:
    skewness = train_data[col].skew()
    outliers_present = has_outliers(train_data[col].dropna())
    if abs(skewness) < 0.5 and not outliers_present:
        train_data[col].fillna(train_data[col].mean(), inplace=True)
        print(f"{col} will be filled with mean")
    else:
        train_data[col].fillna(train_data[col].median(), inplace=True)
        print(f"{col} will be filled with median")

In [None]:
def has_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return ((series < lower_bound) | (series > upper_bound)).any()

for col in numeric_columns:
    skewness = test_data[col].skew()
    outliers_present = has_outliers(test_data[col].dropna())
    if abs(skewness) < 0.5 and not outliers_present:
        test_data[col].fillna(train_data[col].mean(), inplace=True)
        print(f"{col} will be filled with mean")
    else:
        test_data[col].fillna(train_data[col].median(), inplace=True)
        print(f"{col} will be filled with median")

# Missing Handle Values In Categorical Values

In [None]:
for col in categorical_columns:
    mode_value = train_data[col].mode()[0]
    train_data[col].fillna(mode_value, inplace=True)
    print(f"{col}: Imputed missing values with mode = {mode_value}")

In [None]:
for col in categorical_columns:
    mode_value = train_data[col].mode()[0]
    test_data[col].fillna(mode_value, inplace=True)
    print(f"{col}: Imputed missing values with mode = {mode_value}")

# Apply One Hot Encoding For Categorical Data

In [None]:
train_data = pd.get_dummies(train_data, columns=categorical_columns, drop_first=True)
test_data = pd.get_dummies(test_data, columns=categorical_columns, drop_first=True)

# Apply MinMaxScaler on Numerical data

In [None]:
scaler = StandardScaler()
for col in numeric_columns:
    train_data[col] = scaler.fit_transform(train_data[[col]])
    test_data[col] = scaler.transform(test_data[[col]])

# Conduct Exploratory Data Analysis (EDA)

In [None]:
sns.set(style='whitegrid', palette='pastel')

# 📊 1. Histograms – Visualize Distributions

## For Train Data

In [None]:
train_data.hist(figsize=(15, 12), bins=20, edgecolor='black')
plt.suptitle("Feature Distributions", fontsize=16)
plt.tight_layout()
plt.show()

## For Test Data

In [None]:
test_data.hist(figsize=(15, 12), bins=20, edgecolor='black')
plt.suptitle("Feature Distributions", fontsize=16)
plt.tight_layout()
plt.show()

# 2. Correlation Heatmap – Feature Relationships

## For Train Data

In [None]:
plt.figure(figsize=(14, 10))
correlation_matrix = train_data.corr()
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()

## For Test Data

In [None]:
plt.figure(figsize=(14, 10))
correlation_matrix = test_data.corr()
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()

# 📦 3. Boxplots – Detect Outliers & Compare Groups

## For Train Data

In [None]:
plt.figure(figsize=(15, 10))
for i, col in enumerate(numeric_columns, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(x='target', y=col, data=train_data)
    plt.title(f"{col} vs Target")
plt.tight_layout()
plt.show()

## For Test Data

In [None]:
plt.figure(figsize=(15, 10))
for i, col in enumerate(numeric_columns, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(x='target', y=col, data=test_data)
    plt.title(f"{col} vs Target")
plt.tight_layout()
plt.show()

# 4. Countplot of Categorical Columns

## For Train Data

In [None]:
plt.figure(figsize=(12, 6))
for i, col in enumerate(categorical_columns, 1):
    plt.subplot(1, 3, i)
    sns.countplot(x=col, hue='target', data=train_data)
    plt.title(f"{col} by Heart Disease")
plt.tight_layout()
plt.show()

## For Test Data

In [None]:
plt.figure(figsize=(12, 6))
for i, col in enumerate(categorical_columns, 1):
    plt.subplot(1, 3, i)
    sns.countplot(x=col, hue='target', data=test_data)
    plt.title(f"{col} by Heart Disease")
plt.tight_layout()
plt.show()