## Importing Python Libraries for EDA Analysis

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')

Reading Data From CSV file

In [2]:
df = pd.read_csv('data/heart.csv')
df = pd.DataFrame(df)

Shape of the DataSet

In [3]:
df.shape

(918, 12)

# Attribute Information
~ Age: age of the patient [years] 

~ Sex: sex of the patient [M: Male, F: Female] 

~ ChestPainType: chest pain type [TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic]

~ RestingBP: resting blood pressure [mm Hg]

~ Cholesterol: serum cholesterol [mm/dl]

~ FastingBS: fasting blood sugar [1: if FastingBS > 120 mg/dl, 0: otherwise]

~ RestingECG: resting electrocardiogram results [Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria]

~ MaxHR: maximum heart rate achieved [Numeric value between 60 and 202]

~ ExerciseAngina: exercise-induced angina [Y: Yes, N: No]

~ Oldpeak: oldpeak = ST [Numeric value measured in depression]

~ ST_Slope: the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]

~ HeartDisease: output class [1: heart disease, 0: Normal]

Checking Missing Values From the DataSet

In [4]:
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

Checking for Duplicate Rows in DataSet

In [5]:
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
913    False
914    False
915    False
916    False
917    False
Length: 918, dtype: bool

Dropping the Duplicates from teh DataSet and Filling the Null Values

In [6]:
df.drop_duplicates

<bound method DataFrame.drop_duplicates of      Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  \
0     40   M           ATA        140          289          0     Normal   
1     49   F           NAP        160          180          0     Normal   
2     37   M           ATA        130          283          0         ST   
3     48   F           ASY        138          214          0     Normal   
4     54   M           NAP        150          195          0     Normal   
..   ...  ..           ...        ...          ...        ...        ...   
913   45   M            TA        110          264          0     Normal   
914   68   M           ASY        144          193          1     Normal   
915   57   M           ASY        130          131          0     Normal   
916   57   F           ATA        130          236          0        LVH   
917   38   M           NAP        138          175          0     Normal   

     MaxHR ExerciseAngina  Oldpeak ST_Slope 

Checking the Number of Unique Values in DataSet

In [7]:
df.nunique()

Age                50
Sex                 2
ChestPainType       4
RestingBP          67
Cholesterol       222
FastingBS           2
RestingECG          3
MaxHR             119
ExerciseAngina      2
Oldpeak            53
ST_Slope            3
HeartDisease        2
dtype: int64

Checking the Statistics of the DataSet

In [8]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,,,,


## Removing Outliers from the Dataset

In [9]:
def remove_outliers(df):
    # Select only numeric columns
    numeric_cols = df.select_dtypes(include=['number']).columns
    
    # Skip binary columns (like FastingBS and HeartDisease)
    numeric_cols = [col for col in numeric_cols if df[col].nunique() > 2]
    
    # Create a copy of the dataframe
    df_clean = df.copy()
    
    # Apply IQR method to each numeric column
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        
        # Define bounds
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Filter outliers
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
    
    return df_clean

In [10]:
# Apply the outlier removal function
df_no_outliers = remove_outliers(df)

# Print the results
print(f"Original dataset shape: {df.shape}")
print(f"Dataset shape after removing outliers: {df_no_outliers.shape}")
print(f"Number of outliers removed: {df.shape[0] - df_no_outliers.shape[0]}")

Original dataset shape: (918, 12)
Dataset shape after removing outliers: (784, 12)
Number of outliers removed: 134


In [11]:
# Visualize the distribution before and after outlier removal
numeric_cols = df.select_dtypes(include=['number']).columns
numeric_cols = [col for col in numeric_cols if df[col].nunique() > 2]

In [12]:
# Create boxplots to compare before and after
fig, axes = plt.subplots(len(numeric_cols), 2, figsize=(15, 4*len(numeric_cols)))

for i, col in enumerate(numeric_cols):
    # Before outlier removal
    sns.boxplot(x=df[col], ax=axes[i, 0])
    axes[i, 0].set_title(f'{col} - Before Outlier Removal')
    
    # After outlier removal
    sns.boxplot(x=df_no_outliers[col], ax=axes[i, 1])
    axes[i, 1].set_title(f'{col} - After Outlier Removal')

plt.tight_layout()