# Heart Disease Prediction – EDA and Data Preprocessing

This notebook contains the Exploratory Data Analysis (EDA), data cleaning, preprocessing, and visualization of the heart disease dataset.

## 📥 Data Loading

In [None]:

import pandas as pd

# Load the dataset
try:
    df = pd.read_csv('heart_cleveland_upload.csv')
    print("✅ Dataset loaded successfully.")
except FileNotFoundError:
    print("❌ Error: heart_cleveland_upload.csv not found. Please make sure the file is in the correct directory.")
    df = None


## 🧹 Data Cleaning

In [None]:

# Basic checks
if df is not None:
    print("\n--- Dataset Info ---")
    display(df.info())

    print("\n--- Missing Values ---")
    display(df.isnull().sum())

    print("\n--- Summary Statistics ---")
    display(df.describe())

    print("\n--- First 5 Rows ---")
    display(df.head())


## 📊 EXPLORATORY DATA ANALYSIS (EDA)

### 🔸 Distribution of Heart Disease Cases

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

if df is not None and 'condition' in df.columns:
    condition_counts = df['condition'].value_counts()
    plt.figure(figsize=(8, 8))
    plt.pie(condition_counts, labels=condition_counts.index, autopct='%1.1f%%', startangle=140)
    plt.title('Distribution of Heart Condition')
    plt.show()


### 🔸 Age vs. Maximum Heart Rate by Condition

In [None]:

if df is not None and 'age' in df.columns and 'thalach' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=df, x='age', y='thalach', hue='condition')
    plt.title('Scatter Plot of Age vs. Maximum Heart Rate')
    plt.xlabel('Age')
    plt.ylabel('Maximum Heart Rate Achieved')
    plt.show()


### 🔸 Chest Pain Type Count by Condition

In [None]:

if df is not None and 'cp' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, x='cp', hue='condition')
    plt.title('Count of Chest Pain Types by Condition')
    plt.xlabel('Chest Pain Type')
    plt.ylabel('Count')
    plt.xticks(ticks=[0, 1, 2, 3], labels=['Typical Angina', 'Atypical Angina', 'Non-anginal Pain', 'Asymptomatic'])
    plt.show()


### 🔸 Correlation Heatmap of Numerical Features

In [None]:

if df is not None:
    numerical_df = df.select_dtypes(include=['number'])
    if len(numerical_df.columns) >= 2:
        plt.figure(figsize=(12, 10))
        correlation_matrix = numerical_df.corr()
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
        plt.title('Correlation Heatmap')
        plt.show()
