# The Definitive EDA + Modelling Notebook on Heart Disease Data

![](https://media.giphy.com/media/3o6ZtrvsFui01kCPCg/source.gif)


**If you have trouble understanding something, please comment down below!**

In [None]:
from IPython.display import HTML
f = open("../input/notebookassets/blue.css").read()
HTML(f"<style>{f}</style>")

## Imports libraries and Data 📚

In [None]:
! pip install -q rich
! pip install -q dabl
! echo "Done Installing."

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import random

from rich.progress import track
from rich import print

import dabl

warnings.simplefilter("ignore")
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

sns.set_style("darkgrid")
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (9, 5)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

cmap = plt.get_cmap('tab20b')
colors = [cmap(i) for i in np.linspace(0, 1, 15)]

hfont = {'fontname':'DejaVu Sans', 'fontsize': 15}
shfont = {'fontname':'DejaVu Sans', 'fontsize': 13}

In [None]:
data = pd.read_csv("../input/heart-disease-uci/heart.csv")
data.head()

In [None]:
data.info()

In [None]:
data.describe()

#### Check Null Values in the Data

In [None]:
print(f"[black]There are: [/black][bold red]{data.isna().sum().max()}[/bold red][black] maximum [/black][red]NULL[/red][black] values in the dataset[/black]")

## Exploratory Data Analysis 📈

Let's start with Exploratory Data Analysis on this dataset and see what it has to offer!

### Age Column 👴


Let's analyze the age column and later we'll see how it differs in conjunction to other variables.

In [None]:
plt.figure(figsize=(9, 8))
sns.histplot(data['age'], color='#7b00ff', kde=True)
plt.title(f"Distribution of Ages [ μ : {data['age'].mean():.2f}, σ : {data['age'].std():.2f} ]")
plt.xlabel("Age")
plt.show()

### Sex ♀♂

This is the Gender Column, the values here are categorical (0: Female, 1: Male)

In [None]:
labels = ['Male', 'Female']
values = data['sex'].value_counts().tolist()
pie_colors = ['#0008ff', '#e833c6']

plt.figure(figsize=(9, 8))
plt.pie(x=values, labels=labels, autopct="%1.2f%%", colors=pie_colors, textprops=shfont)
plt.title("Gender Distribution", **hfont)
plt.show()

### Chest Pain Type (cp) 

This represents the class (or type) of Chest Pain that the patient has experienced or is experiencing.
This variable is also categorical. It has 4 categories.

In [None]:
labels = data['cp'].value_counts().index.tolist()
values = data['cp'].value_counts().tolist()
pie_colors = ['#1c1f7a', '#1117b2', '#0008ff', '#5e63f2']

plt.figure(figsize=(9, 8))

_, _, ats = plt.pie(x=values, labels=labels, autopct="%1.2f%%", colors=pie_colors, textprops=shfont, startangle=90)
for at in ats:
    at.set_color('white')
    
plt.title("Chest Pain Type", **hfont)
plt.show()

### Resting Blood Pressure (trestbps) 🆎

This represents the blood pressure of the patient while he/she was in a state of rest.
This variable is a a continuous one.

In [None]:
plt.figure(figsize=(9, 8))
sns.histplot(data['trestbps'], color='#3662db', kde=True)
plt.title(f"Distribution of Resting BP [ μ : {data['trestbps'].mean():.2f}, σ : {data['trestbps'].std():.2f} ]")
plt.xlabel("Resting BP")
plt.show()

### Cholestrol (chol)

This represents the value of cholestrol present in the patient's system.

This variable is continuous.

In [None]:
plt.figure(figsize=(9, 8))
sns.histplot(data['chol'], color='#25058c', kde=True)
plt.title(f"Distribution of Cholestrol [ μ : {data['chol'].mean():.2f}, σ : {data['chol'].std():.2f} ]")
plt.xlabel("Cholestrol Level")
plt.show()

### Fasting Blood Sugar (fbs) 🍯

This represents if fasting blood sugar level in a patient is > 120 mg/dl.
This is a continuous variable.

In [None]:
labels = ['fbs < 120 mg/dl', 'fbs > 120 mg/dl']
values = data['fbs'].value_counts().tolist()
pie_colors = ['#d893ed', '#d363f2']

plt.figure(figsize=(9, 8))

_, _, ats = plt.pie(x=values, labels=labels, autopct="%1.2f%%", colors=pie_colors, textprops=shfont, startangle=90)
for at in ats:
    at.set_color('white')
    
plt.title("Fasting Blood Sugar", **hfont)
plt.show()

### Resting ECG (restecg) 💓

This represents Resting Electrocardiographic Results of a patient.
This variable is Categorical.

In [None]:
labels = data['restecg'].value_counts().index.tolist()
values = data['restecg'].value_counts().tolist()
# pie_colors = ['#d893ed', '#d363f2']

plt.figure(figsize=(9, 8))

_, _, ats = plt.pie(x=values, labels=labels, autopct="%1.2f%%", colors=colors, explode=[0, 0, 0], textprops=shfont)
for at in ats:
    at.set_color('white')
    
plt.title("Resting ECG Result Distribution", **hfont)
plt.show()

### Maximum Heart Rate (thalach)

This represents maximum heart rate achieved by the patient.

This is a continuous Variable.

In [None]:
plt.figure(figsize=(9, 8))
sns.histplot(data['thalach'], color='#9f69ea', kde=True)
plt.title(f"Max. Heart Rate Achieved [ μ : {data['thalach'].mean():.2f}, σ : {data['thalach'].std():.2f} ]")
plt.xlabel("Maximum Heart Rate")
plt.show()

### Exercise Induced Angina (exang)

This represents if Angina (which is a type of pain that occurs when not enough blood flows to heart muscles) is present.

This is a Categorical Variable.

In [None]:
labels = ['Angina Not Recorded', 'Angina Recorded']
values = data['exang'].value_counts().tolist()

plt.figure(figsize=(9, 8))

_, _, ats = plt.pie(x=values, labels=labels, autopct="%1.2f%%", colors=['#7faaef', '#0061ff'], textprops=shfont)
for at in ats:
    at.set_color('white')
    
plt.title("Presence of Exercise Induced Angina", **hfont)
plt.show()

### Old Peak (oldpeak)

This represents the ST depression induced by exercise relative to rest.

This variable is continuous.

In [None]:
plt.figure(figsize=(9, 8))
sns.histplot(data['oldpeak'], color='#0026ff', kde=True)
plt.title(f"ST Depression [ μ : {data['oldpeak'].mean():.2f}, σ : {data['oldpeak'].std():.2f} ]")
plt.xlabel("ST Depression")
plt.show()

### Number of Major Vessels (ca)

This represents the number of major vessels in the patient's body covered by flourosopy.

This variable is continuous but the values are discrete and in the range (0-4)

In [None]:
plt.figure(figsize=(9, 8))
sns.countplot(data['ca'], color='#c85ff4')
plt.title(f"Number of Major Vessels [ μ : {data['ca'].mean():.2f}, σ : {data['ca'].std():.2f} ]")
plt.xlabel("Number of Major Vessels")
plt.ylabel("Count")
plt.show()

### Thal

Some values to their conditions: 3 = normal; 6 = fixed defect; 7 = reversable defect

This variable is continuous but the values are discrete and in the range (0-3)

In [None]:
plt.figure(figsize=(9, 8))
sns.countplot(data['thal'], color='aqua')
plt.title(f"THAL [ μ : {data['thal'].mean():.2f}, σ : {data['thal'].std():.2f} ]")
plt.xlabel("THAL Result")
plt.ylabel("Count")
plt.show()

### Target

This is out target variable and it represents if the patient suffered a heart attack (1) or didn't suffer a heart attack (0)

This is categorical and has only 2 possible values.

In [None]:
labels = ['Cardiovascular Event Present', 'Cardiovascular Event Not Present']
values = data['target'].value_counts().tolist()
pie_colors = ['#25c9ea', '#5e77f2']

plt.figure(figsize=(9, 8))

_, _, ats = plt.pie(x=values, labels=labels, autopct="%1.2f%%", colors=pie_colors, textprops=shfont)
for at in ats:
    at.set_color('white')
    
plt.title("Target Value Distribution", **hfont)
plt.show()

### Age vs Sex

Let's see the relationship between Age and Sex using different plots.

In [None]:
data_temp = data.copy(deep=True)
data_temp['sex'] = data['sex'].map({1: 'Male', 0: 'Female'})

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(9, 5))
sns.boxplot(x='sex', y='age', data=data_temp, ax=ax[0], palette='tab20b')
ax[0].set_title("Box Plot of Age vs Sex")
ax[0].set_xlabel("Sex")
ax[0].set_ylabel("Age")

sns.violinplot(x='sex', y='age', data=data_temp, ax=ax[1], palette='tab20b_r')
ax[1].set_title("Violin Plot of Age vs Sex")
ax[1].set_xlabel("Sex")
ax[1].set_ylabel("Age")

plt.show()

### Relationship Between Resting BP and Cholestrol

Now let's turn towards Resting Blood Pressure and Cholestrol Levels.

In [None]:
plt.figure(figsize=(7, 7))
sns.kdeplot(x=data['trestbps'], y=data['chol'], shade=True, color='blue')
plt.xlabel("Resting BP")
plt.ylabel("Cholestrol Level")
plt.title("KDE Plot between Cholestrol and Resting BP")

plt.show()

In [None]:
print("[red bold]Joint-Plot between Resting BP and Cholestrol[/red bold]")
sns.jointplot(x=data['trestbps'], y=data['chol'], color='darkblue', kind='kde')
plt.show()

### Relationship between Age and Cholestrol

Let's now visualize the relationship between Age and Cholestrol levels.

In [None]:
print("[red bold]Joint-Plot between Cholestrol and Age Separated by Gender[/red bold]")
sns.jointplot(x='age', y='chol', data=data_temp, color='purple', hue='sex', size=8)
plt.show()

### Age vs Gender Kernel Density Estimation Plot

In [None]:
avg_male_age = data_temp[data_temp['sex'] == 'Male']['age'].mean()
avg_female_age = data_temp[data_temp['sex'] == 'Female']['age'].mean()

plt.figure(figsize=(16, 6))

# Draw KDE Plots
sns.kdeplot(data_temp.loc[data_temp['sex'] == 'Male', 'age'], label = 'Male', shade=True, color='darkred')
sns.kdeplot(data_temp.loc[data_temp['sex'] == 'Female', 'age'], label = 'Female', shade=True, color='darkblue')

# Draw Verticals Lines showing averages of both
plt.axvline(avg_male_age, color='aqua', linestyle='-', linewidth=1)
min_ylim, max_ylim = plt.ylim()
plt.text(avg_male_age-14, max_ylim*0.93, 'Avg. Male Age: {} yrs'.format(int(avg_male_age)))

plt.axvline(avg_female_age, color='magenta', linestyle='-', linewidth=1)
min_ylim, max_ylim = plt.ylim()
plt.text(avg_female_age*1.05, max_ylim*0.93, 'Avg. Female Age: {} yrs'.format(int(avg_female_age)))

plt.title("Age-Gender KDE Plot")
plt.xlabel("Age")
plt.legend()
plt.show()

### Oldpeak vs Cholestrol Levels

In [None]:
print("[red bold]Joint-Plot between Oldpeak and Cholestrol Levels[/red bold]")
sns.jointplot(x=data['oldpeak'], y=data['chol'], color='magenta', kind='kde')
plt.show()

### Target vs Age Columns in Box plot and Violin Plot

In [None]:
data_temp['target'] = data_temp['target'].astype(int).map({0: 'No Problem', 1:'Problem'})

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(9, 5))
sns.boxplot(x='target', y='age', data=data_temp, ax=ax[0], palette='tab20')
ax[0].set_title("Box Plot of Age vs Target")
ax[0].set_xlabel("Target")
ax[0].set_ylabel("Age")

sns.violinplot(x='target', y='age', data=data_temp, ax=ax[1], palette='tab20b')
ax[1].set_title("Violin Plot of Age vs Target")
ax[1].set_xlabel("Target")
ax[1].set_ylabel("Age")

plt.show()

### DABL Plot

Let's now use DABL to see different plots.

In [None]:
plt.style.use("classic")
dabl.plot(data, target_col='target')

## Modelling 📳

Let's now do some basic modelling and see how they perform on this data.

In [None]:
# Now let's split the data
feats = data.drop(['target'], axis=1).values
feats = (feats - feats.mean()) / feats.std()
targets = data['target'].values

trainX = feats[:290]
trainY = targets[:290]

validX = feats[290:]
validY = targets[290:]

print(f"[black]Training Data Shapes:[/black] {trainX.shape, trainY.shape}")
print(f"[black]Validation Data Shapes:[/black] {validX.shape, validY.shape}")

### 1. Logistic Regression from Scratch

Below code is Logistic Regression from Scratch using only numpy.

It's not supposed to perform better, but I am just doing it for the concept of it.

In [None]:
class LogisticRegression:
    """
    This is the Logistic Regression Model Class.
    """
    def __init__(self, learning_rate, num_iters, fit_intercept = True, verbose = True):
        self.learning_rate = learning_rate
        self.num_iters = num_iters
        self.fit_intercept = fit_intercept
        self.verbose = verbose
    
    def __repr__(self):
        return f"[black]LogisticRegression Object with [lr: {self.learning_rate}, num_iters: {self.num_iters}][/black]"
    
    def __add_intercept(self, X):
        intercept = np.ones((X.shape[0],1))
        return np.concatenate((intercept,X),axis=1)
    
    def __sigmoid(self,z):
        return 1/(1+np.exp(-z))
    
    def __loss(self, h, y):
        return (-y * np.log(h) - (1-y) * np.log(1-h)).mean()
  
    def fit(self, X, y):
        if self.fit_intercept:
            X = self.__add_intercept(X)
        self.theta = np.random.rand(X.shape[1])  
        for i in track(range(self.num_iters), description=f"[bold red]Training..."):
            z = np.dot(X,self.theta)
            h = self.__sigmoid(z)
            gradient = np.dot(X.T,(h-y))/y.size

            self.theta -= self.learning_rate * gradient

            z = np.dot(X,self.theta)
            h = self.__sigmoid(z)
            loss = self.__loss(h,y)
            
            if i==0:
                print(f"[black]Initial loss:[/black]   {loss}")
            if i == self.num_iters - 1:
                print(f"[black]Final loss:     [/black]{loss}")
            
        return self.theta
    
    def predict_probability(self,X):
        if self.fit_intercept:
            X = self.__add_intercept(X)
        return self.__sigmoid(np.dot(X,self.theta))
    
    def predict(self,X):
        return (self.predict_probability(X).round())

In [None]:
model = LogisticRegression(learning_rate = 0.1, num_iters=150000, verbose=True)
print(model)

In [None]:
# Train the model
model_weights = model.fit(trainX, trainY)

In [None]:
preds = model.predict(validX)
print(f"[black]Validation Accuracy:[/black] {(validY == preds).mean()*100:.2f} %")

In [None]:
print("[bold green]UNDER WORK! MORE CONTENT TO BE ADDED SOON![/bold green]")