# Coronary Artery Disease Prediction Using ML 

Below are the steps and the classification of Coronary Artery Disease Prediction Using Naive Bayes Algorithm, K Nearest Neighbour Algorithm and Decision Tree Algorithm.

The Dataset that was used in this project is [Heart Disease Patient Dataset](https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset).

# Importing Required Libraries
The first step in formulating the solution is to import all the necessary libraries that should be optimized for procedures and operations during the development.

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np  
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib.cm import rainbow
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve,auc, classification_report
from collections import Counter

# Importing the Selected Dataset
All the data stored in the csv file from the dataset will be imported and explored in the form of a data frame

In [None]:
# Reading the dataset
df = pd.read_csv("HeartDisease_Patients.csv")
df

In [None]:
df = pd.read_csv("HeartDisease_Patients.csv")
print(df.columns)

# Data Exploration

Data exploration involves checking the dataset information, identifying missing values, and providing a summary of descriptive statistics to gain insights into the structure and characteristics of the data.

In [None]:
df.info() # describing the dataframe based on all of its columns and its respective data types and nnon-null cells

In [None]:
df.isnull().sum() # Checking for missing values in the DataFrame 'df'

In [None]:
df.describe() # Generating descriptive statistics for the DataFrame

In [None]:
df.isnull().sum() # Checking for missing values in the DataFrame

In [None]:
df

# Data Visualization
Data visualization includes creating a heatmap to explore correlations among features, generating histograms for individual features, and visualizing the distribution of the target variable through a countplot.

In [None]:
# to see the correlation of different attributes
corrmat = df.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,10))
#plot Heat Map
g=sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
df.hist() # Generating histograms for each numerical column

In [None]:
# Setting the Seaborn style to 'whitegrid' and creating a count plot
sns.set_style('whitegrid')
sns.countplot(x='target',data=df,palette='PuRd')

# Data preprocessing

Data preprocessing involves creating dummy variables for categorical features, standardizing selected numerical features, and transforming the dataset to prepare it for machine learning models.

In [None]:
# Creating dummy variables for categorical columns
dataset= pd.get_dummies(df,columns=['gender','cp','fbs','restecg','exang','slope','ca','thal'])

In [None]:
# Creating a StandardScaler instance
standardScaler=StandardScaler()
columns_to_scale = ['age','trestbps','chol','thalach','oldpeak']
dataset[columns_to_scale]=standardScaler.fit_transform(dataset[columns_to_scale])

In [None]:
dataset.head()

In [None]:
# Separating the target variable 'y' and features 'x'
y = df['target']
x = df.drop('target',axis=1)

# Splitting the dataset into training and testing sets
x_train,x_test,y_train, y_test= train_test_split(x,y,test_size=0.20,random_state=42)
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

In [None]:
print(y_test.unique())
Counter(y_train)

# Naive Bayes

In [None]:
m1 = 'Naive Bayes'
# Initializing a Naive Bayes model (Gaussian Naive Bayes) with variable 'nb'
nb = GaussianNB()
# Training the Naive Bayes model on the standardized training data
model=nb.fit(x_train,y_train)
# Predicting the target variable on the standardized test data
nb_predict = nb.predict(x_test)
# Calculating and displaying the confusion matrix for Naive Bayes predictions
nb_conf_matrix = confusion_matrix(y_test,nb_predict)
nb_acc_score = accuracy_score(y_test,nb_predict)
print ("Confussion Mtarix")
print (nb_conf_matrix)
print ("\n")
print("Accuracy of Naive Bayes:",nb_acc_score*100,'\n')
print (classification_report(y_test,nb_predict))

# KNN Classifier

In [None]:
#applying KNN Classifier & find the Value of 'K'
knn_score =[]
for k in range (1,21):
    knn = KNeighborsClassifier(n_neighbors=k)
    score= cross_val_score(knn,x,y,cv=10)
    knn_score.append(score.mean())

In [None]:
# Creating a plot to visualize k-Nearest Neighbors classifier scores for different values of k
fig= plt.figure(figsize=(18,10))
plt.plot([k for k in range(1,21)],knn_score,color='red')
for i in range (1,21):
         plt.text(i,knn_score[i-1],(i,knn_score[i-1]))
plt.xticks([i for i in range (1,21)]) 
plt.xlabel('Number of Neighbors (K)') 
plt.ylabel('score')
plt.title ('k Neighbors Classifier Scores for Different K-Values')

In [None]:
# Initializing a K-Neighbors Classifier model with k=19
m2 = 'K-NeighborsClassifier'
knn = KNeighborsClassifier(n_neighbors=19)
# Training the K-Neighbors Classifier model on the standardized training data
model=knn.fit(x_train,y_train)
# Predicting the target variable on the standardized test data
knn_predict = knn.predict(x_test)
# Calculating and displaying the confusion matrix for K-Neighbors Classifier predictions
knn_conf_matrix = confusion_matrix(y_test,knn_predict)
knn_acc_score = accuracy_score(y_test,knn_predict)
print ("Confussion Mtarix")
print (knn_conf_matrix)
print ("\n")
print("Accuracy of K-Neighbors Classifier:",knn_acc_score*100,'\n')
print (classification_report(y_test,knn_predict))

# Decision Tree Classifier

In [None]:
# Initializing a Decision Tree Classifier with specified parameters
m3 = 'DecisionTreeClassifier'
dt = DecisionTreeClassifier(criterion='entropy',random_state=2,max_depth=6)
# Training the Decision Tree Classifier model on the standardized training data
dt.fit(x_train,y_train)
# Predicting the target variable on the standardized test data
dt_predict = dt.predict(52,1,0,125,212,0,1,168,0,1,2,2,3,0)
dt_conf_matrix = confusion_matrix(y_test,dt_predict)
dt_acc_score = accuracy_score(y_test,dt_predict)
# Calculating and displaying the confusion matrix for Decision Tree Classifier predictions
print ("Confussion Mtarix")
print (dt_conf_matrix)
print ("\n")
print("Accuracy of Decision Tree Classifier:",dt_acc_score*100,'\n')
print (classification_report(y_test,dt_predict))

In [None]:
# Installing the XGBoost library using pip
!pip install xgboost


In [None]:
# Importing the XGBoost library
import xgboost as xgb


# Feature Importance Visualization

The plot visualizes the importance of attributes in the dataset, showcasing their respective weights derived from a machine learning model.

In [None]:
# Separating features (X_train) and target variable (y_train)
X_train = df.drop('target', axis=1)  
y_train = df['target']

# Initializing an XGBoost
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)

# Get feature importances
feature_importances = xgb_model.get_booster().get_score(importance_type='weight')

# Create a DataFrame for visualization
imp_feature = pd.DataFrame(list(feature_importances.items()), columns=['feature', 'importance'])
imp_feature = imp_feature.sort_values(by='importance', ascending=True)  # Sort for horizontal bar chart

# Plot feature importances
fig = plt.figure(figsize=(10, 4))
colors = ["red", "green", "blue", "yellow", "magenta", "cyan"]
plt.title("Important Attributes")
plt.xlabel("Importance")
plt.ylabel("Attributes")
plt.barh(imp_feature['feature'], imp_feature['importance'], color=colors)
plt.show()

# Receiver Operating Characteristic Curves (ROC)

ROC Curves, short for Receiver Operating Characteristic Curves, provide a graphical representation of the performance of a classification model by illustrating the trade-off between sensitivity and specificity at various thresholds.

In [None]:
# Naive Bayes ROC Curve
nb_probs = nb.predict_proba(x_test)[:, 1]
fpr_nb, tpr_nb, thresholds_nb = roc_curve(y_test, nb_probs)
roc_auc_nb = auc(fpr_nb, tpr_nb)

# K-Neighbors Classifier ROC Curve
knn_probs = knn.predict_proba(x_test)[:, 1]
fpr_knn, tpr_knn, thresholds_knn = roc_curve(y_test, knn_probs)
roc_auc_knn = auc(fpr_knn, tpr_knn)

# Decision Tree Classifier ROC Curve
dt_probs = dt.predict_proba(x_test)[:, 1]
fpr_dt, tpr_dt, thresholds_dt = roc_curve(y_test, dt_probs)
roc_auc_dt = auc(fpr_dt, tpr_dt)

# Plot ROC curves
plt.figure(figsize=(10, 6))
plt.plot(fpr_nb, tpr_nb, color='orange', lw=2, label=f'Naive Bayes (AUC = {roc_auc_nb:.2f})')
plt.plot(fpr_knn, tpr_knn, color='green', lw=2, label=f'K-Neighbors (AUC = {roc_auc_knn:.2f})')
plt.plot(fpr_dt, tpr_dt, color='blue', lw=2, label=f'Decision Tree (AUC = {roc_auc_dt:.2f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()