## Heart Disease Prediction Model.

In [2]:
## Heart Disease Prediction Model.
# Importing necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE

# Load the dataset
data = pd.read_csv('heart.csv') # Ensure the dataset is in the same directory or specify the correct path

# Display the first few rows of the dataframe
print("Head of the dataset:")
print(data.head())

# Data preprocessing

# Check for missing values
print("\nMissing values:\n", data.isnull().sum())

# Descriptive statistics
print("\nDescriptive statistics:\n", data.describe())

# Check the balance of the target variable
print("\nTarget variable distribution:\n", data['output'].value_counts())

# Feature scaling

numerical_features = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']

scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

print("\nHead of the scaled dataset:")
print(data.head())

# Split data into training and testing sets

X = data.drop('output', axis=1)
y = data['output']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Address class imbalance using SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("\nShape of X_train before SMOTE:", X_train.shape)
print("Shape of X_train after SMOTE:", X_train_resampled.shape)
print("Shape of y_train before SMOTE:", y_train.shape)
print("Shape of y_train after SMOTE:", y_train_resampled.shape)

# Model training and evaluation

# Define a function to train and evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"ROC AUC Score: {roc_auc:.4f}")

    return accuracy, precision, recall, f1, conf_matrix, roc_auc

# Gaussian Naive Bayes

gnb = GaussianNB()
gnb.fit(X_train_resampled, y_train_resampled)
print("\nGaussian Naive Bayes Model:")
gnb_accuracy, gnb_precision, gnb_recall, gnb_f1, gnb_conf_matrix, gnb_roc_auc = evaluate_model(gnb, X_test, y_test)

# Support Vector Machine (SVM)

svm = SVC(probability=True, random_state=42)
svm.fit(X_train_resampled, y_train_resampled)
print("\nSupport Vector Machine Model:")
svm_accuracy, svm_precision, svm_recall, svm_f1, svm_conf_matrix, svm_roc_auc = evaluate_model(svm, X_test, y_test)

# Random Forest

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_resampled, y_train_resampled)
print("\nRandom Forest Model:")
rf_accuracy, rf_precision, rf_recall, rf_f1, rf_conf_matrix, rf_roc_auc = evaluate_model(rf, X_test, y_test)

# Gradient Boosting

gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train_resampled, y_train_resampled)
print("\nGradient Boosting Model:")
gb_accuracy, gb_precision, gb_recall, gb_f1, gb_conf_matrix, gb_roc_auc = evaluate_model(gb, X_test, y_test)

# Logistic Regression

lr = LogisticRegression(random_state=42)
lr.fit(X_train_resampled, y_train_resampled)
print("\nLogistic Regression Model:")
lr_accuracy, lr_precision, lr_recall, lr_f1, lr_conf_matrix, lr_roc_auc = evaluate_model(lr, X_test, y_test)

# K-Nearest Neighbors (KNN)

knn = KNeighborsClassifier()
knn.fit(X_train_resampled, y_train_resampled)
print("\nK-Nearest Neighbors Model:")
knn_accuracy, knn_precision, knn_recall, knn_f1, knn_conf_matrix, knn_roc_auc = evaluate_model(knn, X_test, y_test)

# Neural Network (MLPClassifier)

mlp = MLPClassifier(random_state=42, max_iter=300)
mlp.fit(X_train_resampled, y_train_resampled)
print("\nNeural Network (MLPClassifier) Model:")
mlp_accuracy, mlp_precision, mlp_recall, mlp_f1, mlp_conf_matrix, mlp_roc_auc = evaluate_model(mlp, X_test, y_test)

# Hyperparameter Tuning using GridSearchCV

# Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_rf = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid_rf,
                           cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
                           scoring='accuracy',
                           n_jobs=-1)

grid_search_rf.fit(X_train_resampled, y_train_resampled)

print("\nBest parameters for Random Forest:", grid_search_rf.best_params_)
best_rf_model = grid_search_rf.best_estimator_

print("\nTuned Random Forest Model:")
best_rf_accuracy, best_rf_precision, best_rf_recall, best_rf_f1, best_rf_conf_matrix, best_rf_roc_auc = evaluate_model(best_rf_model, X_test, y_test)

# Gradient Boosting
param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2]
}

grid_search_gb = GridSearchCV(estimator=GradientBoostingClassifier(random_state=42),
                           param_grid=param_grid_gb,
                           cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
                           scoring='accuracy',
                           n_jobs=-1)

grid_search_gb.fit(X_train_resampled, y_train_resampled)

print("\nBest parameters for Gradient Boosting:", grid_search_gb.best_params_)
best_gb_model = grid_search_gb.best_estimator_

print("\nTuned Gradient Boosting Model:")
best_gb_accuracy, best_gb_precision, best_gb_recall, best_gb_f1, best_gb_conf_matrix, best_gb_roc_auc = evaluate_model(best_gb_model, X_test, y_test)

# Logistic Regression
param_grid_lr = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10],
    'solver': ['liblinear']
}

grid_search_lr = GridSearchCV(estimator=LogisticRegression(random_state=42),
                           param_grid=param_grid_lr,
                           cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
                           scoring='accuracy',
                           n_jobs=-1)

grid_search_lr.fit(X_train_resampled, y_train_resampled)

print("\nBest parameters for Logistic Regression:", grid_search_lr.best_params_)
best_lr_model = grid_search_lr.best_estimator_

print("\nTuned Logistic Regression Model:")
best_lr_accuracy, best_lr_precision, best_lr_recall, best_lr_f1, best_lr_conf_matrix, best_lr_roc_auc = evaluate_model(best_lr_model, X_test, y_test)

# Ensemble Modeling - VotingClassifier
# Create an ensemble of the best models (tuned)
voting_clf = VotingClassifier(estimators=[('rf', best_rf_model), ('gb', best_gb_model), ('lr', best_lr_model)], voting='soft')
voting_clf.fit(X_train_resampled, y_train_resampled)

print("\nEnsemble Model (VotingClassifier):")
ensemble_accuracy, ensemble_precision, ensemble_recall, ensemble_f1, ensemble_conf_matrix, ensemble_roc_auc = evaluate_model(voting_clf, X_test, y_test)

Head of the dataset:
   age  sex  cp  trtbps  chol  fbs  restecg  thalachh  exng  oldpeak  slp  \
0   63    1   3     145   233    1        0       150     0      2.3    0   
1   37    1   2     130   250    0        1       187     0      3.5    0   
2   41    0   1     130   204    0        0       172     0      1.4    2   
3   56    1   1     120   236    0        1       178     0      0.8    2   
4   57    0   0     120   354    0        1       163     1      0.6    2   

   caa  thall  output  
0    0      1       1  
1    0      2       1  
2    0      2       1  
3    0      2       1  
4    0      2       1  

Missing values:
 age         0
sex         0
cp          0
trtbps      0
chol        0
fbs         0
restecg     0
thalachh    0
exng        0
oldpeak     0
slp         0
caa         0
thall       0
output      0
dtype: int64

Descriptive statistics:
               age         sex          cp      trtbps        chol         fbs  \
count  303.000000  303.000000  303.000




Neural Network (MLPClassifier) Model:
Accuracy: 0.8132
Precision: 0.8367
Recall: 0.8200
F1 Score: 0.8283
Confusion Matrix:
[[33  8]
 [ 9 41]]
ROC AUC Score: 0.8761

Best parameters for Random Forest: {'max_depth': 6, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}

Tuned Random Forest Model:
Accuracy: 0.8132
Precision: 0.8235
Recall: 0.8400
F1 Score: 0.8317
Confusion Matrix:
[[32  9]
 [ 8 42]]
ROC AUC Score: 0.9146

Best parameters for Gradient Boosting: {'learning_rate': 0.2, 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}

Tuned Gradient Boosting Model:
Accuracy: 0.7912
Precision: 0.8444
Recall: 0.7600
F1 Score: 0.8000
Confusion Matrix:
[[34  7]
 [12 38]]
ROC AUC Score: 0.8815

Best parameters for Logistic Regression: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}

Tuned Logistic Regression Model:
Accuracy: 0.8022
Precision: 0.8333
Recall: 0.8000
F1 Score: 0.8163
Confusion Matrix:
[[33  8]
 [10 40]]
ROC AUC Score: 0.8859

En