In [549]:
# Import Liberaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# To preprocess the data
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
# import iterative imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# machine learning
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
#for classification tasks
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestRegressor
from xgboost import XGBClassifier
#metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error, mean_squared_error, r2_score

# ignore warnings   
import warnings
warnings.filterwarnings('ignore')


In [550]:
# Load Data
df=pd.read_csv('heart_disease_uci.csv')
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [551]:
df.shape

(920, 16)

In [552]:
(df.isnull().sum()/len(df)*100).sort_values(ascending=False)

ca          66.413043
thal        52.826087
slope       33.586957
fbs          9.782609
oldpeak      6.739130
trestbps     6.413043
thalch       5.978261
exang        5.978261
chol         3.260870
restecg      0.217391
id           0.000000
age          0.000000
sex          0.000000
dataset      0.000000
cp           0.000000
num          0.000000
dtype: float64

In [537]:
df.isnull().sum()[df.isnull().sum() > 0].sort_values(ascending=False)
missing_data_cols = df.isnull().sum()[df.isnull().sum() > 0].index.tolist()
missing_data_cols

['trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalch',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal']

In [535]:
categorical_cols = ['thal', 'ca', 'slope', 'exang', 'restecg','fbs', 'cp', 'sex', 'num']
bool_cols = ['fbs', 'exang']
numeric_cols = ['oldpeak', 'thalch', 'chol', 'trestbps', 'age']

In [530]:
# define the function to impute the missing values in thal column

def impute_categorical_missing_data(passed_col):
    
    df_null = df[df[passed_col].isnull()]
    df_not_null = df[df[passed_col].notnull()]

    X = df_not_null.drop(passed_col, axis=1)
    y = df_not_null[passed_col]
    
    other_missing_cols = [col for col in missing_data_cols if col != passed_col]
    
    label_encoder = LabelEncoder()

    for col in X.columns:
        if X[col].dtype == 'object' or X[col].dtype == 'category':
            X[col] = label_encoder.fit_transform(X[col])

    if passed_col in bool_cols:
        y = label_encoder.fit_transform(y)
        
    iterative_imputer = IterativeImputer(estimator=RandomForestRegressor(random_state=42), add_indicator=True)

    for col in other_missing_cols:
        if X[col].isnull().sum() > 0:
            col_with_missing_values = X[col].values.reshape(-1, 1)
            imputed_values = iterative_imputer.fit_transform(col_with_missing_values)
            X[col] = imputed_values[:, 0]
        else:
            pass
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    rf_classifier = RandomForestClassifier()

    rf_classifier.fit(X_train, y_train)

    y_pred = rf_classifier.predict(X_test)
    acc_score = accuracy_score(y_test, y_pred)

    print("The feature '"+ passed_col+ "' has been imputed with", round((acc_score * 100), 2), "accuracy\n")

    X = df_null.drop(passed_col, axis=1)

    for col in X.columns:
        if X[col].dtype == 'object' or X[col].dtype == 'category':
            X[col] = label_encoder.fit_transform(X[col])

    for col in other_missing_cols:
        if X[col].isnull().sum() > 0:
            col_with_missing_values = X[col].values.reshape(-1, 1)
            imputed_values = iterative_imputer.fit_transform(col_with_missing_values)
            X[col] = imputed_values[:, 0]
        else:
            pass
                
    if len(df_null) > 0: 
        df_null[passed_col] = rf_classifier.predict(X)
        if passed_col in bool_cols:
            df_null[passed_col] = df_null[passed_col].map({0: False, 1: True})
        else:
            pass
    else:
        pass

    df_combined = pd.concat([df_not_null, df_null])
    
    return df_combined[passed_col]

def impute_continuous_missing_data(passed_col):
    
    df_null = df[df[passed_col].isnull()]
    df_not_null = df[df[passed_col].notnull()]
    X = df_not_null.drop(passed_col, axis=1)
    y = df_not_null[passed_col]
    
    other_missing_cols = [col for col in missing_data_cols if col != passed_col]
    
    label_encoder = LabelEncoder()

    for col in X.columns:
        if X[col].dtype == 'object' or X[col].dtype == 'category':
            X[col] = label_encoder.fit_transform(X[col])
    
    iterative_imputer = IterativeImputer(estimator=RandomForestRegressor(random_state=42), add_indicator=True)

    for col in other_missing_cols:
        if X[col].isnull().sum() > 0:
            col_with_missing_values = X[col].values.reshape(-1, 1)
            imputed_values = iterative_imputer.fit_transform(col_with_missing_values)
            X[col] = imputed_values[:, 0]
        else:
            pass
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    rf_regressor = RandomForestRegressor()

    rf_regressor.fit(X_train, y_train)

    y_pred = rf_regressor.predict(X_test)

    print("MAE =", mean_absolute_error(y_test, y_pred), "\n")
    print("RMSE =", mean_squared_error(y_test, y_pred, squared=False), "\n")
    print("R2 =", r2_score(y_test, y_pred), "\n")

    X = df_null.drop(passed_col, axis=1)

    for col in X.columns:
        if X[col].dtype == 'object' or X[col].dtype == 'category':
            X[col] = label_encoder.fit_transform(X[col])
    for col in other_missing_cols:
        if X[col].isnull().sum() > 0:
            col_with_missing_values = X[col].values.reshape(-1, 1)
            imputed_values = iterative_imputer.fit_transform(col_with_missing_values)
            X[col] = imputed_values[:, 0]
        else:
            pass
                
    if len(df_null) > 0: 
        df_null[passed_col] = rf_regressor.predict(X)
    else:
        pass

    df_combined = pd.concat([df_not_null, df_null])
    
    return df_combined[passed_col]


In [538]:
# remove warning
import warnings
warnings.filterwarnings('ignore')

# impute missing values using our functions
for col in missing_data_cols:
    print("Missing Values", col, ":", str(round((df[col].isnull().sum() / len(df)) * 100, 2))+"%")
    if col in categorical_cols:
        df[col] = impute_categorical_missing_data(col)
    elif col in numeric_cols:
        df[col] = impute_continuous_missing_data(col)
    else:
        pass

Missing Values trestbps : 6.41%
MAE = 13.248265895953756 

RMSE = 17.226749346977325 

R2 = 0.07341670427558655 

Missing Values chol : 3.26%
MAE = 45.25028089887641 

RMSE = 64.31429764415276 

R2 = 0.6723322432525373 

Missing Values fbs : 9.78%
The feature 'fbs' has been imputed with 78.92 accuracy

Missing Values restecg : 0.22%
The feature 'restecg' has been imputed with 64.13 accuracy

Missing Values thalch : 5.98%
MAE = 16.57843930635838 

RMSE = 21.62706136387915 

R2 = 0.31988189113045573 

Missing Values exang : 5.98%
The feature 'exang' has been imputed with 77.46 accuracy

Missing Values oldpeak : 6.74%
MAE = 0.5564941860465116 

RMSE = 0.7790736784872326 

R2 = 0.4217508419082805 

Missing Values slope : 33.59%
The feature 'slope' has been imputed with 65.04 accuracy

Missing Values ca : 66.41%
The feature 'ca' has been imputed with 64.52 accuracy

Missing Values thal : 52.83%
The feature 'thal' has been imputed with 71.26 accuracy



In [539]:
df.isnull().sum()

id          0
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64

In [553]:
df['restecg']=df['restecg'].fillna(df['restecg'].mode()[0])
df['chol']=df['chol'].fillna(df['chol'].median())
df['exang']=df['exang'].fillna(df['exang'].median())
df['thalch']=df['thalch'].fillna(df['thalch'].median())
df['trestbps']=df['trestbps'].fillna(df['trestbps'].median())
df['oldpeak']=df['oldpeak'].fillna(df['oldpeak'].median())
df['fbs']=df['fbs'].fillna(df['fbs'].median())
df['ca'].isnull().sum()
df['ca']=df['ca'].fillna(df['ca'].median())
df['ca']=df['ca'].fillna(df['ca'].median())
df['slope']=df['slope'].fillna(df['slope'].mode()[0])
df['thal']=df['thal'].fillna(df['thal'].mode()[0])

In [554]:
df.isnull().sum()

id          0
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64

In [555]:
x=df.drop('num',axis=1)
y=df['num']

In [556]:
# print the row from df where trestbps value is 0
df[df['trestbps'] == 0]
# remove this row from data
df = df[df['trestbps'] != 0]

In [557]:
le=LabelEncoder()
for col in x.columns:
    if x[col].dtype=='object' or x[col].dtype=='category':
        x[col]=le.fit_transform(x[col])

In [558]:
x.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,1,63,1,0,3,145.0,233.0,1,0,150.0,0,2.3,0,0.0,0
1,2,67,1,0,0,160.0,286.0,0,0,108.0,1,1.5,1,3.0,1
2,3,67,1,0,0,120.0,229.0,0,0,129.0,1,2.6,1,2.0,2
3,4,37,1,0,2,130.0,250.0,0,1,187.0,0,3.5,0,0.0,1
4,5,41,0,0,1,130.0,204.0,0,0,172.0,0,1.4,2,0.0,1


In [563]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [442]:
# import all models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
# from lightgbm import LGBMClassifier

# impot pipeline
from sklearn.pipeline import Pipeline

# import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [564]:
# Create a list of models to evaluate
models = [
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('Support Vector Machine', SVC(random_state=42)),
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Decision Tree', DecisionTreeClassifier(random_state=42)),
    ('Ada Boost', AdaBoostClassifier(random_state=42)),
    ('XG Boost', XGBClassifier(random_state=42)),
    ('Naive Bayes', GaussianNB())
]

best_model = None
best_accuracy = 0.0

# Iterate over the models and evaluate their performance
for name, model in models:
    # Create a pipeline for each model
    pipeline = Pipeline([
        # ('imputer', SimpleImputer(strategy='most_frequent')),
        # ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('model', model)
    ])
    
    # Perform cross-validation
    scores = cross_val_score(pipeline, x_train, y_train, cv=5)
    
    # Calculate mean accuracy
    mean_accuracy = scores.mean()
    
    # Fit the pipeline on the training data
    pipeline.fit(x_train, y_train)
    
    # Make predictions on the test data
    y_pred = pipeline.predict(x_test)
    
    # Calculate accuracy score
    accuracy = accuracy_score(y_test, y_pred)
    # Print the performance metrics
    print("Model:", name)
    print("Cross-validation Accuracy:", mean_accuracy)
    print("Test Accuracy:", accuracy)
    print()
    
    # Check if the current model has the best accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = pipeline

# Retrieve the best model
print("Best Model:", best_model)

# save the best model
#import pickle
#pickle.dump(best_model, open('heart_disease_model.pkl', 'wb'))

Model: Random Forest
Cross-validation Accuracy: 0.6290770362198934
Test Accuracy: 0.657608695652174

Model: Gradient Boosting
Cross-validation Accuracy: 0.624995403566832
Test Accuracy: 0.6413043478260869

Model: Support Vector Machine
Cross-validation Accuracy: 0.592388306674021
Test Accuracy: 0.6141304347826086

Model: Logistic Regression
Cross-validation Accuracy: 0.520380584666299
Test Accuracy: 0.5543478260869565

Model: K-Nearest Neighbors
Cross-validation Accuracy: 0.5556995771281485
Test Accuracy: 0.6304347826086957

Model: Decision Tree
Cross-validation Accuracy: 0.5665839308696452
Test Accuracy: 0.5815217391304348

Model: Ada Boost
Cross-validation Accuracy: 0.5964515535944107
Test Accuracy: 0.5869565217391305

Model: XG Boost
Cross-validation Accuracy: 0.6222467365324509
Test Accuracy: 0.6358695652173914

Model: Naive Bayes
Cross-validation Accuracy: 0.569277440706012
Test Accuracy: 0.6195652173913043

Best Model: Pipeline(steps=[('model', RandomForestClassifier(random_state

In [546]:
model = {
    'Random Forest': (RandomForestClassifier(random_state=42),{'n_estimators': [10, 100, 1000], 'max_depth': [None, 5, 10]}),
    'Gradient Boosting': (GradientBoostingClassifier(random_state=42),{'n_estimators': [10, 100]}),
    'Support Vector Machine': (SVC(random_state=42),{'kernel': ['rbf', 'poly', 'sigmoid']}),
    'Logistic Regression': (LogisticRegression(random_state=42),{}),
    'K-Nearest Neighbors': (KNeighborsClassifier(),{'n_neighbors': np.arange(3, 100, 2)}),
    'Decision Tree': (DecisionTreeClassifier(random_state=42),{'max_depth': [None, 5, 10], 'splitter': ['best', 'random']}),
    'Ada Boost': (AdaBoostClassifier(random_state=42),{}),
    'XG Boost': (XGBClassifier(random_state=42),{'n_estimators': [10, 100, 1000], 'learning_rate': [0.1, 0.01, 0.001]}),
    'Naive Bayes': (GaussianNB(),{})
}

best_model = None
best_accuracy = 0.0

# Iterate over the models and evaluate their performance
for name, (model,params) in model.items():
    # Create a pipeline for each model
    pipeline = Pipeline([
        # ('imputer', SimpleImputer(strategy='most_frequent')),
        # ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('model', model)
    ])
    
    # Perform cross-validation
    scores = cross_val_score(pipeline, x_train, y_train, cv=5)
    
    # Calculate mean accuracy
    mean_accuracy = scores.mean()
    
    # Fit the pipeline on the training data
    pipeline.fit(x_train, y_train)
    
    # Make predictions on the test data
    y_pred = pipeline.predict(x_test)

    # Calculate accuracy score
    accuracy = accuracy_score(y_test, y_pred)
    # Print the performance metrics
    print("Model:", name)
    print("Cross-validation Accuracy:", mean_accuracy)
    print("Test Accuracy:", accuracy)
    print()
    
    # Check if the current model has the best accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = pipeline

# Retrieve the best model
print("Best Model:", best_model)

Model: Random Forest
Cross-validation Accuracy: 0.6820187534473249
Test Accuracy: 0.6413043478260869

Model: Gradient Boosting
Cross-validation Accuracy: 0.6792792792792792
Test Accuracy: 0.6304347826086957

Model: Support Vector Machine
Cross-validation Accuracy: 0.5977937120794264
Test Accuracy: 0.5597826086956522

Model: Logistic Regression
Cross-validation Accuracy: 0.5162805662805663
Test Accuracy: 0.4891304347826087

Model: K-Nearest Neighbors
Cross-validation Accuracy: 0.5991634491634492
Test Accuracy: 0.5434782608695652

Model: Decision Tree
Cross-validation Accuracy: 0.620876999448428
Test Accuracy: 0.6032608695652174

Model: Ada Boost
Cross-validation Accuracy: 0.5990990990990991
Test Accuracy: 0.6032608695652174

Model: XG Boost
Cross-validation Accuracy: 0.656186799043942
Test Accuracy: 0.6684782608695652

Model: Naive Bayes
Cross-validation Accuracy: 0.5638168781025924
Test Accuracy: 0.5815217391304348

Best Model: Pipeline(steps=[('model',
                 XGBClassifier(b