In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, KFold
from sklearn.svm import LinearSVC,SVC
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostClassifier
# IMPORT XGBOOST
from xgboost import XGBClassifier
import warnings
import pickle
from sklearn.preprocessing import OneHotEncoder


In [2]:
def model_classifier(X_train,X_test,y_train,y_test,model):
    mod = model
    mod.fit(X_train, y_train)
    y_pred = mod.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    
    print("Accuracy:", accuracy)
    print("Confusion Matrix:\n", conf_matrix)
    print("Classification Report:\n", report)
    return mod

In [3]:
def model_classifier_kfold(X, y, model, n_splits=3):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    y_pred = cross_val_predict(model, X, y, cv=kf)
    
    accuracy = accuracy_score(y, y_pred)
    conf_matrix = confusion_matrix(y, y_pred)
    report = classification_report(y, y_pred)
    
    print("Accuracy (K-Fold):", accuracy)
    print("Confusion Matrix (K-Fold):\n", conf_matrix)
    print("Classification Report (K-Fold):\n", report)
    
    cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation accuracy:", cv_scores.mean())

### Reading the dataset and performing the train test split ###

In [4]:
data = pd.read_csv('dataset\heart_2022_cleaned.csv')
print(data.shape)
data.columns

(354862, 40)


Index(['State', 'Sex', 'GeneralHealth', 'PhysicalHealthDays',
       'MentalHealthDays', 'LastCheckupTime', 'PhysicalActivities',
       'SleepHours', 'RemovedTeeth', 'HadHeartAttack', 'HadAngina',
       'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD',
       'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis',
       'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty',
       'DifficultyConcentrating', 'DifficultyWalking',
       'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus',
       'ECigaretteUsage', 'ChestScan', 'RaceEthnicityCategory', 'AgeCategory',
       'HeightInMeters', 'WeightInKilograms', 'BMI', 'AlcoholDrinkers',
       'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver', 'TetanusLast10Tdap',
       'HighRiskLastYear', 'CovidPos'],
      dtype='object')

In [5]:
categorical_features = data.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_features

['State',
 'Sex',
 'GeneralHealth',
 'LastCheckupTime',
 'PhysicalActivities',
 'RemovedTeeth',
 'HadHeartAttack',
 'HadAngina',
 'HadStroke',
 'HadAsthma',
 'HadSkinCancer',
 'HadCOPD',
 'HadDepressiveDisorder',
 'HadKidneyDisease',
 'HadArthritis',
 'HadDiabetes',
 'DeafOrHardOfHearing',
 'BlindOrVisionDifficulty',
 'DifficultyConcentrating',
 'DifficultyWalking',
 'DifficultyDressingBathing',
 'DifficultyErrands',
 'SmokerStatus',
 'ECigaretteUsage',
 'ChestScan',
 'RaceEthnicityCategory',
 'AgeCategory',
 'AlcoholDrinkers',
 'HIVTesting',
 'FluVaxLast12',
 'PneumoVaxEver',
 'TetanusLast10Tdap',
 'HighRiskLastYear',
 'CovidPos']

In [6]:
numerical_columns = data.select_dtypes(include=['number']).columns
numerical_columns

Index(['PhysicalHealthDays', 'MentalHealthDays', 'SleepHours',
       'HeightInMeters', 'WeightInKilograms', 'BMI'],
      dtype='object')

In [7]:
# Perform train test split
# y = data['GeneralHealth']
# X = data.drop('GeneralHealth', axis=1)
standard_scaler = StandardScaler()  
label_encoder = LabelEncoder()


for i in categorical_features:
    data[i] = label_encoder.fit_transform(data[i])

X_train,X_test = train_test_split(data,test_size=0.2, random_state=42)
X_train.shape, X_test.shape


X_train[numerical_columns] = standard_scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = standard_scaler.transform(X_test[numerical_columns])

Y_test = X_test.pop('GeneralHealth')
Y_train = X_train.pop('GeneralHealth')

X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((283889, 39), (70973, 39), (283889,), (70973,))

In [8]:
X = data[numerical_columns]
Y = data.pop('GeneralHealth')

### Naive Babes ###

Default

In [9]:
model_naive  = model_classifier(X_train,X_test,Y_train,Y_test,GaussianNB())
pickle.dump(model_naive, open('naive_bayes.pkl', 'wb'))

Accuracy: 0.32570132303833854
Confusion Matrix:
 [[ 9109   344   972    56   910]
 [ 1142  3272  2232  1996   930]
 [ 8979  3722  5249   828  3777]
 [   62   854   262  1934    56]
 [15541  1544  3429   221  3552]]
Classification Report:
               precision    recall  f1-score   support

           0       0.26      0.80      0.39     11391
           1       0.34      0.34      0.34      9572
           2       0.43      0.23      0.30     22555
           3       0.38      0.61      0.47      3168
           4       0.39      0.15      0.21     24287

    accuracy                           0.33     70973
   macro avg       0.36      0.43      0.34     70973
weighted avg       0.37      0.33      0.30     70973



K-Fold

In [11]:
model_naive_kf = model_classifier_kfold(X,Y,GaussianNB()) 
pickle.dump(model_naive_kf, open('naive_bayes_kf.pkl', 'wb'))

Accuracy (K-Fold): 0.3808184590065998
Confusion Matrix (K-Fold):
 [[21743  2395  3689   261 28904]
 [ 3622 14500 10722  5357 13737]
 [17243 13997 23255  2106 56784]
 [  371  6403  1304  6144  1206]
 [29345  7094 14445   739 69496]]
Classification Report (K-Fold):
               precision    recall  f1-score   support

           0       0.30      0.38      0.34     56992
           1       0.33      0.30      0.31     47938
           2       0.44      0.21      0.28    113385
           3       0.42      0.40      0.41     15428
           4       0.41      0.57      0.48    121119

    accuracy                           0.38    354862
   macro avg       0.38      0.37      0.36    354862
weighted avg       0.39      0.38      0.37    354862

Cross-validation scores: [0.38167861 0.38131832 0.37945844]
Mean cross-validation accuracy: 0.38081845658267627


### Logistic Regression ###

Default

In [9]:
model_logistic = model_classifier(X_train,X_test,Y_train,Y_test,LogisticRegression(verbose=True))
pickle.dump(model_logistic, open('models/model_logistic_default.pkl', 'wb'))

Accuracy: 0.45344004057881165
Confusion Matrix:
 [[  888    76  1531    35  8861]
 [   73  2628  4878   691  1302]
 [  526  1277 10634   233  9885]
 [    3  1582   540   959    84]
 [  764   335  6044    71 17073]]
Classification Report:
               precision    recall  f1-score   support

           0       0.39      0.08      0.13     11391
           1       0.45      0.27      0.34      9572
           2       0.45      0.47      0.46     22555
           3       0.48      0.30      0.37      3168
           4       0.46      0.70      0.56     24287

    accuracy                           0.45     70973
   macro avg       0.45      0.37      0.37     70973
weighted avg       0.44      0.45      0.42     70973



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


K-Fold

In [None]:
model_logistic_kf = model_classifier_kfold(X,Y,LogisticRegression(verbose=True))
pickle.dump(model_logistic_kf, open('models/model_logistic_kfold.pkl', 'wb'))

### Random Forest ###

Default

In [None]:
model_rf = model_classifier(X_train,X_test,Y_train,Y_test,RandomForestClassifier(verbose=True,n_jobs=6))
# pickle.dump(model_rf, open('models/model_rf_default.pkl', 'wb'))

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   12.2s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   28.3s finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.5s


Accuracy: 0.45393318585941134
Confusion Matrix:
 [[ 2375   117  2199    13  6687]
 [   89  3248  4572   656  1007]
 [  946  1853 11546   160  8050]
 [    1  1526   498  1093    50]
 [ 2298   521  7465    48 13955]]
Classification Report:
               precision    recall  f1-score   support

           0       0.42      0.21      0.28     11391
           1       0.45      0.34      0.39      9572
           2       0.44      0.51      0.47     22555
           3       0.55      0.35      0.43      3168
           4       0.47      0.57      0.52     24287

    accuracy                           0.45     70973
   macro avg       0.47      0.40      0.42     70973
weighted avg       0.45      0.45      0.44     70973



[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    1.3s finished


K-fold

In [None]:
model_rf_kf = model_classifier_kfold(X,Y,RandomForestClassifier(verbose=True,n_jobs=6)) 
# pickle.dump(model_rf_kf, open('models/model_rf_kfold.pkl', 'wb'))

### Decision Tree ###

Default

In [None]:
model_dt = model_classifier(X_train,X_test,Y_train,Y_test,DecisionTreeClassifier())
# pickle.dump(model_dt, open('models/model_dt_default.pkl', 'wb'))

Accuracy: 0.3561917912445578
Confusion Matrix:
 [[3156  530 2817   58 4830]
 [ 621 2797 3284 1121 1749]
 [2963 3229 8425  568 7370]
 [  80 1131  651 1049  257]
 [5034 1791 7370  239 9853]]
Classification Report:
               precision    recall  f1-score   support

           0       0.27      0.28      0.27     11391
           1       0.30      0.29      0.29      9572
           2       0.37      0.37      0.37     22555
           3       0.35      0.33      0.34      3168
           4       0.41      0.41      0.41     24287

    accuracy                           0.36     70973
   macro avg       0.34      0.34      0.34     70973
weighted avg       0.36      0.36      0.36     70973



K Fold

In [None]:
model_dt_kf = model_classifier_kfold(X,Y,DecisionTreeClassifier())
pickle.dump(model_dt_kf, open('models/model_dt_kfold.pkl', 'wb'))

### SVM ###

Default

In [None]:
model_svm = model_classifier(X_train,X_test,Y_train,Y_test,SVC(verbose=True))
pickle(model_svm, open('models/model_svm_default.pkl', 'wb'))

K Fold

In [None]:
model_svm_kf = model_classifier_kfold(X,Y,SVC(verbose=True))
pickle(model_svm, open('models/model_svm_kfold.pkl', 'wb'))

### Ada Boost ###


Default

In [17]:
model_ada = model_classifier(X_train,X_test,Y_train,Y_test,AdaBoostClassifier())
# pickle(model_ada, open('models/model_ada_default.pkl', 'wb'))

Accuracy: 0.45713158525072917
Confusion Matrix:
 [[  499   109  1648    15  9120]
 [   41  3283  4384   649  1215]
 [  358  1913 10432   130  9722]
 [    2  1523   474  1097    72]
 [  420   452  6232    50 17133]]
Classification Report:
               precision    recall  f1-score   support

           0       0.38      0.04      0.08     11391
           1       0.45      0.34      0.39      9572
           2       0.45      0.46      0.46     22555
           3       0.57      0.35      0.43      3168
           4       0.46      0.71      0.56     24287

    accuracy                           0.46     70973
   macro avg       0.46      0.38      0.38     70973
weighted avg       0.45      0.46      0.42     70973



K Fold

In [None]:
model_ada_kf = model_classifier_kfold(X,Y,AdaBoostClassifier())
pickle(model_ada, open('models/model_ada_kfold.pkl', 'wb'))

### XG Boost ###

Default

In [None]:
model_xg = model_classifier(X_train,X_test,Y_train,Y_test,XGBClassifier())
# pickle(model_xg, open('models/model_xg_default.pkl', 'wb'))

K-Fold

In [None]:
model_xg_kf = model_classifier_kfold(X,Y,XGBClassifier())
pickle(model_xg, open('models/model_xg_kfold.pkl', 'wb'))