In [None]:
# Create subplot titles
subplot_titles = numeric_features.columns.to_list()

cols = 4
rows = len(subplot_titles) // cols + 1
total_plots = rows * cols

# Initialize the subplot figure
fig = make_subplots(
    rows=rows,
    cols=cols,
    subplot_titles=['' for _ in subplot_titles],
    horizontal_spacing=0.10,
    vertical_spacing=0.15
)

for idx, feature in enumerate(subplot_titles):
    # Calculate row and column for the current subplot
    row = idx // cols + 1
    col = idx % cols + 1

    # Create a count plot for the feature
    fig.add_trace(
        go.Box(
            x=df['HadHeartAttack'],
            y=df[feature],
            showlegend=False,
        ),
        row=row,
        col=col
    )

    # Update x-axis for the current subplot
    fig.update_xaxes(title_text='HadHeartAttack', row=row, col=col)

    # Update y-axis for the current subplot
    fig.update_yaxes(title_text=feature, row=row, col=col)

# Update the overall layout
fig.update_layout(
    height=rows*300,
    width=cols*300,
    title_text="Numerical Feature Distribution by Target Variable",
    showlegend=False
)

# Display the figure
fig.show()

# Preprocessing

In [63]:
# separate the features and the target variable
X = df.drop('HadHeartAttack', axis=1)
y = df['HadHeartAttack']

# encode categorical variables
X = pd.get_dummies(X, drop_first=True)

# display the first 5 rows of the encoded features
X.head()

Unnamed: 0,PhysicalHealthDays,MentalHealthDays,SleepHours,HeightInMeters,WeightInKilograms,BMI,State_Alaska,State_Arizona,State_Arkansas,State_California,...,AlcoholDrinkers_Yes,HIVTesting_Yes,FluVaxLast12_Yes,PneumoVaxEver_Yes,"TetanusLast10Tdap_Yes, received Tdap","TetanusLast10Tdap_Yes, received tetanus shot but not sure what type","TetanusLast10Tdap_Yes, received tetanus shot, but not Tdap",HighRiskLastYear_Yes,CovidPos_Tested positive using home test without a health professional,CovidPos_Yes
0,4.0,0.0,9.0,1.6,71.67,27.99,False,False,False,False,...,False,False,True,True,True,False,False,False,False,False
1,0.0,0.0,6.0,1.78,95.25,30.13,False,False,False,False,...,False,False,True,True,False,True,False,False,False,False
2,0.0,0.0,8.0,1.85,108.86,31.66,False,False,False,False,...,True,False,False,True,False,False,False,False,False,True
3,5.0,0.0,9.0,1.7,90.72,31.32,False,False,False,False,...,False,False,True,True,False,False,False,False,False,True
4,3.0,15.0,5.0,1.55,79.38,33.07,False,False,False,False,...,False,False,True,True,False,False,False,False,False,False


## Handling Imbalanced Data

In [64]:
# use SMOTE / Nearmiss to balance the dataset
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

# apply SMOTE
smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X, y)

# apply NearMiss
nm = NearMiss()
X_nm, y_nm = nm.fit_resample(X, y)

plt = px.bar(y_sm.value_counts(), title='Distribution of the target variable after applying SMOTE')
plt.show()

plt = px.bar(y_nm.value_counts(), title='Distribution of the target variable after applying NearMiss')
plt.show()

## Dataset Splitting

In [65]:
# split the data into training and testing sets
from sklearn.model_selection import train_test_split

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.3, random_state=42)

# peak into the training set
X_train.head()

Unnamed: 0,PhysicalHealthDays,MentalHealthDays,SleepHours,HeightInMeters,WeightInKilograms,BMI,State_Alaska,State_Arizona,State_Arkansas,State_California,...,AlcoholDrinkers_Yes,HIVTesting_Yes,FluVaxLast12_Yes,PneumoVaxEver_Yes,"TetanusLast10Tdap_Yes, received Tdap","TetanusLast10Tdap_Yes, received tetanus shot but not sure what type","TetanusLast10Tdap_Yes, received tetanus shot, but not Tdap",HighRiskLastYear_Yes,CovidPos_Tested positive using home test without a health professional,CovidPos_Yes
48815,0.0,0.0,8.0,1.88,77.11,21.83,False,False,False,False,...,True,False,False,True,True,False,False,False,True,False
286022,0.0,0.0,7.0,1.762029,88.814885,28.587485,False,False,False,False,...,True,True,True,False,True,True,False,False,False,True
367384,0.0,3.643895,6.643895,1.52,71.797725,30.911119,False,False,False,False,...,True,False,True,True,False,True,False,False,False,False
322905,0.0,0.0,6.086237,1.802587,86.18,26.437047,False,False,False,False,...,True,True,False,False,False,False,False,False,False,False
260043,0.0,0.1886,7.1886,1.67057,77.11,27.768164,False,False,False,False,...,True,False,True,True,False,False,False,False,False,False


# Model Training

## Random Forest

In [66]:
# train a random forest classifier
from sklearn.ensemble import RandomForestClassifier

# instantiate the classifier
rf = RandomForestClassifier()

# train the classifier
rf.fit(X_train, y_train)

# make predictions
y_pred = rf.predict(X_test)

In [67]:
# evaluate the model
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.97      0.97      0.97     69906
         Yes       0.97      0.97      0.97     69647

    accuracy                           0.97    139553
   macro avg       0.97      0.97      0.97    139553
weighted avg       0.97      0.97      0.97    139553



In [68]:
# confusion_matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[67806,  2100],
       [ 1899, 67748]], dtype=int64)

In [69]:
# Random Forest ROC Curve
from sklearn.metrics import roc_curve

# get the probabilities
y_pred_proba = rf.predict_proba(X_test)[:, 1]

# calculate the fpr, tpr and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba, pos_label='Yes')

# use plotly
plt = px.area(x=fpr, y=tpr, title='ROC Curve', labels=dict(x='False Positive Rate', y='True Positive Rate'))
plt.add_shape(type='line', line=dict(dash='dash'), x0=0, x1=1, y0=0, y1=1)
plt.show()



In [70]:
# save the model
import joblib

joblib.dump(rf, 'outputs/random_forest_model.pkl')

['outputs/random_forest_model.pkl']

## Decision Tree

In [71]:
# decision tree training
from sklearn.tree import DecisionTreeClassifier

# instantiate the classifier
dt = DecisionTreeClassifier()

# train the classifier
dt.fit(X_train, y_train)

# make predictions
y_pred_dt = dt.predict(X_test)

In [72]:
# evaluate the model
print(classification_report(y_test, y_pred_dt))

              precision    recall  f1-score   support

          No       0.96      0.94      0.95     69906
         Yes       0.94      0.96      0.95     69647

    accuracy                           0.95    139553
   macro avg       0.95      0.95      0.95    139553
weighted avg       0.95      0.95      0.95    139553



In [73]:
# confusion_matrix
confusion_matrix(y_test, y_pred_dt)

array([[65602,  4304],
       [ 2681, 66966]], dtype=int64)

In [74]:
# Decision Tree ROC Curve
# get the probabilities
y_pred_proba_dt = dt.predict_proba(X_test)[:, 1]

# calculate the fpr, tpr and thresholds
fpr_dt, tpr_dt, thresholds_dt = roc_curve(y_test, y_pred_proba_dt, pos_label='Yes')

# plot the ROC curve
plt = px.area(x=fpr_dt, y=tpr_dt, title='ROC Curve', labels=dict(x='False Positive Rate', y='True Positive Rate'))
plt.add_shape(type='line', line=dict(dash='dash'), x0=0, x1=1, y0=0, y1=1)
plt.show()

In [75]:
# save the model
joblib.dump(dt, 'outputs/decision_tree_model.pkl')

['outputs/decision_tree_model.pkl']

## Ada Boost

In [76]:
# ada boost
from sklearn.ensemble import AdaBoostClassifier

# instantiate the classifier
ada = AdaBoostClassifier()

# train the classifier
ada.fit(X_train, y_train)

# make predictions
y_pred_ada = ada.predict(X_test)

In [77]:
# evaluate the model
print(classification_report(y_test, y_pred_ada))

              precision    recall  f1-score   support

          No       0.93      0.95      0.94     69906
         Yes       0.95      0.93      0.94     69647

    accuracy                           0.94    139553
   macro avg       0.94      0.94      0.94    139553
weighted avg       0.94      0.94      0.94    139553



In [78]:
# confusion_matrix
confusion_matrix(y_test, y_pred_ada)

array([[66396,  3510],
       [ 5076, 64571]], dtype=int64)

In [79]:
# AdaBoost ROC Curve
# get the probabilities
y_pred_proba_ada = ada.predict_proba(X_test)[:, 1]

# calculate the fpr, tpr and thresholds
fpr_ada, tpr_ada, thresholds_ada = roc_curve(y_test, y_pred_proba_ada, pos_label='Yes')

# plot the ROC curve
plt = px.area(x=fpr_ada, y=tpr_ada, title='ROC Curve', labels=dict(x='False Positive Rate', y='True Positive Rate'))
plt.add_shape(type='line', line=dict(dash='dash'), x0=0, x1=1, y0=0, y1=1)
plt.show()

In [80]:
# save the model
joblib.dump(ada, 'outputs/ada_boost_model.pkl')

['outputs/ada_boost_model.pkl']

## Naïve Bayes

In [93]:
# kfold cross validation
from sklearn.model_selection import KFold

# make a 10 fold cross validation
cv = KFold(n_splits=10, random_state=None,shuffle=False) 

In [None]:
# Naïve Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

# instantiate the classifier
_nb = GaussianNB()
nb = GridSearchCV(_nb, param_grid = {'var_smoothing': np.logspace(1,10, num=100)}, cv = cv, verbose = 1)
# train the classifier
nb.fit(X_train, y_train)

# make predictions
y_pred_nb = nb.predict(X_test)

In [95]:
# evaluate the model
print(classification_report(y_test, y_pred_nb))

              precision    recall  f1-score   support

          No       0.91      0.85      0.88     69906
         Yes       0.86      0.92      0.89     69647

    accuracy                           0.88    139553
   macro avg       0.88      0.88      0.88    139553
weighted avg       0.88      0.88      0.88    139553



In [96]:
# confusion_matrix
confusion_matrix(y_test, y_pred_nb)

array([[59184, 10722],
       [ 5639, 64008]], dtype=int64)

In [97]:
# Naïve Bayes ROC Curve
# get the probabilities
y_pred_proba_nb = nb.predict_proba(X_test)[:, 1]

# calculate the fpr, tpr and thresholds
fpr_nb, tpr_nb, thresholds_nb = roc_curve(y_test, y_pred_proba_nb, pos_label='Yes')

# plot the ROC curve
plt = px.area(x=fpr_nb, y=tpr_nb, title='ROC Curve', labels=dict(x='False Positive Rate', y='True Positive Rate'))
plt.add_shape(type='line', line=dict(dash='dash'), x0=0, x1=1, y0=0, y1=1)
plt.show()

In [98]:
# save the model
joblib.dump(nb, 'outputs/naive_bayes_model.pkl')

['outputs/naive_bayes_model.pkl']