In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, log_loss
)

In [3]:
data = pd.read_csv("../../Data/BRFSS_2024_model_ready_train.csv", low_memory=False)

In [4]:
X_train = data.drop('DIABETE4', axis=1)
y_train = data['DIABETE4'].astype(int)

In [5]:
test_data = pd.read_csv("../../Data/BRFSS_2024_model_ready_test.csv", low_memory=False)
X_test = test_data.drop('DIABETE4', axis=1)
y_test = test_data['DIABETE4'].astype(int)

In [6]:
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV

param_grid = {
    "var_smoothing": np.logspace(-12, -6, 13)
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = GridSearchCV(
    GaussianNB(),
    param_grid=param_grid,
    scoring="f1_macro",
    cv=cv,
    #n_jobs=-1,
    verbose=1,
    return_train_score=True
)

search.fit(X_train, y_train)

print("Best CV macro-F1:", search.best_score_)
print("Best parameters:", search.best_params_)

best_nb = search.best_estimator_

y_pred = best_nb.predict(X_test)
y_proba = best_nb.predict_proba(X_test)

print("\n=== Tuned GaussianNB Results ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Macro Precision:", precision_score(y_test, y_pred, average='macro', zero_division=0))
print("Macro Recall:", recall_score(y_test, y_pred, average='macro', zero_division=0))
print("Macro F1:", f1_score(y_test, y_pred, average='macro', zero_division=0))
print("Log Loss:", log_loss(y_test, y_proba))

print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Fitting 5 folds for each of 13 candidates, totalling 65 fits
Best CV macro-F1: 0.5033988454619587
Best parameters: {'var_smoothing': np.float64(1e-06)}

=== Tuned GaussianNB Results ===
Accuracy: 0.45947555957594677
Macro Precision: 0.4129509942854632
Macro Recall: 0.4754725462731961
Macro F1: 0.3420472008370377
Log Loss: 8.189865799395665

Classification Report:
               precision    recall  f1-score   support

           1       0.26      0.60      0.37     13162
           3       0.94      0.44      0.60     75226
           4       0.03      0.39      0.06      2261

    accuracy                           0.46     90649
   macro avg       0.41      0.48      0.34     90649
weighted avg       0.82      0.46      0.55     90649


Confusion Matrix:
 [[ 7880  1626  3656]
 [21048 32888 21290]
 [  934   444   883]]
