In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, log_loss
)
import numpy as np

In [2]:
# Load datasets
train_data = pd.read_csv("../../Data/BRFSS_2024_model_ready_train.csv", low_memory=False)
test_data  = pd.read_csv("../../Data/BRFSS_2024_model_ready_test.csv", low_memory=False)

# Training data
X = train_data.drop("DIABETE4", axis=1)
y = train_data["DIABETE4"].astype(int)

# Test data
X_test = test_data.drop("DIABETE4", axis=1)
y_test = test_data["DIABETE4"].astype(int)

# Internal 80/20 train-validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)
print("Test shape:", X_test.shape)

Train shape: (725362, 198)
Validation shape: (181341, 198)
Test shape: (90649, 198)


In [3]:
# Subsample training set for FAST hyperparameter tuning
subsample_fraction = 0.30  

X_train_sub = X_train.sample(frac=subsample_fraction, random_state=42)
y_train_sub = y_train.loc[X_train_sub.index]

print("Subsample size:", X_train_sub.shape)

Subsample size: (217609, 198)


In [4]:
# Hyperparameter tuning for p=2 (FAST MODE)
param_grid_p2 = {
    'n_neighbors': [550, 750, 950, 1150, 1350],
    'weights': ['uniform', 'distance'],
    'leaf_size': [30],
    'metric': ['minkowski'],
    'p': [2]
}

grid_p2 = GridSearchCV(
    estimator=KNeighborsClassifier(n_jobs=-1),
    param_grid=param_grid_p2,
    cv=3,                     # FAST
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    refit=True                # trains on X_train_sub automatically
)

# Fit on SUBSAMPLE
grid_p2.fit(X_train_sub, y_train_sub)

print("Best parameters (p=2):", grid_p2.best_params_)
print("Best CV score (p=2):", grid_p2.best_score_)

# Get the trained model directly
knn_p2 = grid_p2.best_estimator_

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters (p=2): {'leaf_size': 30, 'metric': 'minkowski', 'n_neighbors': 550, 'p': 2, 'weights': 'distance'}
Best CV score (p=2): 0.5394813460902096


In [5]:
# Validation evaluation for p=2
val_pred_p2 = knn_p2.predict(X_val)
val_f1_p2 = f1_score(y_val, val_pred_p2, average='macro')
print("Validation F1 (p=2):", val_f1_p2)

Validation F1 (p=2): 0.5434422331926972


In [6]:
y_pred_test_p2 = knn_p2.predict(X_test)
y_proba_test_p2 = knn_p2.predict_proba(X_test)

accuracy = accuracy_score(y_test, y_pred_test_p2)
precision = precision_score(y_test, y_pred_test_p2, average='macro')
recall = recall_score(y_test, y_pred_test_p2, average='macro')
f1 = f1_score(y_test, y_pred_test_p2, average='macro')
logloss = log_loss(y_test, y_proba_test_p2)

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"Log Loss:  {logloss:.4f}")

Accuracy:  0.3921
Precision: 0.4223
Recall:    0.4706
F1 Score:  0.3142
Log Loss:  1.1728


In [7]:
# Hyperparameter Tuning for p = 1

param_grid_p1 = {
    'n_neighbors': [550, 750, 950, 1150, 1350],
    'weights': ['uniform', 'distance'],
    'leaf_size': [30],
    'metric': ['minkowski'],
    'p': [1]
}

grid_p1 = GridSearchCV(
    estimator=KNeighborsClassifier(n_jobs=-1),
    param_grid=param_grid_p1,
    cv=3,                     # FAST â€” no KFold overhead
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    refit=True                # trains best model on X_train_sub automatically
)

# Fit on SUBSAMPLED data
grid_p1.fit(X_train_sub, y_train_sub)

print("Best parameters (p=1):", grid_p1.best_params_)
print("Best CV score (p=1):", grid_p1.best_score_)

# This is the trained KNN model 
knn_p1 = grid_p1.best_estimator_

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters (p=1): {'leaf_size': 30, 'metric': 'minkowski', 'n_neighbors': 550, 'p': 1, 'weights': 'distance'}
Best CV score (p=1): 0.615250271978855


In [8]:
# Validation evaluation for p=1

val_pred_p1 = knn_p1.predict(X_val)
val_f1_p1 = f1_score(y_val, val_pred_p1, average='macro')
print("Validation F1 (p=1):", val_f1_p1)

Validation F1 (p=1): 0.6399048383523854


In [9]:
y_pred_test_p1 = knn_p1.predict(X_test)
y_proba_test_p1 = knn_p1.predict_proba(X_test)

accuracy = accuracy_score(y_test, y_pred_test_p1)
precision = precision_score(y_test, y_pred_test_p1, average='macro')
recall = recall_score(y_test, y_pred_test_p1, average='macro')
f1 = f1_score(y_test, y_pred_test_p1, average='macro')
logloss = log_loss(y_test, y_proba_test_p1)

print(f"Accuracy:     {accuracy:.4f}")
print(f"Precision:    {precision:.4f}")
print(f"Recall:       {recall:.4f}")
print(f"F1 Score:     {f1:.4f}")
print(f"Log Loss:     {logloss:.4f}")

print("\nClassification Report:\n", classification_report(y_test, y_pred_test_p1))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_test_p1))

Accuracy:     0.6171
Precision:    0.4270
Recall:       0.4999
F1 Score:     0.4103
Log Loss:     0.8575

Classification Report:
               precision    recall  f1-score   support

           1       0.30      0.61      0.40     13162
           3       0.94      0.63      0.75     75226
           4       0.04      0.26      0.08      2261

    accuracy                           0.62     90649
   macro avg       0.43      0.50      0.41     90649
weighted avg       0.82      0.62      0.68     90649


Confusion Matrix:
 [[ 7984  2540  2638]
 [17557 47363 10306]
 [  946   719   596]]
