In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix,
    log_loss
)

from sklearn.svm import LinearSVC, SVC
from sklearn.decomposition import PCA
import os
import pickle

In [2]:
train = pd.read_csv(
    "../../Data/BRFSS_2024_model_ready_train.csv",
    low_memory=False
)
test = pd.read_csv(
    "../../Data/BRFSS_2024_model_ready_test.csv",
    low_memory=False
)

X_train = train.drop("DIABETE4", axis=1)
y_train = train["DIABETE4"].astype(int)

X_test = test.drop("DIABETE4", axis=1)
y_test = test["DIABETE4"].astype(int)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (906703, 198)
Test shape: (90649, 198)


In [3]:
def evaluate_model(clf, X_test, y_test, title="Model"):
    y_pred = clf.predict(X_test)

    y_proba = None
    ll = None
    if hasattr(clf, "predict_proba"):
        try:
            y_proba = clf.predict_proba(X_test)
            ll = log_loss(y_test, y_proba)
        except Exception:
            y_proba = None
            ll = None

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="macro", zero_division=0)
    rec = recall_score(y_test, y_pred, average="macro", zero_division=0)
    f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)

    print(f"\n=== {title} ===")
    print(f"Accuracy:          {acc:.4f}")
    print(f"Precision (macro): {prec:.4f}")
    print(f"Recall (macro):    {rec:.4f}")
    print(f"F1 Score (macro):  {f1:.4f}")
    if ll is not None:
        print(f"Log Loss:          {ll:.4f}")

    print("\nClassification Report:\n",
          classification_report(y_test, y_pred, zero_division=0))
    print("\nConfusion Matrix:\n",
          confusion_matrix(y_test, y_pred))


In [4]:
# Baseline Linear SVM 
baseline_svm = LinearSVC(random_state=42, max_iter=2000)
baseline_svm.fit(X_train, y_train)

evaluate_model(baseline_svm, X_test, y_test, title="Baseline Linear SVM")


=== Baseline Linear SVM ===
Accuracy:          0.6164
Precision (macro): 0.4388
Recall (macro):    0.5302
F1 Score (macro):  0.4199

Classification Report:
               precision    recall  f1-score   support

           1       0.32      0.57      0.41     13162
           3       0.94      0.63      0.76     75226
           4       0.05      0.39      0.09      2261

    accuracy                           0.62     90649
   macro avg       0.44      0.53      0.42     90649
weighted avg       0.83      0.62      0.69     90649


Confusion Matrix:
 [[ 7511  2269  3382]
 [15048 47489 12689]
 [  729   653   879]]


In [5]:
# Tuned Linear SVM using best hyperparameters 
tuned_svm = LinearSVC(
    random_state=42,
    C=10,
    tol=1e-3,
    max_iter=2000
)

tuned_svm.fit(X_train, y_train)

evaluate_model(tuned_svm, X_test, y_test, title="Tuned Linear SVM (C=10, tol=1e-3)")


=== Tuned Linear SVM (C=10, tol=1e-3) ===
Accuracy:          0.6171
Precision (macro): 0.4390
Recall (macro):    0.5309
F1 Score (macro):  0.4203

Classification Report:
               precision    recall  f1-score   support

           1       0.32      0.57      0.41     13162
           3       0.94      0.63      0.76     75226
           4       0.05      0.39      0.09      2261

    accuracy                           0.62     90649
   macro avg       0.44      0.53      0.42     90649
weighted avg       0.83      0.62      0.69     90649


Confusion Matrix:
 [[ 7522  2270  3370]
 [15058 47536 12632]
 [  729   652   880]]


In [None]:
from sklearn.svm import SVC

rbf_param_grid = {
    "C": [0.1, 1, 10],
    "gamma": ["scale", 0.01, 0.001]
}

rbf_svm = SVC(kernel="rbf", probability=True, random_state=42)

from sklearn.model_selection import GridSearchCV, StratifiedKFold
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_rbf = GridSearchCV(rbf_svm, rbf_param_grid, scoring='f1_macro', cv=cv, verbose=2)
grid_rbf.fit(X_train, y_train)
evaluate_model(grid_rbf.best_estimator_, X_test, y_test, title='Tuned RBF SVM')

Fitting 3 folds for each of 9 candidates, totalling 27 fits


In [6]:
# SAVE PICKLE BUNDLE FOR SVM MODEL (TUNED LINEAR SVM)


# 1. Use the tuned Linear SVM as the final model
final_svm = tuned_svm  # assumes tuned_svm is already fitted on X_train, y_train

# Predictions on test set
y_pred = final_svm.predict(X_test)

# LinearSVC has no predict_proba, but we can use decision_function scores
y_scores = None
if hasattr(final_svm, "decision_function"):
    try:
        y_scores = final_svm.decision_function(X_test)
    except Exception:
        y_scores = None

cm = confusion_matrix(y_test, y_pred)


# 2. PCA + separate SVM for 2D decision boundary visualization
pca = PCA(n_components=2, random_state=42)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Train a Linear SVM in PCA space (for visualization only)
svm_pca = LinearSVC(
    random_state=42,
    C=final_svm.C,
    tol=final_svm.tol,
    max_iter=final_svm.max_iter,
)
svm_pca.fit(X_train_pca, y_train)

# Meshgrid over PCA space
x_min, x_max = X_train_pca[:, 0].min() - 1, X_train_pca[:, 0].max() + 1
y_min, y_max = X_train_pca[:, 1].min() - 1, X_train_pca[:, 1].max() + 1

xx, yy = np.meshgrid(
    np.linspace(x_min, x_max, 200),
    np.linspace(y_min, y_max, 200),
)

grid_points = np.c_[xx.ravel(), yy.ravel()]
grid_pred = svm_pca.predict(grid_points)
grid_pred = grid_pred.reshape(xx.shape)


# 3. Build SVM bundle
bundle = {
    "model_name": "Linear SVM (tuned)",

    # Core evaluation arrays (for confusion matrix, ROC, etc.)
    "y_test": y_test,
    "y_pred": y_pred,
    "y_scores": y_scores,   # use this for ROC curves
    "y_proba": None,        # LinearSVC has no probabilities

    "confusion_matrix": cm,

    # Scalar performance metrics (for model comparison plots)
    "accuracy": accuracy_score(y_test, y_pred),
    "precision_macro": precision_score(y_test, y_pred, average="macro", zero_division=0),
    "recall_macro": recall_score(y_test, y_pred, average="macro", zero_division=0),
    "f1_macro": f1_score(y_test, y_pred, average="macro", zero_division=0),

    # We skip log_loss / AUC because there are no probabilities
    "log_loss": None,
    "roc_auc_ovr": None,

    # Hyperparameters
    "params": final_svm.get_params(),

    # For interactive feature explorer
    "feature_names": X_train.columns.to_numpy() if hasattr(X_train, "columns") else None,
    "X_test_sample": X_test.iloc[:2000] if hasattr(X_test, "iloc") else X_test[:2000],

    # SVM-specific: PCA-based 2D decision boundary
    "svm_pca_components": pca.components_,
    "svm_pca_explained_variance_ratio": pca.explained_variance_ratio_,
    "svm_X_test_pca": X_test_pca,
    "svm_y_test": y_test,
    "svm_grid_x": xx,
    "svm_grid_y": yy,
    "svm_grid_pred": grid_pred,
}


# 4. Save bundle to ../../Results/Visualizations
save_path = "../../Results/Visualizations"
os.makedirs(save_path, exist_ok=True)

bundle_filename = os.path.join(save_path, "linear_svm_bundle.pkl")

with open(bundle_filename, "wb") as f:
    pickle.dump(bundle, f)

print(f"\nSVM pickle bundle saved to: {bundle_filename}")
print("Bundle keys:", list(bundle.keys()))


SVM pickle bundle saved to: ../../Results/Visualizations\linear_svm_bundle.pkl
Bundle keys: ['model_name', 'y_test', 'y_pred', 'y_scores', 'y_proba', 'confusion_matrix', 'accuracy', 'precision_macro', 'recall_macro', 'f1_macro', 'log_loss', 'roc_auc_ovr', 'params', 'feature_names', 'X_test_sample', 'svm_pca_components', 'svm_pca_explained_variance_ratio', 'svm_X_test_pca', 'svm_y_test', 'svm_grid_x', 'svm_grid_y', 'svm_grid_pred']


In [None]:
# SAVE PICKLE BUNDLE FOR SVM MODEL (TUNED RBF SVM)


# 1. Use the tuned RBF SVM as the final model
final_svm = grid_rbf.best_estimator_  # assumes tuned_svm is already fitted on X_train, y_train

# Predictions on test set
y_pred = final_svm.predict(X_test)

# LinearSVC has no predict_proba, but we can use decision_function scores
y_scores = None
if hasattr(final_svm, "decision_function"):
    try:
        y_scores = final_svm.decision_function(X_test)
    except Exception:
        y_scores = None

cm = confusion_matrix(y_test, y_pred)


# 2. PCA + separate SVM for 2D decision boundary visualization
pca = PCA(n_components=2, random_state=42)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Train a Linear SVM in PCA space (for visualization only)
svm_pca = LinearSVC(
    random_state=42,
    C=final_svm.C,
    tol=final_svm.tol,
    max_iter=final_svm.max_iter,
)
svm_pca.fit(X_train_pca, y_train)

# Meshgrid over PCA space
x_min, x_max = X_train_pca[:, 0].min() - 1, X_train_pca[:, 0].max() + 1
y_min, y_max = X_train_pca[:, 1].min() - 1, X_train_pca[:, 1].max() + 1

xx, yy = np.meshgrid(
    np.linspace(x_min, x_max, 200),
    np.linspace(y_min, y_max, 200),
)

grid_points = np.c_[xx.ravel(), yy.ravel()]
grid_pred = svm_pca.predict(grid_points)
grid_pred = grid_pred.reshape(xx.shape)


# 3. Build SVM bundle
bundle = {
    "model_name": "RBF SVM (tuned)",

    # Core evaluation arrays (for confusion matrix, ROC, etc.)
    "y_test": y_test,
    "y_pred": y_pred,
    "y_scores": y_scores,   # use this for ROC curves
    "y_proba": None,        # LinearSVC has no probabilities

    "confusion_matrix": cm,

    # Scalar performance metrics (for model comparison plots)
    "accuracy": accuracy_score(y_test, y_pred),
    "precision_macro": precision_score(y_test, y_pred, average="macro", zero_division=0),
    "recall_macro": recall_score(y_test, y_pred, average="macro", zero_division=0),
    "f1_macro": f1_score(y_test, y_pred, average="macro", zero_division=0),

    # We skip log_loss / AUC because there are no probabilities
    "log_loss": None,
    "roc_auc_ovr": None,

    # Hyperparameters
    "params": final_svm.get_params(),

    # For interactive feature explorer
    "feature_names": X_train.columns.to_numpy() if hasattr(X_train, "columns") else None,
    "X_test_sample": X_test.iloc[:2000] if hasattr(X_test, "iloc") else X_test[:2000],

    # SVM-specific: PCA-based 2D decision boundary
    "svm_pca_components": pca.components_,
    "svm_pca_explained_variance_ratio": pca.explained_variance_ratio_,
    "svm_X_test_pca": X_test_pca,
    "svm_y_test": y_test,
    "svm_grid_x": xx,
    "svm_grid_y": yy,
    "svm_grid_pred": grid_pred,
}


# 4. Save bundle to ../../Results/Visualizations
save_path = "../../Results/Visualizations"
os.makedirs(save_path, exist_ok=True)

bundle_filename = os.path.join(save_path, "rbf_svm_bundle.pkl")

with open(bundle_filename, "wb") as f:
    pickle.dump(bundle, f)

print(f"\nSVM pickle bundle saved to: {bundle_filename}")
print("Bundle keys:", list(bundle.keys()))