In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
import numpy as np


In [None]:
import os
from pathlib import Path

print("Current working directory:\n", Path().resolve(), "\n")

print("Searching for BRFSS_2024_model_ready.csv ...")
# Comment to Sara: We already split the data into train and test sets, just import BRFSS_2024_model_ready_train.csv 
# and BRFSS_2024_model_ready_test.csv directly in the code.
for p in Path().resolve().rglob("BRFSS_2024_model_ready.csv"): 
    print("Found at:", p)


Current working directory:
 C:\CS506-Project---Analyzing-Lifestyle-and-Demographic-Risk-Factors-of-Diabetes-with-BRFSS-Data\Code\models 

Searching for BRFSS_2024_model_ready.csv ...


In [None]:
# Load model-ready BRFSS data
# Comment to Sara: We already split the data into train and test sets, just import BRFSS_2024_model_ready_train.csv 
# and BRFSS_2024_model_ready_test.csv directly in the code.
data = pd.read_csv(
    "../../Results/Data Cleaning Logs/BRFSS_2024_model_ready.csv",
    low_memory=False
)

# Separate features and target
X = data.drop("DIABETE4", axis=1)
y = data["DIABETE4"].astype(int)

# Comment to Sara: You can remove this part since we've already split the data into train and test sets
# Train / Test split (20% test, stratified by label)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


Train shape: (362592, 198) Test shape: (90649, 198)


In [6]:
def evaluate_model(clf, X_test, y_test, title="Model"):
    y_pred = clf.predict(X_test)

    if hasattr(clf, "predict_proba"):
        y_proba = clf.predict_proba(X_test)
        ll = log_loss(y_test, y_proba)
    else:
        y_proba = None
        ll = None

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="macro", zero_division=0)
    rec = recall_score(y_test, y_pred, average="macro", zero_division=0)
    f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)

    print(f"\n=== {title} ===")
    print(f"Accuracy:          {acc:.4f}")
    print(f"Precision (macro): {prec:.4f}")
    print(f"Recall (macro):    {rec:.4f}")
    print(f"F1 Score (macro):  {f1:.4f}")
    if ll is not None:
        print(f"Log Loss:          {ll:.4f}")

    print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
# Comment to Sara: You can remove this cell since we've already split the data into train and test sets
# Separate features and target
X = data.drop("DIABETE4", axis=1)
y = data["DIABETE4"].astype(int)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Comment to Sara: We've already performed data cleaning and preprocessing, so we can directly use the train and test sets. No need to use StandardScaler
# Baseline Linear SVM with scaling
baseline_svm = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("svm", LinearSVC(random_state=42))
])

baseline_svm.fit(X_train, y_train)

evaluate_model(baseline_svm, X_test, y_test, title="Baseline Linear SVM")


=== Baseline Linear SVM ===
Accuracy:          0.8366
Precision (macro): 0.4768
Recall (macro):    0.3712
F1 Score (macro):  0.3732

Classification Report:
               precision    recall  f1-score   support

           1       0.59      0.13      0.21     13162
           3       0.84      0.99      0.91     75226
           4       0.00      0.00      0.00      2261

    accuracy                           0.84     90649
   macro avg       0.48      0.37      0.37     90649
weighted avg       0.79      0.84      0.79     90649


Confusion Matrix:
 [[ 1681 11481     0]
 [ 1071 74155     0]
 [  118  2143     0]]


In [None]:
from sklearn.model_selection import GridSearchCV

# Comment to Sara: We've already performed data cleaning and preprocessing, so we can directly use the train and test sets. No need to use StandardScaler
# Pipeline for scaling + SVM
svm_pipe = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("svm", LinearSVC(random_state=42))
])

# Hyperparameter search space
param_grid = {
    "svm__C": [0.01, 0.1, 1, 10],
    "svm__tol": [1e-3, 1e-4],
}

print("Starting GridSearchCV...")

grid_svm = GridSearchCV(
    svm_pipe,
    param_grid,
    scoring="f1_macro",
    cv=3,
    n_jobs=-1,
    verbose=2
)

grid_svm.fit(X_train, y_train)

print("Best params:", grid_svm.best_params_)
print("Best CV Macro-F1:", grid_svm.best_score_)


Starting GridSearchCV...
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best params: {'svm__C': 0.1, 'svm__tol': 0.0001}
Best CV Macro-F1: 0.37268136941446756


In [10]:
# Evaluate tuned SVM on test set
best_svm = grid_svm.best_estimator_

print("\n=== Tuned Linear SVM Results ===")
evaluate_model(best_svm, X_test, y_test, title="Tuned Linear SVM")


=== Tuned Linear SVM Results ===

=== Tuned Linear SVM ===
Accuracy:          0.8366
Precision (macro): 0.4770
Recall (macro):    0.3712
F1 Score (macro):  0.3732

Classification Report:
               precision    recall  f1-score   support

           1       0.59      0.13      0.21     13162
           3       0.84      0.99      0.91     75226
           4       0.00      0.00      0.00      2261

    accuracy                           0.84     90649
   macro avg       0.48      0.37      0.37     90649
weighted avg       0.79      0.84      0.79     90649


Confusion Matrix:
 [[ 1682 11480     0]
 [ 1070 74156     0]
 [  118  2143     0]]


In [None]:
###############################################
# OPTIONAL â€” RBF SVM 
###############################################

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

rbf_svm = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("svm", SVC(kernel="rbf", probability=True, random_state=42))
])

print("RBF SVM model created (not fitted).")
print("Prem can run rbf_svm.fit(X_train, y_train) on his machine.")
