In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, confusion_matrix, log_loss, roc_curve, auc
)
import matplotlib.pyplot as plt
import numpy as np
from sklearn import tree
import pickle
import os

In [3]:
data = pd.read_csv("../../Data/BRFSS_2024_model_ready.csv", low_memory=False)
X = data.drop('DIABETE4', axis=1)
y = data['DIABETE4'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [4]:
clf = DecisionTreeClassifier(
    criterion='gini',
    max_depth=None,
    random_state=42
)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1] if len(clf.classes_) == 2 else None

In [5]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')


print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision (macro): {precision:.4f}")
print(f"Recall (macro):    {recall:.4f}")
print(f"F1 Score (macro):  {f1:.4f}")

print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy:  0.7527
Precision (macro): 0.4009
Recall (macro):    0.4053
F1 Score (macro):  0.4029

Classification Report:
               precision    recall  f1-score   support

           1       0.30      0.32      0.31     13162
           3       0.87      0.85      0.86     75226
           4       0.04      0.04      0.04      2261

    accuracy                           0.75     90649
   macro avg       0.40      0.41      0.40     90649
weighted avg       0.76      0.75      0.76     90649


Confusion Matrix:
 [[ 4238  8342   582]
 [ 9388 63892  1946]
 [  536  1624   101]]


In [6]:
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

criteria = ['gini', 'entropy', 'log_loss']
depth_values = [4, 6, 8, 10, None]      
min_samples_values = [2, 5, 10, 20]

records = []

for crit in criteria:
    for max_depth in depth_values:
        for min_split in min_samples_values:
            clf_temp = DecisionTreeClassifier(
                criterion=crit,
                max_depth=max_depth,
                min_samples_split=min_split,
                random_state=42
            )

            clf_temp.fit(X_train, y_train)
            y_pred_temp = clf_temp.predict(X_test)

            acc = accuracy_score(y_test, y_pred_temp)
            f1_macro = f1_score(y_test, y_pred_temp, average='macro')

            records.append({
                "Criterion": crit,
                "Max Depth": max_depth,
                "Min Samples Split": min_split,
                "Accuracy": acc,
                "F1_macro": f1_macro
            })

results_df = pd.DataFrame(records)


In [7]:
results_sorted = (
    results_df
    .sort_values(by=["F1_macro", "Accuracy"], ascending=False)
    .reset_index(drop=True)
)

results_sorted.head(20)



Unnamed: 0,Criterion,Max Depth,Min Samples Split,Accuracy,F1_macro
0,gini,,20,0.77724,0.405705
1,entropy,,20,0.779611,0.405317
2,log_loss,,20,0.779611,0.405317
3,entropy,,10,0.75992,0.404346
4,log_loss,,10,0.75992,0.404346
5,gini,,2,0.752694,0.40288
6,gini,,10,0.760968,0.402162
7,gini,,5,0.754162,0.40213
8,entropy,,2,0.753378,0.401919
9,log_loss,,2,0.753378,0.401919


In [8]:
import pandas as pd

best_row = results_sorted.iloc[0]
print("Best hyperparameters from grid:")
print(best_row)

raw_max_depth = best_row["Max Depth"]
if pd.isna(raw_max_depth):
    max_depth_param = None
else:
    max_depth_param = int(raw_max_depth)

best_clf = DecisionTreeClassifier(
    criterion=best_row["Criterion"],
    max_depth=max_depth_param,
    min_samples_split=int(best_row["Min Samples Split"]),
    random_state=42
)

best_clf.fit(X_train, y_train)
y_pred_best = best_clf.predict(X_test)

best_accuracy = accuracy_score(y_test, y_pred_best)
best_precision = precision_score(y_test, y_pred_best, average='macro')
best_recall = recall_score(y_test, y_pred_best, average='macro')
best_f1 = f1_score(y_test, y_pred_best, average='macro')

print("\nTuned Decision Tree Performance:")
print(f"Accuracy:          {best_accuracy:.4f}")
print(f"Precision (macro): {best_precision:.4f}")
print(f"Recall (macro):    {best_recall:.4f}")
print(f"F1 Score (macro):  {best_f1:.4f}")

print("\nClassification Report:\n", classification_report(y_test, y_pred_best))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_best))


Best hyperparameters from grid:
Criterion                gini
Max Depth                 NaN
Min Samples Split          20
Accuracy              0.77724
F1_macro             0.405705
Name: 0, dtype: object

Tuned Decision Tree Performance:
Accuracy:          0.7772
Precision (macro): 0.4087
Recall (macro):    0.4072
F1 Score (macro):  0.4057

Classification Report:
               precision    recall  f1-score   support

           1       0.32      0.33      0.33     13162
           3       0.86      0.88      0.87     75226
           4       0.04      0.01      0.02      2261

    accuracy                           0.78     90649
   macro avg       0.41      0.41      0.41     90649
weighted avg       0.77      0.78      0.77     90649


Confusion Matrix:
 [[ 4347  8662   153]
 [ 8581 66080   565]
 [  550  1682    29]]


In [9]:
print("Baseline tree (your original clf):")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  F1_macro: {f1:.4f}")

print("\nBest tuned tree (from grid):")
print(f"  Accuracy: {best_accuracy:.4f}")

print(f"  F1_macro: {best_f1:.4f}")


Baseline tree (your original clf):
  Accuracy: 0.7527
  F1_macro: 0.4029

Best tuned tree (from grid):
  Accuracy: 0.7772
  F1_macro: 0.4057


In [11]:
# SAVE PICKLE BUNDLE FOR DECISION TREE MODEL

# Make sure y_proba exists and is a proper array if possible
if 'y_proba' not in globals() or y_proba is None:
    # Try to compute it from the model (clf or best_clf)
    try:
        model_for_proba = clf if 'clf' in globals() else best_clf
        if hasattr(model_for_proba, "predict_proba"):
            y_proba = model_for_proba.predict_proba(X_test)
        else:
            y_proba = None
    except Exception:
        y_proba = None

# 1. Build confusion matrix (if not already built)
cm = confusion_matrix(y_test, y_pred)


# 2. Build full results dictionary ("bundle")
bundle = {
    "model_name": "Decision Tree",

    # Core evaluation arrays
    "y_test": y_test,
    "y_pred": y_pred,
    "y_proba": y_proba,

    # Confusion matrix
    "confusion_matrix": cm,

    # Scalar performance metrics
    "accuracy": accuracy_score(y_test, y_pred),
    "precision_macro": precision_score(y_test, y_pred, average="macro", zero_division=0),
    "recall_macro": recall_score(y_test, y_pred, average="macro", zero_division=0),
    "f1_macro": f1_score(y_test, y_pred, average="macro", zero_division=0),

    # Log loss and AUC (only if y_proba exists)
    "log_loss": log_loss(y_test, y_proba) if y_proba is not None else None,
    "roc_auc_ovr": roc_auc_score(y_test, y_proba, multi_class="ovr") if y_proba is not None else None,

    # Hyperparameters
    "params": clf.get_params() if 'clf' in globals() else best_clf.get_params(),

    
    # MODEL-SPECIFIC VARIABLES
    
    # Required for Decision Tree feature importance visualizations
    "dt_feature_importance": clf.feature_importances_ if 'clf' in globals() else best_clf.feature_importances_,

    # If you used a DataFrame for X, feature names will exist:
    "dt_feature_names": X_train.columns if hasattr(X_train, "columns") else None,

    # --- Common keys for interactive feature explorer ---
    "feature_names": X_train.columns.to_numpy() if hasattr(X_train, "columns") else None,
    "X_test_sample": X_test.iloc[:2000] if hasattr(X_test, "iloc") else X_test[:2000],
}


# 3. Save bundle to specified directory

save_path = "../../Results/Visualizations"
os.makedirs(save_path, exist_ok=True)

bundle_filename = os.path.join(save_path, "decision_tree_bundle.pkl")

with open(bundle_filename, "wb") as f:
    pickle.dump(bundle, f)

print(f"\nDecision Tree pickle bundle saved to:\n{bundle_filename}")
print("Bundle keys:", list(bundle.keys()))



Decision Tree pickle bundle saved to:
../../Results/Visualizations/decision_tree_bundle.pkl
Bundle keys: ['model_name', 'y_test', 'y_pred', 'y_proba', 'confusion_matrix', 'accuracy', 'precision_macro', 'recall_macro', 'f1_macro', 'log_loss', 'roc_auc_ovr', 'params', 'dt_feature_importance', 'dt_feature_names', 'feature_names', 'X_test_sample']
