In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, confusion_matrix, roc_curve, auc
)
import matplotlib.pyplot as plt
import numpy as np
from sklearn import tree

In [24]:
data = pd.read_csv("../../Results/BRFSS_2024_model_ready.csv", low_memory=False)
X = data.drop('DIABETE4', axis=1)
y = data['DIABETE4'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [25]:
clf = DecisionTreeClassifier(
    criterion='gini',
    max_depth=None,
    random_state=42
)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1] if len(clf.classes_) == 2 else None

In [26]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')


print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision (macro): {precision:.4f}")
print(f"Recall (macro):    {recall:.4f}")
print(f"F1 Score (macro):  {f1:.4f}")

print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy:  0.7530
Precision (macro): 0.4036
Recall (macro):    0.4081
F1 Score (macro):  0.4056

Classification Report:
               precision    recall  f1-score   support

           1       0.30      0.32      0.31     13162
           3       0.86      0.85      0.86     75226
           4       0.05      0.06      0.05      2261

    accuracy                           0.75     90649
   macro avg       0.40      0.41      0.41     90649
weighted avg       0.76      0.75      0.76     90649


Confusion Matrix:
 [[ 4201  8405   556]
 [ 9340 63937  1949]
 [  521  1615   125]]


In [27]:
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

criteria = ['gini', 'entropy', 'log_loss']
depth_values = [4, 6, 8, 10, None]      
min_samples_values = [2, 5, 10, 20]

records = []

for crit in criteria:
    for max_depth in depth_values:
        for min_split in min_samples_values:
            clf_temp = DecisionTreeClassifier(
                criterion=crit,
                max_depth=max_depth,
                min_samples_split=min_split,
                random_state=42
            )

            clf_temp.fit(X_train, y_train)
            y_pred_temp = clf_temp.predict(X_test)

            acc = accuracy_score(y_test, y_pred_temp)
            f1_macro = f1_score(y_test, y_pred_temp, average='macro')

            records.append({
                "Criterion": crit,
                "Max Depth": max_depth,
                "Min Samples Split": min_split,
                "Accuracy": acc,
                "F1_macro": f1_macro
            })

results_df = pd.DataFrame(records)


In [28]:
results_sorted = (
    results_df
    .sort_values(by=["F1_macro", "Accuracy"], ascending=False)
    .reset_index(drop=True)
)

results_sorted.head(20)



Unnamed: 0,Criterion,Max Depth,Min Samples Split,Accuracy,F1_macro
0,gini,,20,0.77778,0.405825
1,entropy,,20,0.778894,0.405807
2,log_loss,,20,0.778894,0.405807
3,gini,,2,0.753047,0.405634
4,entropy,,10,0.760714,0.404953
5,log_loss,,10,0.760714,0.404953
6,gini,,10,0.761729,0.404738
7,entropy,,5,0.753158,0.403045
8,log_loss,,5,0.753158,0.403045
9,entropy,,2,0.753952,0.402232


In [29]:
import pandas as pd

best_row = results_sorted.iloc[0]
print("Best hyperparameters from grid:")
print(best_row)

raw_max_depth = best_row["Max Depth"]
if pd.isna(raw_max_depth):
    max_depth_param = None
else:
    max_depth_param = int(raw_max_depth)

best_clf = DecisionTreeClassifier(
    criterion=best_row["Criterion"],
    max_depth=max_depth_param,
    min_samples_split=int(best_row["Min Samples Split"]),
    random_state=42
)

best_clf.fit(X_train, y_train)
y_pred_best = best_clf.predict(X_test)

best_accuracy = accuracy_score(y_test, y_pred_best)
best_precision = precision_score(y_test, y_pred_best, average='macro')
best_recall = recall_score(y_test, y_pred_best, average='macro')
best_f1 = f1_score(y_test, y_pred_best, average='macro')

print("\nTuned Decision Tree Performance:")
print(f"Accuracy:          {best_accuracy:.4f}")
print(f"Precision (macro): {best_precision:.4f}")
print(f"Recall (macro):    {best_recall:.4f}")
print(f"F1 Score (macro):  {best_f1:.4f}")

print("\nClassification Report:\n", classification_report(y_test, y_pred_best))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_best))


Best hyperparameters from grid:
Criterion                gini
Max Depth                 NaN
Min Samples Split          20
Accuracy              0.77778
F1_macro             0.405825
Name: 0, dtype: object

Tuned Decision Tree Performance:
Accuracy:          0.7778
Precision (macro): 0.4082
Recall (macro):    0.4072
F1 Score (macro):  0.4058

Classification Report:
               precision    recall  f1-score   support

           1       0.32      0.33      0.33     13162
           3       0.86      0.88      0.87     75226
           4       0.04      0.01      0.02      2261

    accuracy                           0.78     90649
   macro avg       0.41      0.41      0.41     90649
weighted avg       0.77      0.78      0.77     90649


Confusion Matrix:
 [[ 4346  8644   172]
 [ 8510 66131   585]
 [  554  1679    28]]


In [30]:
print("Baseline tree (your original clf):")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  F1_macro: {f1:.4f}")

print("\nBest tuned tree (from grid):")
print(f"  Accuracy: {best_accuracy:.4f}")
print(f"  F1_macro: {best_f1:.4f}")


Baseline tree (your original clf):
  Accuracy: 0.7530
  F1_macro: 0.4056

Best tuned tree (from grid):
  Accuracy: 0.7778
  F1_macro: 0.4058
