In [2]:
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score,
    precision_score, recall_score, f1_score, roc_auc_score
)
from sklearn.model_selection import train_test_split

In [3]:
npp_data = pd.read_csv("diabetes_dataset.csv")
pp_data = pd.read_csv("diabetes_dataset_DONE.csv")

In [6]:
LABEL = "diagnosed_diabetes"

def run_simple_dummy(df, label, name):
    train, test = train_test_split(
        df,
        test_size=0.3,
        random_state=42,
        stratify=df[label]
    )

    X_train = train.drop(columns=[label])
    y_train = train[label]
    X_test  = test.drop(columns=[label])
    y_test  = test[label]

    dummy = DummyClassifier(strategy="most_frequent")
    dummy.fit(X_train, y_train)

    y_pred = dummy.predict(X_test)
    y_proba = dummy.predict_proba(X_test)[:, 1]

    return {
        "dataset": name,
        "strategy": "DummyClassifier",
        "accuracy": accuracy_score(y_test, y_pred),
        "balanced_accuracy": balanced_accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "f1": f1_score(y_test, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_test, y_proba)
    }

results = [
    run_simple_dummy(npp_data, LABEL, "NPP"),
    run_simple_dummy(pp_data, LABEL, "PP")
]

baseline_df = pd.DataFrame(results)
print(baseline_df)

# =========================
# 4. Zapis do CSV
# =========================
baseline_df.to_csv("baseline_dummy_simple.csv", index=False)

  dataset         strategy  accuracy  balanced_accuracy  precision  recall  \
0     NPP  DummyClassifier  0.599967                0.5   0.599967     1.0   
1      PP  DummyClassifier  0.596135                0.5   0.596135     1.0   

         f1  roc_auc  
0  0.749974      0.5  
1  0.746973      0.5  
