In [16]:
import pandas as pd
import os
from config_paths import *
from utils import drop_columns

In [18]:
# =====================================================
# 1. Read main dataset
# =====================================================
df = pd.read_csv(dataset_path)

print(f"Total rows loaded: {len(df)}")

Total rows loaded: 158020


In [19]:
# =====================================================
# 2. Normalize catheter_present (CRITICAL FIX)
# =====================================================
# Handle mixed types: True/False, 1/0, "True"/"False", NaN
df["catheter_present"] = (
    df["catheter_present"]
    .fillna(0)
    .replace({"True": 1, "False": 0})
    .astype(int)
    .astype(bool)
)

# Optional sanity check
print("\nCatheter_present distribution:")
print(df["catheter_present"].value_counts())

# =====================================================
# 3. Create y label based on clinical rules
# =====================================================
df["y"] = 0  # default: non-CAUTI

# -----------------------------
# Rule 1: Definite CAUTI
# -----------------------------
df.loc[
    df["cauti_type"].isin(["CAUTI", "CAUTI+OtherUTI"]),
    "y"
] = 1

# -----------------------------
# Rule 2: Probable CAUTI + catheter present
# -----------------------------
# df.loc[
#     (df["cauti_type"] == "Probable_CAUTI") &
#     (df["catheter_present"]),
#     "y"
# ] = 1

# =====================================================
# 4. Validation (MANDATORY)
# =====================================================
print("\nLabel Distribution:")
print(df["y"].value_counts())

print("\nCross-check by cauti_type:")
print(
    pd.crosstab(
        df["cauti_type"],
        df["y"],
        margins=True
    )
)

print("\nCross-check by cauti_type & catheter_present:")
print(
    pd.crosstab(
        [df["cauti_type"], df["catheter_present"]],
        df["y"],
        margins=True
    )
)

# =====================================================
# 5. Optional: Save labeled dataset
# =====================================================
df.to_csv(dataset_path, index=False)

print(f"\n✅ Labeled dataset saved at:\n{dataset_path}")



Catheter_present distribution:
catheter_present
True     138524
False     19496
Name: count, dtype: int64

Label Distribution:
y
0    156185
1      1835
Name: count, dtype: int64

Cross-check by cauti_type:
y                    0     1     All
cauti_type                          
CAUTI                0   192     192
CAUTI+OtherUTI       0  1643    1643
No_CAUTI        117182     0  117182
Probable_CAUTI   39003     0   39003
All             156185  1835  158020

Cross-check by cauti_type & catheter_present:
y                                     0     1     All
cauti_type     catheter_present                      
CAUTI          True                   0   192     192
CAUTI+OtherUTI True                   0  1643    1643
No_CAUTI       True              117182     0  117182
Probable_CAUTI False              19496     0   19496
               True               19507     0   19507
All                              156185  1835  158020

✅ Labeled dataset saved at:
C:\Users\Coditas\Desktop\

In [20]:
# Ensure Probable_Cauti without catheter is always y=0
assert not (
    (df["cauti_type"] == "Probable_Cauti") &
    (df["catheter_present"] == False) &
    (df["y"] == 1)
).any(), "Logic error: Probable_Cauti without catheter labeled as y=1"


In [21]:
# cols_to_drop = [
#  'y'
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)