In [19]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

file_path = r"C:\Users\Coditas\Desktop\Projects\Cauti\silver\bronze_binary_encoded.csv"

df = pd.read_csv(file_path)
df.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,anchor_age,admission_type,admission_location,discharge_location,race,...,age_group,nitrite_tested,nitrite_positive,blood_crp_measured,cfu_count_measured,urine_output_measured,catheter_duration_measured,catheter_size_known,catheter_size_fr,catheter_size_measured
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,F,52,URGENT,TRANSFER FROM HOSPITAL,HOME,WHITE,...,middle_age,1,0,0,0,0,0,0,,0
1,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,F,52,EW EMER.,EMERGENCY ROOM,HOME,WHITE,...,middle_age,0,0,0,0,0,0,0,,0
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,F,52,EW EMER.,EMERGENCY ROOM,HOSPICE,WHITE,...,middle_age,0,0,0,0,0,0,0,,0
3,10000560,28979390,2189-10-15 10:30:00,2189-10-17 15:00:00,F,53,SURGICAL SAME DAY ADMISSION,PHYSICIAN REFERRAL,HOME,WHITE,...,middle_age,0,0,0,0,0,0,1,6.0,1
4,10000690,25860671,2150-11-02 18:02:00,2150-11-12 13:45:00,F,86,EW EMER.,EMERGENCY ROOM,REHAB,WHITE,...,elderly,1,0,0,0,0,0,0,,0


In [20]:
# Ensure measured flag is correct (safety)
df["catheter_size_measured"] = df["catheter_size_fr"].notna().astype(int)

# Drop redundant column
df = df.drop(columns=["catheter_size_known","age_group", "catheter_size_from_notes", "race", "admittime", "dischtime"], errors="ignore")


In [21]:
# ----------------------------
# 1. Normal categorical columns
# ----------------------------
simple_cat_cols = [
    "admission_type",
    "admission_location",
    "discharge_location",
    "mobility_status"
]

df = pd.get_dummies(
    df,
    columns=simple_cat_cols,
    prefix=[c.lower() for c in simple_cat_cols],
    prefix_sep="_",
    dtype=int
)

# Make new dummy column names lowercase + snake_case
df.columns = df.columns.str.lower().str.replace(" ", "_").str.replace("[^0-9a-zA-Z_]", "", regex=True)


In [22]:
# ----------------------------
# 2. Multi-label column: catheter_type
# ----------------------------

# Convert the stringified lists to actual lists
df['catheter_type'] = df['catheter_type'].apply(
    lambda x: eval(x) if isinstance(x, str) else x
)

mlb = MultiLabelBinarizer()

cat_matrix = mlb.fit_transform(df['catheter_type'])
cat_labels = mlb.classes_

# Build column names in snake_case
cat_cols = [f"catheter_type_{c.lower().replace(' ', '_')}" for c in cat_labels]

# Convert to DataFrame
cat_df = pd.DataFrame(cat_matrix, columns=cat_cols)

# Concatenate
df = pd.concat([df.drop(columns=['catheter_type']), cat_df], axis=1)

In [25]:
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd

# Convert stringified lists to lists
df["other_uti"] = df["other_uti"].apply(
    lambda x: eval(x) if isinstance(x, str) else x
)

# Replace NaN / invalid with empty list
df["other_uti"] = df["other_uti"].apply(
    lambda x: x if isinstance(x, list) else []
)

mlb = MultiLabelBinarizer()

uti_matrix = mlb.fit_transform(df["other_uti"])
uti_cols = [f"other_uti_{c.lower().replace(' ', '_')}" for c in mlb.classes_]

uti_df = pd.DataFrame(uti_matrix, columns=uti_cols, index=df.index)

df = pd.concat([df.drop(columns=["other_uti"]), uti_df], axis=1)


In [27]:
df['gender'] = df['gender'].str.strip().str.upper().map({
    'M': 1,
    'MALE': 1,
    'F': 0,
    'FEMALE': 0
})

In [28]:
save_path = r"C:\Users\Coditas\Desktop\Projects\Cauti\silver\bronze_one_hot_encoded.csv"
df.to_csv(save_path, index=False)

print("Filtered dataset saved at:", save_path)

Filtered dataset saved at: C:\Users\Coditas\Desktop\Projects\Cauti\silver\bronze_one_hot_encoded.csv
