In [1]:
"""
Feature: steroids_per_admission
Boolean flag indicating if steroids were given during admission
"""
import pandas as pd
import os
import re
from config_paths import *
from utils import drop_columns

In [2]:
# Read dataset
df = pd.read_csv(dataset_path)
initial_row_count = len(df)
print(f"Initial dataset shape: {df.shape}")

# Check for duplicates in dataset (shouldn't happen, but verify)
duplicates = df.duplicated(subset=["subject_id", "hadm_id"]).sum()
if duplicates > 0:
    print(f"Warning: {duplicates} duplicate (subject_id, hadm_id) pairs found in dataset")

# Load prescriptions
prescriptions = pd.read_csv(
    os.path.join(hosp_path, "prescriptions.csv"),
    usecols=["subject_id", "hadm_id", "drug"]
)

# Steroid pattern
steroid_pattern = r"(?:pred|cort|methason|dexam|betameth|beclometh|budeson|momet|triamcinol|fludrocort)"

def drug_flag_regex(df, pattern):
    return df["drug"].str.contains(pattern, case=False, na=False, regex=True)

# Build steroids flag per admission
steroids_flag_df = (
    prescriptions.assign(
        steroids=lambda x: drug_flag_regex(x, steroid_pattern)
    )
    .groupby(["subject_id", "hadm_id"])["steroids"]
    .max()
    .reset_index()
)

# Merge
# Check for duplicates in steroids_flag_df (shouldn't happen after groupby, but verify)
duplicates_flag = steroids_flag_df.duplicated(subset=["subject_id", "hadm_id"]).sum()
if duplicates_flag > 0:
    print(f"Warning: {duplicates_flag} duplicate (subject_id, hadm_id) pairs found in steroids_flag_df")
    # If duplicates exist, take max (shouldn't be needed, but safety check)
    steroids_flag_df = steroids_flag_df.groupby(["subject_id", "hadm_id"])["steroids"].max().reset_index()

df = df.merge(steroids_flag_df, on=["subject_id", "hadm_id"], how="left")

# Verify row count is preserved (left merge should keep all rows from df)
if len(df) != initial_row_count:
    print(f"ERROR: Row count changed during merge! Initial: {initial_row_count}, After merge: {len(df)}")
    print(f"Difference: {len(df) - initial_row_count} rows")

# Verify no new duplicates were created
if len(df) != len(df.drop_duplicates(subset=["subject_id", "hadm_id"])):
    print(f"Warning: Merge created duplicate rows. Before: {len(df.drop_duplicates(subset=['subject_id', 'hadm_id']))}, After: {len(df)}")

# Fill missing with False and convert to bool
# Use fillna first, then convert to bool to avoid nullable boolean issues
df["steroids_per_admission"] = df["steroids"].fillna(False).astype(bool)

# Drop the temporary steroids column
df = df.drop(columns=["steroids"])

# Save
df.to_csv(dataset_path, index=False)
print(f"\nFeature 'steroids_per_admission' added.")
print(f"Value counts:\n{df['steroids_per_admission'].value_counts()}")
print(f"Total rows: {len(df)}")
print(f"True count: {df['steroids_per_admission'].sum()}")
print(f"False count: {(~df['steroids_per_admission']).sum()}")
print(f"Sum check (True + False): {df['steroids_per_admission'].sum() + (~df['steroids_per_admission']).sum()}")
print(f"Expected total: {initial_row_count}")
if len(df) == initial_row_count:
    print(f"✓ Row count matches expected total ({initial_row_count})")
else:
    print(f"⚠ Row count mismatch! Expected {initial_row_count}, got {len(df)} (difference: {len(df) - initial_row_count})")
print(f"Dataset shape: {df.shape}")



Initial dataset shape: (158020, 73)


  df["steroids_per_admission"] = df["steroids"].fillna(False).astype(bool)



Feature 'steroids_per_admission' added.
Value counts:
steroids_per_admission
False    123553
True      34467
Name: count, dtype: int64
Total rows: 158020
True count: 34467
False count: 123553
Sum check (True + False): 158020
Expected total: 158020
✓ Row count matches expected total (158020)
Dataset shape: (158020, 74)


In [3]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime', 'admission_type', 'admission_location',
       'discharge_location', 'race', 'catheter_procedures',
       'catheter_procedures_ir', 'catheter_used_in_procedures_ir',
       'catheter_insertion_date_ir', 'catheter_removal_date_ir',
       'catheter_outputevents', 'catheter_used_in_output_events',
       'datetimeevents', 'catheter_used_in_datetime_events',
       'insertion_date_from_datetimeevents',
       'removal_date_from_datetimeevents', 'catheter_procedure_events',
       'catheter_used_in_procedure_events', 'catheter_size_from_chartevents',
       'index', 'catheter_clinical_notes', 'catheter_used_in_clinical_notes',
       'catheter_present', 'final_cauti_flag', 'final_insertion_date',
       'final_removal_date', 'catheter_duration_days', 'BMI_in_admission',
       'BMI_computed', 'BMI_last_year', 'BMI', 

In [4]:
# cols_to_drop = [
#  "steroids_per_admission"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)