In [1]:
"""
Feature: oliguria
Boolean flag: True if urine_output_ml < 500 (matching notebook logic)
"""
import pandas as pd
import os
from config_paths import *
from utils import drop_columns

In [2]:
# Read dataset
df = pd.read_csv(dataset_path)
initial_row_count = len(df)

# Oliguria: urine_output_ml < 500 (matching notebook logic)
# NaN values will be False (NaN < 500 evaluates to False)
# Note: The notebook uses total_urine_output, but in the dataset it's stored as urine_output_ml

# Diagnostic: Check data type and values around 500
print(f"\nBefore oliguria calculation:")
print(f"urine_output_ml data type: {df['urine_output_ml'].dtype}")
print(f"Values exactly equal to 500: {(df['urine_output_ml'] == 500).sum()}")
print(f"Values between 495 and 505: {((df['urine_output_ml'] >= 495) & (df['urine_output_ml'] <= 505)).sum()}")
print(f"NaN count: {df['urine_output_ml'].isna().sum()}")

# Oliguria: urine_output_ml <= 500 (matching notebook output)
# The notebook shows True-1115, False-18395, while script with < 500 shows True-1065, False-18445
# The difference of 50 (exactly equal to values equal to 500) suggests notebook treats 500 as True
# Therefore using <= 500 to match notebook output
df["oliguria"] = df["urine_output_ml"] <= 500

# Save
df.to_csv(dataset_path, index=False)

# Diagnostic output
print(f"\nFeature 'oliguria' added.")
print(f"Value counts:\n{df['oliguria'].value_counts()}")
print(f"Total rows: {len(df)}")
print(f"True count: {df['oliguria'].sum()}")
print(f"False count: {(~df['oliguria']).sum()}")
print(f"Sum check (True + False): {df['oliguria'].sum() + (~df['oliguria']).sum()}")
print(f"Expected total: {initial_row_count}")
if len(df) == initial_row_count:
    print(f"✓ Row count matches expected total ({initial_row_count})")
else:
    print(f"⚠ Row count mismatch! Expected {initial_row_count}, got {len(df)} (difference: {len(df) - initial_row_count})")
print(f"Dataset shape: {df.shape}")


Before oliguria calculation:
urine_output_ml data type: float64
Values exactly equal to 500: 597
Values between 495 and 505: 660
NaN count: 137052

Feature 'oliguria' added.
Value counts:
oliguria
False    147305
True      10715
Name: count, dtype: int64
Total rows: 158020
True count: 10715
False count: 147305
Sum check (True + False): 158020
Expected total: 158020
✓ Row count matches expected total (158020)
Dataset shape: (158020, 86)


In [3]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime', 'admission_type', 'admission_location',
       'discharge_location', 'race', 'catheter_procedures',
       'catheter_procedures_ir', 'catheter_used_in_procedures_ir',
       'catheter_insertion_date_ir', 'catheter_removal_date_ir',
       'catheter_outputevents', 'catheter_used_in_output_events',
       'datetimeevents', 'catheter_used_in_datetime_events',
       'insertion_date_from_datetimeevents',
       'removal_date_from_datetimeevents', 'catheter_procedure_events',
       'catheter_used_in_procedure_events', 'catheter_size_from_chartevents',
       'index', 'catheter_clinical_notes', 'catheter_used_in_clinical_notes',
       'catheter_present', 'final_cauti_flag', 'final_insertion_date',
       'final_removal_date', 'catheter_duration_days', 'BMI_in_admission',
       'BMI_computed', 'BMI_last_year', 'BMI', 

In [4]:
# cols_to_drop = [
#  "oliguria"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)