In [10]:
"""
Feature : catheter_size
Extract catheter size from discharge notes (e.g., "16F", "16 Fr")
"""
import pandas as pd
import os
import re
from config_paths import *
from utils import drop_columns

In [12]:
# Read dataset
df = pd.read_csv(dataset_path)
initial_row_count = len(df)

# Load discharge notes
discharge = pd.read_csv(
    os.path.join(note_path, "discharge.csv"),
    usecols=["subject_id", "hadm_id", "text"]
)

# Regex to extract catheter size
size_pattern = re.compile(
    r"\b(\d{1,2})\s*(f|fr|fr\.|french)\b",
    flags=re.IGNORECASE
)

def extract_catheter_sizes(text: str):
    """
    Extract catheter sizes from free text as normalized strings like '16 Fr'.
    Returns a list of unique sizes found in the text.
    """
    if not isinstance(text, str):
        return []

    matches = size_pattern.findall(text)
    sizes = []

    for num, unit in matches:
        norm = f"{num} Fr"  # normalize everything to 'NN Fr'
        sizes.append(norm)

    # Return unique sizes, preserving order (not sorted, like notebook)
    seen = set()
    unique_sizes = []
    for s in sizes:
        if s not in seen:
            seen.add(s)
            unique_sizes.append(s)

    return unique_sizes

# Apply extraction to discharge notes
discharge_sizes = discharge.copy()
discharge_sizes["catheter_size_from_notes"] = discharge_sizes["text"].apply(extract_catheter_sizes)

# Aggregate per (subject_id, hadm_id)
def aggregate_sizes(size_lists):
    combined = []
    for lst in size_lists:
        if isinstance(lst, list):
            combined.extend(lst)
    # de-duplicate, preserve order (not sorted, like notebook)
    seen = set()
    result = []
    for s in combined:
        if s not in seen:
            seen.add(s)
            result.append(s)
    return result

catheter_size_adm = (
    discharge_sizes
    .groupby(["subject_id", "hadm_id"], as_index=False)["catheter_size_from_notes"]
    .apply(aggregate_sizes)
)

# Merge
df = df.merge(catheter_size_adm, on=["subject_id", "hadm_id"], how="left")

# Keep as list (like notebook), not convert to string
df["catheter_size_from_notes"] = df["catheter_size_from_notes"].apply(lambda x: x if isinstance(x, list) else [])

# Save
df.to_csv(dataset_path, index=False)



In [13]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime', 'admission_type', 'admission_location',
       'discharge_location', 'race', 'catheter_procedures',
       'catheter_procedures_ir', 'catheter_used_in_procedures_ir',
       'catheter_insertion_date_ir', 'catheter_removal_date_ir',
       'catheter_outputevents', 'catheter_used_in_output_events',
       'datetimeevents', 'catheter_used_in_datetime_events',
       'insertion_date_from_datetimeevents',
       'removal_date_from_datetimeevents', 'catheter_procedure_events',
       'catheter_used_in_procedure_events', 'catheter_size_from_chartevents',
       'index', 'catheter_clinical_notes', 'catheter_used_in_clinical_notes',
       'catheter_present', 'final_cauti_flag', 'final_insertion_date',
       'final_removal_date', 'catheter_duration_days', 'BMI_in_admission',
       'BMI_computed', 'BMI_last_year', 'BMI', 

In [14]:
df["catheter_size_from_chartevents"].value_counts()

catheter_size_from_chartevents
16 French         26796
14 French         16490
18 French          1130
Coude Catheter      546
Not applicable      297
20 French           208
22 French           145
Name: count, dtype: int64

In [15]:
df["catheter_size_from_notes"].value_counts()

catheter_size_from_notes
[]                          147856
[5 Fr]                        4171
[4 Fr]                        1177
[8 Fr]                         589
[6 Fr]                         543
                             ...  
[1 Fr, 5 Fr, 7 Fr]               1
[6 Fr, 4 Fr, 3 Fr]               1
[5 Fr, 8 Fr, 7 Fr]               1
[0 Fr, 8 Fr, 4 Fr]               1
[4 Fr, 3 Fr, 6 Fr, 7 Fr]         1
Name: count, Length: 261, dtype: int64

In [20]:
# -----------------------------------------
# Helpers
# -----------------------------------------
def normalize_chartevents_size(x):
    """
    Normalize chartevents catheter size to 'NN French'
    Returns None if unusable
    """
    if not isinstance(x, str):
        return None

    x = x.strip().lower()

    # Ignore non-size values
    if x in {"not applicable", "coude catheter"}:
        return None

    m = re.search(r"(\d{1,2})\s*french", x)
    if m:
        return f"{int(m.group(1))} French"

    return None


def normalize_notes_sizes(size_list):
    """
    Convert ['16 Fr', '14 Fr'] -> [16, 14]
    """
    if not isinstance(size_list, list) or len(size_list) == 0:
        return []

    sizes = []
    for s in size_list:
        if isinstance(s, str):
            m = re.search(r"(\d{1,2})\s*fr", s.lower())
            if m:
                sizes.append(int(m.group(1)))
    return sizes


# -----------------------------------------
# Final catheter_size logic
# -----------------------------------------
def derive_final_catheter_size(row):
    """
    Priority:
    1. catheter_size_from_chartevents
    2. max(catheter_size_from_notes)
    """

    # ---- Priority 1: Chartevents ----
    ce_size = normalize_chartevents_size(
        row.get("catheter_size_from_chartevents")
    )
    if ce_size is not None:
        return ce_size

    # ---- Priority 2: Notes (max size) ----
    note_sizes = normalize_notes_sizes(
        row.get("catheter_size_from_notes")
    )
    if note_sizes:
        return f"{max(note_sizes)} French"

    return None
# -----------------------------------------
# Apply
# -----------------------------------------
df["catheter_size"] = df.apply(derive_final_catheter_size, axis=1)

# Save
df.to_csv(dataset_path, index=False)

print("✓ Feature 'catheter_size' created with correct priority & normalization")
print(df["catheter_size"].value_counts(dropna=False).head(10))


✓ Feature 'catheter_size' created with correct priority & normalization
catheter_size
None         105328
16 French     26815
14 French     16510
5 French       3493
18 French      1139
4 French       1074
8 French        561
6 French        493
9 French        448
7 French        438
Name: count, dtype: int64


In [26]:
print(f"\nTotal rows: {len(df)}")

rows_with_size = df["catheter_size"].notna().sum()
rows_without_size = df["catheter_size"].isna().sum()

print(f"Rows with catheter size: {rows_with_size}")
print(f"Rows without catheter size: {rows_without_size}")

print(f"Expected total: {initial_row_count}")

if len(df) == initial_row_count:
    print(f"✓ Row count matches expected total ({initial_row_count})")
else:
    print(
        f"⚠ Row count mismatch! Expected {initial_row_count}, "
        f"got {len(df)} (difference: {len(df) - initial_row_count})"
    )

print(f"Dataset shape: {df.shape}")



Total rows: 158020
Rows with catheter size: 52692
Rows without catheter size: 105328
Expected total: 158020
✓ Row count matches expected total (158020)
Dataset shape: (158020, 59)


In [28]:
# cols_to_drop = [
#  "catheter_size",
#  "catheter_size_from_notes"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)