In [1]:
"""
Feature: mobility_status
Extract from discharge notes using regex patterns
"""
import pandas as pd
import os
import re
from config_paths import *
from utils import drop_columns

In [2]:
# Read dataset
df = pd.read_csv(dataset_path)
initial_row_count = len(df)

# Load discharge notes
discharge = pd.read_csv(
    os.path.join(note_path, "discharge.csv"),
    usecols=["subject_id", "hadm_id", "text"]
)

def extract_mobility_status(text: str):
    """Return a wide mobility category for a single note."""
    if pd.isna(text):
        return "Unknown"

    s = str(text)

    # Prefer an explicit "Activity Status:" line if present
    m = re.search(r"Activity Status:\s*(.+)", s, flags=re.I)
    if m:
        line = m.group(1).strip()
        line = re.split(r"[.\n;]", line)[0].strip()
    else:
        line = s

    s_lower = line.lower()

    # Non-ambulatory / Paralysis
    if re.search(r"paraplegic|quadriplegic|paralyzed|spinal\s+injury|dense\s+paralysis", s_lower):
        return "Non-ambulatory / Paralysis"

    # Bedbound / bed rest
    if re.search(r"bed\s*bound|bedbound|bed\s+rest|in\s+bed", s_lower):
        return "Bedbound"

    # Wheelchair
    if re.search(r"wheel\s*chair|wheelchair|wheel-chair", s_lower):
        return "Wheelchair"

    # Out of Bed with Assistance
    if re.search(r"out\s+of\s+bed|oobtc|to\s+chair|hoyer|lift\s+to", s_lower):
        return "Out of Bed with Assistance"

    # Assisted Ambulatory
    if re.search(r"ambulat|walk", s_lower) and re.search(
        r"assist|assistance|supervision|min\s+assist|max\s+assist|"
        r"walker|cane|crutch|crutches|rw|rolling\s+walker|brace|gait\s+belt",
        s_lower
    ):
        return "Assisted Ambulatory"

    # Independent Ambulatory
    if re.search(r"ambulatory|ambulating|walks?\b", s_lower) and not re.search(
        r"assist|assistance|supervision|walker|cane|crutch|wheel|bed",
        s_lower
    ):
        return "Independent Ambulatory"

    return "Unknown"

# Apply extractor on each note
discharge["mobility_status_note"] = discharge["text"].apply(extract_mobility_status)

# One value per admission: first non-"Unknown" for that hadm_id, else "Unknown"
mobility_by_hadm = (
    discharge.groupby("hadm_id")["mobility_status_note"]
    .agg(lambda s: next((v for v in s if v != "Unknown"), "Unknown"))
    .reset_index()
    .rename(columns={"mobility_status_note": "mobility_status"})
)

# Merge into main df
df = df.merge(mobility_by_hadm, on="hadm_id", how="left")

# Fill missing values
df["mobility_status"] = df["mobility_status"].fillna("Unknown")

# Save
df.to_csv(dataset_path, index=False)

# Diagnostic output
print(f"\nTotal rows: {len(df)}")
print(f"Expected total: {initial_row_count}")
if len(df) == initial_row_count:
    print(f"✓ Row count matches expected total ({initial_row_count})")
else:
    print(f"⚠ Row count mismatch! Expected {initial_row_count}, got {len(df)} (difference: {len(df) - initial_row_count})")
print(f"Dataset shape: {df.shape}")
print(f"Feature 'mobility_status' added. Unique values: {df['mobility_status'].unique()}")
print(f"Dataset shape: {df.shape}")



Total rows: 158020
Expected total: 158020
✓ Row count matches expected total (158020)
Dataset shape: (158020, 56)
Feature 'mobility_status' added. Unique values: ['Independent Ambulatory' 'Unknown' 'Assisted Ambulatory'
 'Out of Bed with Assistance' 'Bedbound' 'Wheelchair'
 'Non-ambulatory / Paralysis']
Dataset shape: (158020, 56)


In [4]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime', 'admission_type', 'admission_location',
       'discharge_location', 'race', 'catheter_procedures',
       'catheter_procedures_ir', 'catheter_used_in_procedures_ir',
       'catheter_insertion_date_ir', 'catheter_removal_date_ir',
       'catheter_outputevents', 'catheter_used_in_output_events',
       'datetimeevents', 'catheter_used_in_datetime_events',
       'insertion_date_from_datetimeevents',
       'removal_date_from_datetimeevents', 'catheter_procedure_events',
       'catheter_used_in_procedure_events', 'catheter_size_from_chartevents',
       'index', 'catheter_clinical_notes', 'catheter_used_in_clinical_notes',
       'catheter_present', 'final_cauti_flag', 'final_insertion_date',
       'final_removal_date', 'catheter_duration_days', 'BMI_in_admission',
       'BMI_computed', 'BMI_last_year', 'BMI', 

In [5]:
# cols_to_drop = [
#  "mobility_status"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)