In [1]:
"""
Feature: pain_documented
Extract pain documentation from discharge notes
"""
import pandas as pd
import os
import re
from config_paths import *
from utils import drop_columns

In [2]:
# Read dataset
df = pd.read_csv(dataset_path)
initial_row_count = len(df)

# Load discharge notes
discharge = pd.read_csv(
    os.path.join(note_path, "discharge.csv"),
    usecols=["subject_id", "hadm_id", "text"]
)

# Pain keywords
pain_keywords = [
    r"pain", r"discomfort", r"ache", r"soreness", r"tenderness",
    r"cramp|cramps|cramping", r"burning pain", r"sharp pain",
    r"stabbing pain", r"severe pain", r"moderate pain", r"mild pain",
    r"suprapubic pain", r"abdominal pain", r"flank pain", r"pelvic pain",
    r"bladder pain", r"urethral pain", r"dysuria", r"painful urination",
    r"catheter[- ]related discomfort", r"catheter discomfort", r"catheter pain"
]

pain_pattern = "|".join(pain_keywords)

discharge["pain_documented"] = discharge["text"].str.contains(
    pain_pattern, case=False, na=False, regex=True
)

# Aggregate per admission
pain_feature = (
    discharge.groupby(["subject_id", "hadm_id"])["pain_documented"]
    .any()
    .reset_index()
)

# Merge
df = df.merge(pain_feature, on=["subject_id", "hadm_id"], how="left")

# Fill missing with False
df["pain_documented"] = df["pain_documented"].fillna(False).astype(bool)

# Save
df.to_csv(dataset_path, index=False)

# Diagnostic output
print(f"\nTotal rows: {len(df)}")
print(f"Expected total: {initial_row_count}")
if len(df) == initial_row_count:
    print(f"✓ Row count matches expected total ({initial_row_count})")
else:
    print(f"⚠ Row count mismatch! Expected {initial_row_count}, got {len(df)} (difference: {len(df) - initial_row_count})")
print(f"Dataset shape: {df.shape}")
print(f"Feature 'pain_documented' added. True count: {df['pain_documented'].sum()}")
print(f"Dataset shape: {df.shape}")

  df["pain_documented"] = df["pain_documented"].fillna(False).astype(bool)



Total rows: 158020
Expected total: 158020
✓ Row count matches expected total (158020)
Dataset shape: (158020, 102)
Feature 'pain_documented' added. True count: 132764
Dataset shape: (158020, 102)


In [3]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime',
       ...
       'temperature', 'heart_rate', 'resp_rate', 'o2sat', 'BP_systolic',
       'BP_diastolic', 'other_uti', 'other_uti_present', 'has_cauti_history',
       'pain_documented'],
      dtype='object', length=102)

In [None]:
# cols_to_drop = [
#  "pain_documented"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)