This notebook combines Rachel Gold (@rachgoldaz, RG) and Rosie Cima (@cimar, RC) distinct methodologies for flagging potentially pregnant patients among the EMTALA deficiencies reported by CMS.

In [102]:
import pandas

# Stop phrases for @rachgoldaz methodology for detecting pregnant patients. @cimar note: This is @rachgoldaz's attempt to remove boilerplate that contains her keywords. I minimally edited this, but took out "vaginal pain, abdominal bleeding" from this list because it didn't have a corresponding keyword
IGNORE_PHRASES_PASS_RG = ['pregnancy test', 'medical condition, and /or pregnancy within its capabilities', 'medical condition, and/or pregnancy within its capabilities','pregnancy/active labor','hospital does not do ultrasounds except for pregnancy','the hospital does not do ultrasounds except for pregnancy','treatment based on how far along they were in their pregnancy','relates to pregnancy','during her pregnancy could be assessed for labor by a labor and delivery nurse','A preterm or premature baby is delivered before 37 weeks of the pregnancy','medical screening examinations for patients with pregnancy-related conditions under standardized procedures','urine test for pregnancy','urine pregnancy test if potential for pregnancy','except for pregnancy','citizenship, religion, pregnancy','without pregnancy','pregnancy and childbirth','no intrauterine pregnancy','urine pregnancy','Drug screen, Urine, pregnancy','complaints were not related to pregnancy','presenting to ED with pregnancy greater than 20 weeks','complaint is non-pregnancy related','An evaluation sufficient to determine if an emergency medical condition or pregnancy with contractions exists','possible EMCs related to pregnancy','If an emergency medical condition or pregnancy with contractions is present, the hospital must provide such additional medical examination and treatment','A minor who understands the nature and consequences of treatment is capable of consenting if the minor is 18 years of age or older, graduated from high school, has married, has been pregnant, needs diagnosis or treatment of pregnancy or venereal disease, or is 14 years of age or older and requests psychiatric treatment','someone in need of emergency care for a psychiatric or pregnancy-relations condition','discussion with prophylaxis against pregnancy','In pregnancy at-term, stabilization includes delivery of the child and the placenta','Using screen for pregnancy','Abdominal pain - any female of childbearing age requiring diagnostic testing to determine pregnancy','policy when presenting unscheduled for pregnancy related emergency care']

# Keywords for @rachgoldaz methodology for detecting pregnant patients
KEYWORDS_RG = ["weeks pregnant","miscarried","stillborn","water breaking","water broke","weeks gestation","weeks' gestation","weeks with labor","week pregnant","she was pregnant","was pregnant","water had broken","was in labor","was in active labor","was born","was noticeably pregnant","year old pregnant","months pregnant","months gestation","wks (weeks) preg (pregnant)","currently pregnant","weeks of pregnancy","gestational age","leaking amniotic fluid","wks (weeks)","pregnancy"]

# Stop phrases @cimar methodology for detecting pregnant patients
IGNORE_PHRASES_PASS_RC = ["pregnancy test","test for pregnancy","active labor act","active labor (sic) act"]

# Keywords for @cimar methodology for detecting pregnant patients
KEYWORDS_RC = ['gravid','pregnan','eclampsia','caeserian',' c-section',' csection',' c section',' para ','gestation','water break','water broke','active labor','obstetr']

# EMTALA deficiency codes: (2400, 2401, 2402, 2403, 2404, 2405, 2406, 2407, 2408, 2409, 2410, 2411)
EMTALA_RANGE = range(2400,2412)

q3pt1 = pandas.read_excel("data/source/Hospital_2567s_2022Q3/Hospital 2567s - 2022Q3 Part 1.xlsx")
q3pt2 = pandas.read_excel("data/source/Hospital_2567s_2022Q3/Hospital 2567s - 2022Q3 Part 2.xlsx")

q3 = pandas.concat([q3pt1,q3pt2])

def std_text(insp_text, ignore_words):
    std_text = insp_text.lower()
    for p in IGNORE_PHRASES_PASS2:
        std_text = std_text.replace(p,"")
    return std_text

# @rachgoldaz's methodology looks for keywords in the standardized text field (std_text_rg)
def search_kw(std_text,kw):
    for k in kw:
        if k in std_text:
            return True
    return False

def make_key_identifier(row):
    return str(row["deficiency_tag"]).split(".")[0] + " " + str(row["EVENT_ID"])

q3["key_identifier"] = q3.apply(make_key_identifier, axis=1)
q3["std_text_rg"] = q3["inspection_text"].apply(std_text, ignore_words=IGNORE_PHRASES_PASS_RG)
q3["std_text_rc"] = q3["inspection_text"].apply(std_text, ignore_words=IGNORE_PHRASES_PASS_RC)
q3["may_be_pregnant_rg"] = q3["std_text_rg"].apply(search_kw, kw=KEYWORDS_RG)

In [103]:
import re

# The inspection text often has phrases that identify single, anonymous patients. These are the substrings and regex that indicate a patient identifier.
PATIENT_ID_STRS = ["patient #","patient id #", "pt #", "pi #", "(pi) #"]
PATIENT_ID_REGEX = ["patient \\d","patient id \\d","pt \\d", "pi \\d", "(pi) \\d"]

WINDOW_AFTER = 200
WINDOW_BEFORE = 100

# @cimar's methodology is two-part. This function looks for keywords in the vicinity of patient identifiers (defined as substrings or regex). It returns the first keyword found -- if it finds one. Otherwise it returns None.
def may_be_preg_rc_near_text(text):
    for p in PATIENT_ID_STRS:
        inc = 0
        while True:
            ind = text.find(p,inc)
            if ind < 0:
                break
            start = ind - WINDOW_BEFORE
            end = ind + WINDOW_AFTER
            substr = text[start:end]
            k = search_kw(substr,KEYWORDS_RC)
            for k in KEYWORDS_RC:
                if k in substr:
                    return k
            inc = ind+len(p)
    for r in PATIENT_ID_REGEX:
        inc = 0
        while True:
            contains_regex = re.search(r,text[inc:])
            if not contains_regex:
                break
            ind = contains_regex.span()[0]
            start = ind - WINDOW_BEFORE
            end = ind + WINDOW_AFTER
            substr = text[start:end]
            for k in KEYWORDS_RC:
                if k in substr:
                    return k
            inc += contains_regex.span()[1]
    return None

# @cimar's methodology is two-part. This function looks for keywords in the same paragraphs as patient identifiers (defined as substrings or regex). It returns the first keyword found -- if it finds one. Otherwise it returns None.
def may_be_preg_rc_graf(text):
    grafs = text.split("\n")
    for g in grafs:
        for p in PATIENT_ID_STRS:
            if p in g:
                for k in KEYWORDS_RC:
                    if k in g:
                        return k
        for r in PATIENT_ID_REGEX:
            if re.search(r,g):
                for k in KEYWORDS_RC:
                    if k in g:
                        return k
    return None

def is_emtala_deficiency(code):
    if code in EMTALA_RANGE:
        return True
    return False

q3["may_be_pregnant_rc_near_text"] = q3["std_text_rc"].apply(may_be_preg_rc_near_text)
q3["may_be_pregnant_rc_graf"] = q3["std_text_rc"].apply(may_be_preg_rc_graf)

In [109]:
# Passing @rachgoldaz's methodology or either part of @cimar's methodology AND having an EMTALA deficiency code flags the case for manual review.
emtala_may_be_preg = q3[((~q3["may_be_pregnant_rc_graf"].isnull())|(~q3["may_be_pregnant_rc_near_text"].isnull())|(q3["may_be_pregnant_rg"]))&(q3["deficiency_tag"].apply(is_emtala_deficiency))]

# Export for manual review -- there are csv encoding issues with the inspection text field, so I'm exporting to Excel.
emtala_may_be_preg.to_excel("data/processed/emtala_may_be_preg.xlsx",index=False)

emtala_may_be_preg.shape

(916, 19)

In [157]:
# Importing the manually reviewed spreadsheet

manual_review = pandas.read_excel("data/manual/EMTALA_PREGNANT_2011-2022.xlsx")
manual_review_keys = pandas.Series(manual_review["key_identifier"].unique())

emtala_may_be_preg[~(emtala_may_be_preg["key_identifier"].isin(manual_review_keys))]

In [160]:
is_pregnant = emtala_may_be_preg[(emtala_may_be_preg["key_identifier"].isin(manual_review_keys))]

is_pregnant.to_excel("data/processed/manual_recombo.xlsx",index=False)
is_pregnant.shape

(679, 19)

In [139]:
manual_review_keys[~manual_review_keys.isin(emtala_may_be_preg["key_identifier"])]

122            NaN
324     Patient #1
dtype: object

In [153]:
manual_review[(manual_review["key_identifier"].duplicated())&(manual_review["key_identifier"].notna())][["deficiency_tag","EVENT_ID","key_identifier"]]["key_identifier"].tolist()

[' Patient #1',
 '2400 XI4411',
 '2406 XI4411',
 '2407 XI4411',
 '2411 0I8X11',
 '2407 HNOX11']

In [161]:
manual_review["facility_id"].nunique()

426