## sample patients

In [1]:
import pandas as pd

note_index_path = "results/aud_notes_keywords_by_person.csv"
df_note=pd.read_csv(note_index_path)

key_word_2=df_note[df_note['aud_roots_count']>=2].reset_index(drop=True)
key_word_1=df_note[df_note['aud_roots_count']>=1].reset_index(drop=True)

key_word_2=key_word_2.drop_duplicates(subset=['person_id'],keep='first')
key_word_1=key_word_1.drop_duplicates(subset=['person_id'],keep='first')

In [None]:
icd_index=pd.read_csv('results/aud_patients_ICD_AUD_rule.csv')
drug_index=pd.read_csv('results/aud_patients_drug_rule.csv')
icd_1=icd_index[icd_index['inpatient_count']>=1].reset_index(drop=True)
icd_2=icd_index[icd_index['outpatient_count']>=2].reset_index(drop=True)
drug_1=drug_index

In [4]:
# Extract unique person_ids for each dataset
icd_1_a= set(icd_1['person_id'])
icd_2_b= set(icd_2['person_id'])
key_word1_c= set(key_word_1['person_id'])
key_word2_d= set(key_word_2['person_id'])
drug_e= set(drug_1['person_id'])

print(f"icd1: {len(icd_1_a)}")
print(f"icd2: {len(icd_2_b)}")
print(f"key_word1: {len(key_word1_c)}")
print(f"key_word2: {len(key_word2_d)}")
print(f"drug1: {len(drug_e)}")

icd1: 52556
icd2: 90817
key_word1: 128890
key_word2: 74467
drug1: 25423


In [5]:
# Define sets
a = set(icd_1['person_id'])
b = set(icd_2['person_id'])
c = set(key_word_1['person_id'])
d = set(key_word_2['person_id'])
e = set(drug_1['person_id'])

# Compute groups
groups = {
    "a": list(a - b - c - d - e),
    "b": list(b - a - c - d - e),
    "c+d": list((c & d) - a - b - e),
    "c": list(c - d - a - b - e),
    "e": list(e - a - b - c - d),
    "a+b": list((a & b) - c - d - e),
    "a+c": list((a & c) - b - d - e),
    "a+c+d": list((a & c & d) - b - e),
    "a+e": list((a & e) - b - c - d),
    "b+c": list((b & c) - a - d - e),
    "b+c+d": list((b & c & d) - a - e),
    "b+e": list((b & e) - a - c - d),
    "c+e": list((c & e) - a - b - d),
    "c+d+e": list((c & d & e) - a - b),
    "a+b+c": list((a & b & c) - d - e),
    "a+b+e": list((a & b & e) - c - d),
    "b+c+e": list((b & c & e) - a - d),
    "a+b+c+d": list((a & b & c & d) - e),
    "a+b+c+e": list((a & b & c & e) - d),
    "b+c+d+e": list((b & c & d & e) - a),
    "a+b+c+d+e": list(a & b & c & d & e)
}

# Print results
for group, members in groups.items():
    print(f"{group}: {len(members)}")

a: 6583
b: 27205
c+d: 14375
c: 20566
e: 12844
a+b: 709
a+c: 9437
a+c+d: 16297
a+e: 49
b+c: 18479
b+c+d: 22088
b+e: 523
c+e: 2855
c+d+e: 2024
a+b+c: 2081
a+b+e: 35
b+c+e: 764
a+b+c+d: 13595
a+b+c+e: 84
b+c+d+e: 2559
a+b+c+d+e: 2695


In [None]:
def check_category(category, elements_required):
    """
    Check if a category contains any of the required elements
    elements_required: list of elements that satisfy the rule (e.g., ['a', 'b'] means a OR b)
    """
    cat_elements = set(str(category).split('+'))
    return any(elem in cat_elements for elem in elements_required)

In [None]:
rules = [
    # Benchmark
    ('at least 1 inpatient or at least 2 outpatients', 
     lambda cat: check_category(cat, ['a', 'b'])),
    
    # Structure data only
    ('at least 1 inpatient or at least 2 outpatients or at least 1 medication', 
     lambda cat: check_category(cat, ['a', 'b', 'e'])),
    ('at least 1 inpatient', 
     lambda cat: check_category(cat, ['a'])),
    ('at least 2 outpatients', 
     lambda cat: check_category(cat, ['b'])),
    ('at least 1 medication', 
     lambda cat: check_category(cat, ['e'])),
    
    # Unstructure only
    ('at least 1 keyword', 
     lambda cat: check_category(cat, ['c', 'd'])),  # c or d both satisfy "at least 1 keyword"
    ('at least 2 keywords', 
     lambda cat: check_category(cat, ['d'])),  # only d means "at least 2 keywords"
    
    # Structure and unstructure data
    ('(at least 1 inpatient or at least 2 outpatients) AND at least 1 keywords',
     lambda cat: (check_category(cat, ['a', 'b']) and check_category(cat, ['c', 'd']))),  # (a OR b) AND (c OR d)
    ('at least 1 inpatient or at least 2 outpatients or at least 1 keywords', 
     lambda cat: check_category(cat, ['a', 'b', 'c', 'd'])),  # c or d both count as "at least 1 keyword"
    ('at least 1 inpatient or at least 2 outpatients or at least 1 keywords or at least 1 drug', 
     lambda cat: check_category(cat, ['a', 'b', 'c', 'd', 'e'])),
    ('at least 1 inpatient or at least 2 outpatients or at least 2 keywords', 
     lambda cat: check_category(cat, ['a', 'b', 'd'])),  # only d for "at least 2 keywords"
]