In [None]:
import numpy as np
import pandas as pd
import os
import time
import pickle


### define input and output path

In [None]:
patients_f = '[the output file in the patient2dict]'
OUTPATH = '[output directory]'

In [None]:
import pickle

with open(patients_f, 'rb') as f:
    patients = pickle.load(f)

In [None]:
len(patients)

### Related diagnoses(lung, BM and BM facts) labeling

In [None]:
def detect_dischg_disp_code_desc_from_enc(enc, keywords):
    desc = str(enc.get('dischg_disp_code_desc')).lower()
    for keyword in keywords:
        if keyword in desc:
            return enc.get('discharged_dt_tm')
    return None

In [None]:
def detect_medication_from_enc(enc, codes):
    meds = enc.get('MEDICATION')
    if meds is not None:
        med_names = [med['generic_name'].upper() for med in meds]
        for code in codes:
            if any(code in s for s in med_names):
                return enc.get('discharged_dt_tm')
    return None

In [None]:
def detect_diag_code_from_enc(enc, codes):
    diags = enc.get('DIAGNOSIS')
    if diags is not None:
        diag_codes = [str(diag['diagnosis_code']).upper() for diag in diags]
        for code in codes:
            if any(code in s for s in diag_codes):
                return enc.get('discharged_dt_tm')
    return None

In [None]:
def detect_hospice_from_enc(enc):
    return detect_dischg_disp_code_desc_from_enc(enc, ['hospice'])

In [None]:
def detect_expired_from_enc(enc):
    return detect_dischg_disp_code_desc_from_enc(enc, ['expired'])

In [None]:
def detect_lung_from_enc(enc):
    return detect_diag_code_from_enc(enc, ['162', 'C34'])

In [None]:
def detect_bm_from_enc(enc):
    return detect_diag_code_from_enc(enc, ['198.3', 'C79.3'])

In [None]:
def detect_bm_facts_from_enc(enc):
    diag_dt = detect_diag_code_from_enc(enc
                                        , ['191.9', 'C71.9', '348.5', 'G93.6', '239.6', 'D49.6'
                                           , '780.39', 'R56.9', 'V10.85', 'Z85.841'])
    med_dt = detect_medication_from_enc(enc, ['LEVETIRACETAM', 'PHENYTOIN'])
    if diag_dt != None:
        if med_dt == None or med_dt > diag_dt:
            return diag_dt
        else:
            return med_dt
    return None

In [None]:
### here we label each patient with the first diagnosis time of lung cancer, BM, BM facts, hospice and expired.

from tqdm import tqdm_notebook as tqdm

for d in tqdm(patients):
    encs = d['ENCOUNTER']
    lung_dt = None
    bm_dt = None
    bm_facts_dt = None
    hospice_dt = None
    expired_dt = None
    
    for enc in encs:
        tmp_lung_dt = detect_lung_from_enc(enc)
        tmp_bm_dt = detect_bm_from_enc(enc)
        tmp_bm_facts_dt = detect_bm_facts_from_enc(enc)
        tmp_hospice_dt = detect_hospice_from_enc(enc)
        tmp_expired_dt = detect_expired_from_enc(enc)
        
        if tmp_bm_dt is not None:
            if bm_dt is None:
                bm_dt = tmp_bm_dt
        
        if tmp_lung_dt is not None:
            if lung_dt is None:
                lung_dt = tmp_lung_dt
                
        if tmp_bm_facts_dt is not None:
            if bm_facts_dt is None:
                bm_facts_dt = tmp_bm_facts_dt
        
        if tmp_hospice_dt is not None:
            if hospice_dt is None:
                hospice_dt = tmp_hospice_dt
                
        if tmp_expired_dt is not None:
            if hospice_dt is None:
                expired_dt = tmp_expired_dt

#        if lung_dt and bm_dt:
#            break
    d['lung_dt'] = lung_dt
    d['bm_dt'] = bm_dt
    d['bm_facts_dt'] = bm_facts_dt
    d['expired_dt'] = expired_dt
    d['hospice_dt'] = hospice_dt
    d['final_dt'] = encs[-1].get('discharged_dt_tm')
    d['first_dt'] = encs[0].get('discharged_dt_tm')

### check labeling result

In [None]:
bm_ct = 0
lung_ct = 0
trainable_ct = 0
for d in patients:
    if d['bm_dt'] is not None:
        bm_ct += 1
        if d['lung_dt'] < d['bm_dt']:
            trainable_ct += 1
    if d['lung_dt'] is not None:
        lung_ct += 1

In [None]:
print(f'{lung_ct}, {bm_ct}, {trainable_ct}')

In [None]:
len(patients)

### check patients with BM facts before BM diagnosis

In [None]:
ct = 0

for d in patients:
    if d['bm_facts_dt'] is not None:
        if d['bm_dt'] is None or d['bm_dt'] < d['bm_facts_dt']:
            ct += 1
            
print(ct)

### check the expired dt and final dt

In [None]:
ct = 0

for d in patients:
    if d['expired_dt'] is not None:
        if d['expired_dt'] < d['final_dt']:
            ct += 1
            
print(ct)

### check the number of patients whose the final dt is less than 1-year of the Lung cancer dt 

In [None]:
ct = 0

for d in patients:
    if (d['final_dt'] - d['lung_dt']).days > 365:
        if d['bm_dt'] is None or (d['final_dt'] - d['bm_dt']).days > 365*3:
            ct += 1
            
print(ct)

### check and label positive patients

In [None]:
for d in patients:
    d['label'] = None

In [None]:
ct = 0
ct_all = 0
for d in patients:
    if d['bm_dt']:
        ct_all += 1
        interval = (d['bm_dt'] - d['lung_dt']).days
        #if  interval >= 90 and interval <= 365*5:
        if  interval >= 90:
            if d['bm_facts_dt'] is None or d['bm_dt'] < d['bm_facts_dt']:
                tmp_ct = 0
                for enc in d['ENCOUNTER']:
                    if enc['discharged_dt_tm'] < d['bm_dt']:
                        tmp_ct += 1
                if tmp_ct > 1:
                    ct += 1
                    d['label'] = 1    
print(ct)
print(ct_all)

### check and label negative patients (control for prediction within a follow-up period)

In [None]:
ct = 0
timewindow = 365 * 5
for d in patients:
        interval = (d['final_dt'] - d['lung_dt']).days
        if  interval >= timewindow:
            if d['hospice_dt'] is None or (d['hospice_dt'] - d['lung_dt']).days >= timewindow:
                if d['expired_dt'] is None or (d['expired_dt'] - d['lung_dt']).days >= timewindow:
                    if d['bm_dt'] is None or (d['bm_dt'] - d['lung_dt']).days >= timewindow:
                        if d['bm_facts_dt'] is None or (d['bm_facts_dt'] - d['lung_dt']).days >= timewindow:
                            ct += 1
                            d['label'] = 0

print(ct)

### check and label negative patients (control for prediction in the whole period)

In [None]:
ct = 0
timewindow = 365*1
for d in patients:
        interval = (d['final_dt'] - d['lung_dt']).days
        if  interval >= timewindow:
            if d['bm_dt'] is None and d['bm_facts_dt'] is None:
                if d['hospice_dt'] is None:
                    ct += 1
                    d['label'] = 0   ### candidate control list

print(ct)

In [None]:
new_patients = [d for d in patients if d['label'] in [0, 1]]

In [None]:
len(new_patients)

In [None]:
print('Saving data by pickle')

with open(os.path.join(OUTPATH, 'patients_with_label.pickle'), 'wb') as f:
    pickle.dump(new_patients, f, protocol=pickle.HIGHEST_PROTOCOL)