### import library

In [None]:
import numpy as np
import pandas as pd
import os
import time
from collections import Counter
import pickle

### define the data path, input data file and output directory

In [None]:
datapath = 'put the directory of your data'
enc_file = 'lung_encall.txt'
diag_file = 'lung_diagnosis.txt'
med_file = 'lung_medication.txt'
surg_file = 'lung_surgical.txt'

OUTPATH = os.path.join(datapath, 'matching')
if not os.path.exists(OUTPATH):
    os.makedirs(OUTPATH)
    

In [None]:
### read encounter
enc_df = pd.read_csv(
    os.path.join(datapath, enc_file)
    , sep='\t'
    , usecols=['patient_sk', 'encounter_id', 'gender', 'race', 
              'marital_status', 'yob', 'admitted_dt_tm', 'discharged_dt_tm'
              , 'patient_type_desc', 'dischg_disp_code_desc', 'hospital_id'
              , 'census_region', 'census_division', 'bed_size_range'
               , 'urban_rural_status', 'alt_health_system_id']
    , parse_dates=['admitted_dt_tm', 'discharged_dt_tm']
)


In [None]:
enc_df.head()

In [None]:
enc_hospital_groupby = enc_df.groupby('patient_sk')['hospital_id']

In [None]:
psk_hid_dict = {}
for k,v in enc_hospital_groupby:
    most_common = Counter(v).most_common(1)
    if len(most_common) == 1:
        psk_hid_dict[k] = most_common[0][0]

In [None]:
len(psk_hid_dict)

In [None]:
psk_hid_s = pd.Series(psk_hid_dict, name='hospital_id')
psk_hid_s.index.name = 'patient_sk'
psk_hid_df = psk_hid_s.reset_index()

In [None]:
psk_hid_df.to_csv(os.path.join(OUTPATH, 'psk_hospit.tsv'), sep='\t', index=False)

In [None]:
### extract hospital information from encounter table
hospital_df = enc_df.loc[:, ['hospital_id'
              , 'census_region', 'census_division', 'bed_size_range'
               , 'urban_rural_status', 'alt_health_system_id']].drop_duplicates()

In [None]:
hospital_df.to_csv(os.path.join(OUTPATH, 'hospital_info.tsv'), sep='\t', index=False)

### read diagnosis

In [None]:

diag_df = pd.read_csv(
    os.path.join(datapath, diag_file)
    , sep='\t'
    , usecols=['patient_sk', 'encounter_id', 'diagnosis_id'
               , 'diagnosis_type', 'diagnosis_code'
               #, 'diagnosis_description', 'diagnosis_type_display'
              ])

In [None]:
diag_df.shape

In [None]:
diag_df = diag_df.dropna()

In [None]:
diag_df.shape

In [None]:
### read medication
med_df = pd.read_csv(
    os.path.join(datapath, med_file)
    , sep='\t'
    , usecols=['patient_sk', 'encounter_id', 'medication_id', 'ndc_code', 'brand_name',
       'generic_name'])

In [None]:
med_df.head()

In [None]:
### read surgical
surg_df = pd.read_csv(
    os.path.join(datapath, surg_file)
    , sep='\t'
    , usecols=['patient_sk', 'encounter_id', 'surgical_procedure_id',
       'surgical_procedure_desc', 'icd9_code'])

In [None]:
surg_df.head()

In [None]:
### extract demographic from encounter table
demo_df = enc_df.loc[:, ['patient_sk', 'gender', 'race', 
              'marital_status', 'yob']].drop_duplicates()

In [None]:
print(demo_df.shape)
#print(demo_hsptl_df.shape)

In [None]:
### convert demographic dataframe to dictionary
patients = demo_df.to_dict(orient='records')

In [None]:
enc_dict = {k:v for k,v in enc_df.groupby('patient_sk')}
diag_dict = {k:v for k,v in diag_df.groupby('encounter_id')}
med_dict = {k:v for k,v in med_df.groupby('encounter_id')}
surg_dict = {k:v for k,v in surg_df.groupby('encounter_id')}

### extract features for each patient

In [None]:
from tqdm import tqdm_notebook as tqdm

for patient in tqdm(patients):
    psk = patient['patient_sk']
    encs = enc_dict[psk].sort_values(
        by='discharged_dt_tm'
        , ascending=True).loc[:, ['encounter_id', 'discharged_dt_tm'
                                  , 'patient_type_desc', 'bed_size_range'
                                  , 'dischg_disp_code_desc']].to_dict(orient='records')
    for enc in encs:
        encid = enc['encounter_id']
        diags = diag_dict.get(encid)
        if diags is not None:
            enc['DIAGNOSIS'] = diags.loc[
            :
            , ['diagnosis_id', 'diagnosis_type', 'diagnosis_code']].to_dict(orient='records')
        meds = med_dict.get(encid)
        if meds is not None:
            enc['MEDICATION'] = meds.loc[
            :
            , ['medication_id', 'ndc_code', 'brand_name', 'generic_name']].to_dict(orient='records')
        surgs = surg_dict.get(encid)
        if surgs is not None:
            enc['SURGICAL'] = surgs.loc[
            :
            , ['surgical_procedure_id', 'surgical_procedure_desc', 'icd9_code']].to_dict(orient='records')
    patient['ENCOUNTER'] = encs

In [None]:
print('Saving data by pickle')
import pickle
with open(os.path.join(OUTPATH, 'patients.pickle'), 'wb') as f:
    pickle.dump(patients, f, protocol=pickle.HIGHEST_PROTOCOL)
    