In [16]:
# Import libraries

import pandas as pd
import numpy as np
from datetime import datetime as dt
import pickle

In [17]:
# Paths

in_path = '/ctao_shared_data/Liver_optum/outputs/'
ot_path = '../../Data/'

# Diagnoses

In [18]:
# Load diagnosis file : this file contains patient_id, claim_date, diagnosis code and ICD code flag
f = "final_diag_all"
diagnosis = pd.read_csv(in_path+f+'.csv', delimiter = '\t', low_memory = False)

In [19]:
diagnosis['encounter_id'] = diagnosis.patid.astype(str)+'_'+diagnosis.fst_dt.astype(str)
diagnosis['modified_diag'] = np.where((diagnosis['icd_flag']==10), 'D_10_' + diagnosis['diag'].replace('.', ''), 'D_9_' + diagnosis['diag'].replace('.', ''))

In [22]:
cl = ['patid', 'encounter_id', 'modified_diag']
diagnosis = diagnosis[cl]
diagnosis.to_csv(ot_path+'mod_diagnosis.csv',index=False)

# Medications

In [23]:
# Load drugs file
f ='final_drug_all'
drug = pd.read_csv(in_path+f+'.csv', low_memory = False)

In [26]:
# Load the NDC (National Drug Code) data from a tab-delimited file
ndc = pd.read_csv(in_path + 'lu_ndc.csv', delimiter = '\t', header=1, low_memory=False)

# Select only the relevant columns: 'NDC' (National Drug Code) and 'GNRC_NM' (Generic Name)
ndc = ndc.loc[ndc.AHFSCLSS!='UNK']
ndc = ndc[['NDC','GNRC_NM']].drop_duplicates().reset_index(drop=True)
ndc = ndc.dropna()

In [27]:
# Maping NDC to generic Names
ndc_gnrc_nm = dict(zip(ndc['NDC'], ndc['GNRC_NM'])) 

gnrc_nm_list = []
for ndc in drug['ndc']:
    if ndc in ndc_gnrc_nm:
        gnrc_nm_list.append(ndc_gnrc_nm[ndc])
    else:
        gnrc_nm_list.append(np.nan)
drug['gnrc_nm']= gnrc_nm_list
drug = drug.dropna()

In [28]:
# Save final mapped medications file
drug['encounter_id'] = drug.patid.astype(str)+'_'+drug.fst_dt.astype(str)
mod_gen_drug_list = []
for gnrc_nm in drug.gnrc_nm:
    modified_name = 'M_' + gnrc_nm.lower().replace('.', '')
    mod_gen_drug_list.append(modified_name)
drug['mod_gen_drug'] = mod_gen_drug_list


cl = ['patid', 'encounter_id', 'mod_gen_drug']
drug = drug[cl]
drug.to_csv(ot_path+'mod_drug.csv',index=False)

# Procedures

In [29]:
# Load procedures file : this file contains patient_id, claim_date, procedure code and ICD code flag
f ='final_proc_all'
proc = pd.read_csv(in_path + f + '.csv', delimiter = '\t', low_memory = False)

In [30]:
proc['encounter_id'] = proc.patid.astype(str)+'_'+proc.fst_dt.astype(str)

In [31]:
mod_proc_list = []
for proc_code in proc.proc:
    new_code = 'P_' + 'I_' + proc_code.replace('.', '')
    mod_proc_list.append(new_code)
proc['mod_proc_code'] = mod_proc_list

cl = ['patid', 'encounter_id', 'mod_proc_code']
proc.to_csv(ot_path + 'mod_proc.csv', index=False)