In [1]:
from collections import defaultdict

import dill
import numpy as np
import pandas as pd
import nltk
import re
from tqdm import tqdm
import random
tqdm.pandas()

In [2]:
##### process medications #####
# load med data
def med_process(med_file):
    med_pd = pd.read_csv(med_file, dtype={'ndc': 'category'})

    med_pd.drop(columns=['pharmacy_id', 'poe_id', 'poe_seq', 'order_provider_id',
                         'stoptime', 'drug', 'formulary_drug_cd', 'gsn', 'drug_type',
                         'prod_strength', 'form_rx', 'dose_val_rx', 'dose_unit_rx',
                         'form_val_disp', 'form_unit_disp', 'doses_per_24_hrs', 'route'], axis=1, inplace=True)
    med_pd.drop(index=med_pd[med_pd['ndc'] == '0'].index, axis=0, inplace=True)
    med_pd.fillna(method='pad', inplace=True)
    med_pd.dropna(inplace=True)
    med_pd.drop_duplicates(inplace=True)
    med_pd['starttime'] = pd.to_datetime(med_pd['starttime'], format='%Y-%m-%d %H:%M:%S')
    med_pd.sort_values(by=['subject_id', 'hadm_id', 'starttime'], inplace=True)
    med_pd = med_pd.reset_index(drop=True)

    med_pd = med_pd.drop_duplicates()
    med_pd = med_pd.reset_index(drop=True)

    return med_pd


# medication mapping
def ndc2atc4(med_pd):
    with open(ndc_rxnorm_file, 'r') as f:
        ndc2rxnorm = eval(f.read())
    med_pd['RXCUI'] = med_pd['ndc'].map(ndc2rxnorm)
    med_pd.dropna(inplace=True)

    rxnorm2atc = pd.read_csv(ndc2atc_file)
    rxnorm2atc = rxnorm2atc.drop(columns=['YEAR', 'MONTH', 'NDC'])
    rxnorm2atc.drop_duplicates(subset=['RXCUI'], inplace=True)
    med_pd.drop(index=med_pd[med_pd['RXCUI'].isin([''])].index, axis=0, inplace=True)

    med_pd['RXCUI'] = med_pd['RXCUI'].astype('int64')
    med_pd = med_pd.reset_index(drop=True)
    med_pd = med_pd.merge(rxnorm2atc, on=['RXCUI'])
    med_pd.drop(columns=['ndc', 'RXCUI'], inplace=True)
    med_pd = med_pd.rename(columns={'ATC5': 'ATC4'})
    med_pd['ATC4'] = med_pd['ATC4'].map(lambda x: x[:4])
    med_pd = med_pd.drop_duplicates()
    med_pd = med_pd.reset_index(drop=True)
    return med_pd


# visit >= 2
def process_visit_lg2(med_pd):
    a = med_pd[['subject_id', 'hadm_id']].groupby(by='subject_id')['hadm_id'].unique().reset_index()
    a['hadm_id_Len'] = a['hadm_id'].map(lambda x: len(x))
    a = a[a['hadm_id_Len'] > 1]
    return a


# most common medications
def filter_300_most_med(med_pd):
    med_count = med_pd.groupby(by=['ATC4']).size().reset_index().rename(columns={0: 'count'}).sort_values(by=['count'],
                                                                                                         ascending=False).reset_index(
        drop=True)
    med_pd = med_pd[med_pd['ATC4'].isin(med_count.loc[:299, 'ATC4'])]

    return med_pd.reset_index(drop=True)


##### process diagnosis #####
def diag_process(diag_file):
    diag_pd = pd.read_csv(diag_file)
    diag_pd.dropna(inplace=True)
    diag_pd.drop(columns=['seq_num', 'icd_version'], inplace=True)
    diag_pd.drop_duplicates(inplace=True)
    diag_pd.sort_values(by=['subject_id', 'hadm_id'], inplace=True)
    diag_pd = diag_pd.reset_index(drop=True)

    def filter_2000_most_diag(diag_pd):
        diag_count = diag_pd.groupby(by=['icd_code']).size().reset_index().rename(columns={0: 'count'}).sort_values(
            by=['count'], ascending=False).reset_index(drop=True)
        diag_pd = diag_pd[diag_pd['icd_code'].isin(diag_count.loc[:1999, 'icd_code'])]

        return diag_pd.reset_index(drop=True)

    diag_pd = filter_2000_most_diag(diag_pd)

    return diag_pd


##### process procedure #####
def procedure_process(procedure_file):
    proc_pd = pd.read_csv(procedure_file, dtype={'ICD9_CODE': 'category'})
    proc_pd.drop(columns=['icd_version', 'chartdate'], inplace=True)
    proc_pd.drop_duplicates(inplace=True)
    proc_pd.sort_values(by=['subject_id', 'hadm_id', 'seq_num'], inplace=True)
    proc_pd.drop(columns=['seq_num'], inplace=True)
    proc_pd.drop_duplicates(inplace=True)
    proc_pd.reset_index(drop=True, inplace=True)

    def filter_1500_most_proc(proc_pd):
        diag_count = proc_pd.groupby(by=['icd_code']).size().reset_index().rename(columns={0: 'count'}).sort_values(
            by=['count'], ascending=False).reset_index(drop=True)
        proc_pd = proc_pd[proc_pd['icd_code'].isin(diag_count.loc[:1499, 'icd_code'])]

        return proc_pd.reset_index(drop=True)

    proc_pd = filter_1500_most_proc(proc_pd)

    return proc_pd

def match_symptoms(search):

    """re for symptom matching"""

    symptomList = []

    for item in search:
        pattern = r'<c>(.*?)<\/c>'
        noun_prases = re.findall(pattern, item, flags=0)
        for s in noun_prases:
            if s.lower() in symptoms_list:
                symptomList.append(s)

    return list(set(symptomList))


def main_text(full_text):
    sentence = full_text.replace('"','')

    sentences = nltk.sent_tokenize(sentence)
    word_tokens = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in word_tokens]

    def chunk(text):
        """ Return the noun phrases using reguler expressoins"""

        '''pattern = ['JJ', 'NN', 'VB', 'NN']
            matches = []

            for i in range(len(tagged)):
                if tagged[i:i+len(pattern)] == pattern:
                    matches.append(sentences[i:i+len(pattern)])

            matches = [' '.join(match) for match in matches]
            print(matches)'''

        grammar = """NP: {<V.*>+(<RP?><NN>)?}
                    NP: {(<NN.*><DT>)?(<NN.*><IN>)?<NN.*>?<JJ.>*<NN.*>+}
                    NP: {<V.*>}
                    ENTITY: {<NN.*>}"""

        parser = nltk.RegexpParser(grammar)
        result = parser.parse(text)
        t_sent = ' '.join(word for word, pos in text)
        for subtree in result.subtrees():
            if subtree.label() == 'NP':
                noun_phrases_list = ' '.join(word for word, pos in subtree.leaves())
                t_sent = t_sent.replace(noun_phrases_list, "<c>"+noun_phrases_list+"</c>", 1)
        return t_sent
    
    chunk_sent = []
    for sentence in sentences:
        chunk_sent.append(chunk(sentence))
    return chunk_sent

def symptoms_tagger(x):

    search = main_text(x)

    tagged_symptom_list = match_symptoms(search)
    return list(set(tagged_symptom_list))


def text_to_symptom(text):
    text_list = text.split('\n')
    sym_list = []
    for i in range(len(text_list)):
        sym = symptoms_tagger(text_list[i])
        sym_list += sym
    return list(set([sym.lower() for sym in sym_list]))

###### combine three tables #####
def combine_process(med_pd, diag_pd, proc_pd):
    med_pd_key = med_pd[['subject_id', 'hadm_id']].drop_duplicates()
    diag_pd_key = diag_pd[['subject_id', 'hadm_id']].drop_duplicates()
    proc_pd_key = proc_pd[['subject_id', 'hadm_id']].drop_duplicates()

    combined_key = med_pd_key.merge(diag_pd_key, on=['subject_id', 'hadm_id'], how='inner')
    combined_key = combined_key.merge(proc_pd_key, on=['subject_id', 'hadm_id'], how='inner')

    diag_pd = diag_pd.merge(combined_key, on=['subject_id', 'hadm_id'], how='inner')
    med_pd = med_pd.merge(combined_key, on=['subject_id', 'hadm_id'], how='inner')
    proc_pd = proc_pd.merge(combined_key, on=['subject_id', 'hadm_id'], how='inner')

    # flatten and merge
    diag_pd = diag_pd.groupby(by=['subject_id', 'hadm_id'])['icd_code'].unique().reset_index()
    med_pd = med_pd.groupby(by=['subject_id', 'hadm_id'])['ATC4'].unique().reset_index()
    proc_pd = proc_pd.groupby(by=['subject_id', 'hadm_id'])['icd_code'].unique().reset_index().rename(
        columns={'icd_code': 'PRO_CODE'})
    med_pd['ATC4'] = med_pd['ATC4'].map(lambda x: list(x))
    proc_pd['PRO_CODE'] = proc_pd['PRO_CODE'].map(lambda x: list(x))
    data = diag_pd.merge(med_pd, on=['subject_id', 'hadm_id'], how='inner')
    data = data.merge(proc_pd, on=['subject_id', 'hadm_id'], how='inner')
    #     data['icd_code_Len'] = data['icd_code'].map(lambda x: len(x))
    data['ndc_Len'] = data['ATC4'].map(lambda x: len(x))

    return data

In [21]:
def statistics(data):
    print('#patients ', data['subject_id'].unique().shape)
    print('#clinical events ', len(data))

    diag = data['icd_code'].values
    med = data['ATC4'].values
    pro = data['PRO_CODE'].values
    sym = data['SYM_LIST'].values

    unique_diag = set([j for i in diag for j in list(i)])
    unique_med = set([j for i in med for j in list(i)])
    unique_pro = set([j for i in pro for j in list(i)])
    unique_sym = set([j for i in sym for j in list(i)])

    print('#diagnosis ', len(unique_diag))
    print('#med ', len(unique_med))
    print('#procedure', len(unique_pro))
    print('#symptoms', len(unique_sym))

    avg_diag, avg_med, avg_pro, avg_sym, max_diag, max_med, max_pro, max_sym, cnt, max_visit, avg_visit = [0 for i in range(11)]

    for subject_id in data['subject_id'].unique():
        item_data = data[data['subject_id'] == subject_id]
        x, y, z,s = [], [], [], []
        visit_cnt = 0
        for index, row in item_data.iterrows():
            visit_cnt += 1
            cnt += 1
            x.extend(list(row['icd_code']))
            y.extend(list(row['ATC4']))
            z.extend(list(row['PRO_CODE']))
            s.extend(list(row['SYM_LIST']))
        x, y, z, s = set(x), set(y), set(z), set(s)
        avg_diag += len(x)
        avg_med += len(y)
        avg_pro += len(z)
        avg_sym += len(s)                              
        avg_visit += visit_cnt
        if len(x) > max_diag:
            max_diag = len(x)
        if len(y) > max_med:
            max_med = len(y)
        if len(z) > max_pro:
            max_pro = len(z)
        if len(s) > max_sym:
            max_sym = len(s)
        if visit_cnt > max_visit:
            max_visit = visit_cnt

    print('#avg of diagnoses ', avg_diag / cnt)
    print('#avg of medicines ', avg_med / cnt)
    print('#avg of procedures ', avg_pro / cnt)
    print('#avg of symptoms ', avg_sym / cnt)
    print('#avg of vists ', avg_visit / len(data['subject_id'].unique()))

    print('#max of diagnoses ', max_diag)
    print('#max of medicines ', max_med)
    print('#max of procedures ', max_pro)
    print('#max of symptoms ', max_sym)
    print('#max of visit ', max_visit)

In [4]:
def get_ddi_matrix(records, med_voc, ddi_file):
    TOPK = 40  # topk drug-drug interaction
    cid2atc_dic = defaultdict(set)
    med_voc_size = len(med_voc.idx2word)
    med_unique_word = [med_voc.idx2word[i] for i in range(med_voc_size)]
    atc3_atc4_dic = defaultdict(set)
    for item in med_unique_word:
        atc3_atc4_dic[item[:4]].add(item)

    with open(cid_atc, 'r') as f:
        for line in f:
            line_ls = line[:-1].split(',')
            cid = line_ls[0]
            atcs = line_ls[1:]
            for atc in atcs:
                if len(atc3_atc4_dic[atc[:4]]) != 0:
                    cid2atc_dic[cid].add(atc[:4])

    # ddi load
    ddi_df = pd.read_csv(ddi_file)
    # fliter sever side effect
    ddi_most_pd = ddi_df.groupby(by=['Polypharmacy Side Effect', 'Side Effect Name']).size().reset_index().rename(
        columns={0: 'count'}).sort_values(by=['count'], ascending=False).reset_index(drop=True)
    ddi_most_pd = ddi_most_pd.iloc[-TOPK:, :]
    # ddi_most_pd = pd.DataFrame(columns=['Side Effect Name'], data=['as','asd','as'])
    fliter_ddi_df = ddi_df.merge(ddi_most_pd[['Side Effect Name']], how='inner', on=['Side Effect Name'])
    ddi_df = fliter_ddi_df[['STITCH 1', 'STITCH 2']].drop_duplicates().reset_index(drop=True)
    ddi_adj = np.zeros((med_voc_size, med_voc_size))
    for index, row in ddi_df.iterrows():
        # ddi
        cid1 = row['STITCH 1']
        cid2 = row['STITCH 2']

        # cid -> atc_level3
        for atc_i in cid2atc_dic[cid1]:
            for atc_j in cid2atc_dic[cid2]:

                # atc_level3 -> atc_level4
                for i in atc3_atc4_dic[atc_i]:
                    for j in atc3_atc4_dic[atc_j]:
                        if med_voc.word2idx[i] != med_voc.word2idx[j]:
                            ddi_adj[med_voc.word2idx[i], med_voc.word2idx[j]] = 1
                            ddi_adj[med_voc.word2idx[j], med_voc.word2idx[i]] = 1
    dill.dump(ddi_adj, open('./outputs/ddi_A_final.pkl', 'wb'))

    return ddi_adj

def get_ddi_maskH(med_structure_file, med_voc):
    NDCList = dill.load(open(med_structure_file, 'rb'))

    fraction = []
    for k, v in med_voc.idx2word.items():
        tempF = set()

        for SMILES in NDCList[v]:
            try:
                m = BRICS.BRICSDecompose(Chem.MolFromSmiles(SMILES))
                for frac in m:
                    tempF.add(frac)
            except:
                pass

        fraction.append(tempF)

    fracSet = []
    for i in fraction:
        fracSet += i
    fracSet = list(set(fracSet))

    ddi_matrix = np.zeros((len(med_voc.idx2word), len(fracSet)))

    for i, fracList in enumerate(fraction):
        for frac in fracList:
            ddi_matrix[i, fracSet.index(frac)] = 1

    dill.dump(ddi_matrix, open('./outputs/ddi_mask_H.pkl', 'wb'))
    dill.dump(fracSet, open('./outputs/substructure_smiles.pkl', 'wb'))

In [29]:
##### indexing file and final record
class Voc(object):
    def __init__(self):
        self.idx2word = {}
        self.word2idx = {}

    def add_sentence(self, sentence):
        for word in sentence:
            if word not in self.word2idx:
                self.idx2word[len(self.word2idx)] = word
                self.word2idx[word] = len(self.word2idx)
                
def create_str_token_mapping(df):
    diag_voc = Voc()
    med_voc = Voc()
    pro_voc = Voc()
    sym_voc = Voc()
    
    for index, row in df.iterrows():
        diag_voc.add_sentence(["MASK"])
        diag_voc.add_sentence(row['icd_code'])
        pro_voc.add_sentence(["MASK"])
        pro_voc.add_sentence(row['PRO_CODE'])
        sym_voc.add_sentence(["MASK"])
        sym_voc.add_sentence(row['SYM_LIST'])
        med_voc.add_sentence(["MASK"])
        med_voc.add_sentence(row['ATC4'])
    
    dill.dump(obj={'diag_voc':diag_voc, 'med_voc':med_voc ,'pro_voc':pro_voc, 'sym_voc':sym_voc}, file=open('./outputs/voc_final.pkl','wb'))
    return diag_voc, pro_voc, sym_voc, med_voc 


# create final records
def create_patient_record(df, diag_voc, med_voc, pro_voc, sym_voc):
    records = []  # (patient, code_kind:3, codes)  code_kind:diag, proc, med
    for subject_id in df['subject_id'].unique():
        item_df = df[df['subject_id'] == subject_id]
        patient = []
        for index, row in item_df.iterrows():
            admission = []
            admission.append([diag_voc.word2idx[i] for i in row['icd_code']])
            admission.append([pro_voc.word2idx[i] for i in row['PRO_CODE']])
            admission.append([sym_voc.word2idx[i] for i in row['SYM_LIST']])
            admission.append([med_voc.word2idx[i] for i in row['ATC4']])
            patient.append(admission)
        records.append(patient)
    dill.dump(obj=records, file=open('./outputs/records_final.pkl', 'wb'))
    return records

In [6]:
med_file = '/home/heyichen/dataset/mimiciv/prescriptions.csv'
diag_file = '/home/heyichen/dataset/mimiciv/diagnoses_icd.csv'
procedure_file = '/home/heyichen/dataset/mimiciv/procedures_icd.csv'
# profile_file = '/home/heyichen/dataset/mimiciv//ADMISSIONS.csv'
text_file = '/home/heyichen/dataset/mimiciv/discharge.csv'

symptom_file = './inputs/symptoms_list.pkl'
med_structure_file = './inputs/idx2drug.pkl'

# drug code mapping files
ndc2atc_file = './inputs/ndc2atc_level4.csv'
cid_atc = './inputs/drug-atc.csv'
ndc_rxnorm_file = './inputs/ndc2rxnorm_mapping.txt'

# ddi information
ddi_file = './inputs/drug-DDI.csv'

# output files
ddi_adjacency_file = "./outputs/ddi_A_final.pkl"
ehr_adjacency_file = "./outputs/ehr_adj_final.pkl"
ehr_sequence_file = "./outputs/records_final.pkl"
vocabulary_file = "./outputs/voc_final.pkl"
ddi_mask_H_file = "./outputs/ddi_mask_H.pkl"


In [7]:
med_pd = med_process(med_file)
med_pd_lg2 = process_visit_lg2(med_pd).reset_index(drop=True)
med_pd = med_pd.merge(med_pd_lg2[['subject_id']], on='subject_id', how='inner').reset_index(drop=True)

  med_pd = pd.read_csv(med_file, dtype={'ndc': 'category'})


In [8]:
med_pd = ndc2atc4(med_pd)
NDCList = dill.load(open(med_structure_file, 'rb'))
med_pd = med_pd[med_pd.ATC4.isin(list(NDCList.keys()))]
med_pd = filter_300_most_med(med_pd)

In [9]:
med_pd

Unnamed: 0,subject_id,hadm_id,starttime,ATC4
0,10000032,22595853,2180-05-07 00:00:00,B01A
1,10000032,22841357,2180-06-26 22:00:00,B01A
2,10000032,25742920,2180-08-06 03:00:00,B01A
3,10000032,29079034,2180-07-23 15:00:00,B01A
4,10000117,22927623,2181-11-15 13:00:00,B01A
...,...,...,...,...
5472967,19692720,24810109,2160-02-24 20:00:00,N03A
5472968,19692720,22961484,2159-10-20 08:00:00,N03A
5472969,19692720,22961484,2159-10-20 22:00:00,N03A
5472970,19837286,22242249,2118-05-07 01:00:00,H01B


In [10]:
diag_pd = diag_process(diag_file)
diag_pd

Unnamed: 0,subject_id,hadm_id,icd_code
0,10000032,22595853,5723
1,10000032,22595853,78959
2,10000032,22595853,5715
3,10000032,22595853,07070
4,10000032,22595853,496
...,...,...,...
4141879,19999987,23865745,41401
4141880,19999987,23865745,78039
4141881,19999987,23865745,0413
4141882,19999987,23865745,36846


In [11]:
pro_pd = procedure_process(procedure_file)
pro_pd

Unnamed: 0,subject_id,hadm_id,icd_code
0,10000032,22595853,5491
1,10000032,22841357,5491
2,10000032,25742920,5491
3,10000068,25022803,8938
4,10000117,27988844,0QS734Z
...,...,...,...
570237,19999840,21033226,0331
570238,19999840,26071774,8891
570239,19999840,26071774,8841
570240,19999987,23865745,8841


In [12]:
data = combine_process(med_pd, diag_pd, pro_pd)

In [13]:
data

Unnamed: 0,subject_id,hadm_id,icd_code,ATC4,PRO_CODE,ndc_Len
0,10000032,22595853,"[5723, 78959, 5715, 07070, 496, 29680, 30981, ...","[B01A, N02B, J05A, A12B, N07B, C03C, R01A]",[5491],7
1,10000032,22841357,"[78959, 2875, 2761, 496, 5715, V08, 3051]","[B01A, N02B, J05A, C03C, A07A, A02A, J01E]",[5491],7
2,10000032,25742920,"[07054, 78959, V462, 5715, 2767, 2761, 496, V0...","[B01A, J05A, C03C, A07A, A02A, D07A, B05C, N06...",[5491],9
3,10000117,27988844,"[W010XXA, K219, E7800, I341, G43909, Z87891, Z...","[N02B, A06A, A12A, A12C]",[0QS734Z],4
4,10000826,20032235,"[5712, 486, 78959, 5723, 5990, 2639, 2761, 511...","[B01A, N07B, C03C, A06A, A12C, N02A, N05B, A11...",[5491],15
...,...,...,...,...,...,...
156805,19999784,29956342,"[Z5111, E876, Z87891, Z8619]","[N02A, N05A, B05C, A02A, A06A]",[3E04305],5
156806,19999828,25744818,"[K632, Y838, Y929, I10, J449, Z794, Z87891, Z9...","[A12A, A12C, A07A, A02B, A12B, A06A, B05C]","[0WPF0JZ, 05HY33Z]",7
156807,19999828,29734428,"[T8131XA, K632, D6851, N390, Y838, Y92018, I48...","[B01A, B05C, A06A, N02B, A12A, A12C, D04A, A10...","[0HBJXZZ, 0HBHXZZ, 02HV33Z, 3E0436Z]",11
156808,19999840,21033226,"[3453, 51881, 5070, 5180, 42741, 43811, 4019, ...","[B01A, A06A, A12A, A12C, A12B, N02B, J01D, C10...","[9604, 9672, 966, 0331]",19


In [14]:
#combine symp
symptoms_list = dill.load(open(symptom_file, 'rb'))
symptoms_list = list(set([sym.lower() for sym in symptoms_list]))
notes = pd.read_csv(text_file, usecols=['subject_id','hadm_id','note_type','text'])

In [15]:
notes1 = notes[notes.note_type=='DS'].sort_values(by=['subject_id', 'hadm_id'])
notes2 = notes1.groupby(by=['subject_id','hadm_id'])['text'].apply('\n'.join).reset_index()
data1 = data.merge(notes2, on=['subject_id', 'hadm_id'], how='inner')
data1['SYM_LIST'] = data1['text'].progress_apply(text_to_symptom)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 139056/139056 [8:07:10<00:00,  4.76it/s]


In [16]:
data1['SYM_len'] = data1['SYM_LIST'].map(len)
data = data1[data1['SYM_len'] > 0].reset_index()
data = data.drop(columns=['index', 'text'])
# data.to_pickle('./output/data_final.pkl')
# statistics(data)
print('complete combining')

complete combining


In [17]:
data

Unnamed: 0,subject_id,hadm_id,icd_code,ATC4,PRO_CODE,ndc_Len,SYM_LIST,SYM_len
0,10000032,22595853,"[5723, 78959, 5715, 07070, 496, 29680, 30981, ...","[B01A, N02B, J05A, A12B, N07B, C03C, R01A]",[5491],7,"[sob, bleeding, hematuria, stomach pain, disco...",13
1,10000032,22841357,"[78959, 2875, 2761, 496, 5715, V08, 3051]","[B01A, N02B, J05A, C03C, A07A, A02A, J01E]",[5491],7,"[nausea, sob, bleeding, fever, hallucinations,...",12
2,10000032,25742920,"[07054, 78959, V462, 5715, 2767, 2761, 496, V0...","[B01A, J05A, C03C, A07A, A02A, D07A, B05C, N06...",[5491],9,"[cough, fever, wheezing, shortness of breath, ...",12
3,10000117,27988844,"[W010XXA, K219, E7800, I341, G43909, Z87891, Z...","[N02B, A06A, A12A, A12C]",[0QS734Z],4,"[weakness, ecchymosis, constipation, pain]",4
4,10000826,20032235,"[5712, 486, 78959, 5723, 5990, 2639, 2761, 511...","[B01A, N07B, C03C, A06A, A12C, N02A, N05B, A11...",[5491],15,"[constipation, flatulence, tremor, fever, back...",7
...,...,...,...,...,...,...,...,...
138819,19999784,29956342,"[Z5111, E876, Z87891, Z8619]","[N02A, N05A, B05C, A02A, A06A]",[3E04305],5,"[nausea, sob, fatigue, vomiting, chest pain, d...",10
138820,19999828,25744818,"[K632, Y838, Y929, I10, J449, Z794, Z87891, Z9...","[A12A, A12C, A07A, A02B, A12B, A06A, B05C]","[0WPF0JZ, 05HY33Z]",7,"[diarrhea, pain]",2
138821,19999828,29734428,"[T8131XA, K632, D6851, N390, Y838, Y92018, I48...","[B01A, B05C, A06A, N02B, A12A, A12C, D04A, A10...","[0HBJXZZ, 0HBHXZZ, 02HV33Z, 3E0436Z]",11,"[nausea, sob, constipation, cough, vomiting, s...",11
138822,19999840,21033226,"[3453, 51881, 5070, 5180, 42741, 43811, 4019, ...","[B01A, A06A, A12A, A12C, A12B, N02B, J01D, C10...","[9604, 9672, 966, 0331]",19,"[sob, constipation, respiratory distress, coug...",10


In [18]:
data.to_pickle('./outputs/data_final.pkl')

In [22]:
statistics(data)

#patients  (54845,)
#clinical events  138824
#diagnosis  2000
#med  131
#procedure 1500
#symptoms 564
#avg of diagnoses  8.88785080389558
#avg of medicines  7.038667665533337
#avg of procedures  2.1951463723851785
#avg of symptoms  7.222209416239267
#avg of vists  2.5312061263560945
#max of diagnoses  268
#max of medicines  72
#max of procedures  63
#max of symptoms  118
#max of visit  69


In [27]:
diag_voc, pro_voc, sym_voc, med_voc = create_str_token_mapping(data)

In [30]:
all_records = create_patient_record(data, diag_voc, med_voc, pro_voc, sym_voc)

In [32]:
ddi_adj = get_ddi_matrix(all_records, med_voc, ddi_file)