In [None]:
import pandas as pd
import numpy as np

In [None]:
path = "/content/drive/MyDrive/MedicalData/"

# EHRs
med_file = path + "MIMIC-IV/prescriptions_ndc.csv"
diag_file = path + "MIMIC-IV/diagnoses_icd.csv"
proc_file = path + "MIMIC-IV/procedures_icd.csv"
patients_file = path + "MIMIC-IV/patients.csv"

diag_hadm_file = path + "MIMIC-IV/hadm_diagnoses.csv"
med_hadm_file = path + "MIMIC-IV/hadm_prescriptions.csv"

adm_hadm_file = path + "hadm_code.csv"

ddi_file = path + "drug-DDI.csv"

# ICD Codes
diag_icd_file = path + "MIMIC-IV/d_icd_diagnoses.csv"
proc_icd_file = path + "MIMIC-IV/d_icd_procedures.csv"

# Mappings
ndc2atc_file = path + "Mappings/ndc2atc_level4.csv"
ndc2rxnorm_file = path + "Mappings/ndc2rxnorm_mapping.txt"
cid2atc_file = path + "Mappings/drug-atc-processed.csv"
drugmap_file = path + "Mappings/drug_codes_mapping.csv"
drugmap_atc_file = path + "Mappings/drug_codes_mapping_atc.csv"
atc4to3_file = path + "Mappings/atc4to3_map.csv"

### **Prescriptions**

In [None]:
def process_med():
    # process prescription dataset
    med_pd = pd.read_csv(med_file, dtype={'ndc':'category'})
    med_pd.drop(index = med_pd[med_pd['ndc'] == '0'].index, axis=0, inplace=True)
    med_pd.fillna(method='pad', inplace=True)
    med_pd.dropna(inplace=True)
    med_pd.drop_duplicates(inplace=True)
    med_pd = med_pd.reset_index(drop=True)

    # change subject_id to serial numbers
    hadm_code_pd = pd.read_csv(path + 'hadm_code.csv')
    med_pd = med_pd.merge(hadm_code_pd, how='left', on='hadm_id')

    # convert ndc medical data in prescriptions to atc5 format
    def ndc2atc5(med_pd):
        drugmap_pd = pd.read_csv(drugmap_atc_file, dtype={'ndc': 'category'})
        drugmap_pd.drop(columns=['ndc10', 'rxcui', 'atc4', 'atc4_name', 'in_name', 'atc5', 'atc3'], inplace=True)

        med_pd = med_pd.merge(drugmap_pd, how='left', on='ndc')

        med_pd.dropna(inplace=True)
        med_pd.drop_duplicates(subset=["hadm_id", "ndc"], keep="first", inplace=True)
        med_pd.drop_duplicates(subset=["hadm_id", "atc_idx"], keep="first", inplace=True)
        med_pd = med_pd.reset_index(drop=True)

        return med_pd

    med_pd = ndc2atc5(med_pd)

    med_pd_atc3 = med_pd.drop(columns=['subject_id', 'hadm_id', 'ndc', 'ndc_idx', 'atc_idx'])
    med_pd_atc3.drop_duplicates(subset=['hadm_code', 'atc3_idx'], inplace=True)
    med_pd_atc3.reset_index(drop=True, inplace=True)
    med_pd_atc3.rename(columns = {'atc3_idx':'atc_idx'}, inplace = True)

    med_pd.drop(columns=['subject_id', 'hadm_id', 'ndc', 'ndc_idx', 'atc3_idx'], inplace=True)

    med_pd = med_pd.astype(int)
    med_pd_atc3 = med_pd_atc3.astype(int)

    med_pd.to_csv(path + 'Triplets/med_triplets.csv', index=None)
    med_pd_atc3.to_csv(path + 'Triplets/med_triplets_atc3.csv', index=None)

    return med_pd, med_pd_atc3

med_pd, med_pd_atc3 = process_med()

### **Diagnoses**

In [None]:
# process icd diagnoses codes for encoding
# run only once preferably
diag_icd_pd = pd.read_csv(diag_icd_file)
diag_icd_pd.fillna(method='pad', inplace=True)
diag_icd_pd.dropna(inplace=True)
diag_icd_pd.drop_duplicates(inplace=True)

diag_icd_pd['icd'] = diag_icd_pd['icd_version'].astype(str) + "-" + diag_icd_pd['icd_code']
diag_icd_pd['icd_idx'] = range(len(diag_icd_pd))

diag_icd_pd.to_csv(path+'icd_diag_codes.csv', index=None)

In [None]:
def process_diag():
    # process diagnoses dataset
    diag_pd = pd.read_csv(diag_file, dtype={'icd_version': 'category'})
    diag_pd.fillna(method='pad', inplace=True)
    diag_pd.dropna(inplace=True)
    diag_pd.drop_duplicates(inplace=True)
    diag_pd.reset_index(drop=True, inplace=True)

    # change subject_id to serial numbers
    hadm_code_pd = pd.read_csv(path+'hadm_code.csv')
    diag_pd = diag_pd.merge(hadm_code_pd, how='left', on='hadm_id')

    diag_pd['icd'] = diag_pd['icd_version'].astype(str) + "-" + diag_pd['icd_code']

    diag_icd_pd = pd.read_csv(path+'icd_diag_codes.csv')
    diag_icd_pd.drop(columns=['icd_code', 'icd_version', 'long_title'], inplace=True)

    diag_pd = diag_pd.merge(diag_icd_pd, how='left', on='icd')

    diag_pd.drop(columns=['hadm_id', 'subject_id', 'seq_num', 'icd_code', 'icd_version', 'icd'], inplace=True)
    diag_pd.dropna(inplace=True)
    diag_pd.reset_index(drop=True, inplace=True)

    diag_pd = diag_pd.astype(int)

    diag_pd.to_csv(path+'Triplets/diag_triplets.csv', index=None)

    return diag_pd

diag_pd = process_diag()

### **Procedures**

In [None]:
# process icd procedure codes for encoding
# run only once preferably
proc_icd_pd = pd.read_csv(proc_icd_file)
proc_icd_pd.fillna(method='pad', inplace=True)
proc_icd_pd.dropna(inplace=True)
proc_icd_pd.drop_duplicates(inplace=True)

proc_icd_pd['icd'] = proc_icd_pd['icd_version'].astype(str) + "-" + proc_icd_pd['icd_code']
proc_icd_pd['icd_idx'] = range(len(proc_icd_pd))

proc_icd_pd.to_csv(path + 'icd_proc_codes.csv', index=None)

In [None]:
def process_proc():
    # process procedures dataset
    proc_pd = pd.read_csv(proc_file, dtype={'icd_version': 'category'})
    proc_pd.fillna(method='pad', inplace=True)
    proc_pd.dropna(inplace=True)
    proc_pd.drop_duplicates(inplace=True)
    proc_pd.reset_index(drop=True, inplace=True)
    proc_pd.sort_values(by=['hadm_id'], inplace=True)

    # change subject_id to serial numbers
    hadm_code_pd = pd.read_csv(path+'hadm_code.csv')
    hadm_code_pd.drop(columns=["subject_id"], inplace=True)
    proc_pd = proc_pd.merge(hadm_code_pd, how='left', on='hadm_id')

    proc_pd['icd'] = proc_pd['icd_version'].astype(str) + "-" + proc_pd['icd_code']

    proc_icd_pd = pd.read_csv(path+'icd_proc_codes.csv')
    proc_icd_pd.drop(columns=['icd_code', 'icd_version', 'long_title'], inplace=True)

    proc_pd = proc_pd.merge(proc_icd_pd, how='left', on='icd')

    proc_pd.drop(columns=['hadm_id', 'seq_num', 'icd_code', 'icd_version', 'icd'], inplace=True)
    proc_pd.dropna(inplace=True)
    proc_pd.reset_index(drop=True, inplace=True)

    proc_pd = proc_pd.astype(int)

    proc_pd.to_csv(path+'Triplets/proc_triplets.csv', index=None)

    return proc_pd

proc_pd = process_proc()

### **Drug-Drug Interactions**

In [None]:
drugmap_pd = pd.read_csv(drugmap_atc_file, dtype={'ndc': 'category'})
atc5to3_map = drugmap_pd[['atc5', 'atc_idx', 'atc3', 'atc3_idx']].drop_duplicates().reset_index(drop=True)
# atc5to3_map.to_csv(path+"Mappings/atc5to3_map.csv", index=False)

In [None]:
def process_ddi():
  ddi_pd = pd.read_csv(ddi_file)

  ddi_most_pd = ddi_pd.groupby(by=['Polypharmacy Side Effect', 'Side Effect Name']).size().reset_index().rename(columns={0:'count'}).sort_values(by=['count'],ascending=False).reset_index(drop=True)
  ddi_most_pd = ddi_most_pd.iloc[-40:,:]

  filter_ddi_pd = ddi_pd.merge(ddi_most_pd[['Side Effect Name']], how='inner', on=['Side Effect Name'])
  ddi_pd = filter_ddi_pd.drop_duplicates().reset_index(drop=True)

  cid2atc_pd = pd.read_csv(cid2atc_file)
  cid2atc_pd.drop(columns=["atc5"], inplace=True)

  ddi_pd = pd.merge(ddi_pd, cid2atc_pd, left_on='STITCH 1', right_on="STITCH", how='left')
  ddi_pd.rename(columns = {'atc_idx':'atc_idx-1'}, inplace = True)

  ddi_pd = pd.merge(ddi_pd, cid2atc_pd, left_on='STITCH 2', right_on="STITCH", how='left')
  ddi_pd.rename(columns = {'atc_idx':'atc_idx-2'}, inplace = True)

  ddi_pd.drop(columns=["STITCH 1", "STITCH 2", "STITCH_x", "STITCH_y"], inplace=True)

  ddi_pd.dropna(inplace=True)
  ddi_pd.drop_duplicates(inplace=True)
  ddi_pd = ddi_pd.reset_index(drop=True)

  ddi_pd = ddi_pd.astype({"atc_idx-1": int, "atc_idx-2": int})

  ddi_pd_atc3 = ddi_pd

  ddi_pd_atc3 = pd.merge(ddi_pd_atc3, atc5to3_map, left_on='atc_idx-1', right_on="atc_idx", how='left')
  ddi_pd_atc3.drop(columns=["atc_idx-1", "atc_idx", "atc5", "atc3"], inplace=True)
  ddi_pd_atc3.rename(columns = {'atc3_idx':'atc_idx-1'}, inplace = True)

  ddi_pd_atc3 = pd.merge(ddi_pd_atc3, atc5to3_map, left_on='atc_idx-2', right_on="atc_idx", how='left')
  ddi_pd_atc3.drop(columns=["atc_idx-2", "atc_idx", "atc5", "atc3"], inplace=True)
  ddi_pd_atc3.rename(columns = {'atc3_idx':'atc_idx-2'}, inplace = True)

  ddi_pd_atc3.drop_duplicates(inplace=True)
  ddi_pd_atc3.reset_index(drop=True, inplace=True)

  ddi_pd.to_csv(path+"ddi_info.csv", index=None)
  ddi_pd_atc3.to_csv(path+"ddi_info_atc3.csv", index=None)

  ddi_triplets_pd = ddi_pd[["atc_idx-1", "atc_idx-2"]]
  ddi_triplets_pd.to_csv(path+"Triplets/ddi_triplets.csv", index=None)

  ddi_triplets_atc3_pd = ddi_pd_atc3[["atc_idx-1", "atc_idx-2"]]
  ddi_triplets_pd.to_csv(path+"Triplets/ddi_triplets_atc3.csv", index=None)

  return ddi_pd, ddi_pd_atc3

ddi_pd, ddi_pd_atc3 = process_ddi()

### **EHRs**

In [None]:
subj_hadm_pd = pd.read_csv(adm_hadm_file)
subj_unique = subj_hadm_pd['subject_id'].unique()
subj_unique = np.sort(subj_unique)
subj_unique_range = range(len(subj_unique))
subj_unique_pd = pd.DataFrame(subj_unique, columns=['subject_id'])
subj_unique_pd['subject_code'] = subj_unique_range
subj_hadm_pd = subj_hadm_pd.merge(subj_unique_pd, on="subject_id", how="left")

In [None]:
hadm_unique = list(subj_hadm_pd['hadm_code'])

med_pd_group = med_pd.groupby('hadm_code')
med_pd_atc3_group = med_pd_atc3.groupby('hadm_code')
diag_pd_group = diag_pd.groupby('hadm_code')
proc_pd_group = proc_pd.groupby('hadm_code')

ehr_dict = {
    "subject_code": [],
    "hadm_code": [],
    "med_codes": [],
    "med_codes_atc3": [],
    "diag_codes": [],
    "proc_codes": []
}

for hadm_code in hadm_unique:
  ehr_dict["hadm_code"].append(hadm_code)
  ehr_dict["subject_code"].append(subj_hadm_pd.loc[subj_hadm_pd['hadm_code'] == hadm_code]['subject_code'].item())

  try:
    ehr_dict["med_codes"].append(list(med_pd_group.get_group(hadm_code)['atc_idx'].unique()))
  except: ehr_dict["med_codes"].append(None)

  try:
    ehr_dict["med_codes_atc3"].append(list(med_pd_atc3_group.get_group(hadm_code)['atc_idx'].unique()))
  except: ehr_dict["med_codes_atc3"].append(None)

  try:
    ehr_dict["diag_codes"].append(list(diag_pd_group.get_group(hadm_code)['icd_idx'].unique()))
  except: ehr_dict["diag_codes"].append(None)

  try:
    ehr_dict["proc_codes"].append(list(proc_pd_group.get_group(hadm_code)['icd_idx'].unique()))
  except: ehr_dict["proc_codes"].append(None)

In [None]:
ehr_pd = pd.DataFrame(ehr_dict)

In [None]:
records = []
records_atc3 = []

i = 0
for id in np.sort(ehr_pd['subject_code'].unique()):
  subject = []
  subject.append(id)

  subject_atc3 = []
  subject_atc3.append(id)

  subject_pd = ehr_pd[ehr_pd['subject_code'] == id]
  subject_atc3_pd = ehr_pd[ehr_pd['subject_code'] == id]

  for index, row in subject_pd.iterrows():
    admission = []
    admission_atc3 = []

    admission.append(row['hadm_code'])
    admission.append(row['diag_codes'])
    admission.append(row['proc_codes'])
    admission.append(row['med_codes'])

    admission_atc3.append(row['hadm_code'])
    admission_atc3.append(row['diag_codes'])
    admission_atc3.append(row['proc_codes'])
    admission_atc3.append(row['med_codes_atc3'])

    subject.append(admission)
    subject_atc3.append(admission_atc3)

  records.append(subject)
  records_atc3.append(subject_atc3)

In [None]:
import pickle
with open(path + 'ehr_atc3.pkl', 'wb') as f:
  pickle.dump(records_atc3, f)

## **Adjacency Matrices**

### Diag-Diag

In [None]:
diag_idx_arr = np.sort(diag_pd['icd_idx'].unique())
diag_idx_arr_len = len(diag_idx_arr)

diag_dict = {}
diag_dict_rev = {}
for i in range(diag_idx_arr_len):
  diag_dict_rev[diag_idx_arr[i]] = i
  diag_dict[i] = diag_idx_arr[i]

In [None]:
# Diagnoses-Diagnoses Adj Matrix

diag_adj = np.zeros((diag_idx_arr_len, diag_idx_arr_len))

for subj in range(len(records)):
  for adm in range(1, len(records[subj])):
    diag_set = records[subj][adm][1]

    if diag_set is None: break

    for i, diag_i in enumerate(diag_set):
      for j, diag_j in enumerate(diag_set):
          if j<=i:
            continue
          x = diag_dict_rev[diag_i]
          y = diag_dict_rev[diag_j]
          diag_adj[x, y] = 1
          diag_adj[y, x] = 1

### Proc-Proc

In [None]:
proc_idx_arr = np.sort(proc_pd['icd_idx'].unique())
proc_idx_arr_len = len(proc_idx_arr)

proc_dict = {}
proc_dict_rev = {}
proc_dict_adjusted = {}
proc_dict_rev_adjusted = {}
for i in range(proc_idx_arr_len):
  proc_dict_rev[proc_idx_arr[i]] = i
  proc_dict[i] = proc_idx_arr[i]
  proc_dict_rev_adjusted[proc_idx_arr[i]] = i+2007
  proc_dict_adjusted[2007+i] = proc_idx_arr[i]

In [None]:
# Procedure-Procedure Adj Matrix

proc_adj = np.zeros((proc_idx_arr_len, proc_idx_arr_len))

for subj in range(len(records)):
  for adm in range(1, len(records[subj])):
    proc_set = records[subj][adm][2]

    if proc_set is None: break

    for i, proc_i in enumerate(proc_set):
      for j, proc_j in enumerate(proc_set):
          if j<=i:
            continue
          x = proc_dict_rev[proc_i]
          y = proc_dict_rev[proc_j]
          proc_adj[x, y] = 1
          proc_adj[y, x] = 1

### Proc-Diag & Diag-Proc

In [None]:
## Procedure & Diagnoses Adj Matrix
diag_proc_adj = np.zeros((diag_idx_arr_len, proc_idx_arr_len))
proc_diag_adj = np.zeros((proc_idx_arr_len, diag_idx_arr_len))

for subj in range(len(records)):
  for adm in range(1, len(records[subj])):
    diag_set = records[subj][adm][1]
    proc_set = records[subj][adm][2]

    if proc_set is None or diag_set is None: break

    for i, diag in enumerate(diag_set):
      for j, proc in enumerate(proc_set):
          if j<=i:
            continue
          x = diag_dict_rev[diag]
          y = proc_dict_rev[proc]
          diag_proc_adj[x, y] = 1
          proc_diag_adj[y, x] = 1

### Med-Med

##### ATC5

Adjacency matrices for ATC Level 5 and ATC Level 3 are created.

In [None]:
atc_idx_arr = np.array(list(set(list(med_pd['atc_idx'].unique()) + list(ddi_pd['atc_idx-1'].unique()) + list(ddi_pd['atc_idx-2'].unique()))))
atc_idx_arr_len = len(atc_idx_arr)
med_adj_map = {
    "atc_idx": atc_idx_arr,
    "adj_idx": np.array(range(atc_idx_arr_len))
}
med_adj_map_pd = pd.DataFrame(med_adj_map)

atc_dict = {}
atc_dict_rev = {}
for i in range(atc_idx_arr_len):
  atc_dict_rev[atc_idx_arr[i]] = i
  atc_dict[i] = atc_idx_arr[i]

In [None]:
# Adjacency matrix for prescriptions

prescriptions_adj = np.zeros((atc_idx_arr_len, atc_idx_arr_len))

for subj in range(len(records)):
  for adm in range(1, len(records[subj])):
    med_set = records[subj][adm][3]

    if med_set is None: break

    for i, med_i in enumerate(med_set):
      for j, med_j in enumerate(med_set):
          if j<=i:
            continue
          # x = med_adj_map_pd.loc[med_adj_map_pd['atc_idx'] == med_i]['adj_idx'].item()
          # y = med_adj_map_pd.loc[med_adj_map_pd['atc_idx'] == med_j]['adj_idx'].item()
          x = atc_dict_rev[med_i]
          y = atc_dict_rev[med_j]
          prescriptions_adj[x,y] = 1
          prescriptions_adj[y,x] = 1

In [None]:
# Adjacency matrix for DDIs

ddi_adj = np.zeros((atc_idx_arr_len, atc_idx_arr_len))

for index, row in ddi_pd.iterrows():
  x = atc_dict_rev[row['atc_idx-1']]
  y = atc_dict_rev[row['atc_idx-2']]
  ddi_adj[x, y] = 1
  ddi_adj[y, x] = 1

##### ATC3

In [None]:
atc3_idx_arr = np.array(list(set(list(med_pd_atc3['atc_idx'].unique()) + list(ddi_pd_atc3['atc_idx-1'].unique()) + list(ddi_pd_atc3['atc_idx-2'].unique()))))
atc3_idx_arr_len = len(atc3_idx_arr)
med_atc3_adj_map = {
    "atc_idx": atc3_idx_arr,
    "adj_idx": np.array(range(atc3_idx_arr_len))
}
med_atc3_adj_map_pd = pd.DataFrame(med_atc3_adj_map)

atc3_dict = {}
atc3_dict_rev = {}
for i in range(atc3_idx_arr_len):
  atc3_dict_rev[atc3_idx_arr[i]] = i
  atc3_dict[i] = atc3_idx_arr[i]

In [None]:
# Adjacency matrix for prescriptions

prescriptions_atc3_adj = np.zeros((atc3_idx_arr_len, atc3_idx_arr_len))

for subj in range(len(records_atc3)):
  for adm in range(1, len(records_atc3[subj])):
    med_set = records_atc3[subj][adm][3]

    if med_set is None: break

    for i, med_i in enumerate(med_set):
      for j, med_j in enumerate(med_set):
          if j<=i:
            continue
          x = atc3_dict_rev[med_i]
          y = atc3_dict_rev[med_j]
          prescriptions_atc3_adj[x,y] = 1
          prescriptions_atc3_adj[y,x] = 1

In [None]:
# Adjacency matrix for DDIs
ddi_atc3_adj = np.zeros((atc3_idx_arr_len+1, atc3_idx_arr_len+1))

for index, row in ddi_pd_atc3.iterrows():
  x = atc3_dict_rev[row['atc_idx-1']]
  y = atc3_dict_rev[row['atc_idx-2']]
  ddi_atc3_adj[x, y] = 1
  ddi_atc3_adj[y, x] = 1

### Save Adj Matrices and Mapping Dicts

In [None]:
import pickle

with open(path + 'DictMaps/diag_map_dict.pkl', 'wb') as f:
  pickle.dump(diag_dict, f)

with open(path + 'DictMaps/diag_map_rev_dict.pkl', 'wb') as f:
  pickle.dump(diag_dict_rev, f)

with open(path + 'DictMaps/proc_map_dict.pkl', 'wb') as f:
  pickle.dump(proc_dict, f)

with open(path + 'DictMaps/proc_map_rev_dict.pkl', 'wb') as f:
  pickle.dump(proc_dict_rev, f)

with open(path + 'DictMaps/proc_map_adjusted_dict.pkl', 'wb') as f:
  pickle.dump(proc_dict_adjusted, f)

with open(path + 'DictMaps/proc_map_rev_adjusted_dict.pkl', 'wb') as f:
  pickle.dump(proc_dict_rev_adjusted, f)

with open(path + 'DictMaps/atc_map_dict.pkl', 'wb') as f:
  pickle.dump(atc_dict, f)

with open(path + 'DictMaps/atc_map_rev_dict.pkl', 'wb') as f:
  pickle.dump(atc_dict_rev, f)

with open(path + 'DictMaps/atc3_map_dict.pkl', 'wb') as f:
  pickle.dump(atc3_dict, f)

with open(path + 'DictMaps/atc3_map_rev_dict.pkl', 'wb') as f:
  pickle.dump(atc3_dict_rev, f)

In [None]:
with open(path + 'AdjMatrices/diag_adj.pkl', 'wb') as f:
  pickle.dump(diag_adj, f)

with open(path + 'AdjMatrices/proc_adj.pkl', 'wb') as f:
  pickle.dump(proc_adj, f)

with open(path + 'AdjMatrices/diag_proc_adj.pkl', 'wb') as f:
  pickle.dump(diag_proc_adj, f)

with open(path + 'AdjMatrices/proc_diag_adj.pkl', 'wb') as f:
  pickle.dump(proc_diag_adj, f)

with open(path + 'AdjMatrices/prescriptions_adj.pkl', 'wb') as f:
  pickle.dump(prescriptions_adj, f)

with open(path + 'AdjMatrices/ddi_adj.pkl', 'wb') as f:
  pickle.dump(ddi_adj, f)

with open(path + 'AdjMatrices/prescriptions_atc3_adj.pkl', 'wb') as f:
  pickle.dump(prescriptions_atc3_adj, f)

with open(path + 'AdjMatrices/ddi_atc3_adj.pkl', 'wb') as f:
  pickle.dump(ddi_atc3_adj, f)