Classify all diseases in the mimic dataset according to the icd10 categories

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm.notebook import tqdm


icd9toicd10 = pd.read_csv('icd9toicd10cmgem.csv', usecols=['icd9cm', 'icd10cm'])
map9to10 = defaultdict(str)
for i in range(len(icd9toicd10)):
    icd9 = icd9toicd10.iloc[i]['icd9cm']
    icd10 = icd9toicd10.iloc[i]['icd10cm']
    while icd9.startswith('0'):
        icd9 = icd9[1:]
    if len(map9to10[icd9]) == 0 or map9to10[icd9] == 'NoDx':
        map9to10[icd9] = icd10
np.save('icd9toicd10.npy', map9to10)

def get_icd10_type(id: str) -> str:
    if id[0] in ['A', 'B']:
        return 'Certain infectious and parasitic diseases'
    elif id[0] == 'C':
        return 'Neoplasms'
    elif id[0] == 'D':
        if id[1:3].isdecimal() and int(id[1:3]) <= 49:
            return 'Neoplasms'
        else:
            return 'Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism'
    elif id[0] == 'E':
        return 'Endocrine, nutritional and metabolic diseases'
    elif id[0] == 'F':
        return 'Mental, Behavioral and Neurodevelopmental disorders'
    elif id[0] == 'G':
        return 'Diseases of the nervous system'
    elif id[0] == 'H':
        if id[1:3].isdecimal() and int(id[1:3]) <= 59:
            return 'Diseases of the eye and adnexa'
        else:
            return 'Diseases of the ear and mastoid process'
    elif id[0] == 'I':
        return 'Diseases of the circulatory system'
    elif id[0] == 'J':
        return 'Diseases of the respiratory system'
    elif id[0] == 'K':
        return 'Diseases of the digestive system'
    elif id[0] == 'L':
        return 'Diseases of the skin and subcutaneous tissue'
    elif id[0] == 'M':
        return 'Diseases of the musculoskeletal system and connective tissue'
    elif id[0] == 'N':
        return 'Diseases of the genitourinary system'
    elif id[0] == 'O':
        return 'Pregnancy, childbirth and the puerperium'
    elif id[0] == 'P':
        return 'Certain conditions originating in the perinatal period'
    elif id[0] == 'Q':
        return 'Congenital malformations, deformations and chromosomal abnormalities'
    elif id[0] == 'R':
        return 'Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified'
    elif id[0] in ['S', 'T']:
        return 'Injury, poisoning and certain other consequences of external causes'
    elif id[0] == 'U':
        return 'Codes for special purposes'
    elif id[0] in ['V', 'W', 'X', 'Y']:
        return 'External causes of morbidity'
    elif id[0] == 'Z':
        return 'Factors influencing health status and contact with health services'

diagnosis = pd.read_csv('diagnosis_icd.csv')
all_type = defaultdict(list)
for i in tqdm(range(len(diagnosis))):
    code, version = diagnosis.iloc[i]['icd_code'], diagnosis.iloc[i]['icd_version']
    if version == 9:
        code = icd9toicd10[code]
    if len(code) == 0 or code == 'NoDx':
        continue
    all_type[get_icd10_type(code)].append([diagnosis.iloc[i]['subject_id'], diagnosis.iloc[i]['hadm_id']])
for k in all_type.keys():
    df = pd.DataFrame(all_type[k], columns=['subject_id', 'hadm_id'])
    df.to_csv(f'diagnosis/{k}.csv', index=False)
    print(k, len(all_type[k]))

get DDI.json (Drug-Drug Interaction)

In [None]:
import xml.etree.ElementTree as et
import json
from collections import defaultdict
from tqdm.notebook import tqdm


tree = et.parse("drugbank_full database.xml")
all_drug = tree.findall("drug")
proc = lambda text: text.replace("\n", "").replace("\t", "").replace("\r", "").strip()
DDI = defaultdict(dict)

for drug in tqdm(all_drug):
    for i in drug.findall("drugbank-id"):
        if i.attrib.get("primary", False) == "true":
            drug_id = proc(i.text)
    for di in drug.find("drug-interactions").findall("drug-interaction"):
        DDI[drug_id][proc(di.find("drugbank-id").text)] = proc(di.find("description").text)

DDI_json = json.dumps(DDI, indent=4)
with open("./DDI.json", "w") as f:
    f.write(DDI_json)

from DDI.json to links.json

In [None]:
import json
from collections import defaultdict
from tqdm.notebook import tqdm

with open("./DDI.json", "r") as f:
    ddi:dict = json.load(f)

links = defaultdict(str)
total_len = 0
for h, ts in tqdm(ddi.items()):
    for t, d in ts.items():
        if not links[tuple(sorted((h, t)))]:
            links[tuple(sorted((h, t)))] = d
    total_len += len(ts)
print(len(links)/total_len)

with open("./links.json", "w") as f:
    f.write(json.dumps([{"entity1":k[0], "entity2":k[1], "description":v} for k, v in links.items()]))

get drugs.json

{drug_id: {"names":[names], "description":description}}

In [None]:
import xml.etree.ElementTree as et
import json
from tqdm.notebook import tqdm


tree = et.parse("drugbank_full database.xml")
all_drug = tree.findall("drug")
proc = lambda text: text.replace("\n", "").replace("\t", "").replace("\r", "").strip()
fail = []
ans = dict()

for drug in tqdm(all_drug):
    temp = {"id":None, "name":None, "desc":None}
    names = []
    # id
    for id in drug.findall("drugbank-id"):
        if "primary" in id.attrib.keys() and id.attrib["primary"]=="true":
            temp["id"] = id.text
            break
    if type(temp["id"]) is not str and len(temp["id"]) == 0:
        fail.append(drug)
        continue
    # desc
    desc = ""
    ## description
    if drug.find("description").text:
        desc += proc(drug.find("description").text)
        desc += "\n"
    ## indication
    if drug.find("indication").text:
        desc += proc(drug.find("indication").text)
        desc += "\n"
    ## products
    products = []
    for product in drug.find("products").findall("product"):
        if product.find("name").text:
            p = proc(product.find("name").text)
            if p not in names:
                names.append(p)
        else:
            continue
        if product.find("dosage-form").text:
            p += ' {}'.format(proc(product.find("dosage-form").text).replace(",", ""))
        if product.find("strength").text:
            p += ' {}'.format(proc(product.find("strength").text))
        if product.find("route").text:
            p += ' {}'.format(proc(product.find("route").text))
        products.append(p)
    if len(products) > 0:
        products = "; ".join(list(set(products)))
        products = products[:-2]
        desc += products
        desc += "\n"
    ## food-interactions
    fis = ""
    for fi in drug.find("food-interactions").findall("food-interaction"):
        if fi.text:
            fis += proc(fi.text)
            fis += "; "
    if len(fis) > 0:
        desc += fis[:-2]
        desc += "\n"
    # name
    ## synonyms
    for syn in drug.find("synonyms").findall("synonym"):
        if syn.attrib.get("language", False) == "english":
            names.append(proc(syn.text))
    ## name
    names.append(proc(drug.find("name").text))
    temp["name"] = names

    temp["desc"] = desc

    ans[temp["id"]] = {"names":temp["name"], "description":temp["desc"]}

ans = json.dumps(ans, indent=4)
with open("drugs.json", "w", encoding="utf-8") as f:
    f.write(ans)

find all drugs for diseases of the circulatory system

In [None]:
from tqdm.notebook import tqdm
import pandas as pd


pres = pd.read_csv("./data/prescriptions.csv", usecols=["subject_id", 'hadm_id', 'drug'])
# 60271
dcs = pd.read_csv("./data/Diseases of the circulatory system.csv")

all_drugs = set()

for i in tqdm(range(len(dcs))):
    subject_id = dcs.iloc[i]["subject_id"]
    hadm_id = dcs.iloc[i]["hadm_id"]
    all_drugs = all_drugs.union(set(pres[(pres["subject_id"] == subject_id) & (pres["hadm_id"] == hadm_id)]["drug"].tolist()))

all_drugs = list(all_drugs)


import json

with open("./data/mimic_drugs.json", "w") as f:
    f.write(json.dumps(all_drugs))