In [68]:
import pandas as pd
import numpy as np
import json
import ast
from tqdm.notebook import tqdm
import math
tqdm.pandas()

# Load MedQA query table

In [128]:
q_path = "/scratch/s190619/Data_etc/MedQA/disorders_table_dev-test.csv"
qs = pd.read_csv(q_path)
qs.head()

Unnamed: 0,qid,query,answer,options,meta_info,Disorder,CUI,TUI,short_category,long_category,description,cui_METAMAP
0,0,A 5-year-old girl is brought to the emergency ...,Cyclic vomiting syndrome,"{'A': 'Cyclic vomiting syndrome', 'B': 'Gastro...",step2&3,True,,,,,,['C0152164']
1,1,A 19-year-old boy presents with confusion and ...,Hypoperfusion,"{'A': 'Hypoperfusion', 'B': 'Hyperglycemia', '...",step1,True,C0442856,T046,DISO,Disorders,Pathologic Function,['C0442856']
2,2,A 41-year-old woman presents to her primary ca...,Iron deficiency,"{'A': 'Vitamin B12 deficiency', 'B': 'Folate d...",step2&3,True,C0240066,T047,DISO,Disorders,Disease or Syndrome,['C0240066']
3,3,A 56-year-old man with known coronary artery d...,Monomorphic ventricular tachycardia,"{'A': 'Premature ventricular contractions', 'B...",step1,True,,,,,,['C0344431']
4,4,A 16-year-old female high school student is br...,Oppositional defiant disorder,"{'A': 'Reactive attachment disorder', 'B': 'Co...",step2&3,True,C0029121,T048,DISO,Disorders,Mental or Behavioral Dysfunction,['C0029121']


# Load FindZebra collection

In [56]:
FZ_path = "/scratch/s190619/Data_etc/FindZebra/FZcorpus_CLEANED.tsv"
fz = pd.read_csv(FZ_path, sep="\t")
fz.head()

Unnamed: 0,apid,title,clean_content,cui,source,source_url,meta-data
0,0,Kernicterus,Kernicterus refers to brain damage that may oc...,C0022610,gard,https://rarediseases.info.nih.gov/diseases/683...,"{""mesh"": [""D007647""], ""umls"": [""C0022610""], ""o..."
1,1,47 XXX syndrome,"47 XXX syndrome , also called trisomy X or tri...",C0221033,gard,https://rarediseases.info.nih.gov/diseases/567...,"{""mesh"": [""C535318""], ""umls"": [""C0221033""], ""o..."
2,2,Pseudoxanthoma elasticum,"Pseudoxanthoma elasticum , PXE, is an inherite...",C0033847,gard,https://rarediseases.info.nih.gov/diseases/964...,"{""mesh"": [""D011561""], ""omim"": [""264800""], ""orp..."
3,3,Smith-Magenis Syndrome,Summary Clinical characteristics. Smith-Mageni...,C0795864,gene_reviews,https://www.ncbi.nlm.nih.gov/books/NBK1310/,"{""mesh"": [""D058496""], ""synonyms"": [""del(17)(p1..."
4,4,Smith-Lemli-Opitz Syndrome,Summary Clinical characteristics. Smith-Lemli-...,C0175694,gene_reviews,https://www.ncbi.nlm.nih.gov/books/NBK1143/,"{""mesh"": [""D019082""], ""synonyms"": [""SLOS""]}"


In [22]:
#ast.literal_eval(fz["meta-data"].iloc[0])["synonyms"]
def get_synonyms(x):
    try:
        return [x.lower() for x in ast.literal_eval(x)["synonyms"]]
    except:
        return []


# Load umls data

In [5]:
semgroups = pd.read_csv("/scratch/s190619/Data_etc/Med_Groups/SemGroups_2018.txt",sep="|",names=["short_category","long_category","tui","desc"])
semgroups = semgroups[semgroups["short_category"] == "DISO"]
tuis = list(semgroups["tui"])

umls_path = "/scratch/s190619/Data_etc/Med_Groups/umls_terms.csv"
umls = pd.read_csv(umls_path, sep=",")
umls = umls[umls["TUI"].isin(tuis)].drop("TUI",axis=1)
umls["STR"] = umls["STR"].apply(lambda x: str(x).lower())
umls.head()

Unnamed: 0,CUI,STR
0,C0026106,mild mental retardation
1,C0026351,moderate mental retardation
2,C0036857,severe mental retardation
3,C0020796,profound mental retardation
4,C0025362,unspecified mental retardation


In [9]:
fz2 = fz.copy()
fz2["synonyms"] = fz2["meta-data"].progress_apply(lambda x: get_synonyms(x))

  0%|          | 0/30658 [00:00<?, ?it/s]

In [23]:
fz3 = fz2[["apid","synonyms"]].copy()
apids = []
syns = []
for i in tqdm(range(fz3.shape[0])):
    for j in range(len(fz3["synonyms"].iloc[i])):
        syns.append(fz3["synonyms"].iloc[i][j])
        apids.append(fz3["apid"].iloc[i])
synonyms = pd.DataFrame(np.array([apids,syns]).T)
synonyms.columns = ["apid","synonym"]

  0%|          | 0/30658 [00:00<?, ?it/s]

In [26]:
synonyms_filtered = synonyms.join(umls.set_index("STR"), on="synonym").dropna()

In [55]:
dense = []
apids = -1
syns = []
cuis = []
for i in tqdm(range(synonyms_filtered.shape[0])):
    if synonyms_filtered["apid"].iloc[i] != apids and apids != -1:
        dense.append([apids, syns, cuis])
        apids = []
        syns = []
        cuis = []
    apids = synonyms_filtered["apid"].iloc[i]
    syns.append(synonyms_filtered["synonym"].iloc[i])
    cuis.append(synonyms_filtered["CUI"].iloc[i])
df = pd.DataFrame(dense)
df.columns = ["apid","synonyms","alt_cuis"]
df = df.drop("synonyms",axis=1)
df["apid"] = df["apid"].astype("int")
    

  0%|          | 0/88 [00:00<?, ?it/s]

In [81]:
new_fz = fz.join(df.set_index("apid"), on="apid")
new_fz["cuis"] = [[] if type(x) != list else x for x in new_fz["alt_cuis"]]
[new_fz["cuis"].iloc[i].append(new_fz["cui"].iloc[i]) for i in range(new_fz.shape[0])]
new_fz = new_fz.drop("alt_cuis",axis=1)

In [82]:
new_fz.head()

Unnamed: 0,apid,title,clean_content,cui,source,source_url,meta-data,cuis
0,0,Kernicterus,Kernicterus refers to brain damage that may oc...,C0022610,gard,https://rarediseases.info.nih.gov/diseases/683...,"{""mesh"": [""D007647""], ""umls"": [""C0022610""], ""o...",[C0022610]
1,1,47 XXX syndrome,"47 XXX syndrome , also called trisomy X or tri...",C0221033,gard,https://rarediseases.info.nih.gov/diseases/567...,"{""mesh"": [""C535318""], ""umls"": [""C0221033""], ""o...",[C0221033]
2,2,Pseudoxanthoma elasticum,"Pseudoxanthoma elasticum , PXE, is an inherite...",C0033847,gard,https://rarediseases.info.nih.gov/diseases/964...,"{""mesh"": [""D011561""], ""omim"": [""264800""], ""orp...",[C0033847]
3,3,Smith-Magenis Syndrome,Summary Clinical characteristics. Smith-Mageni...,C0795864,gene_reviews,https://www.ncbi.nlm.nih.gov/books/NBK1310/,"{""mesh"": [""D058496""], ""synonyms"": [""del(17)(p1...",[C0795864]
4,4,Smith-Lemli-Opitz Syndrome,Summary Clinical characteristics. Smith-Lemli-...,C0175694,gene_reviews,https://www.ncbi.nlm.nih.gov/books/NBK1143/,"{""mesh"": [""D019082""], ""synonyms"": [""SLOS""]}",[C0175694]


# Get list of all unique cuis in FZ

In [95]:
A = []
for cuis in new_fz["cuis"]:
    for cui in cuis:
        if type(cui) == str:
            A.append(cui)
unique_cuis_in_FZ = set(A)
len(unique_cuis_in_FZ)

14986

# Filter MedQA queries for FindZebra (only rare diseases found in FZ)

In [130]:
print("Shape of queries before filtering: ", qs.shape)
qs2 = qs.copy()
all_cuis = []
found_in_FZ = []
for i in range(qs.shape[0]):
    tmp = False
    cuis = []
    if type(qs["CUI"].iloc[i]) == str:
        cui = qs["CUI"].iloc[i]
        cuis.append(cui)
        if cui in unique_cuis_in_FZ:
            tmp = True
    for cui in ast.literal_eval(qs["cui_METAMAP"].iloc[i]):
        cuis.append(cui)
        if cui in unique_cuis_in_FZ:
            tmp = True
    all_cuis.append(cuis)
    found_in_FZ.append(tmp)
qs2["all_cuis"] = all_cuis
qs2["found_in_fz"] = found_in_FZ
qs2 = qs2[qs2["found_in_fz"] == True].drop("found_in_fz",axis=1)
print("Shape of queries after filtering: ", qs2.shape)
qs2.head()

Shape of queries before filtering:  (611, 12)
Shape of queries after filtering:  (394, 13)


Unnamed: 0,qid,query,answer,options,meta_info,Disorder,CUI,TUI,short_category,long_category,description,cui_METAMAP,all_cuis
0,0,A 5-year-old girl is brought to the emergency ...,Cyclic vomiting syndrome,"{'A': 'Cyclic vomiting syndrome', 'B': 'Gastro...",step2&3,True,,,,,,['C0152164'],[C0152164]
2,2,A 41-year-old woman presents to her primary ca...,Iron deficiency,"{'A': 'Vitamin B12 deficiency', 'B': 'Folate d...",step2&3,True,C0240066,T047,DISO,Disorders,Disease or Syndrome,['C0240066'],"[C0240066, C0240066]"
4,4,A 16-year-old female high school student is br...,Oppositional defiant disorder,"{'A': 'Reactive attachment disorder', 'B': 'Co...",step2&3,True,C0029121,T048,DISO,Disorders,Mental or Behavioral Dysfunction,['C0029121'],"[C0029121, C0029121]"
5,5,A 35-year-old male presents to his primary car...,Erythema infectiosum,"{'A': 'Kaposi’s sarcoma', 'B': 'Erythema infec...",step1,True,,,,,,['C0085273'],[C0085273]
7,7,A 59-year-old man presents to his primary care...,Dysthymia,{'A': 'Adjustment disorder with depressive fea...,step1,True,C0013415,T048,DISO,Disorders,Mental or Behavioral Dysfunction,['C0013415'],"[C0013415, C0013415]"


In [131]:
qs2.to_csv("/scratch/s190619/Data_etc/MedQA/disorders_table_dev-test_RARE_FZ.csv")