# Combined Rare Disease data.tsv

- preprocess each dataset (as done in individual GTRx and OOPD benchmarks), loosely following [get_indications.py](https://github.com/TranslatorSRI/Benchmarks/blob/main/config/DrugCentral_creative/get_indications.py)
- combine with only ids and names

In [1]:
import pandas as pd
import requests
from biothings_client import get_client

## OOPD

In [2]:
mc = get_client('chem')
oopd = mc.query(
        '_exists_:fda_orphan_drug', 
        fields='fda_orphan_drug',
        fetch_all=True)

In [3]:
df_oopd = pd.DataFrame(columns=['_id', 'drug_pubchem_cid', 'disease_umls', 'drug_name', 'disease_name', 'status', 'date'])

for drug in oopd:
    
    _id = drug['_id']
    
    # orphan designation 
    for use in drug['fda_orphan_drug']:

        # designation status/approval (may be useful for filtering)
        status = use['designation_status']
        date = use['designated_date']
            

        if isinstance(use['orphan_designation']['umls'], list):
            names = use['orphan_designation']['parsed_text']
            umls = use['orphan_designation']['umls']
        else:
            names = [use['orphan_designation']['parsed_text']]
            umls = [use['orphan_designation']['umls']]
        # for now, skip if there are nested lists of names
        if any(isinstance(el, list) for el in names):
            continue
        name_uml_dict = dict(zip(names, umls))
        
        if 'pubchem_cid' in use.keys():
            if isinstance(use['pubchem_cid'], list):
                continue
            cid = 'PUBCHEM.COMPOUND:' + str(use['pubchem_cid'])
        else:
            cid = ''
            # only include cases where a pubchem cid exists (used for querying)
            continue

        for name, uml in name_uml_dict.items():
            disease_umls = "UMLS:" + uml
            disease_name = name
            new_record = {
                '_id': _id,
                'drug_name': use['generic_name'],
                'drug_pubchem_cid': cid,
                'disease_umls': disease_umls,
                'disease_name': disease_name,
                'status': status,
                'date': date
                }
            #print(new_record)
            df_oopd.loc[len(df_oopd)] = new_record


INFO:biothings.client:Fetching 3661 chem(s) . . .
ERROR:biothings.client:No more results to return.


In [4]:
df_oopd = df_oopd.sort_values('_id')

# restrict to only 1 drug per disease
df_oopd = df_oopd.drop_duplicates('disease_umls', keep=False)

# keep only those drugs whose use is approved
df_oopd = df_oopd.loc[df_oopd['status'] == 'Designated/Approved']

df_oopd = df_oopd.drop(columns=['status', 'date'])
df_oopd.rename(columns={'drug_pubchem_cid': 'drug_id', 'disease_umls': 'disease_id'}, inplace=True)
df_oopd

Unnamed: 0,_id,drug_id,disease_id,drug_name,disease_name
2070,ACTRVOBWPAIOHC-XIXRPRMCSA-N,PUBCHEM.COMPOUND:2724354,UMLS:C0149921,Succimer,lead poisoning in children
1927,APKFDSVGJQXUKY-INPOYWNPSA-N,PUBCHEM.COMPOUND:5280965,UMLS:C0085436,Liposomal amphotericin B,Cryptococcal meningitis
628,APKFDSVGJQXUKY-VTPNKFOWSA-N,PUBCHEM.COMPOUND:118705500,UMLS:C1262313,Amphotericin B lipid complex,Invasive Fungal Infections
1229,AQHHHDLHHXJYJD-UHFFFAOYSA-N,PUBCHEM.COMPOUND:4946,UMLS:C0018916,propranolol,Hemangiomas
49,AUFUWRKPQLGTGF-FMKGYKFTSA-N,PUBCHEM.COMPOUND:20058,UMLS:C0268130,uridine triacetate,OROTIC ACIDURIA
...,...,...,...,...,...
1289,YLMAHDNUQAMNNX-UHFFFAOYSA-N,PUBCHEM.COMPOUND:123596,UMLS:C0346421,Imatinib mesylate,Chronic eosinophilic leukemia
3637,YQNQNVDNTFHQSW-UHFFFAOYSA-N,PUBCHEM.COMPOUND:41684,UMLS:C0010418,Nitazoxanide,Cryptosporidiosis
3029,ZDXPYRJPNDTMRX-VKHMYHEASA-N,PUBCHEM.COMPOUND:5961,UMLS:C0024523,Glutamine,Malabsorption
1903,ZHSKUOZOLHMKEA-UHFFFAOYSA-N,PUBCHEM.COMPOUND:77082,UMLS:C1955690,bendamustine hydrochloride,Extranodal marginal zone B-cell lymphoma


## GTRx

In [5]:
url = 'https://biothings.ncats.io/gtrx/query?q=predicate:treated_by&fetch_all=True'
data = requests.get(url)
gtrx = data.json()
gtrx = gtrx['hits']

In [6]:
df_gtrx = pd.DataFrame(columns=['_id', 'drug_inxight', 'disease_omim', 'drug_name', 'disease_name', 'nref'])

for entry in gtrx:
    # association id
    _id = entry['_id']
    
    # drug info
    # if a drug intervention has multiple entries -> combination therapy, all drug names/ids hyphenated
    drug_name = ''
    drug_inxight = ''
    for drug in entry['object']['intervention']:
        drug_name += (drug['description'] + '-')
        drug_inxight += ('UNII:' + drug['inxight'] + '-')
        
    drug_name = drug_name[:-1]
    drug_inxight = drug_inxight[:-1]
    
    # disease info
    disease_name = entry['subject']['condition_name']
    #disease_name = ','.join(entry['subject']['alternate_names'])
    disease_omim = 'OMIM:' + entry['subject']['omim']
    
    # number of references
    nref = len(entry['references'])
    
    new_record = {
        '_id': _id,
        'drug_name': drug_name,
        'drug_inxight': drug_inxight,
        'disease_name': disease_name,
        'disease_omim': disease_omim,
        'nref': nref
    }
    df_gtrx.loc[len(df_gtrx)] = new_record

In [7]:
# remove interventions with multiple drugs
df_gtrx = df_gtrx[~df_gtrx['drug_inxight'].str.contains('-')]

# restrict to only 1 drug per disease
df_gtrx = df_gtrx.drop_duplicates('disease_omim', keep=False)

df_gtrx = df_gtrx.drop(columns=['nref'])
df_gtrx.rename(columns={'drug_inxight': 'drug_id', 'disease_omim': 'disease_id'}, inplace=True)
df_gtrx

Unnamed: 0,_id,drug_id,disease_id,drug_name,disease_name
4,1037-OMIM:612924-A3ULP0F556,UNII:A3ULP0F556,OMIM:612924,Eculizumab,"HEMOLYTIC UREMIC SYNDROME, ATYPICAL, SUSCEPTIB..."
5,10483-OMIM:145600-F64QU97QCR,UNII:F64QU97QCR,OMIM:145600,Dantrolene,"MALIGNANT HYPERTHERMIA, SUSCEPTIBILITY TO, 1"
35,10591-OMIM:168300-K94FTS1806,UNII:K94FTS1806,OMIM:168300,Flecainide,PARAMYOTONIA CONGENITA OF VON EULENBURG; PMC
52,10593-OMIM:601144-L628TT009W,UNII:L628TT009W,OMIM:601144,Isoprenaline,BRUGADA SYNDROME 1; BRGDA1
56,10593-OMIM:608567-N7Z035406B,UNII:N7Z035406B,OMIM:608567,Cilostazol,SICK SINUS SYNDROME 1; SSS1
...,...,...,...,...,...
660,8979-OMIM:615214-66Y330CJHS,UNII:66Y330CJHS,OMIM:615214,Human immunoglobulin G,"AGAMMAGLOBULINEMIA 7, AUTOSOMAL RECESSIVE"
663,9115-OMIM:212065-O26FZP769L,UNII:O26FZP769L,OMIM:212065,Lorazepam,"CONGENITAL DISORDER OF GLYCOSYLATION, TYPE Ia;..."
670,9413-OMIM:615966-66Y330CJHS,UNII:66Y330CJHS,OMIM:615966,Human immunoglobulin G,IMMUNODEFICIENCY 26 WITH OR WITHOUT NEUROLOGIC...
673,9577-OMIM:614023-452VLY9402,UNII:452VLY9402,OMIM:614023,Serine,PHOSPHOSERINE PHOSPHATASE DEFICIENCY


In [8]:
data_full = pd.concat([df_oopd, df_gtrx])
data_full.to_csv("data_full.tsv", sep="\t", index=False)
data_full

Unnamed: 0,_id,drug_id,disease_id,drug_name,disease_name
2070,ACTRVOBWPAIOHC-XIXRPRMCSA-N,PUBCHEM.COMPOUND:2724354,UMLS:C0149921,Succimer,lead poisoning in children
1927,APKFDSVGJQXUKY-INPOYWNPSA-N,PUBCHEM.COMPOUND:5280965,UMLS:C0085436,Liposomal amphotericin B,Cryptococcal meningitis
628,APKFDSVGJQXUKY-VTPNKFOWSA-N,PUBCHEM.COMPOUND:118705500,UMLS:C1262313,Amphotericin B lipid complex,Invasive Fungal Infections
1229,AQHHHDLHHXJYJD-UHFFFAOYSA-N,PUBCHEM.COMPOUND:4946,UMLS:C0018916,propranolol,Hemangiomas
49,AUFUWRKPQLGTGF-FMKGYKFTSA-N,PUBCHEM.COMPOUND:20058,UMLS:C0268130,uridine triacetate,OROTIC ACIDURIA
...,...,...,...,...,...
660,8979-OMIM:615214-66Y330CJHS,UNII:66Y330CJHS,OMIM:615214,Human immunoglobulin G,"AGAMMAGLOBULINEMIA 7, AUTOSOMAL RECESSIVE"
663,9115-OMIM:212065-O26FZP769L,UNII:O26FZP769L,OMIM:212065,Lorazepam,"CONGENITAL DISORDER OF GLYCOSYLATION, TYPE Ia;..."
670,9413-OMIM:615966-66Y330CJHS,UNII:66Y330CJHS,OMIM:615966,Human immunoglobulin G,IMMUNODEFICIENCY 26 WITH OR WITHOUT NEUROLOGIC...
673,9577-OMIM:614023-452VLY9402,UNII:452VLY9402,OMIM:614023,Serine,PHOSPHOSERINE PHOSPHATASE DEFICIENCY


In [9]:
data_tsv = data_full.sample(n=50, random_state=3000)
data_tsv.to_csv("data.tsv", sep="\t", index=False)
data_tsv

Unnamed: 0,_id,drug_id,disease_id,drug_name,disease_name
312,2519-OMIM:615897-66Y330CJHS,UNII:66Y330CJHS,OMIM:615897,Human immunoglobulin G,IMMUNODEFICIENCY 24
49,AUFUWRKPQLGTGF-FMKGYKFTSA-N,PUBCHEM.COMPOUND:20058,UMLS:C0268130,uridine triacetate,OROTIC ACIDURIA
99,10969-OMIM:212140-0G389FZZ9M,UNII:0G389FZZ9M,OMIM:212140,Levocarnitine,"CARNITINE DEFICIENCY, SYSTEMIC PRIMARY"
2983,FWYSMLBETOMXAG-QHCPKHFHSA-N,PUBCHEM.COMPOUND:45138674,UMLS:C0205725,letermovir,Human cytomegalovirus
3637,YQNQNVDNTFHQSW-UHFFFAOYSA-N,PUBCHEM.COMPOUND:41684,UMLS:C0010418,Nitazoxanide,Cryptosporidiosis
1291,YLMAHDNUQAMNNX-UHFFFAOYSA-N,PUBCHEM.COMPOUND:123596,UMLS:C0221013,Imatinib mesylate,Systemic mastocytosis
544,6269-ORPHA:130-L628TT009W,UNII:L628TT009W,OMIM:6269-ORPHA:130,Isoprenaline,Brugada syndrome
151,1304-OMIM:607398-WI4X0X7BPJ,UNII:WI4X0X7BPJ,OMIM:607398,Hydrocortisone,GLUCOCORTICOID DEFICIENCY 2; GCCD2
1089,LFQSCWFLJHTTHZ-UHFFFAOYSA-N,PUBCHEM.COMPOUND:702,UMLS:C4551472,dehydrated alcohol,Hypertrophic obstructive cardiomyopathy
426,4170-ORPHA:79277-9VU1KI44GP,UNII:9VU1KI44GP,OMIM:4170-ORPHA:79277,Vitamin D,Congenital erythropoietic porphyria
