# OOPD (orphan drugs)
- from API url listed in [github issue](https://github.com/TranslatorSRI/Benchmarks/issues/20) (mychem.info)


In [1]:
import pandas as pd
import requests

In [2]:
from biothings_client import get_client

mc = get_client('chem')
drugs = mc.query(
        '_exists_:fda_orphan_drug', 
        fields='fda_orphan_drug',
        fetch_all=True)

In [3]:
# url = 'https://mychem.info/v1/query?q=_exists_:fda_orphan_drug&fields=fda_orphan_drug&fetch_all=True'
# data = requests.get(url)
# drugs = data.json()
# drugs = drugs['hits']

In [4]:
df = pd.DataFrame(columns=['_id', 'drug_pubchem_cid', 'disease_umls', 'drug_name', 'disease_name', 'status', 'date'])

for drug in drugs:
    
    _id = drug['_id']
    
    # orphan designation 
    for use in drug['fda_orphan_drug']:

        # designation status/approval (may be useful for filtering)
        status = use['designation_status']
        date = use['designated_date']
            

        if isinstance(use['orphan_designation']['umls'], list):
            names = use['orphan_designation']['parsed_text']
            umls = use['orphan_designation']['umls']
        else:
            names = [use['orphan_designation']['parsed_text']]
            umls = [use['orphan_designation']['umls']]
        # for now, skip if there are nested lists of names
        if any(isinstance(el, list) for el in names):
            continue
        name_uml_dict = dict(zip(names, umls))
        
        if 'pubchem_cid' in use.keys():
            if isinstance(use['pubchem_cid'], list):
                continue
            cid = 'PUBCHEM.COMPOUND:' + str(use['pubchem_cid'])
        else:
            cid = ''
            # only include cases where a pubchem cid exists (used for querying)
            continue

        for name, uml in name_uml_dict.items():
            disease_umls = "UMLS:" + uml
            disease_name = name
            new_record = {
                '_id': _id,
                'drug_name': use['generic_name'],
                'drug_pubchem_cid': cid,
                'disease_umls': disease_umls,
                'disease_name': disease_name,
                'status': status,
                'date': date
                }
            #print(new_record)
            df.loc[len(df)] = new_record

df

INFO:biothings.client:Fetching 3661 chem(s) . . .
ERROR:biothings.client:No more results to return.


Unnamed: 0,_id,drug_pubchem_cid,disease_umls,drug_name,disease_name,status,date
0,WXXSNCNJFUAIDG-UHFFFAOYSA-N,PUBCHEM.COMPOUND:11304743,UMLS:C0036421,riociguat,Systemic sclerosis,Designated/Withdrawn,2014-07-24
1,WXXSNCNJFUAIDG-UHFFFAOYSA-N,PUBCHEM.COMPOUND:11304743,UMLS:C3203102,riociguat,PULMONARY ARTERIAL HYPERTENSION,Designated/Approved,2013-09-19
2,WXXSNCNJFUAIDG-UHFFFAOYSA-N,PUBCHEM.COMPOUND:11304743,UMLS:C2363973,riociguat,Chronic thromboembolic pulmonary hypertension,Designated/Approved,2013-09-19
3,WYQPLTPSGFELIB-JTQPXKBDSA-N,PUBCHEM.COMPOUND:443936,UMLS:C0030343,difluprednate,Panuveitis,Designated/Approved,2008-09-30
4,WYQPLTPSGFELIB-JTQPXKBDSA-N,PUBCHEM.COMPOUND:443936,UMLS:C4325554,difluprednate,Traumatic anterior uveitis,Designated/Approved,2008-09-30
...,...,...,...,...,...,...,...
3910,QUIJNHUBAXPXFS-XLJNKUFUSA-N,PUBCHEM.COMPOUND:5388906,UMLS:C0041296,bedaquiline,Active Tuberculosis,Designated/Approved,2005-01-10
3911,QUIJNHUBAXPXFS-XLJNKUFUSA-N,PUBCHEM.COMPOUND:5388906,UMLS:C0041296,bedaquiline,Active Tuberculosis,Designated/Approved,2005-01-10
3912,QUIJNHUBAXPXFS-XLJNKUFUSA-N,PUBCHEM.COMPOUND:5388906,UMLS:C0009450,Bedaquiline,Infection,Designated,2017-07-27
3913,QUIJNHUBAXPXFS-XLJNKUFUSA-N,PUBCHEM.COMPOUND:5388906,UMLS:C0026912,Bedaquiline,Mycobacteria,Designated,2017-07-27


In [5]:
# steps from DrugCentral_creative Benchmark get_indications.py

# sort
df = df.sort_values('_id')

df.to_csv("data_full.tsv", sep="\t", index=False)

# restrict to only 1 drug per disease
df = df.drop_duplicates('disease_umls', keep=False)
df

Unnamed: 0,_id,drug_pubchem_cid,disease_umls,drug_name,disease_name,status,date
1266,ACRHBAYQBXXRTO-OAQYLSRUSA-N,PUBCHEM.COMPOUND:132999,UMLS:C0340515,ivabradine,heart dysfunction,Designated/Withdrawn,2018-07-09
2743,ACSROKXFXFNERX-UHFFFAOYSA-N,PUBCHEM.COMPOUND:51711,UMLS:C0338656,Pramiracetam Sulfate,Cognitive Dysfunction,Designated/Withdrawn,1991-11-04
2070,ACTRVOBWPAIOHC-XIXRPRMCSA-N,PUBCHEM.COMPOUND:2724354,UMLS:C0149921,Succimer,lead poisoning in children,Designated/Approved,1984-05-09
2069,ACTRVOBWPAIOHC-XIXRPRMCSA-N,PUBCHEM.COMPOUND:2724354,UMLS:C0728899,Succimer,Intoxication,Designated,1991-03-22
2000,AKJHMTWEGVYYSE-FXILSDISSA-N,PUBCHEM.COMPOUND:5288209,UMLS:C2700204,fenretinide,Follicular T-cell lymphoma,Designated,2017-12-22
...,...,...,...,...,...,...,...
3640,ZNNLBTZKUZBEKO-UHFFFAOYSA-N,PUBCHEM.COMPOUND:3488,UMLS:C0948008,Glyburide,Ischemic stroke,Designated,2017-01-12
3848,ZSSLCFLHEFXANG-GOSISDBHSA-N,PUBCHEM.COMPOUND:25052630,UMLS:C0020542,Zamicastat,Pulmonary Hypertension,Designated,2019-07-15
3451,ZUJBBVJXXYRPFS-DYKIIFRCSA-N,PUBCHEM.COMPOUND:71722017,UMLS:C1096155,dusquetide,Macrophage Activation Syndrome,Designated,2016-08-10
842,ZXFWJPKXEMFBOG-LWVMDMHWSA-N,PUBCHEM.COMPOUND:118984442,UMLS:C0022660,Anaritide acetate,Acute renal failure,Designated/Withdrawn,1992-08-27


In [6]:
approved_df = df.loc[df['status'] == 'Designated/Approved']

data_tsv = approved_df.sample(50, random_state=3000)
data_tsv.to_csv("data.tsv", sep="\t", index=False)
data_tsv

Unnamed: 0,_id,drug_pubchem_cid,disease_umls,drug_name,disease_name,status,date
1502,KTEIFNKAUNYNJU-GFCCVEGCSA-N,PUBCHEM.COMPOUND:11626560,UMLS:C0024302,crizotinib,large cell lymphoma,Designated/Approved,2012-09-28
3311,JGWRKYUXBBNENE-UHFFFAOYSA-N,PUBCHEM.COMPOUND:25151352,UMLS:C0039106,pexidartinib,Pigmented villonodular synovitis,Designated/Approved,2014-02-14
1409,YFGHCGITMMYXAQ-UHFFFAOYSA-N,PUBCHEM.COMPOUND:4236,UMLS:C4551761,Modafinil,Excessive daytime sleepiness,Designated/Approved,1993-03-15
3862,HXHWSAZORRCQMX-UHFFFAOYSA-N,PUBCHEM.COMPOUND:2082,UMLS:C4553297,Albendazole,Hydatid Disease,Designated/Approved,1996-01-17
3861,HXHWSAZORRCQMX-UHFFFAOYSA-N,PUBCHEM.COMPOUND:2082,UMLS:C0152069,Albendazole,Alveolar echinococcosis,Designated/Approved,1996-01-17
2331,UUVWYPNAQBNQJQ-UHFFFAOYSA-N,PUBCHEM.COMPOUND:2123,UMLS:C5204526,Altretamine,Advanced Adenocarcinoma,Designated/Approved,1984-02-09
2527,CYOHGALHFOKKQC-UHFFFAOYSA-N,PUBCHEM.COMPOUND:10127622,UMLS:C0027831,selumetinib,Neurofibromatosis type 1,Designated/Approved,2018-02-14
361,QXWYKJLNLSIPIN-JGVFFNPUSA-N,PUBCHEM.COMPOUND:92974,UMLS:C1291314,droxidopa,hydroxylase deficiency,Designated/Approved,2007-01-17
1291,YLMAHDNUQAMNNX-UHFFFAOYSA-N,PUBCHEM.COMPOUND:123596,UMLS:C0221013,Imatinib mesylate,Systemic mastocytosis,Designated/Approved,2005-09-09
3700,GAQMWPRWVIGRRV-IVDGIBIRSA-N,PUBCHEM.COMPOUND:70683024,UMLS:C0854467,Pegfilgrastim,Myelosuppression,Designated/Approved,2013-11-20
