In [510]:
import requests

# rate limiting is important to avoid accidental service abuse of the OpenFDA API provider
from ratelimit import limits, sleep_and_retry

# cache API calls in a sqllite file to reduce the number of requests to openfda server
import requests_cache
requests_cache.install_cache('openfda_cache')

OPENFDA_API = "https://api.fda.gov/drug/event.json"

@sleep_and_retry
@limits(calls=40, period=60)
def call_api(params):
    """
    OpenFDA API call. Respects rate limit. Overrides default data limit
    Input: dictionary with API parameters {search: '...', count: '...'}
    Output: nested dictionary representation of the JSON results section
    
    OpenFDA API rate limits:
         With no API key: 40 requests per minute, per IP address. 1000 requests per day, per IP address.
         With an API key: 240 requests per minute, per key. 120000 requests per day, per key.
    """
    if not params:
        params = {}
    params['limit'] = params.get('limit', 1000)
    response = requests.get(OPENFDA_API, params=params)
    print(response.url)

    if response.status_code != 200:
        raise Exception('API response: {}'.format(response.status_code))
    return response.json()['results']

OPENFDA_METADATA_YAML = "https://open.fda.gov/fields/drugevent.yaml"
# munch is a yaml parser with javascript-style object access
from munch import Munch

def api_meta():
    """
    YAML file with field description and other metadata retrieved from the OpenFDA website
    Parses YAML file and provides syntactic sugar for accessing nested dictionaries
    Example: .patient.properties.patientagegroup.possible_values.value
    Note: reserved words, such as count and items still have to be accessed via ['count'], ['items']
    """
    response = requests.get(OPENFDA_METADATA_YAML)
    if response.status_code != 200:
        raise Exception('Could not retrieve YAML file with drug event API fields')
    y = Munch.fromYAML(response.text)
    return y['properties']

In [511]:
api_meta().patient.properties.keys()

dict_keys(['drug', 'patientagegroup', 'patientdeath', 'patientonsetage', 'patientonsetageunit', 'patientsex', 'patientweight', 'reaction', 'summary'])

In [512]:
import pandas as pd
import numpy as np
import datetime

start_date = '20110101'
end_date = '20110304'


# For example, 20100729 is 07/29/2010"
#start_date = input("Enter the beginning of your desired date range: " )
#end_date = input("Enter the end of your desired date range: ")

#country_list = input("Enter the countries you would like to limit your search to: ")

# create a range of all dates between start and end date
my_range = pd.date_range(start=start_date, end=end_date)

# reformat the dates to match FAERS
f_range = []
for dt in my_range:
    y = str(dt)[0:4]
    m = str(dt)[5:7]
    d = str(dt)[8:10]
    new_dt = y + m + d
    f_range.append(new_dt)


In [513]:
#safetyreportid of test entry: 10574310

columbia_api_key = 'Og4jAa0KIhPJkiwaxXVD6VHp3DGqoQf37JFPeRct'

# test strings for API query search fields
date_query = 'patient.summary.narrativeincludeclinical:('

d_sub = '"CASE EVENT DATE:'

# add each date in range to date_query
date_ind = 0
num_dates = len(f_range)
for ymd in f_range:
    date_ind += 1
    if date_ind < num_dates:
        date_query += d_sub + ymd + '"' + " OR "
    else:
        date_query += d_sub + ymd + '"' + ")"
#country_query = 'primarysource.reportercountry:"FR"'
#country_query = '(primarysource.reportercountry:"FR" OR occurcountry:"FR" OR primarysourcecountry:"FR")'
#country_query = 'primarysource.reportercountry:("FR" OR "DE" OR "ES" OR "IT" OR "CH")'
country_query = 'occurcountry:("FR" OR "DE" OR "ES" OR "IT" OR "CH")'
reaction_query = 'patient.reaction.reactionmeddrapt:("Heat exhaustion" OR "Heat stroke")'


#'search': "receivedate:[20040101 TO 20200101] AND patient.reaction.reactionmeddrapt.exact: {}".format(


test_out = call_api({
    'limit': 50,
    'api_key': columbia_api_key,
    #'search': 'transmissiondate:[20040101 TO 20150630]' + ' AND ' + country_query + ' AND ' + date_query
    'search': 'transmissiondate:[20040101 TO 20150630]' + ' AND ' + reaction_query
    #'search': 'transmissiondate:[20150630 TO 20150730]' + ' AND ' + country_query
    #'search': country_query + ' AND ' + date_query
})


https://api.fda.gov/drug/event.json?limit=50&api_key=Og4jAa0KIhPJkiwaxXVD6VHp3DGqoQf37JFPeRct&search=transmissiondate%3A%5B20040101+TO+20150630%5D+AND+patient.reaction.reactionmeddrapt%3A%28%22Heat+exhaustion%22+OR+%22Heat+stroke%22%29


In [514]:
#call_api returns results, which is a list of dictionaries: [ {...}, {...}, {...} ]

write_obj = open('results.txt', 'w')
write_obj.close()
results_obj = open('results.txt', 'a')
#results_obj.write(str(x['patient']['drug']))

safetyreportid_list = []
# iterate through each entry
# store safetyreportid
# TODO: demographic info?
for entry in test_out:
    # safetyreportid
    results_obj.write("report ID: ")
    results_obj.write(str(entry['safetyreportid']) + "\n")
    safetyreportid_list.append(str(entry['safetyreportid']))

    # reportercountry
    results_obj.write("\treportercountry: " + str(entry['primarysource']['reportercountry']) + "\n")

    # sex
    if 'patientsex' in entry['patient']:
        results_obj.write("\tsex: " + str(entry['patient']['patientsex']) + "\n")

    # date
    if 'summary' in entry['patient']:
        results_obj.write("\t" + str(entry['patient']['summary']['narrativeincludeclinical']) + "\n")
    

results_obj.close()

In [515]:

# get lines in file that start with the given string
# used to search standard_case_drug to find all drugs in a given reportid 
#def generate_last_cols_lines_that_equal(string, fp, case):
def generate_lines_that_equal(string, fp, case):
    # iterate through each line in the given file
    yielded = set() # for ensuring only non-duplicate drug concepts ids are yielded
    for line in fp:
        # if there is a match
        if line.startswith(string):
            # split line by tabs
            line_2 = line.strip("\n")
            tab_split = line_2.split('\t')
            # get the last column (concept id) and second to last column (role_cod)
            # non-duplicate drug concepts
            if case == 0:
                lc = tab_split[(len(tab_split)-1)]
                # role_cod
                rc = tab_split[3]
                # ensure no duplicates
                if tuple([lc, rc]) in yielded:
                    continue
                yield [lc, rc]
                yielded.add(tuple([lc, rc]))
            # RxNorm drug names
            elif case == 1:
                lc = tab_split[1]
                yield lc
            # SNOMED outcome concepts from standard_case_outcome
            elif case == 2:
                lc = tab_split.pop()
                yield lc


In [516]:
# dictionary template for a drug
drug_template = { "drug_concept_id": None, "drug_name": None, "role_cod": None, "KEGG": {
     "get_URL": None, "D_number": None, "Classes": [], "Target": [ ], "Pathway": None }  
    }

sub_dict = { "drugs": [ ], "reactions_SNOMED": [], "reactions_MedDRA:": []}
# creates a dictionary entry for each id
results_dict = dict( (safetyreportid, sub_dict.copy()) for safetyreportid in safetyreportid_list )
#print(results_dict)

In [517]:

# adds a drug subdictionary for each drug for each safetyreportid key
def add_AEOLUS_data_to_dictionary(res_dict):
    # search standard_case_drug.tsv for each safetyreportid key
    # adds drug subdictionary for each drug for each report
    with open("doi_10.5061_dryad.8q0s4__v1/aeolus_v1/standard_case_drug.tsv", "r") as fp:
        for pid in results_dict:
            #id_vals = []
            # generate a list of drug_ids associated with each safetyreportID
            for i in generate_lines_that_equal(str(pid), fp, 0):
                template_copy = drug_template.copy()
                # assign drug_concept_ids
                template_copy['drug_concept_id'] = i[0]
                # assign role_cod (primary suspect, secondary suspect, concomitant, etc)
                template_copy['role_cod'] = i[1]
                drugs_sub_dict_copy = results_dict[pid]['drugs'].copy()
                drugs_sub_dict_copy.append(template_copy)
                results_dict[pid]['drugs'] = drugs_sub_dict_copy
            # go back to beginning of file
            fp.seek(0)
    # adds outcome for each safetyreportid key by searching standard_case_outcome file
    with open("doi_10.5061_dryad.8q0s4__v1/aeolus_v1/standard_case_outcome.tsv") as fp:
        for pid in results_dict:
            # generate a list of MedDRA reactions and SNOMED reactions associated with each safetyreportID
            for i in generate_lines_that_equal(str(pid), fp, 2):
                reactions_sub_dict_copy = results_dict[pid]['reactions_SNOMED'].copy()
                reactions_sub_dict_copy.append(i)
                results_dict[pid]['reactions_SNOMED'] = reactions_sub_dict_copy
            # go back to beginning of file
            fp.seek(0)
        
add_AEOLUS_data_to_dictionary(results_dict)

In [None]:
# adds drug names from RxNorm by searching drug concept IDs
for pid in results_dict:
    for drug in results_dict[pid]['drugs']:
        d_id = drug['drug_concept_id']
        drug_name_vals = []
        # search standard_case_drug for ids
        with open("RxNorm_vocab/CONCEPT.csv", "r") as fp:
            for i in generate_lines_that_equal(d_id, fp, 1):
                drug_name_vals.append(i)
                drug['drug_name'] = i


In [None]:
KEGG_find_base = 'https://rest.kegg.jp/find/drug/'
KEGG_get_base = 'https://rest.kegg.jp/get/'


# function to generate the appropriate URLs for KEGG API queries
# uses the find request to search a drug string, then determines the d-number of the drug for the get-url 
def KEGG_find_query(drug_string):
    find_url = KEGG_find_base + drug_string
    find_req = requests.get(find_url)
    a = find_req.text.split("\n")
    b = []
    for l in a:
        b.append(l.split("\t"))
    b.pop()
    # iterate through find entries to find INN names, extract the d number
    # if multiple entries contain INN, store the first one
    # if none, store the first
    for found_entry in b:
        if "INN" in found_entry[1]:
            get_url = KEGG_get_base + found_entry[0]
            x = found_entry[0].split(":")
            d_number = x[1]
            # put d number at end of get URL
            return [find_url, get_url, d_number]
    #print("find str:", b[0])
    temp = b[0][0].split(":")
    d_number = temp[1]
    return [find_url, KEGG_get_base + b[0][0], d_number]


# send get request to Kegg and store drug targets, pathway, and class
def KEGG_get_query(get_URL):
    target = []
    pathway = []
    class_info = []
    get_req = requests.get(get_URL)
    klines = get_req.iter_lines()
    # booleans to keep track of if a line stores info on pathway, class, or target
    is_targ = 0
    is_path = 0
    is_class = 0
    # iterate through get response
    for line in klines:
        dec = line.decode("utf-8")
        # line has class info
        if dec.startswith("CLASS"):
            is_class = 1
        # line does not have class info
        if dec.startswith("REMARK"):
            is_class = 0
        # line has target info
        if dec.startswith("TARGET"):
            is_targ = 1
        # line has pathway info, not target info
        if dec[2:9]=="PATHWAY":
            is_path = 1
            is_targ = 0
        # line no longer has path info
        if dec.startswith("INTERACTION"):
            is_targ = 0
            is_path = 0
        # if the line corresponds to a field of interest, store it
        if is_class == 1: 
            #print(dec)
            # gets the name and dg-number of the most specific KEGG drug groups associated with the given d-number
            class_info.append(dec[12:])
        elif is_targ == 1: target.append(dec[12:])
        elif is_path == 1: pathway.append(dec[12:])
    # store most specific drug groups
    class_list = []
    for ind in range(len(class_info)):
        #print(class_info[ind])
        # add last class line by default
        if ind == len(class_info) - 1:
            split_line = class_info[ind].strip(" ").split("  ")
            class_list.append([split_line[0], split_line[1]])
            break
        if " DG" in class_info[ind]:
            # not the most specific drug group; continue to next line
            if " DG" in class_info[ind+1]: continue
            else: 
                split_line = class_info[ind].strip(" ").split("  ")
                class_list.append([split_line[0], split_line[1]])
    #print("\n", class_list)
    return(target, pathway, class_list)

#x = KEGG_get_query("https://rest.kegg.jp/get/dr:D02714")
#print(safetyreportid_list)
for x in results_dict:
    print(x, results_dict[x])
    print("\n")
          

10020013 {'drugs': [{'drug_concept_id': '1112921', 'drug_name': 'ipratropium', 'role_cod': 'PS', 'KEGG': {'get_URL': 'https://rest.kegg.jp/get/dr:D02212', 'D_number': 'D02212', 'Classes': [['DG01491', 'Muscarinic cholinergic receptor antagonist']], 'Target': ['CHRM [HSA:1128 1129 1131 1132 1133] [KO:K04129 K04130 K04131 K04132 K04133]'], 'Pathway': ['hsa04024(1128+1129)  cAMP signaling pathway', 'hsa04080(1128+1129+1131+1132+1133)  Neuroactive ligand-receptor interaction', 'hsa04725(1128+1129+1131+1132+1133)  Cholinergic synapse']}}, {'drug_concept_id': '1149380', 'drug_name': 'fluticasone', 'role_cod': 'C', 'KEGG': {'get_URL': 'https://rest.kegg.jp/get/dr:D06315', 'D_number': 'D06315', 'Classes': [['DG02068', 'Glucocorticoid'], ['DG02913', 'CYP3A4 substrate']], 'Target': ['NR3C1 (GR) [HSA:2908] [KO:K05771]'], 'Pathway': ['hsa04080(2908)  Neuroactive ligand-receptor interaction', 'Enzyme: CYP3A4 [HSA:1576]']}}, {'drug_concept_id': '1154343', 'drug_name': 'albuterol', 'role_cod': 'C', '

In [None]:
import traceback

# update dictionary with info from KEGG API requests
# copy for testing
results_dict1 = results_dict.copy()
def get_KEGG_info():
    for reportID in results_dict1:
        for drug in results_dict1[reportID]['drugs']:
            if type(drug['KEGG']) == dict:
                a = drug['KEGG'].copy()
            else:
                a = drug['KEGG']
            # try fixes for common problems with find request
            try:
                kegg_find_results = KEGG_find_query(drug['drug_name'])
            except:
                print("\ncatch 1!")
                print("case id:", reportID)
                print(KEGG_find_base + drug['drug_name'])
                print("concept id:", drug['drug_concept_id'])
                # catching exception with drugs with hyphens
                try:
                    # try fix for mixture drugs
                    fix1 = drug['drug_name'].split(" ")
                    print("try2:", KEGG_find_base + fix1[0] + " " + fix1[2])
                    kegg_find_results = KEGG_find_query(fix1[0] + " " + fix1[2])
                    print(kegg_find_results)
                    print("fix1: ", KEGG_find_base + fix1[0] + " " + fix1[2])
                except Exception as e0:
                    print(traceback.format_exc())
                    # try fix for drugs with hyphens
                    try:
                        print("catch 2!")
                        print("error message:", e0)
                        fix2 = drug['drug_name'].replace("-", " ")
                        kegg_find_results = KEGG_find_query(fix2)
                        print("fixed 2:", fix2)
                    except Exception as e1:
                        print(traceback.format_exc())
                        print("catch 3!")
                        print("error message:", e1)
                        print("No KEGG:", drug['drug_name'])
                        a = "No entry"
                        drug['KEGG'] = a
                        continue    
            kegg_get_results = KEGG_get_query(kegg_find_results[1])
            if type(a) == dict:
                a['get_URL'] = kegg_find_results[1]
                a['D_number'] = kegg_find_results[2]
                a['Target'] = kegg_get_results[0]
                a['Pathway'] = kegg_get_results[1]
                a['Classes'] = kegg_get_results[2]
                drug['KEGG'] = a

get_KEGG_info()



catch 1!
case id: 10085688
https://rest.kegg.jp/find/drug/metformin / rosiglitazone Oral Tablet
concept id: 40063306
try2: https://rest.kegg.jp/find/drug/metformin rosiglitazone
['https://rest.kegg.jp/find/drug/metformin rosiglitazone', 'https://rest.kegg.jp/get/dr:D10244', 'D10244']
fix1:  https://rest.kegg.jp/find/drug/metformin rosiglitazone

catch 1!
case id: 10332098
https://rest.kegg.jp/find/drug/calcium chloride / glucose / lactate / magnesium chloride / sodium chloride Intraperitoneal Solution
concept id: 40166942
try2: https://rest.kegg.jp/find/drug/calcium /
['https://rest.kegg.jp/find/drug/calcium /', 'https://rest.kegg.jp/get/dr:D00934', 'D00934']
fix1:  https://rest.kegg.jp/find/drug/calcium /

catch 1!
case id: 10332098
https://rest.kegg.jp/find/drug/calcium chloride / glucose / lactate / magnesium chloride / sodium chloride Intraperitoneal Solution
concept id: 40166942
try2: https://rest.kegg.jp/find/drug/calcium /
['https://rest.kegg.jp/find/drug/calcium /', 'https://r

In [None]:
# TODO: 
# Figure out what to do when there are multiple lines with INN
# Figure out what to do with safetyreportids with no corresponding AEOLUS entries

In [None]:
#print(safetyreportid_list)
for x in results_dict1:
    print(x, results_dict1[x])
    print("\n")


10020013 {'drugs': [{'drug_concept_id': '1112921', 'drug_name': 'ipratropium', 'role_cod': 'PS', 'KEGG': {'get_URL': 'https://rest.kegg.jp/get/dr:D02212', 'D_number': 'D02212', 'Classes': [['DG01491', 'Muscarinic cholinergic receptor antagonist']], 'Target': ['CHRM [HSA:1128 1129 1131 1132 1133] [KO:K04129 K04130 K04131 K04132 K04133]'], 'Pathway': ['hsa04024(1128+1129)  cAMP signaling pathway', 'hsa04080(1128+1129+1131+1132+1133)  Neuroactive ligand-receptor interaction', 'hsa04725(1128+1129+1131+1132+1133)  Cholinergic synapse']}}, {'drug_concept_id': '1149380', 'drug_name': 'fluticasone', 'role_cod': 'C', 'KEGG': {'get_URL': 'https://rest.kegg.jp/get/dr:D06315', 'D_number': 'D06315', 'Classes': [['DG02068', 'Glucocorticoid'], ['DG02913', 'CYP3A4 substrate']], 'Target': ['NR3C1 (GR) [HSA:2908] [KO:K05771]'], 'Pathway': ['hsa04080(2908)  Neuroactive ligand-receptor interaction', 'Enzyme: CYP3A4 [HSA:1576]']}}, {'drug_concept_id': '1154343', 'drug_name': 'albuterol', 'role_cod': 'C', '