# Libraries

In [2]:
import requests
import json
from ast import literal_eval

import pandas as pd

pd.set_option('display.max_columns', None) # display all columns in DF

# Data from OpenFDA Drug@FDA

In [118]:
for offset_i in range(0, 25374, 1000):
    
    payload = {'api_key': 'vJTYEEbNaMLaY4e8yZDGKYXwdmDdsPfZB05G87ea', 'limit': '1000', 'skip': offset_i}
    r = requests.get('https://api.fda.gov/drug/drugsfda.json?', params=payload, timeout = 20)
    data = r.json()
        
    df = pd.json_normalize(data['results'])
        
    if offset_i == 0:
        df_openFDA = df
    else:
        df_openFDA = df_openFDA.append(df, ignore_index = True)

print(len(df_openFDA))

25374


Saving data as csv to save time

In [119]:
df_openFDA.to_csv('raw_openFDA_org_data.csv', sep = '~', index = False)

In [3]:
df_openFDA = pd.read_csv('raw_openFDA_org_data.csv', sep = '~')

### Data manipulation

Normalizing products and active ingredients

In [4]:
df_openFDA = df_openFDA.dropna(subset = ['products'])
df_openFDA['products'] = df_openFDA['products'].apply(literal_eval)

df_products = pd.DataFrame(df_openFDA['products'].dropna()) 
df_products['id'] = df_products.index

for i, j in zip(df_products['products'], df_products['id']): # normalizing product and adding id for merge
    df_temp = pd.DataFrame(pd.json_normalize(i))
    df_temp['id'] = j
    
    if j == 0:
        df_products_nor = df_temp
    else:
        df_products_nor = pd.concat([df_products_nor, df_temp])
        
df_products_nor = df_products_nor.set_index('id') # using id as index

df_openFDA = pd.merge(df_openFDA, df_products_nor, left_index = True, right_index = True) # merging normalized product to enrich the orig. data
df_openFDA = df_openFDA.drop(columns=['products'])

Filtering on first product number and removing drugs sold over-the-counter

In [5]:
df_openFDA['product_number'] = pd.to_numeric(df_openFDA['product_number'])
df_openFDA = df_openFDA.loc[df_openFDA['product_number'] == 1]
df_openFDA = df_openFDA.loc[df_openFDA['marketing_status'] != 'Over-the-counter']

Unpacking generic name

In [6]:
df_openFDA = df_openFDA.dropna(subset = ['openfda.generic_name'])
df_openFDA['openfda.generic_name'] = df_openFDA['openfda.generic_name'].apply(literal_eval)
df_openFDA = df_openFDA.explode('openfda.generic_name')

Creating an unique id

In [7]:
df_openFDA['unique_id'] = df_openFDA['openfda.generic_name'].astype(str) + '-' + df_openFDA['dosage_form'].astype(str) + '-' + df_openFDA['route'].astype(str)

Unpacking list for ndc

In [8]:
df_openFDA = df_openFDA.dropna(subset = ['openfda.product_ndc'])
df_openFDA['openfda.product_ndc'] = df_openFDA['openfda.product_ndc'].apply(literal_eval)
df_openFDA = df_openFDA.explode('openfda.product_ndc')

Splitting ndc into labeler and product code

In [9]:
df_temp = df_openFDA['openfda.product_ndc'].str.split("-", n = 1, expand = True)
df_openFDA['labeler_code']= df_temp[0]
df_openFDA['labeler_code'] = pd.to_numeric(df_openFDA['labeler_code'])
df_openFDA['product_code']= df_temp[1]
df_openFDA['product_code'] = pd.to_numeric(df_openFDA['product_code'])

Removing duplicates based on unique id, labeler and product code

In [10]:
print('Before:', len(df_openFDA))
df_openFDA = df_openFDA.drop_duplicates(subset = ['unique_id', 'labeler_code', 'product_code'])
print('After:', len(df_openFDA))

Before: 34248
After: 32375


# Downloading output

In [11]:
df_openFDA.to_csv('raw_openFDA_data.csv', sep = '~', index = False)