In [2]:
import requests

import numpy as np
import pandas as pd

import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns

from ast import literal_eval

pd.set_option('display.max_columns', None) # display all columns in DF
pd.set_option('display.max_rows', None) # display all columns in DF

# Data from openFDA NDC

Load data

In [3]:
df_openFDA_NDC = pd.read_csv('raw_openFDA_NDC_data.csv', sep = '~')

Clear data and remove duplicates

In [6]:
df_openFDA_NDC = df_openFDA_NDC['name'].str.lower()
df_openFDA_NDC = pd.DataFrame(df_openFDA_NDC.drop_duplicates())

# Data from NIH

In [7]:
count = 0

for index, row in df_openFDA_NDC.iterrows(): # iterating through drugs
    
    try:
        count = count + 1
        
        drug_name = row[0]

        start_url = 'https://rxnav.nlm.nih.gov/REST/rxclass/class/byDrugName.json?drugName='
        middle_url = str(drug_name)

        r = requests.get(start_url + middle_url, timeout = 20) # requesting meta data
        data_raw = r.json()
        
        data = data_raw['rxclassDrugInfoList']

        df_temp = pd.json_normalize(data['rxclassDrugInfo'])

        df_temp_epc = df_temp.loc[(df_temp['rela'] == 'has_epc') & (df_temp['relaSource'] == 'DAILYMED')]
        df_temp_dis = df_temp.loc[(df_temp['rela'] == 'may_treat') & (df_temp['relaSource'] == 'MEDRT')]
        
        df_temp = pd.concat([df_temp_epc, df_temp_dis], ignore_index=True)
        
        if count == 1:
            df_NIH = df_temp
        else:
            df_NIH = pd.concat([df_NIH, df_temp])
        
    except:
        continue

Removing duplicates based on generic name

In [None]:
df_NIH = df_NIH.drop_duplicates(subset = ['minConcept.name'])

# Downloading Data

In [None]:
df_NIH.to_csv('nih_data.csv', sep = '~', index = False)

# Data Overview

Groupping by disease and drug class

In [None]:
len(df_NIH)

In [None]:
df_NIH_gro_class = pd.DataFrame(df_NIH.groupby(['rxclassMinConceptItem.classType', 'rxclassMinConceptItem.className'])['minConcept.name'].count())
df_NIH_gro_class = df_NIH_gro_class.sort_values(by = ['minConcept.name'], ascending=False)
df_NIH_gro_class.head(20)