In [1]:
import json
import regex as re

import pandas as pd

pd.set_option('display.max_columns', None) # display all columns in DF

# Data from OpenFDA NDC

In [2]:
with open('raw_openFDA_org_NDC_data.json', 'r') as f: # reading download json file
    data = json.load(f)
    
df_openFDA_NDC = pd.DataFrame(data['results']) # creating a DataFrame

print(len(df_openFDA_NDC))

130115


### Data manipulation

Removing all non human prescription drugs and drugs of the type: EMERGENCY USE AUTHORIZATION, UNAPPROVED DRUG FOR USE IN DRUG SHORTAGE, UNAPPROVED DRUG OTHER

In [3]:
df_openFDA_NDC = df_openFDA_NDC.loc[df_openFDA_NDC['product_type'] == 'HUMAN PRESCRIPTION DRUG']

In [4]:
df_openFDA_NDC = df_openFDA_NDC.loc[df_openFDA_NDC['marketing_category'] != 'EMERGENCY USE AUTHORIZATION']
df_openFDA_NDC = df_openFDA_NDC.loc[df_openFDA_NDC['marketing_category'] != 'UNAPPROVED DRUG FOR USE IN DRUG SHORTAGE']
df_openFDA_NDC = df_openFDA_NDC.loc[df_openFDA_NDC['marketing_category'] != 'UNAPPROVED DRUG OTHER']

Splitting ndc into labeler and product code

In [5]:
df_temp = df_openFDA_NDC['product_ndc'].str.split("-", n = 1, expand = True)
df_openFDA_NDC['labeler_code']= df_temp[0]
df_openFDA_NDC['labeler_code'] = pd.to_numeric(df_openFDA_NDC['labeler_code'])
df_openFDA_NDC['product_code']= df_temp[1]
df_openFDA_NDC['product_code'] = pd.to_numeric(df_openFDA_NDC['product_code'])

Clearing route for characters that are not a letters or numbers

In [6]:
df_openFDA_NDC['route'] = df_openFDA_NDC['route'].astype(str).apply(lambda x: re.sub('[^A-Za-z0-9]+', '', x)) 

Creating an unique id

In [7]:
df_openFDA_NDC['unique_id'] = df_openFDA_NDC['generic_name'].astype(str) + '-' + df_openFDA_NDC['dosage_form'].astype(str) + '-' + df_openFDA_NDC['route'].astype(str)

Removing duplicates based on unique id, labeler and product code

In [8]:
df_openFDA_NDC = df_openFDA_NDC.drop_duplicates(subset = ['unique_id', 'labeler_code', 'product_code'])
print(len(df_openFDA_NDC))

48907


# Downloading output

In [9]:
df_openFDA_NDC.to_csv('raw_openFDA_NDC_data.csv', sep = '~', index = False)