# Libraries

In [159]:
import json
import regex as re

from ast import literal_eval

import pandas as pd
import numpy as np

import dtale

pd.set_option('display.max_columns', None) # display all columns in DF

# Data from OpenFDA NDC

The file "raw_openFDA_org_NDC_data.json" can be downloaded at openFDA. Use the following link: https://download.open.fda.gov/drug/ndc/drug-ndc-0001-of-0001.json.zip

In [160]:
with open('raw_openFDA_org_NDC_data.json', 'r') as f: # reading download json file
    data = json.load(f)
    
df_openFDA_NDC_org = pd.DataFrame(data['results']) # creating a DataFrame

print(len(df_openFDA_NDC_org))

130115


Removing unwished data

Removing all non human prescription drugs and drugs of the type: EMERGENCY USE AUTHORIZATION, UNAPPROVED DRUG FOR USE IN DRUG SHORTAGE, UNAPPROVED DRUG OTHER. ALso removing not finished drugs as well as marketing category: UNAPPROVED MEDICAL GAS, DRUG FOR FURTHER PROCESSING and UNAPPROVED HOMEOPATHIC

In [161]:
df_openFDA_NDC = df_openFDA_NDC_org
df_openFDA_NDC = df_openFDA_NDC.loc[df_openFDA_NDC['finished'] != 'TRUE']

In [162]:
df_openFDA_NDC = df_openFDA_NDC.loc[df_openFDA_NDC['product_type'] == 'HUMAN PRESCRIPTION DRUG']

In [163]:
df_openFDA_NDC = df_openFDA_NDC.loc[df_openFDA_NDC['marketing_category'] != 'EMERGENCY USE AUTHORIZATION']
df_openFDA_NDC = df_openFDA_NDC.loc[df_openFDA_NDC['marketing_category'] != 'UNAPPROVED DRUG FOR USE IN DRUG SHORTAGE']
df_openFDA_NDC = df_openFDA_NDC.loc[df_openFDA_NDC['marketing_category'] != 'UNAPPROVED DRUG OTHER']
df_openFDA_NDC = df_openFDA_NDC.loc[df_openFDA_NDC['marketing_category'] != 'UNAPPROVED MEDICAL GAS']
df_openFDA_NDC = df_openFDA_NDC.loc[df_openFDA_NDC['marketing_category'] != 'DRUG FOR FURTHER PROCESSING']
df_openFDA_NDC = df_openFDA_NDC.loc[df_openFDA_NDC['marketing_category'] != 'UNAPPROVED HOMEOPATHIC']

Removing drugs without active ingredients

In [164]:
df_openFDA_NDC = df_openFDA_NDC.dropna(subset = ['active_ingredients'])

In [165]:
len(df_openFDA_NDC)

47968

# Unpacking

### Unpacking is original packager from openfda under results

In [166]:
df_openFDA_NDC_openFDA_org_pack = pd.DataFrame(df_openFDA_NDC['openfda'])

df_openFDA_NDC_openFDA_org_pack['org_index'] = df_openFDA_NDC_openFDA_org_pack.index

count = 0

for i, j in zip(df_openFDA_NDC_openFDA_org_pack['openfda'], df_openFDA_NDC_openFDA_org_pack['org_index']): # normalizing product and adding id for merge
    
    count = count + 1
    
    df_temp = pd.DataFrame(pd.json_normalize(i))
    df_temp['org_index'] = j
    
    if count == 1:
        df_openFDA_NDC_openFDA_org_pack_nor = df_temp
    else:
        df_openFDA_NDC_openFDA_org_pack_nor = pd.concat([df_openFDA_NDC_openFDA_org_pack_nor, df_temp])
        
df_openFDA_NDC_openFDA_org_pack_nor = df_openFDA_NDC_openFDA_org_pack_nor.set_index('org_index')

df_openFDA_NDC_openFDA_org_pack_nor = pd.DataFrame(df_openFDA_NDC_openFDA_org_pack_nor['is_original_packager'])

df_openFDA_NDC_openFDA_org_pack_nor = df_openFDA_NDC_openFDA_org_pack_nor.explode('is_original_packager')

df_openFDA_NDC_openFDA_org_pack_nor = df_openFDA_NDC_openFDA_org_pack_nor.dropna()

Merging with the orig. data

In [167]:
df_openFDA_NDC = pd.merge(df_openFDA_NDC, df_openFDA_NDC_openFDA_org_pack_nor,  left_index = True, right_index = True, how = 'left')

In [168]:
len(df_openFDA_NDC)

47968

### Unpacking active ingredients from results

In [169]:
df_openFDA_NDC_active_ingredients = pd.DataFrame(df_openFDA_NDC['active_ingredients'])

df_openFDA_NDC_active_ingredients['org_index'] = df_openFDA_NDC_active_ingredients.index

df_openFDA_NDC_active_ingredients = df_openFDA_NDC_active_ingredients.dropna(subset = ['active_ingredients'])

df_openFDA_NDC_active_ingredients = df_openFDA_NDC_active_ingredients.explode('active_ingredients')

count = 0

for i, j in zip(df_openFDA_NDC_active_ingredients['active_ingredients'], df_openFDA_NDC_active_ingredients['org_index']): # normalizing product and adding id for merge
    
    count = count + 1
    
    df_temp = pd.DataFrame(pd.json_normalize(i))
    df_temp['org_index'] = j
    
    if count == 1:
        df_openFDA_NDC_active_ingredients_nor = df_temp
    else:
        df_openFDA_NDC_active_ingredients_nor = pd.concat([df_openFDA_NDC_active_ingredients_nor, df_temp])
        
df_openFDA_NDC_active_ingredients_nor = df_openFDA_NDC_active_ingredients_nor.set_index('org_index')

Creating a list to take multiple values for the same drug into account

In [170]:
df_openFDA_NDC_active_ingredients_nor = df_openFDA_NDC_active_ingredients_nor.reset_index()

In [171]:
df_openFDA_NDC_active_ingredients_nor_name = pd.DataFrame(df_openFDA_NDC_active_ingredients_nor.groupby('org_index').apply(lambda x: (x['name'].values).tolist()))
df_openFDA_NDC_active_ingredients_nor_strength = pd.DataFrame(df_openFDA_NDC_active_ingredients_nor.groupby('org_index').apply(lambda x: (x['strength'].values).tolist()))

df_openFDA_NDC_active_ingredients_nor_name_stregnth = pd.merge(df_openFDA_NDC_active_ingredients_nor_name, df_openFDA_NDC_active_ingredients_nor_strength,  left_index = True, right_index = True, how = 'inner')
df_openFDA_NDC_active_ingredients_nor_name_stregnth = df_openFDA_NDC_active_ingredients_nor_name_stregnth.rename(columns = {'0_x': 'name', '0_y': 'strength'})

Merging with the orig. data

In [172]:
df_openFDA_NDC = pd.merge(df_openFDA_NDC, df_openFDA_NDC_active_ingredients_nor_name_stregnth,  left_index = True, right_index = True, how = 'left')

In [173]:
len(df_openFDA_NDC)

47968

### Data Manipulation

Splitting ndc into labeler and product code

In [174]:
df_temp = df_openFDA_NDC['product_ndc'].str.split("-", n = 1, expand = True)
df_openFDA_NDC['labeler_code']= df_temp[0]
df_openFDA_NDC['labeler_code'] = pd.to_numeric(df_openFDA_NDC['labeler_code'])
df_openFDA_NDC['product_code']= df_temp[1]
df_openFDA_NDC['product_code'] = pd.to_numeric(df_openFDA_NDC['product_code'])

Clearing route for characters that are not a letters or numbers

In [175]:
df_openFDA_NDC['route'] = df_openFDA_NDC['route'].astype(str).apply(lambda x: re.sub('[^A-Za-z0-9]+', '', x)) 

Creating an unique id

In [176]:
df_openFDA_NDC['generic_name'] = df_openFDA_NDC['generic_name'].str.strip()
df_openFDA_NDC['dosage_form'] = df_openFDA_NDC['dosage_form'].str.strip()
df_openFDA_NDC['route'] = df_openFDA_NDC['route'].str.strip()

df_openFDA_NDC['unique_id'] = df_openFDA_NDC['generic_name'].astype(str) + '-' + df_openFDA_NDC['dosage_form'].astype(str) + '-' + df_openFDA_NDC['route'].astype(str) + '-' +df_openFDA_NDC['strength'].astype(str)
df_openFDA_NDC['unique_id'] = df_openFDA_NDC['unique_id'].str.lower()

Removing duplicates based on unique id, labeler and product code

In [177]:
df_openFDA_NDC = df_openFDA_NDC.drop_duplicates(subset = ['labeler_code', 'product_code'], keep = 'first')

In [178]:
len(df_openFDA_NDC)

47387

# Downloading output

In [None]:
df_openFDA_NDC.to_csv('raw_openFDA_NDC_data.csv', sep = '~', index = False)