### Data Source 1: Dietary Label Supplement Database (NIH)
- 1,25,446 supplements
- Minimum 35 good attributes
- *US based database*

In [7]:
import json
import os
from tqdm import tqdm

In [24]:
data = []
dir_path = 'data/DSLD-full-database-JSON'
for file in os.listdir(dir_path):
    with open(f"{dir_path}/{file}", encoding='utf-8') as f:
        data.append(json.load(f))

In [3]:
data[1]

{'_index': 'dsldnxt',
 '_type': '_doc',
 '_id': '10000',
 '_version': 1,
 '_seq_no': 1,
 '_primary_term': 3,
 'found': True,
 '_source': {'langualProductType': 'Non-Nutrient/Non-Botanical [A1309]',
  'entryDate': '2012-06-25',
  'labelVersion': '2.0',
  'dsldId': '10000',
  'statementGroups': [{'groupName': 'Attributes, including intended target group(s)',
    'statements': ['Adult (18 - 50 Years)', 'Dairy Free']},
   {'groupName': 'FDA Disclaimer Statement', 'statements': ['NP']},
   {'groupName': 'FDA Statement of Identity',
    'statements': ['A Dietary Supplement']},
   {'groupName': 'Formulation',
    'statements': ['This product contains NO yeast, wheat gluten, soy protein, milk/dairy, corn, sodium, starch, artificial coloring, preservatives or flavoring.']},
   {'groupName': 'Other', 'statements': ['Formula #81322']},
   {'groupName': 'Precautions',
    'statements': ['If you are pregnant, nursing, taking any prescription medication (especially hormone replacement therapy), or h

In [25]:
len(data)

125446

In [4]:
for key in data[1]['_source']:
    print(key)

langualProductType
entryDate
labelVersion
dsldId
statementGroups
ipSymbol
productName
langualClaimsOrUses
dietarySupplementsFacts
langualCodes
langualTargetGroup
statementOfIdentity
netContentQuantities
offMarket
langualSupplementForm
sku
nhanesId
brand
outerPackaging
servingSize
events
contacts
suggestedUse


In [14]:
def format_json(data):
    formatted_data = {}

    # Primary Values
    formatted_data['id'] = data['_id']
    for key in data['_source']:
        if key in {'langualProductType', 'entryDate', 'langualTargetGroup', 'statementOfIdentity', 'netContentQuantities', 'langualSupplementForm', 'servingSize', 'events', 'suggestedUse'}:
            formatted_data[key] = data['_source'][key][0]['size'] if key == 'servingSize' else data['_source'][key]
    try:
        formatted_data['productType'] = data['_source']['langualProductType']
        formatted_data['entryDate'] = data['_source']['entryDate']
    except:
        pass

    # Statement Groups
    for group in data['_source']['statementGroups']:
        formatted_data[group['groupName']] = group['statements']
    
    # Supplement Facts
    try:
        formatted_data['productName'] = data['_source']['productName']    
        for key in data['_source']['dietarySupplementsFacts'][0]:
            if key in ['targetGroupId', 'ingredients']:
                continue
            if key == 'otheringredients':
                formatted_data[key] = data['_source']['dietarySupplementsFacts'][0][key]['text']
            else:
                formatted_data[key] = data['_source']['dietarySupplementsFacts'][0][key]
    except:
        pass
    
    # Ingredients
    try:
        for key in data['_source']['dietarySupplementsFacts'][0]['ingredients'][0]['data']:
            if key in ['ingredientEntryName', 'dvTargetGroupDvGroupName', 'sfbQuantityColumnColumnLabel', 'sfbQuantityQuantity', 'unitName', 'labelFullName', 'sfbDvFootnoteFootnoteText', 'ingredientEntryPlantPart', 'ingredientEntryCategory']:
                formatted_data[key] = data['_source']['dietarySupplementsFacts'][0]['ingredients'][0]['data'][key]
    except:
        pass
    
    try:
        for key in data['_source']['dietarySupplementsFacts'][0]['ingredients'][0]:
            if key in ['hierarchyIngredient', 'altName']:
                formatted_data[key] = data['_source']['dietarySupplementsFacts'][0]['ingredients'][0][key]
    except:
        pass

    return formatted_data

In [38]:
sample = format_json(data[1])
print(list(sample.keys()))

['id', 'langualProductType', 'entryDate', 'langualTargetGroup', 'statementOfIdentity', 'netContentQuantities', 'langualSupplementForm', 'servingSize', 'events', 'suggestedUse', 'productType', 'Attributes, including intended target group(s)', 'FDA Disclaimer Statement', 'FDA Statement of Identity', 'Formulation', 'Other', 'Precautions', 'Product Specific Information', 'Suggested/Recommended/Usage/Directions', 'productName', 'driFootNotes', 'otherTargetGroups', 'servingsPerContainer', 'labelFootNotes', 'dailyValueTargetGroups', 'servingSizeUnitName', 'servingSizeQuantity', 'otheringredients', 'targetGroupName', 'usageSuggestion', 'ingredientEntryName', 'dvTargetGroupDvGroupName', 'sfbQuantityColumnColumnLabel', 'sfbQuantityQuantity', 'unitName', 'labelFullName', 'sfbDvFootnoteFootnoteText', 'ingredientEntryPlantPart', 'ingredientEntryCategory', 'hierarchyIngredient', 'altName']


In [26]:
processed = []
for i in tqdm(range(len(data))):
    processed.append(format_json(data[i]))

100%|██████████| 125446/125446 [00:09<00:00, 12901.80it/s]


In [34]:
n = 5
sample = processed[:n]
for i in range(n):
    print(f"Product: {sample[i]['labelFullName']}, Type: {sample[i]['langualProductType']}")

Product: Coral Calcium, Type: Dietary Supplements, Combination, Other [A1325]
Product: Melatonin 3 mg Sublingual Tablets, Type: Non-Nutrient/Non-Botanical [A1309]
Product: Melatonin 3 mg, Type: Non-Nutrient/Non-Botanical [A1309]
Product: Melatonin 3 mg Sublingual Tablets, Type: Non-Nutrient/Non-Botanical [A1309]
Product: Melatonin 3 mg Sublingual Tablets, Type: Non-Nutrient/Non-Botanical [A1309]


In [9]:
sample = processed[:100]
sample[0]

{'id': '1000',
 'langualProductType': 'Dietary Supplements, Combination, Other [A1325]',
 'entryDate': '2011-10-26',
 'langualTargetGroup': 'Four years and above [P0250]',
 'statementOfIdentity': 'DIETARY SUPPLEMENT',
 'netContentQuantities': '60.0 Capsule(s)\r\n',
 'langualSupplementForm': 'Capsule [E0159]',
 'servingSize': '2.0 Capsule(s)',
 'events': [{'date': 'October 26, 2011', 'name': 'Date - Entered into DSLD'}],
 'suggestedUse': 'Directions: As a dietary supplement, take two capsules daily.\r\n',
 'productType': 'Dietary Supplements, Combination, Other [A1325]',
 'Attributes, including intended target group(s)': ['Adult (18 - 50 Years)'],
 'FDA Disclaimer Statement': ['NP'],
 'FDA Statement of Identity': ['DIETARY SUPPLEMENT'],
 'Formulation': ['optimal 2:1 ratio of calcium and magnesium'],
 'Other': ['ACTUAL SIZE',
  'GNC Coral Calcium naturally supplies calcium, magnesium and numerous other essential trace minerals. Gathered from fossilized coral from above the Okinawan sea, 

In [30]:
import pickle

with open('supplements_data.pkl', 'wb') as f:
    pickle.dump(processed, f)

In [40]:
with open('supplements_data.json', 'w') as f:
    json.dump(processed, f)

In [39]:
processed[0]

{'id': '1000',
 'langualProductType': 'Dietary Supplements, Combination, Other [A1325]',
 'entryDate': '2011-10-26',
 'langualTargetGroup': 'Four years and above [P0250]',
 'statementOfIdentity': 'DIETARY SUPPLEMENT',
 'netContentQuantities': '60.0 Capsule(s)\r\n',
 'langualSupplementForm': 'Capsule [E0159]',
 'servingSize': '2.0 Capsule(s)',
 'events': [{'date': 'October 26, 2011', 'name': 'Date - Entered into DSLD'}],
 'suggestedUse': 'Directions: As a dietary supplement, take two capsules daily.\r\n',
 'productType': 'Dietary Supplements, Combination, Other [A1325]',
 'Attributes, including intended target group(s)': ['Adult (18 - 50 Years)'],
 'FDA Disclaimer Statement': ['NP'],
 'FDA Statement of Identity': ['DIETARY SUPPLEMENT'],
 'Formulation': ['optimal 2:1 ratio of calcium and magnesium'],
 'Other': ['ACTUAL SIZE',
  'GNC Coral Calcium naturally supplies calcium, magnesium and numerous other essential trace minerals. Gathered from fossilized coral from above the Okinawan sea, 

## Data
- Merging similar instances of different supplements
- TRC, medlinePlus (National Library of Medicine)
- Complementary and Alterenative Medicies (for natural, regional supplements)

In [8]:
import json
# with open('supplements_data.json') as f:
#     data = json.load(f)

# len(data)

In [3]:
data[:5]

[{'id': '1000',
  'langualProductType': 'Dietary Supplements, Combination, Other [A1325]',
  'entryDate': '2011-10-26',
  'langualTargetGroup': 'Four years and above [P0250]',
  'statementOfIdentity': 'DIETARY SUPPLEMENT',
  'netContentQuantities': '60.0 Capsule(s)\r\n',
  'langualSupplementForm': 'Capsule [E0159]',
  'servingSize': '2.0 Capsule(s)',
  'events': [{'date': 'October 26, 2011', 'name': 'Date - Entered into DSLD'}],
  'suggestedUse': 'Directions: As a dietary supplement, take two capsules daily.\r\n',
  'productType': 'Dietary Supplements, Combination, Other [A1325]',
  'Attributes, including intended target group(s)': ['Adult (18 - 50 Years)'],
  'FDA Disclaimer Statement': ['NP'],
  'FDA Statement of Identity': ['DIETARY SUPPLEMENT'],
  'Formulation': ['optimal 2:1 ratio of calcium and magnesium'],
  'Other': ['ACTUAL SIZE',
   'GNC Coral Calcium naturally supplies calcium, magnesium and numerous other essential trace minerals. Gathered from fossilized coral from above t