In [2]:
# Enable autoreload in Jupyter
%load_ext autoreload
%autoreload 2

**USDA API Food Data Processing** <br>
Getting nutrient data for available branded foods

In [3]:
import requests
import dotenv
import pandas as pd

url = "https://api.nal.usda.gov/fdc/v1/foods/list"
_api_key = dotenv.get_key("credentials.env", "USDA_FOOD_KEY")

In [4]:
# get list of branded / basic foundational foods, sorted by name
response = requests.get(url, params={
    "api_key":_api_key,
    "dataType":["Foundation"],
    "sortBy":"dataType.keyword"
})

if response.status_code == 200:
    foods_data = response.json()
    foods_df = pd.DataFrame(foods_data)
    print(f"Total records: {len(foods_df)}")
else:
    print(f"Error: {response.status_code} - {response.text}")



Total records: 50


In [5]:
foods_df

Unnamed: 0,fdcId,description,dataType,publicationDate,ndbNumber,foodNutrients
0,2262074,"Almond butter, creamy",Foundation,2022-04-28,12195,"[{'number': '717', 'name': 'Daidzin', 'amount'..."
1,2257045,"Almond milk, unsweetened, plain, refrigerated",Foundation,2022-04-28,100276,"[{'number': '404', 'name': 'Thiamin', 'amount'..."
2,1999631,"Almond milk, unsweetened, plain, shelf stable",Foundation,2021-10-28,14091,"[{'number': '631', 'name': 'PUFA 22:5 n-3 (DPA..."
3,2003590,"Apple juice, with added vitamin C, from concen...",Foundation,2021-10-28,9400,"[{'number': '303', 'name': 'Iron, Fe', 'amount..."
4,1750340,"Apples, fuji, with skin, raw",Foundation,2020-10-30,9504,"[{'number': '303', 'name': 'Iron, Fe', 'amount..."
5,1750341,"Apples, gala, with skin, raw",Foundation,2020-10-30,9503,"[{'number': '303', 'name': 'Iron, Fe', 'amount..."
6,1750342,"Apples, granny smith, with skin, raw",Foundation,2020-10-30,9502,"[{'number': '303', 'name': 'Iron, Fe', 'amount..."
7,1750343,"Apples, honeycrisp, with skin, raw",Foundation,2020-10-30,9501,"[{'number': '303', 'name': 'Iron, Fe', 'amount..."
8,1750339,"Apples, red delicious, with skin, raw",Foundation,2020-10-30,9500,"[{'number': '303', 'name': 'Iron, Fe', 'amount..."
9,2346414,"Applesauce, unsweetened, with added vitamin C",Foundation,2022-10-28,9401,"[{'number': '303', 'name': 'Iron, Fe', 'amount..."


In [6]:
rows, cols = foods_df.shape
print(f"Number of rows: {rows}")
print(f"Number of columns: {cols}")
print(foods_df.columns)

Number of rows: 50
Number of columns: 6
Index(['fdcId', 'description', 'dataType', 'publicationDate', 'ndbNumber',
       'foodNutrients'],
      dtype='object')


# CSV Import
Data is split across dozens of csv files. Need to join.

In [7]:
import importlib, food_db_client
importlib.reload(food_db_client)
import re
from food_db_client import FoodDBClient

def filter_rows_by_substrings(df, substrings):
    if isinstance(substrings, str):
        substrings = [substrings]

    pattern = '|'.join([re.escape(s) for s in substrings])
    return df[df['description'].str.contains(pattern, case=False, na=False)]

# Example usage:
filtered_df = filter_rows_by_substrings(foods_df, ["protein", "carb", "sugar", "fat"])
print(filtered_df.shape)
filtered_df

(6, 6)


Unnamed: 0,fdcId,description,dataType,publicationDate,ndbNumber,foodNutrients
37,2644289,"Beans, kidney, dark red, canned, sodium added,...",Foundation,2023-10-26,100318,"[{'number': '303', 'name': 'Iron, Fe', 'amount..."
38,2644290,"Beans, kidney, light red, canned, sodium added...",Foundation,2023-10-26,100319,"[{'number': '303', 'name': 'Iron, Fe', 'amount..."
45,2514744,"Beef, ground, 80% lean meat / 20% fat, raw",Foundation,2023-04-20,23572,"[{'number': '303', 'name': 'Iron, Fe', 'amount..."
46,2514743,"Beef, ground, 90% lean meat / 10% fat, raw",Foundation,2023-04-20,23562,"[{'number': '303', 'name': 'Iron, Fe', 'amount..."
47,746758,"Beef, loin, tenderloin roast, separable lean o...",Foundation,2019-12-16,23377,"[{'number': '204', 'name': 'Total lipid (fat)'..."
48,746759,"Beef, loin, top loin steak, boneless, lip-on, ...",Foundation,2019-12-16,23385,"[{'number': '406', 'name': 'Niacin', 'amount':..."


In [8]:
first_fdc_id = filtered_df.iloc[0]['fdcId']

FoodDBClient.get_food_by_id(first_fdc_id)

Unnamed: 0,fdcId,description,publicationDate,foodNutrients,dataType,foodClass,inputFoods,foodComponents,foodAttributes,nutrientConversionFactors,ndbNumber,isHistoricalReference,foodCategory.id,foodCategory.code,foodCategory.description
0,2644289,"Beans, kidney, dark red, canned, sodium added,...",10/26/2023,"[{'nutrient': {'id': 2045, 'number': '951', 'n...",Foundation,FinalFood,"[{'id': 123937, 'foodDescription': 'beans, kid...",[],[],"[{'id': 23042, 'value': 6.25, 'type': '.Protei...",100318,False,16,1600,Legumes and Legume Products


In [14]:
all_nutrients = FoodDBClient.get_all_nutrients()
all_nutrients

Unnamed: 0,id,name,unit_name,nutrient_nbr,rank
0,2047,Energy (Atwater General Factors),KCAL,957.0,280.0
1,2048,Energy (Atwater Specific Factors),KCAL,958.0,290.0
2,1001,Solids,G,201.0,200.0
3,1002,Nitrogen,G,202.0,500.0
4,1003,Protein,G,203.0,600.0
...,...,...,...,...,...
472,2064,Oligosaccharides,MG,,2250.0
473,2065,Low Molecular Weight Dietary Fiber (LMWDF),G,293.4,1306.0
474,2068,Vitamin E,MG,959.0,7810.0
475,2067,Vitamin A,UG,960.0,7430.0


In [16]:
food_nutrients = FoodDBClient.get_food_nutrients(first_fdc_id)
print(food_nutrients.shape)
food_nutrients = food_nutrients.drop(columns=["id","derivation_id","footnote", "min_year_acquired"])
food_nutrients

(20, 11)


  


Unnamed: 0,fdc_id,nutrient_id,amount,data_points,min,max,median
136351,2644289,1071,2.0,8.0,2.0,2.0,2.0
136352,2644289,1093,172.3,8.0,94.7,268.0,153.5
136353,2644289,1009,12.16,8.0,11.43,13.23,12.07
136354,2644289,1090,29.31,8.0,26.7,32.2,29.2
136355,2644289,1087,56.94,8.0,39.4,98.1,47.3
136356,2644289,2033,7.013,8.0,6.4,8.2,6.9
136357,2644289,1091,104.1,8.0,82.0,131.0,98.75
136358,2644289,1051,68.82,8.0,66.77,70.14,68.7
136359,2644289,1004,1.256,8.0,1.06,1.56,1.26
136360,2644289,1095,0.56,8.0,0.454,0.747,0.5225


In [17]:
# Join food_nutrients with all_nutrients on nutrient_id and id 
# This is just populating more information about nutrients when looking through a food's nutrition
food_nutrients_details = food_nutrients.merge(
    all_nutrients,
    left_on='nutrient_id',
    right_on='id',
    how='left'
)

food_nutrients_details

Unnamed: 0,fdc_id,nutrient_id,amount,data_points,min,max,median,id,name,unit_name,nutrient_nbr,rank
0,2644289,1071,2.0,8.0,2.0,2.0,2.0,1071,Resistant starch,G,283.0,2225.0
1,2644289,1093,172.3,8.0,94.7,268.0,153.5,1093,"Sodium, Na",MG,307.0,5800.0
2,2644289,1009,12.16,8.0,11.43,13.23,12.07,1009,Starch,G,209.0,2200.0
3,2644289,1090,29.31,8.0,26.7,32.2,29.2,1090,"Magnesium, Mg",MG,304.0,5500.0
4,2644289,1087,56.94,8.0,39.4,98.1,47.3,1087,"Calcium, Ca",MG,301.0,5300.0
5,2644289,2033,7.013,8.0,6.4,8.2,6.9,2033,Total dietary fiber (AOAC 2011.25),G,293.0,1300.0
6,2644289,1091,104.1,8.0,82.0,131.0,98.75,1091,"Phosphorus, P",MG,305.0,5600.0
7,2644289,1051,68.82,8.0,66.77,70.14,68.7,1051,Water,G,255.0,100.0
8,2644289,1004,1.256,8.0,1.06,1.56,1.26,1004,Total lipid (fat),G,204.0,800.0
9,2644289,1095,0.56,8.0,0.454,0.747,0.5225,1095,"Zinc, Zn",MG,309.0,5900.0
