## First, we extract the food name from the 'description' column in the raw data

In [10]:
import json
import tqdm

In [11]:
from random import sample


input_file = 'resources/FoodData_Central_foundation_food_json_2025-04-24.json'
with open(input_file, 'r') as f:
    data = json.load(f)
print(f"Loaded {len(data)} records from {input_file}")
# print a sample food
sampled = data['FoundationFoods'][0]
print(json.dumps(sampled, indent=4))

Loaded 1 records from resources/FoodData_Central_foundation_food_json_2025-04-24.json
{
    "foodClass": "FinalFood",
    "description": "Hummus, commercial",
    "foodNutrients": [
        {
            "type": "FoodNutrient",
            "id": 2219707,
            "nutrient": {
                "id": 1120,
                "number": "334",
                "name": "Cryptoxanthin, beta",
                "rank": 7460,
                "unitName": "\u00b5g"
            },
            "dataPoints": 1,
            "foodNutrientDerivation": {
                "code": "A",
                "description": "Analytical",
                "foodNutrientSource": {
                    "id": 1,
                    "code": "1",
                    "description": "Analytical or derived from analytical"
                }
            },
            "median": 3.0,
            "amount": 3.0
        },
        {
            "type": "FoodNutrient",
            "id": 2219708,
            "nutrient": {
            

In [12]:
# show all the keys
print("Keys:")
print(sampled.keys())

Keys:
dict_keys(['foodClass', 'description', 'foodNutrients', 'foodAttributes', 'nutrientConversionFactors', 'isHistoricalReference', 'ndbNumber', 'dataType', 'foodCategory', 'fdcId', 'foodPortions', 'publicationDate', 'inputFoods'])


## Now, we create a set of foundation food that is normalized via lemmatization.

In [13]:
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shitianhao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/shitianhao/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [14]:

list_foundation_foods = set()
for item in tqdm.tqdm(data['FoundationFoods']):
    description = item['description']
    food_name = description.split(',')[0].lower()
    if food_name.count('(')>0:continue
    food_name_normalized = lemmatizer.lemmatize(food_name, pos='n')
    list_foundation_foods.add(food_name_normalized)
for food_name in sorted(list_foundation_foods):
    print(food_name)

100%|██████████| 340/340 [00:00<00:00, 7265.90it/s]

almond butter
almond milk
apple
apple juice
applesauce
apricot
arugula
asparagus
avocado
banana
bean
beef
beet
bison
blackberry
blackeye pea
blueberry
bread
broccoli
brussels sprouts
buckwheat
bulgur
butter
buttermilk
cabbage
carrot
cauliflower
celery
cheese
cherry
chia seeds
chicken
chickpea
collard
cooky
corn
corn flour
cottage cheese
cranberry juice
cream
cream cheese
crustacean
cucumber
egg
eggplant
einkorn
farro
fig
fish
flaxseed
flour
fonio
frankfurter
garlic
grape
grape juice
grapefruit juice
green onion
ham
hummus
juice
kale
ketchup
khorasan
kiwifruit
lamb
leek
lentil
lettuce
mandarin
mango
melon
milk
millet
mushroom
mustard
nectarine
nut
oat
oat milk
oil
olive
onion
onion rings
orange
orange juice
pawpaw
pea
peach
peanut
peanut butter
pear
pepper
pickle
pineapple
plantain
plum
pork
potato
raspberry
restaurant
rice
rutabaga
salt
sauce
sausage
seed
sesame butter
shallot
sorghum
sorghum bran
sorghum flour
sorghum grain
soy milk
spinach
squash
strawberry
sugar
sweet potatoes
tomat




## Finally, we save the result to a CSV file.

In [15]:
# save list to file
import csv
output_file = 'resources/clean/foundation_ingredient.csv'
with open(output_file, 'w') as f:
    writer = csv.writer(f)
    for item in sorted(list_foundation_foods):
        writer.writerow([item])


we do some manual changing:
rename: cooky -> cookie
delete: sorghum bran
delete: crustacean
delete: khorasan