In [1]:
import os
import sys
import httpx
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv("USDA_API_KEY")

if not api_key:
    raise ValueError("USDA_API_KEY not set")

ingredient_name = "chicken"

In [2]:
notebook_dir = Path.cwd()

if 'notebooks' in str(notebook_dir):
    project_root = notebook_dir.parent.parent
else:
    # If already at project root, use current directory
    project_root = notebook_dir

# Add src/macronome to Python path
sys.path.insert(0, str(project_root / "src"))
sys.path.insert(0, str(project_root))

In [3]:
from macronome.ai.utils.nutrition_calculator import NutritionCalculator

In [4]:
search_url = "https://api.nal.usda.gov/fdc/v1/foods/search"
params = {
    "api_key": api_key,
    "query": ingredient_name,
    "pageSize": 10,
    "dataType": ["SR Legacy"],  # Standard Reference
}

In [5]:
client = httpx.AsyncClient(timeout=10.0)

In [6]:
response = await client.get(search_url, params=params)
response.raise_for_status()
data = response.json()

if not data.get("foods"):
    print(f"No USDA data found for: {ingredient_name}")
    raise ValueError(f"No USDA data found for: {ingredient_name}")

In [9]:
data["foods"][0]

{'fdcId': 173858,
 'description': 'Chicken spread',
 'commonNames': '',
 'additionalDescriptions': '',
 'dataType': 'SR Legacy',
 'ndbNumber': 7018,
 'publishedDate': '2019-04-01',
 'foodCategory': 'Sausages and Luncheon Meats',
 'allHighlightFields': '',
 'score': 240.21654,
 'microbes': [],
 'foodNutrients': [{'nutrientId': 1004,
   'nutrientName': 'Total lipid (fat)',
   'nutrientNumber': '204',
   'unitName': 'G',
   'derivationCode': 'MC',
   'derivationDescription': 'Manufacturer supplied; Calculated by manufacturer or unknown if analytical or calculated',
   'derivationId': 47,
   'value': 17.6,
   'foodNutrientSourceId': 7,
   'foodNutrientSourceCode': '9',
   'foodNutrientSourceDescription': 'Calculated by manufacturer, not adjusted or rounded for NLEA',
   'rank': 800,
   'indentLevel': 1,
   'foodNutrientId': 1810639,
   'dataPoints': 0},
  {'nutrientId': 1051,
   'nutrientName': 'Water',
   'nutrientNumber': '255',
   'unitName': 'G',
   'derivationCode': 'MC',
   'derivati

In [7]:
print(f"Total hits: {data.get('totalHits')}")
print(f"\nFirst 5 results:\n")
for i, food in enumerate(data["foods"][:5]):
    print(f"\n--- Result {i+1} ---")
    print(f"Description: {food.get('description')}")
    print(f"Category: {food.get('foodCategory')}")
    print(f"Score: {food.get('score')}")
    print(f"NDB Number: {food.get('ndbNumber')}")
    print(f"Data Type: {food.get('dataType')}")
    print(f"Common Names: {food.get('commonNames')}")

Total hits: 392

First 5 results:


--- Result 1 ---
Description: Chicken spread
Category: Sausages and Luncheon Meats
Score: 240.21654
NDB Number: 7018
Data Type: SR Legacy
Common Names: 

--- Result 2 ---
Description: Chicken, meatless
Category: Legumes and Legume Products
Score: 240.21654
NDB Number: 43128
Data Type: SR Legacy
Common Names: 

--- Result 3 ---
Description: Fat, chicken
Category: Fats and Oils
Score: 240.21654
NDB Number: 4542
Data Type: SR Legacy
Common Names: 

--- Result 4 ---
Description: Frankfurter, chicken
Category: Sausages and Luncheon Meats
Score: 240.21654
NDB Number: 7024
Data Type: SR Legacy
Common Names: hot dog, wiener, frank

--- Result 5 ---
Description: Fast foods, chicken tenders
Category: Fast Foods
Score: 226.7081
NDB Number: 21401
Data Type: SR Legacy
Common Names: chicken strips


In [8]:
# Step 2: Compare "chicken spread" vs better matches
# What makes "Chicken spread" different from "Chicken, raw"?

# Let's look at categories
categories = {}
for food in data["foods"]:
    cat = food.get("foodCategory", "Unknown")
    categories[cat] = categories.get(cat, 0) + 1

print("Categories found:")
for cat, count in sorted(categories.items(), key=lambda x: x[1], reverse=True):
    print(f"  {cat}: {count}")

# Look at description patterns
print("\nDescription patterns:")
for food in data["foods"][:5]:
    desc = food.get("description", "")
    words = desc.split()
    print(f"  '{desc}' -> {len(words)} words, category: {food.get('foodCategory')}")

Categories found:
  Sausages and Luncheon Meats: 4
  Poultry Products: 3
  Legumes and Legume Products: 1
  Fats and Oils: 1
  Fast Foods: 1

Description patterns:
  'Chicken spread' -> 2 words, category: Sausages and Luncheon Meats
  'Chicken, meatless' -> 2 words, category: Legumes and Legume Products
  'Fat, chicken' -> 2 words, category: Fats and Oils
  'Frankfurter, chicken' -> 2 words, category: Sausages and Luncheon Meats
  'Fast foods, chicken tenders' -> 4 words, category: Fast Foods


In [45]:
def score_food_match(ingredient_name: str, food: dict) -> float:
    """
    Score a USDA food match using only string similarity and structural features.
    No hardcoded keywords or category names.
    
    Args:
        ingredient_name: The ingredient to match (e.g., "chicken", "rice")
        food: USDA food dict with 'description', 'foodCategory', 'score'
    
    Returns:
        Score (higher = better match)
    """
    score = 0.0
    ingredient_lower = ingredient_name.lower().strip()
    description = food.get("description", "").lower()
    category = food.get("foodCategory", "").lower()
    usda_score = food.get("score", 0)
    
    # 1. Exact match (highest priority)
    if description == ingredient_lower:
        score += 1000
    # 2. Ingredient name appears in description
    elif ingredient_lower in description:
        # Check WHERE it appears - if it's the first word, it's likely the main item
        position = description.find(ingredient_lower)
        
        # If ingredient is the first word(s), it's likely the main item
        if position == 0:
            score += 500
        # If ingredient appears after a comma or other word, it's likely a modifier
        elif position > 0:
            # Check what comes before it
            before = description[:position].strip()
            if before.endswith(',') or before.endswith(' '):
                # It's after a comma or space - likely a modifier
                score += 200  # Lower score for modifier position
            else:
                score += 300  # Middle ground
    # 3. Description appears in ingredient (less common)
    elif description in ingredient_lower:
        score += 300
    
    # 4. Word overlap (how many words from ingredient appear in description)
    ingredient_words = set(ingredient_lower.split())
    description_words = set(description.split())
    word_overlap = len(ingredient_words & description_words)
    if word_overlap > 0:
        score += word_overlap * 50
    
    # 5. String similarity between ingredient and description
    if ingredient_lower:
        chars_in_common = sum(1 for char in ingredient_lower if char in description)
        similarity_ratio = chars_in_common / len(ingredient_lower) if len(ingredient_lower) > 0 else 0
        score += similarity_ratio * 200
    
    # 6. Category name similarity to ingredient name
    if category:
        category_words = set(category.split())
        category_overlap = len(ingredient_words & category_words)
        if category_overlap > 0:
            score += category_overlap * 100
    
    # 7. Description complexity (structural features)
    word_count = len(description.split())
    comma_count = description.count(',')
    
    # Penalize very complex descriptions
    complexity_penalty = (word_count - 3) * 10 + comma_count * 15
    if complexity_penalty > 0:
        score -= complexity_penalty
    
    # 8. Description position in ingredient name
    if ingredient_lower in description:
        position = description.find(ingredient_lower)
        position_ratio = 1.0 - (position / max(len(description), 1))
        score += position_ratio * 100  # Earlier = better
    
    # 9. Use USDA relevance score
    score += usda_score / 10
    
    # 10. Length ratio
    if len(ingredient_lower) > 0:
        length_ratio = len(description) / len(ingredient_lower)
        if length_ratio > 5:
            score -= 100
        elif length_ratio < 2:
            score += 50
    
    # 11. NEW: Penalize descriptions where ingredient is clearly a modifier
    # Pattern: "X, ingredient" suggests ingredient is a modifier of X
    if ingredient_lower in description:
        position = description.find(ingredient_lower)
        if position > 0:
            # Check if there's a comma before the ingredient
            before_text = description[:position].strip()
            if before_text and (before_text.endswith(',') or ',' in before_text):
                # Ingredient appears after a comma - likely a modifier
                # Penalize based on how much text comes before
                penalty = min(len(before_text) * 5, 200)
                score -= penalty
    
    return score

In [49]:
def select_best_match(ingredient_name: str, foods: list) -> dict:
    """
    Select the best matching food from USDA API results.
    
    Strategy:
    1. Try exact match first (fast, deterministic)
    2. Try prefix match (ingredient is first word)
    3. Fall back to scoring function for ranking
    
    Args:
        ingredient_name: The ingredient to match
        foods: List of food dicts from USDA API
    
    Returns:
        Best matching food dict
    """
    if not foods:
        return None
    
    ingredient_lower = ingredient_name.lower().strip()
    
    # 1. Exact match (highest priority, fastest)
    for food in foods:
        if food.get("description", "").lower() == ingredient_lower:
            return food
    
    # 2. Prefix match - ingredient name is the first word(s) of description
    # This catches cases like "chicken" matching "Chicken, ground, raw"
    prefix_matches = []
    for food in foods:
        desc_lower = food.get("description", "").lower()
        # Check if description starts with ingredient name
        if desc_lower.startswith(ingredient_lower):
            # Make sure it's a word boundary (not partial word match)
            # e.g., "chick" shouldn't match "chicken"
            if len(desc_lower) == len(ingredient_lower) or desc_lower[len(ingredient_lower)] in [',', ' ', '(', ':']:
                prefix_matches.append(food)
    
    # If we have prefix matches, score them to pick the best one
    if prefix_matches:
        scored = [(score_food_match(ingredient_name, food), food) for food in prefix_matches]
        scored.sort(key=lambda x: x[0], reverse=True)
        return scored[0][1]
    
    # 3. Fall back to scoring all results
    scored = [(score_food_match(ingredient_name, food), food) for food in foods]
    scored.sort(key=lambda x: x[0], reverse=True)
    return scored[0][1] if scored else None

In [47]:
def select_best_match(ingredient_name: str, foods: list) -> dict:
    """Simplest: first prefix match, sorted by USDA score"""
    ingredient_lower = ingredient_name.lower().strip()
    
    prefix_matches = [
        f for f in foods 
        if f.get("description", "").lower().startswith(ingredient_lower)
    ]
    
    if prefix_matches:
        # Sort by USDA score (they already rank results)
        prefix_matches.sort(key=lambda x: x.get("score", 0), reverse=True)
        return prefix_matches[0]
    
    return foods[0] if foods else None

In [53]:
test_ingredient = "rice"

params_test = {
    "api_key": api_key,
    "query": test_ingredient,
    "pageSize": 10,  # Get multiple results
    "dataType": ["SR Legacy"],
}

response = await client.get(search_url, params=params_test)
data = response.json()

if data.get("foods"):
    # Score all results
    scored = [(score_food_match(test_ingredient, food), food) for food in data["foods"]]
    scored.sort(key=lambda x: x[0], reverse=True)
    
    print(f"Top 5 matches for '{test_ingredient}':\n")
    for i, (score, food) in enumerate(scored[:5]):
        print(f"{i+1}. Score: {score:.1f}")
        print(f"   Description: {food.get('description')}")
        print(f"   Category: {food.get('foodCategory')}")
        print(f"   USDA Score: {food.get('score')}")
        print()
    
    # Get best match
    best = select_best_match(test_ingredient, data["foods"])
    print(f"\nBest match: {best.get('description')}")
    print(f"Category: {best.get('foodCategory')}")

Top 5 matches for 'rice':

1. Score: 877.8
   Description: Rice crackers
   Category: Snacks
   USDA Score: 278.1325

2. Score: 696.5
   Description: Rice and vermicelli mix, rice pilaf flavor, unprepared
   Category: Meals, Entrees, and Side Dishes
   USDA Score: 265.2984

3. Score: 361.7
   Description: Snacks, rice cracker brown rice, plain
   Category: Snacks
   USDA Score: 277.801

4. Score: 348.7
   Description: Snacks, rice cakes, brown rice, multigrain
   Category: Snacks
   USDA Score: 277.801

5. Score: 348.3
   Description: Snacks, rice cakes, brown rice, buckwheat
   Category: Snacks
   USDA Score: 277.801


Best match: Rice crackers
Category: Snacks


In [34]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load lightweight embedding model
print("Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded!")

  from .autonotebook import tqdm as notebook_tqdm


Loading embedding model...
Model loaded!


In [37]:
def select_best_match_simple(ingredient_name: str, foods: list) -> dict:
    """Simple heuristic: prefer longer descriptions with commas"""
    if not foods:
        return None
    
    ingredient_lower = ingredient_name.lower().strip()
    
    # Filter to prefix matches
    prefix_matches = []
    for food in foods:
        desc_lower = food.get("description", "").lower()
        if desc_lower.startswith(ingredient_lower):
            if len(desc_lower) == len(ingredient_lower) or desc_lower[len(ingredient_lower)] in [',', ' ']:
                prefix_matches.append(food)
    
    if not prefix_matches:
        return foods[0]
    
    # Prefer items with 3-6 words and 1-2 commas
    def score(food):
        desc = food.get("description", "")
        words = len(desc.split())
        commas = desc.count(',')
        if 3 <= words <= 6 and 1 <= commas <= 2:
            return 1000  # High score
        elif words >= 3 and commas >= 1:
            return 500
        else:
            return 100
    
    prefix_matches.sort(key=score, reverse=True)
    return prefix_matches[0]

## Checking recipes data

In [14]:
import pandas as pd

# Read a parquet file
df = pd.read_parquet('../../data/recipes/processed/recipes.parquet')
df.head()


Unnamed: 0,id,title,ingredients,directions,ner,source,link
0,recipe_0000000,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...","[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu...",Gathered,www.cookbooks.com/Recipe-Details.aspx?id=44874
1,recipe_0000001,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....","[""beef"", ""chicken breasts"", ""cream of mushroom...",Gathered,www.cookbooks.com/Recipe-Details.aspx?id=699419
2,recipe_0000002,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...","[""frozen corn"", ""cream cheese"", ""butter"", ""gar...",Gathered,www.cookbooks.com/Recipe-Details.aspx?id=10570
3,recipe_0000003,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...","[""chicken"", ""chicken gravy"", ""cream of mushroo...",Gathered,www.cookbooks.com/Recipe-Details.aspx?id=897570
4,recipe_0000004,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...","[""peanut butter"", ""graham cracker crumbs"", ""bu...",Gathered,www.cookbooks.com/Recipe-Details.aspx?id=659239


In [15]:
df_subset = df.iloc[0:100]

In [20]:
for i in range (0, 20):
    print(df_subset['ingredients'][i])

["1 c. firmly packed brown sugar", "1/2 c. evaporated milk", "1/2 tsp. vanilla", "1/2 c. broken nuts (pecans)", "2 Tbsp. butter or margarine", "3 1/2 c. bite size shredded rice biscuits"]
["1 small jar chipped beef, cut up", "4 boned chicken breasts", "1 can cream of mushroom soup", "1 carton sour cream"]
["2 (16 oz.) pkg. frozen corn", "1 (8 oz.) pkg. cream cheese, cubed", "1/3 c. butter, cubed", "1/2 tsp. garlic powder", "1/2 tsp. salt", "1/4 tsp. pepper"]
["1 large whole chicken", "2 (10 1/2 oz.) cans chicken gravy", "1 (10 1/2 oz.) can cream of mushroom soup", "1 (6 oz.) box Stove Top stuffing", "4 oz. shredded cheese"]
["1 c. peanut butter", "3/4 c. graham cracker crumbs", "1 c. melted butter", "1 lb. (3 1/2 c.) powdered sugar", "1 large pkg. chocolate chips"]
["6 baking potatoes", "1 lb. of extra lean ground beef", "2/3 c. butter or margarine", "6 c. milk", "3/4 tsp. salt", "1/2 tsp. pepper", "1 1/2 c (6 oz.) shredded Cheddar cheese, divided", "12 sliced bacon, cooked, crumbled a