# Data Collection

We are using the EDAMAM API to collect data. Here we explore its functionality and what kind of data, and types of format, we can get using the free tier.

Visit this link for API keys: https://developer.edamam.com/admin/applications.

In [22]:
import os
import requests
import pandas as pd

edamam_api_id = os.environ["EDAMAM_API_ID"]
edamam_api_key = os.environ["EDAMAM_API_KEY"]

In [7]:
def recipe_search(ingredient, from_index=0, to_index=10):
    if to_index > 100:
        raise ValueError("to_index must be 100 at maximum")
    
    app_id = edamam_api_id  # Replace with your Edamam API app ID
    app_key = edamam_api_key  # Replace with your Edamam API app key
    result = requests.get(
        'https://api.edamam.com/search?q={}&app_id={}&app_key={}&from={}&to={}'.format(
            ingredient, app_id, app_key, from_index, to_index
        )
    )
    data = result.json()
    return data['hits']

def get_recipe_df(recipes):
    recipes_lst = [recipes[i]['recipe'] for i in range(len(recipes))]
    return pd.DataFrame(recipes_lst)

In [8]:
recipes = recipe_search("broccoli")

In [12]:
recipes[0]['recipe']

{'uri': 'http://www.edamam.com/ontologies/edamam.owl#recipe_543023fc1fef4a1448c2acd81155f12c',
 'label': 'Steamed Broccoli',
 'image': 'https://edamam-product-images.s3.amazonaws.com/web-img/5d4/5d4b2cfd62d545fbd0d9bc9b89f96530?X-Amz-Security-Token=IQoJb3JpZ2luX2VjED0aCXVzLWVhc3QtMSJGMEQCIAWX3KTA1ApTQ780MrFvNBm9Wv2rYlmT5IBi5zNMjabhAiA8HfPx6v66rA7u440QinAUbNpXWtw3ak%2BnriRMqTgyyyq5BQhGEAAaDDE4NzAxNzE1MDk4NiIMr3N%2F%2F8J5NCew8u0DKpYF%2BUDUk53O1Py6kdq%2FA0neYoP6tVxA%2FSxXb2h73Sld8SAppr6B40qcMabAY1gihHpEtcbqtUf9mTEIGtRRcdvYigNynsIrtNgMM8TzrFtlaxu4YqtzPEecBh%2F9iiGWAyjfs0eOomlte9FUNyRJKbSAUpgnKKTWgaKBQJqUcJFpcyWJ2UaWOpgMhcGiaEnPKdto9n1TL6prHUQHDugcA75xS9DHkgEbNfJzpdGry6QHqSR4h9Vdt8Kvd%2B76%2FN8bwSjRPLGSXx9kcs%2BZPRt%2BpWLUHaRQL9nFAHXZSKXfhkpYOl9ZZI%2F4%2FJSnuVk2PsJCE6i0iK1BMQtivHM92uK1ejmFYts87W5yPVt%2FtvFtLYlJKCB4u0iBBcrDj%2F60N80K1doS7ODcweKGJ73s4BKpbZQJJiixUGIiJ6g07VZrHQHGjqNCzmGrPm1IR73hPGL%2FPqjXP0dsd1BXt0JlHTQIj9xtizfRPSNLHWRHTo2vq3h4jB5rGOuQ5AU1YmlvZf9UsXbS2HQThC42hs1g1W0xN0qKpxJj4Ar

In [15]:
def get_recipes(from_index=0, to_index=10):
    app_id = edamam_api_id
    app_key = edamam_api_key
    
    params = {
        'type': 'public',
        'app_id': app_id,
        'app_key': app_key,
        'q': '*',  # Search all recipes
        'imageSize': 'THUMBNAIL',
        'random': 'true',
        'field': ['ingredientLines', 'calories']
    }
    
    response = requests.get('https://api.edamam.com/api/recipes/v2', params=params)
    data = response.json()
    return data['hits'] if 'hits' in data else []

In [17]:
get_recipes()[0]

{'recipe': {'ingredientLines': ['1/2 medium sized pineapple, cut into triangles',
   '4 ounces goat cheese',
   '1 small lime, juiced',
   '1/4 cup freshly chopped cilantro'],
  'calories': 541.6159642000001},
 '_links': {'self': {'href': 'https://api.edamam.com/api/recipes/v2/64b1b6b9db872dbb3c8b9b7396b0734f?app_id=213b4d83&app_key=eb0bcd72cc7e2adc795cca5d7b6b4b37',
   'title': 'Self'}}}

We can also search for food by name but there doesn't seem to be a clear indicator of serving size when looking at the nutrient data.

In [23]:
from dotenv import load_dotenv

load_dotenv()

edamam_food_api_id = os.environ["EDAMAM_FOOD_API_ID"]
edamam_food_api_key = os.environ["EDAMAM_FOOD_API_KEY"]

In [24]:
def search_food(query):
    app_id = edamam_food_api_id
    app_key = edamam_food_api_key
   
    params = {
        "app_id": app_id,
        "app_key": app_key,
        "ingr": query
    }

    response = requests.get(
        "https://api.edamam.com/api/food-database/v2/parser",
        params=params
    )

    return response.json()

In [25]:
search_food("broccoli")

{'text': 'broccoli',
 'count': 3753,
 'parsed': [{'food': {'foodId': 'food_aahw0jha9f8337ajbopx9aec6z7i',
    'label': 'Broccoli',
    'knownAs': 'broccoli',
    'nutrients': {'ENERC_KCAL': 34.0,
     'PROCNT': 2.82,
     'FAT': 0.37,
     'CHOCDF': 6.64,
     'FIBTG': 2.6},
    'category': 'Generic foods',
    'categoryLabel': 'food',
    'image': 'https://www.edamam.com/food-img/3e4/3e47317a3dd54dc911b9c44122285df1.jpg'}}],
 'hints': [{'food': {'foodId': 'food_aahw0jha9f8337ajbopx9aec6z7i',
    'label': 'Broccoli',
    'knownAs': 'broccoli',
    'nutrients': {'ENERC_KCAL': 34.0,
     'PROCNT': 2.82,
     'FAT': 0.37,
     'CHOCDF': 6.64,
     'FIBTG': 2.6},
    'category': 'Generic foods',
    'categoryLabel': 'food',
    'image': 'https://www.edamam.com/food-img/3e4/3e47317a3dd54dc911b9c44122285df1.jpg'},
   'measures': [{'uri': 'http://www.edamam.com/ontologies/edamam.owl#Measure_serving',
     'label': 'Serving',
     'weight': 148.0},
    {'uri': 'http://www.edamam.com/ontologies

## Data Pipeline

### Data Collection & Storage

- Use Recipe API v2 to gather diverse recipes
- Store in PostgreSQL: ingredients, calories, metadata
- Create embeddings for data quality checks

### Feature Engineering & Model

- Process ingredient lists (tokenization, embeddings)
- Train regression model (XGBoost/LightGBM)
- Validate with cross-validation

### AWS Pipeline

- Vision LLM extracts ingredients
- Quality check against stored embeddings
- Calorie prediction
- LLM generates recipe name
- Store results in RDS
