# Data Collection

We are using the EDAMAM API to collect data. Here we explore its functionality and what kind of data, and types of format, we can get using the free tier.

Visit this link for API keys: https://developer.edamam.com/admin/applications.

In [1]:
import os
import requests
import pandas as pd

edamam_api_id = os.environ["EDAMAM_API_ID"]
edamam_api_key = os.environ["EDAMAM_API_KEY"]

## Initial Exploration

In [7]:
def recipe_search(ingredient, from_index=0, to_index=10):
    if to_index > 100:
        raise ValueError("to_index must be 100 at maximum")
    
    app_id = edamam_api_id  # Replace with your Edamam API app ID
    app_key = edamam_api_key  # Replace with your Edamam API app key
    result = requests.get(
        'https://api.edamam.com/search?q={}&app_id={}&app_key={}&from={}&to={}'.format(
            ingredient, app_id, app_key, from_index, to_index
        )
    )
    data = result.json()
    return data['hits']

def get_recipe_df(recipes):
    recipes_lst = [recipes[i]['recipe'] for i in range(len(recipes))]
    return pd.DataFrame(recipes_lst)

In [8]:
recipes = recipe_search("broccoli")

In [12]:
recipes[0]['recipe']

{'uri': 'http://www.edamam.com/ontologies/edamam.owl#recipe_543023fc1fef4a1448c2acd81155f12c',
 'label': 'Steamed Broccoli',
 'image': 'https://edamam-product-images.s3.amazonaws.com/web-img/5d4/5d4b2cfd62d545fbd0d9bc9b89f96530?X-Amz-Security-Token=IQoJb3JpZ2luX2VjED0aCXVzLWVhc3QtMSJGMEQCIAWX3KTA1ApTQ780MrFvNBm9Wv2rYlmT5IBi5zNMjabhAiA8HfPx6v66rA7u440QinAUbNpXWtw3ak%2BnriRMqTgyyyq5BQhGEAAaDDE4NzAxNzE1MDk4NiIMr3N%2F%2F8J5NCew8u0DKpYF%2BUDUk53O1Py6kdq%2FA0neYoP6tVxA%2FSxXb2h73Sld8SAppr6B40qcMabAY1gihHpEtcbqtUf9mTEIGtRRcdvYigNynsIrtNgMM8TzrFtlaxu4YqtzPEecBh%2F9iiGWAyjfs0eOomlte9FUNyRJKbSAUpgnKKTWgaKBQJqUcJFpcyWJ2UaWOpgMhcGiaEnPKdto9n1TL6prHUQHDugcA75xS9DHkgEbNfJzpdGry6QHqSR4h9Vdt8Kvd%2B76%2FN8bwSjRPLGSXx9kcs%2BZPRt%2BpWLUHaRQL9nFAHXZSKXfhkpYOl9ZZI%2F4%2FJSnuVk2PsJCE6i0iK1BMQtivHM92uK1ejmFYts87W5yPVt%2FtvFtLYlJKCB4u0iBBcrDj%2F60N80K1doS7ODcweKGJ73s4BKpbZQJJiixUGIiJ6g07VZrHQHGjqNCzmGrPm1IR73hPGL%2FPqjXP0dsd1BXt0JlHTQIj9xtizfRPSNLHWRHTo2vq3h4jB5rGOuQ5AU1YmlvZf9UsXbS2HQThC42hs1g1W0xN0qKpxJj4Ar

In [15]:
def get_recipes(from_index=0, to_index=10):
    app_id = edamam_api_id
    app_key = edamam_api_key
    
    params = {
        'type': 'public',
        'app_id': app_id,
        'app_key': app_key,
        'q': '*',  # Search all recipes
        'imageSize': 'THUMBNAIL',
        'random': 'true',
        'field': ['ingredientLines', 'calories']
    }
    
    response = requests.get('https://api.edamam.com/api/recipes/v2', params=params)
    data = response.json()
    return data['hits'] if 'hits' in data else []

In [17]:
get_recipes()[0]

{'recipe': {'ingredientLines': ['1/2 medium sized pineapple, cut into triangles',
   '4 ounces goat cheese',
   '1 small lime, juiced',
   '1/4 cup freshly chopped cilantro'],
  'calories': 541.6159642000001},
 '_links': {'self': {'href': 'https://api.edamam.com/api/recipes/v2/64b1b6b9db872dbb3c8b9b7396b0734f?app_id=213b4d83&app_key=eb0bcd72cc7e2adc795cca5d7b6b4b37',
   'title': 'Self'}}}

We can also search for food by name but there doesn't seem to be a clear indicator of serving size when looking at the nutrient data.

In [23]:
from dotenv import load_dotenv

load_dotenv()

edamam_food_api_id = os.environ["EDAMAM_FOOD_API_ID"]
edamam_food_api_key = os.environ["EDAMAM_FOOD_API_KEY"]

In [24]:
def search_food(query):
    app_id = edamam_food_api_id
    app_key = edamam_food_api_key
   
    params = {
        "app_id": app_id,
        "app_key": app_key,
        "ingr": query
    }

    response = requests.get(
        "https://api.edamam.com/api/food-database/v2/parser",
        params=params
    )

    return response.json()

In [25]:
search_food("broccoli")

{'text': 'broccoli',
 'count': 3753,
 'parsed': [{'food': {'foodId': 'food_aahw0jha9f8337ajbopx9aec6z7i',
    'label': 'Broccoli',
    'knownAs': 'broccoli',
    'nutrients': {'ENERC_KCAL': 34.0,
     'PROCNT': 2.82,
     'FAT': 0.37,
     'CHOCDF': 6.64,
     'FIBTG': 2.6},
    'category': 'Generic foods',
    'categoryLabel': 'food',
    'image': 'https://www.edamam.com/food-img/3e4/3e47317a3dd54dc911b9c44122285df1.jpg'}}],
 'hints': [{'food': {'foodId': 'food_aahw0jha9f8337ajbopx9aec6z7i',
    'label': 'Broccoli',
    'knownAs': 'broccoli',
    'nutrients': {'ENERC_KCAL': 34.0,
     'PROCNT': 2.82,
     'FAT': 0.37,
     'CHOCDF': 6.64,
     'FIBTG': 2.6},
    'category': 'Generic foods',
    'categoryLabel': 'food',
    'image': 'https://www.edamam.com/food-img/3e4/3e47317a3dd54dc911b9c44122285df1.jpg'},
   'measures': [{'uri': 'http://www.edamam.com/ontologies/edamam.owl#Measure_serving',
     'label': 'Serving',
     'weight': 148.0},
    {'uri': 'http://www.edamam.com/ontologies

## Data Collection Pipeline

**Extract from Edamam API - Setup and Data Retrieval**
- Set up API client with proper error handling
- Define what fields you need from the API
- Implement rate limiting and pagination
- Basic validation of API responses

**Store Raw Data in PostgreSQL - Database Setup and Data Storage**
- Create `raw_recipes` table with JSONB column
- Set up database connection
- Implement batch inserts for efficiency
- Add metadata (timestamp, source, etc.)

**Clean and Transform - Data Preprocessing and Validation**
- Parse ingredient strings
- Normalize units
- Standardize nutritional values
- Remove duplicates
- Validate data quality (your embedding-based checks)
- Handle missing values

**Store Cleaned Data - Normalized Data Storage**
- Create normalized table schema (recipes, ingredients, nutritional_facts)
- Implement foreign key relationships
- Store with references to raw data
- Add indexes for common queries

**Additional Considerations**
- Add logging throughout the pipeline
- Implement error handling for each step
- Add basic data quality metrics
- Make the pipeline idempotent (can safely re-run)

Note: later we can also create embeddings for data quality checks (embed ingredient lists and do semantic search with LLM output to find similar recipes. Use a threshold - if there is no recipe similar to it then the LLM output may not be good). Also, use embeddings to find similar recipe names, which can be given to another LLM prompt to generate the most relevant one.

In [2]:
import os
import requests
import pandas as pd

edamam_api_id = os.environ["EDAMAM_API_ID"]
edamam_api_key = os.environ["EDAMAM_API_KEY"]

Note that this is the version 2 of the EDAMAM API.

In [3]:
def get_recipes(query='*', from_index=0, to_index=10):
    app_id = edamam_api_id
    app_key = edamam_api_key
    
    params = {
        'type': 'public',
        'app_id': app_id,
        'app_key': app_key,
        'q': query,
        'imageSize': 'THUMBNAIL',
        'random': 'true',
        'field': [
            'label',                # Recipe name
            'ingredientLines',      # List of ingredients
            'ingredients',          # Detailed ingredient info
            'calories',            
            'totalNutrients',       # Detailed nutrition
            'totalWeight',
            'dietLabels',
            'healthLabels',
            'cuisineType',
            'mealType',
            'dishType',
            'yield',               # Number of servings
            'totalTime',           # Cooking time
            'instructions',        # Cooking steps
            'source',              # Recipe source
            'uri',                 # Unique identifier
            'images'               # Recipe images
        ]
    }
    
    response = requests.get('https://api.edamam.com/api/recipes/v2', params=params)
    data = response.json()
    return data['hits'] if 'hits' in data else []

In [4]:
sample_recipes = get_recipes()

In [5]:
sample_recipes[0].keys()

dict_keys(['recipe', '_links'])

In [6]:
sample_recipes[0]['recipe'].keys()

dict_keys(['uri', 'label', 'images', 'source', 'yield', 'dietLabels', 'healthLabels', 'ingredientLines', 'ingredients', 'calories', 'totalWeight', 'totalTime', 'cuisineType', 'mealType', 'dishType', 'totalNutrients'])

In [7]:
sample_recipes[0]['recipe']['healthLabels']

['Low Potassium',
 'Kidney-Friendly',
 'Vegetarian',
 'Pescatarian',
 'Peanut-Free',
 'Tree-Nut-Free',
 'Soy-Free',
 'Fish-Free',
 'Shellfish-Free',
 'Pork-Free',
 'Red-Meat-Free',
 'Crustacean-Free',
 'Celery-Free',
 'Mustard-Free',
 'Sesame-Free',
 'Lupine-Free',
 'Mollusk-Free',
 'Kosher']

The totalNutrients dictionary contains all the nutritional information for the recipe, including macronutrients that we are interested in.

In [8]:
sample_recipes[0]['recipe']['totalNutrients']

{'ENERC_KCAL': {'label': 'Energy', 'quantity': 3669.086688, 'unit': 'kcal'},
 'FAT': {'label': 'Fat', 'quantity': 218.9987755, 'unit': 'g'},
 'FASAT': {'label': 'Saturated', 'quantity': 130.19909093, 'unit': 'g'},
 'FATRN': {'label': 'Trans', 'quantity': 0.04902, 'unit': 'g'},
 'FAMS': {'label': 'Monounsaturated', 'quantity': 67.1051496445, 'unit': 'g'},
 'FAPU': {'label': 'Polyunsaturated',
  'quantity': 10.273119147100001,
  'unit': 'g'},
 'CHOCDF': {'label': 'Carbs', 'quantity': 417.197362215, 'unit': 'g'},
 'CHOCDF.net': {'label': 'Carbohydrates (net)',
  'quantity': 400.4413873,
  'unit': 'g'},
 'FIBTG': {'label': 'Fiber', 'quantity': 16.755974915000003, 'unit': 'g'},
 'SUGAR': {'label': 'Sugars', 'quantity': 295.17462082500003, 'unit': 'g'},
 'SUGAR.added': {'label': 'Sugars, added',
  'quantity': 293.728920825,
  'unit': 'g'},
 'PROCNT': {'label': 'Protein', 'quantity': 40.73118977, 'unit': 'g'},
 'CHOLE': {'label': 'Cholesterol',
  'quantity': 844.3050000000001,
  'unit': 'mg'}

In order to collect a diverse dataset, we should query for all recipes with combinations of the various fields/metadata available to us. More specifically, we should be able to query for all recipes that have a certain cuisine type, meal type, dish type, etc.

The different values we can search for, per metadata, are things you can find in the documentation. Lists of each are below.

Also some things to note: label is the meal name, totalTime is in minutes, calories are in kcal, yield is number of servings, and 

In [9]:
diet_labels = [
    "balanced",      # Protein/Fat/Carb values in 15/35/50 ratio
    "high-fiber",    # More than 5g fiber per serving
    "high-protein",  # More than 50% of total calories from proteins
    "low-carb",      # Less than 20% of total calories from carbs
    "low-fat",       # Less than 15% of total calories from fat
    "low-sodium"     # Less than 140mg Na per serving
]

In [10]:
health_labels = [
    "alcohol-cocktail",    # Describes an alcoholic cocktail
    "alcohol-free",        # No alcohol used or contained
    "celery-free",        # Does not contain celery or derivatives
    "crustacean-free",    # Does not contain crustaceans
    "dairy-free",         # No dairy; no lactose
    "DASH",               # Dietary Approaches to Stop Hypertension diet
    "egg-free",           # No eggs or products containing eggs
    "fish-free",          # No fish or fish derivatives
    "fodmap-free",        # Does not contain FODMAP foods
    "gluten-free",        # No ingredients containing gluten
    "immuno-supportive",  # Science-based immune system strengthening
    "keto-friendly",      # Maximum 7 grams of net carbs per serving
    "kidney-friendly",    # Restricted phosphorus, potassium, and sodium
    "kosher",             # Contains only kosher-allowed ingredients
    "low-potassium",      # Less than 150mg per serving
    "low-sugar",          # No simple sugars
    "lupine-free",        # Does not contain lupine or derivatives
    "Mediterranean",      # Mediterranean diet
    "mollusk-free",       # No mollusks
    "mustard-free",       # Does not contain mustard or derivatives
    "No-oil-added",       # No oil added except in basic ingredients
    "paleo",              # Excludes agricultural products
    "peanut-free",        # No peanuts or products containing peanuts
    "pecatarian",         # No meat, can contain dairy and fish
    "pork-free",          # Does not contain pork or derivatives
    "red-meat-free",      # No red meat or products containing red meat
    "sesame-free",        # Does not contain sesame seed or derivatives
    "shellfish-free",     # No shellfish or shellfish derivatives
    "soy-free",           # No soy or products containing soy
    "sugar-conscious",    # Less than 4g of sugar per serving
    "sulfite-free",       # No Sulfites
    "tree-nut-free",      # No tree nuts or products containing tree nuts
    "vegan",              # No animal products
    "vegetarian",         # No meat, poultry, or fish
    "wheat-free"          # No wheat, can have gluten though
]

In [11]:
meal_types = [
    "breakfast",
    "brunch",
    "lunch", # lunch/dinner are the same according to the API, so we only need one of these labels
    "snack",
    "teatime"
]

In [12]:
dish_types = [
    "alcohol cocktail",
    "biscuits and cookies",
    "bread",
    "cereals",
    "condiments and sauces",
    "desserts",
    "drinks",
    "egg",
    "ice cream and custard",
    "main course",
    "pancake",
    "pasta",
    "pastry",
    "pies and tarts",
    "pizza",
    "preps",
    "preserve",
    "salad",
    "sandwiches",
    "seafood",
    "side dish",
    "soup",
    "special occasions",
    "starter",
    "sweets"
]

In [13]:
cuisine_types = [
    "american",
    "asian",
    "british",
    "caribbean",
    "central europe",
    "chinese",
    "eastern europe",
    "french",
    "greek",
    "indian",
    "italian",
    "japanese",
    "korean",
    "kosher",
    "mediterranean",
    "mexican",
    "middle eastern",
    "nordic",
    "south american",
    "south east asian",
    "world" # International cuisine/Other
]

In [14]:
def get_recipes(diet_labels, health_labels, meal_types, dish_types, cuisine_types, query='*', from_index=0, to_index=1000):
    app_id = edamam_api_id
    app_key = edamam_api_key
    
    params = {
        'type': 'public',
        'app_id': app_id,
        'app_key': app_key,
        'q': query,
        'imageSize': 'THUMBNAIL',
        'random': 'true',
        'field': [
            'label',               
            'ingredientLines',     
            'ingredients',         
            'totalNutrients',      
            'totalWeight',
            'dietLabels',
            'healthLabels',
            'cuisineType',
            'mealType',
            'dishType',
            'yield',              
            'totalTime',           
            'source',              
            'uri',                
            'images'               
        ]
    }
    
    # Only add parameters if they're not empty
    if diet_labels:
        params['diet'] = diet_labels if isinstance(diet_labels, list) else [diet_labels]
    if health_labels:
        params['health'] = health_labels if isinstance(health_labels, list) else [health_labels]
    if meal_types:
        params['mealType'] = meal_types if isinstance(meal_types, list) else [meal_types]
    if dish_types:
        params['dishType'] = dish_types if isinstance(dish_types, list) else [dish_types]
    if cuisine_types:
        params['cuisineType'] = cuisine_types if isinstance(cuisine_types, list) else [cuisine_types]
    
    response = requests.get('https://api.edamam.com/api/recipes/v2', params=params)
    
    # Debug information
    print(f"URL: {response.url}")
    print(f"Status Code: {response.status_code}")
    print(f"Response: {response.json()}")
    
    data = response.json()
    return data['hits'] if 'hits' in data else []

# Try with a single parameter first to isolate issues
sample_recipes = get_recipes(
    diet_labels=['balanced'],
    health_labels=['egg-free'],
    meal_types=['dinner'],
    dish_types=['salad'],
    cuisine_types=['american']
)

URL: https://api.edamam.com/api/recipes/v2?type=public&app_id=213b4d83&app_key=eb0bcd72cc7e2adc795cca5d7b6b4b37&q=%2A&imageSize=THUMBNAIL&random=true&field=label&field=ingredientLines&field=ingredients&field=totalNutrients&field=totalWeight&field=dietLabels&field=healthLabels&field=cuisineType&field=mealType&field=dishType&field=yield&field=totalTime&field=source&field=uri&field=images&diet=balanced&health=egg-free&mealType=dinner&dishType=salad&cuisineType=american
Status Code: 200
Response: {'from': 1, 'to': 20, 'count': 2789, '_links': {}, 'hits': [{'recipe': {'uri': 'http://www.edamam.com/ontologies/edamam.owl#recipe_620c4f9adcb7509cbfbb6828ce8fc679', 'label': 'Lime Salad Spritz', 'images': {'THUMBNAIL': {'url': 'https://www.edamam.com/web-img/not-found.jpg', 'width': 100, 'height': 100}, 'SMALL': {'url': 'https://www.edamam.com/web-img/not-found.jpg', 'width': 200, 'height': 200}, 'REGULAR': {'url': 'https://www.edamam.com/web-img/not-found.jpg', 'width': 300, 'height': 300}}, 'so

In [90]:
len(sample_recipes)

20

In [94]:
sample_recipes[0]['recipe']['mealType']

['lunch/dinner']

Testing out another example.

In [97]:
sample_recipes_2 = get_recipes(
    diet_labels=['balanced'],
    health_labels=['sugar-conscious'],
    meal_types=['breakfast'],
    dish_types=['egg'],
    cuisine_types=['american']
)
len(sample_recipes_2)

URL: https://api.edamam.com/api/recipes/v2?type=public&app_id=213b4d83&app_key=eb0bcd72cc7e2adc795cca5d7b6b4b37&q=%2A&imageSize=THUMBNAIL&random=true&field=label&field=ingredientLines&field=ingredients&field=totalNutrients&field=totalWeight&field=dietLabels&field=healthLabels&field=cuisineType&field=mealType&field=dishType&field=yield&field=totalTime&field=source&field=uri&field=images&diet=balanced&health=sugar-conscious&mealType=breakfast&dishType=egg&cuisineType=american
Status Code: 200
Response: {'from': 1, 'to': 9, 'count': 9, '_links': {}, 'hits': [{'recipe': {'uri': 'http://www.edamam.com/ontologies/edamam.owl#recipe_149fd7fcf00a4ec1904359195733dcfd', 'label': 'Arepas de Huevo (Egg-Stuffed Corn Cakes)', 'images': {'THUMBNAIL': {'url': 'https://www.edamam.com/web-img/not-found.jpg', 'width': 100, 'height': 100}, 'SMALL': {'url': 'https://www.edamam.com/web-img/not-found.jpg', 'width': 200, 'height': 200}, 'REGULAR': {'url': 'https://www.edamam.com/web-img/not-found.jpg', 'width'

9

So when setting all these parameters, we tend not to get too many results as multiple parameters could be contradictory and result in not recipes that meet our criteria. So we need to decide on combinations of parameters that are good (will return non-empty lists at least, but preferably lists with a large number of recipes). We could do this manually, but instead we can do this with adaptive sampling.

In [15]:
from pydantic import BaseModel, Field
import random
import time
from collections import Counter

In [18]:
class RecipeParameters(BaseModel):
    """Stores all valid parameter values for recipe searches"""
    diet_labels: list[str] = Field(
        description="List of valid diet labels (e.g. 'balanced', 'high-protein', 'low-fat')",
    )
    health_labels: list[str] = Field(
        description="List of valid health labels (e.g. 'egg-free', 'gluten-free', 'peanut-free')",
    )
    meal_types: list[str] = Field(
        description="List of valid meal types (e.g. 'breakfast', 'lunch', 'dinner')",
    )
    dish_types: list[str] = Field(
        description="List of valid dish types (e.g. 'salad', 'main course', 'dessert')",
    )
    cuisine_types: list[str] = Field(
        description="List of valid cuisine types (e.g. 'american', 'italian', 'chinese')",
    )
    
    class Config:
        json_schema_extra = {
            "example": {
                "diet_labels": ["balanced", "high-protein"],
                "health_labels": ["egg-free", "gluten-free"],
                "meal_types": ["breakfast", "lunch"],
                "dish_types": ["salad", "main course"],
                "cuisine_types": ["american", "italian"]
            }
        }

In [19]:
class ParameterStats(BaseModel):
    """Statistics for parameter success rates"""
    success: int = Field(default=0, description="Number of successful queries")
    total: int = Field(default=0, description="Total number of queries")

In [25]:
from typing import Any

class RecipeDataCollector:
    def __init__(self,
                 params: RecipeParameters,
                 api_client: Any,
                 min_recipes_per_category: int = 100,
                 max_retries: int = 3,
                 rate_limit: float = 0.5):
        self.params = params
        self.api_client = api_client
        self.min_recipes = min_recipes_per_category
        self.max_retries = max_retries
        self.rate_limit = rate_limit
        self.collected_recipes: dict[str, dict] = {}
        self.parameter_stats: dict[str, ParameterStats] = {}
        
    def _update_stats(self, params: dict[str, list[str]], success: bool):
        """Update success statistics for parameter combinations"""
        for param_type in params:
            if param_type not in self.parameter_stats:
                self.parameter_stats[param_type] = ParameterStats()
                
            stats = self.parameter_stats[param_type]
            stats.total += 1
            if success:
                stats.success += 1
                
    def _store_recipe(self, recipe: dict):
        """Store recipe with additional metadata."""
        recipe_id = recipe['uri']
        self.collected_recipes[recipe_id] = {
            **recipe,
            'collection_timestamp': time.time(),
            'parameter_stats': self.parameter_stats.copy()
        }
    
    def _get_param_success_rate(self, param_type: str) -> float:
        """Calculate success rate for a parameter type"""
        if param_type not in self.parameter_stats:
            return 0.5
        
        stats = self.parameter_stats[param_type]
        return stats.success / stats.total if stats.total > 0 else 0.5
    
    def _generate_parameter_combination(self) -> dict[str, list[str]]:
            """
            Generate a smart parameter combination based on historical success rates.
            
            Returns:
                Dictionary of parameters to use in the next query
            """
            params = {}
            
            # Available parameter types
            param_types = ['diet_labels', 'health_labels', 'meal_types', 
                        'dish_types', 'cuisine_types']
            
            # Start with one random parameter type
            primary_param = random.choice(param_types)
            param_values = getattr(self.params, primary_param)
            params[primary_param] = [random.choice(param_values)]
            
            # Maybe add more parameters based on their success rates
            for param_type in param_types:
                if param_type != primary_param:
                    success_rate = self._get_param_success_rate(param_type)
                    if random.random() < success_rate:
                        param_values = getattr(self.params, param_type)
                        params[param_type] = [random.choice(param_values)]
                        
            return params
        
    async def collect_recipes(self, target_recipes: int = 1000) -> list[dict]:
        """
        Collect recipes using adaptive parameter sampling.
        
        Args:
            target_recipes: Number of unique recipes to collect
            
        Returns:
            List of collected recipes
        """
        unique_recipes = set()
        retries = 0
        
        while len(unique_recipes) < target_recipes and retries < self.max_retries:
            try:
                # Generate parameter combination
                params = self._generate_parameter_combination()
                print(f"Trying parameters: {params}")
                
                # Rate limiting
                time.sleep(self.rate_limit)
                
                # Get recipes
                recipes = await self.api_client.get_recipes(**params)
                
                if recipes:
                    # Update stats for successful combination
                    self._update_stats(params, success=True)
                    
                    # Store new unique recipes
                    for recipe in recipes:
                        recipe_id = recipe['uri']
                        if recipe_id not in unique_recipes:
                            unique_recipes.add(recipe_id)
                            self._store_recipe(recipe)
                else:
                    # Update stats for unsuccessful combination
                    self._update_stats(params, success=False)
                    
                print(f"Collected {len(unique_recipes)} unique recipes")
                
            except Exception as e:
                print(f"Error collecting recipes: {str(e)}")
                retries += 1
                
        return list(self.collected_recipes.values())

In [41]:
def _get_param_success_rate(self, param_type: str) -> float:
    """Calculate success rate for a parameter type"""
    if param_type not in self.parameter_stats:
        return 0.5
    
    stats = self.parameter_stats[param_type]
    return stats.success / stats.total if stats.total > 0 else 0.5

In [48]:
params = {}

# Available parameter types
param_types = ['diet_labels', 'health_labels', 'meal_types', 
            'dish_types', 'cuisine_types']

# Start with one random parameter type
primary_param = random.choice(param_types)
param_values = getattr(RecipeParameters, primary_param)
params[primary_param] = [random.choice(param_values)]

# Maybe add more parameters based on their success rates
for param_type in param_types:
    if param_type != primary_param:
        success_rate = _get_param_success_rate(param_type)
        if random.random() < success_rate:
            param_values = getattr(RecipeParameters, param_type)
            params[param_type] = [random.choice(param_values)]

AttributeError: diet_labels

In [36]:
# First, import everything we need
from pydantic import BaseModel, Field
import asyncio
import aiohttp
import time
import random
from typing import Any

# We already have our RecipeParameters and ParameterStats classes defined

# Let's create a simple API client for Edamam
class EdamamClient:
    def __init__(self, app_id: str, app_key: str):
        self.app_id = app_id
        self.app_key = app_key
        
    async def get_recipes(self, **params) -> list[dict]:
        """Get recipes from Edamam API"""
        base_params = {
            'type': 'public',
            'app_id': self.app_id,
            'app_key': self.app_key,
            'imageSize': 'THUMBNAIL',
            'random': 'true'
        }
        
        # Merge base params with provided params
        params = {**base_params, **params}
        
        async with aiohttp.ClientSession() as session:
            async with session.get('https://api.edamam.com/api/recipes/v2', params=params) as response:
                data = await response.json()
                return [hit['recipe'] for hit in data.get('hits', [])]

# Now let's use it:
async def main():
    # Initialize parameters with our lists from earlier
    params = RecipeParameters(
        diet_labels=diet_labels,
        health_labels=health_labels,
        meal_types=meal_types,
        dish_types=dish_types,
        cuisine_types=cuisine_types
    )

    # Create API client
    api_client = EdamamClient(
        app_id=edamam_api_id,
        app_key=edamam_api_key
    )

    # Create collector
    collector = RecipeDataCollector(
        params=params,
        api_client=api_client,
        min_recipes_per_category=10,  # Start small for testing
        max_retries=3,
        rate_limit=1.0  # 1 second between requests
    )

    # Collect recipes
    recipes = await collector.collect_recipes(target_recipes=100)  # Start with a small number
    
    # Print stats
    print(f"Collected {len(recipes)} recipes")
    print("\nParameter success rates:")
    for param_type, stats in collector.parameter_stats.items():
        success_rate = stats.success / stats.total if stats.total > 0 else 0
        print(f"{param_type}: {success_rate:.2%} ({stats.success}/{stats.total})")

    return recipes

# In Jupyter, run it like this:
recipes = await main()

Trying parameters: {'health_labels': ['mustard-free'], 'diet_labels': ['high-fiber'], 'dish_types': ['egg']}
Collected 20 unique recipes
Trying parameters: {'dish_types': ['alcohol cocktail'], 'diet_labels': ['low-carb'], 'health_labels': ['mustard-free'], 'meal_types': ['lunch']}
Collected 40 unique recipes
Trying parameters: {'dish_types': ['side dish'], 'diet_labels': ['low-fat'], 'health_labels': ['fodmap-free'], 'meal_types': ['breakfast'], 'cuisine_types': ['central europe']}
Collected 60 unique recipes
Trying parameters: {'dish_types': ['pizza'], 'diet_labels': ['high-fiber'], 'health_labels': ['low-potassium'], 'meal_types': ['brunch'], 'cuisine_types': ['japanese']}
Collected 80 unique recipes
Trying parameters: {'dish_types': ['biscuits and cookies'], 'diet_labels': ['low-carb'], 'health_labels': ['DASH'], 'meal_types': ['lunch'], 'cuisine_types': ['world']}
Collected 100 unique recipes
Collected 100 recipes

Parameter success rates:
health_labels: 100.00% (5/5)
diet_labels: 

In [78]:

api_client = EdamamClient(
    app_id=edamam_api_id,
    app_key=edamam_api_key
)

params = RecipeParameters(
    diet_labels=diet_labels,
    health_labels=health_labels,
    meal_types=meal_types,
    dish_types=dish_types,
    cuisine_types=cuisine_types
)

collector = RecipeDataCollector(
    params=params,
    api_client=api_client,
    min_recipes_per_category=10,  # Start small for testing
    max_retries=3,
    rate_limit=1.0  # 1 second between requests
)

In [37]:
if recipes:
    first_recipe = recipes[0]
    print("\nExample recipe:")
    print(f"Name: {first_recipe['label']}")
    print(f"Cuisine: {first_recipe.get('cuisineType', 'Not specified')}")
    print(f"Meal Type: {first_recipe.get('mealType', 'Not specified')}")
    print(f"Diet Labels: {first_recipe.get('dietLabels', 'None')}")


Example recipe:
Name: Mom’s Swedish Potatoes recipes
Cuisine: ['nordic']
Meal Type: ['lunch/dinner']
Diet Labels: []


In [39]:
for recipe in recipes:
    print(f"\nRecipe: {recipe['label']}")
    print("Parameters used:")
    print(f"- Cuisine: {recipe.get('cuisineType', 'Not specified')}")
    print(f"- Diet Labels: {recipe.get('dietLabels', 'None')}")
    print(f"- Health Labels: {recipe.get('healthLabels', 'None')}")
    print(f"- Meal Type: {recipe.get('mealType', 'Not specified')}")
    print(f"- Dish Type: {recipe.get('dishType', 'Not specified')}")


Recipe: Mom’s Swedish Potatoes recipes
Parameters used:
- Cuisine: ['nordic']
- Diet Labels: []
- Health Labels: ['Sugar-Conscious', 'Vegetarian', 'Pescatarian', 'Egg-Free', 'Peanut-Free', 'Tree-Nut-Free', 'Soy-Free', 'Fish-Free', 'Shellfish-Free', 'Pork-Free', 'Red-Meat-Free', 'Crustacean-Free', 'Celery-Free', 'Mustard-Free', 'Sesame-Free', 'Lupine-Free', 'Mollusk-Free', 'Alcohol-Free', 'Sulfite-Free', 'Kosher', 'Immuno-Supportive']
- Meal Type: ['lunch/dinner']
- Dish Type: ['condiments and sauces']

Recipe: Soft Chocolate Chip Cookies
Parameters used:
- Cuisine: ['american']
- Diet Labels: ['Low-Sodium']
- Health Labels: ['Low Potassium', 'Kidney-Friendly', 'Vegetarian', 'Pescatarian', 'Peanut-Free', 'Soy-Free', 'Fish-Free', 'Shellfish-Free', 'Pork-Free', 'Red-Meat-Free', 'Crustacean-Free', 'Celery-Free', 'Mustard-Free', 'Sesame-Free', 'Lupine-Free', 'Mollusk-Free', 'Alcohol-Free', 'Kosher']
- Meal Type: ['teatime']
- Dish Type: ['biscuits and cookies']

Recipe: Zucchini Bread with

### Data Storage

We will use PostgreSQL for storing our raw recipe data, however later on we will want to do it in AWS (for production).

In [137]:
from sqlalchemy import create_engine, Column, Integer, String, JSON, Float
from sqlalchemy.orm import sessionmaker, declarative_base

Let's test putting in a sample set of recipes into the database.

In [138]:
recipes[0].keys()

dict_keys(['uri', 'label', 'image', 'images', 'source', 'url', 'shareAs', 'yield', 'dietLabels', 'healthLabels', 'cautions', 'ingredientLines', 'ingredients', 'calories', 'totalCO2Emissions', 'co2EmissionsClass', 'totalWeight', 'totalTime', 'cuisineType', 'mealType', 'dishType', 'totalNutrients', 'totalDaily', 'digest', 'tags', 'collection_timestamp', 'parameter_stats'])

In [155]:
Base = declarative_base()

class Raw_Recipe(Base):
    __tablename__ = 'raw_recipes'
    
    id = Column(Integer, primary_key=True)
    uri = Column(String, unique=True)
    label = Column(String)
    url = Column(String)
    yield_ = Column(Integer)
    dietLabels = Column(JSON)
    healthLabels = Column(JSON)
    cautions = Column(JSON)
    ingredientLines = Column(JSON)
    ingredients = Column(JSON)
    calories = Column(Float)
    totalWeight = Column(Float)
    totalTime = Column(Integer)
    cuisineType = Column(JSON)
    mealType = Column(JSON)
    dishType = Column(JSON)
    totalNutrients = Column(JSON)
    totalDaily = Column(JSON)
    digest = Column(JSON)
    tags = Column(JSON)

In [156]:
import os
from dotenv import load_dotenv

load_dotenv()

postgresql_password = os.environ["POSTGRESQL_IIFYMATE_PASSWORD"]

In [157]:
engine = create_engine(f'postgresql://iifymate:{postgresql_password}@localhost/raw_recipes')
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)

In [176]:
recipes[0]['yield']

4.0

In [177]:
def create_raw_recipe(recipe_data):
    return Raw_Recipe(
        uri=recipe_data.get('uri', ''),
        label=recipe_data.get('label', ''),
        url=recipe_data.get('url', ''),
        yield_=recipe_data.get('yield', 0.0),
        dietLabels=recipe_data.get('dietLabels', []),
        healthLabels=recipe_data.get('healthLabels', []),
        cautions=recipe_data.get('cautions', []),
        ingredientLines=recipe_data.get('ingredientLines', []),
        ingredients=recipe_data.get('ingredients', []),
        calories=recipe_data.get('calories', 0.0),
        totalWeight=recipe_data.get('totalWeight', 0.0),
        totalTime=recipe_data.get('totalTime', 0.0),
        cuisineType=recipe_data.get('cuisineType', []),
        mealType=recipe_data.get('mealType', []),
        dishType=recipe_data.get('dishType', []),
        totalNutrients=recipe_data.get('totalNutrients', {}),
        totalDaily=recipe_data.get('totalDaily', {}),
        digest=recipe_data.get('digest', []),
        tags=recipe_data.get('tags', [])
    )

In [178]:
session = Session()

for i, recipe in enumerate(recipes):
    try:
        session.add(create_raw_recipe(recipe))
    except Exception as e:
        print(f"Failed at index {i}: {e}")
        break

In [179]:
session.commit()
session.close()