In [3]:
# BakingLLM - Structured Recipe Conversion System
import torch
from transformers import AutoTokenizer, DistilBertForTokenClassification, pipeline
from pydantic import BaseModel
from typing import List, Literal
import google.generativeai as genai
import json
import os
from dotenv import load_dotenv

# Configuration
load_dotenv()
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
gemini_client = genai.GenerativeModel("gemini-2.0-flash")

# FoodBERT Model Setup
FOODBERT_CHECKPOINT = "chambliss/distilbert-for-food-extraction"
tokenizer = AutoTokenizer.from_pretrained(FOODBERT_CHECKPOINT)
model = DistilBertForTokenClassification.from_pretrained(FOODBERT_CHECKPOINT)
LABEL_MAP = {0: "B-INGREDIENT", 1: "I-INGREDIENT", 2: "B-AMOUNT", 
             3: "I-AMOUNT", 4: "B-UNIT", 5: "I-UNIT", 6: "O"}

# Data Models
class Ingredient(BaseModel):
    name: str
    amount: str
    unit: str
    type: Literal["dry", "liquid"]

class Recipe(BaseModel):
    ingredients: List[Ingredient]

# Ingredient Type Lookup
INGREDIENT_TYPE_LOOKUP = {
    "flour": "dry", "sugar": "dry", "salt": "dry", "baking": "dry",
    "milk": "liquid", "water": "liquid", "oil": "liquid", "vanilla": "liquid"
}

def ingredient_type_lookup(name: str) -> str:
    lower_name = name.lower()
    return next((v for k, v in INGREDIENT_TYPE_LOOKUP.items() if k in lower_name), "dry")

# Extraction Pipeline
def parse_entities(text: str) -> Recipe:
    ner_pipeline = pipeline("token-classification", model=model, tokenizer=tokenizer, 
                          aggregation_strategy="simple")
    entities = ner_pipeline(text)
    ingredients = []
    current = {"name": "", "amount": "", "unit": ""}
    
    for entity in entities:
        token = text[entity["start"]:entity["end"]]
        # Use entity_group instead of entity[-1]
        label = entity["entity_group"]
        
        if label == "B-INGREDIENT":
            if current["name"]:
                ingredients.append(current)
                current = {"name": "", "amount": "", "unit": ""}
            current["name"] = token
        elif label == "I-INGREDIENT":
            current["name"] += " " + token
        elif label.startswith("B-AMOUNT"):
            current["amount"] = token
        elif label.startswith("B-UNIT"):
            current["unit"] = token
    
    if current["name"]:
        ingredients.append(current)
    
    return Recipe(ingredients=[
        Ingredient(
            name=ing["name"].strip(),
            amount=ing["amount"],
            unit=ing["unit"],
            type=ingredient_type_lookup(ing["name"])
        ) for ing in ingredients
    ])
# Conversion Engine
CONVERSION_PROMPT = """You are a precise measurement converter for baking ingredients. 
For DRY ingredients return weights in grams/ounces, LIQUID in milliliters/fluid ounces.
Return ONLY JSON in format: {"ingredient": {"original": "...", "type": "...", "metric": "...", "imperial": "..."}}"""

def convert_measurements(recipe: Recipe) -> dict:
    ingredients_str = ", ".join(f"{i.amount} {i.unit} {i.name} ({i.type})" for i in recipe.ingredients)
    response = gemini_client.generate_content(CONVERSION_PROMPT + f"\nConvert: {ingredients_str}")
    try:
        return json.loads(response.text)
    except json.JSONDecodeError:
        print("Conversion failed, using empty dict")
        return {}

# Execution
if __name__ == "__main__":
    recipe_text = "2 cups all-purpose flour, 1.5 cups sugar, 3/4 cup milk, 1 tsp vanilla extract"
    extracted = parse_entities(recipe_text)
    print("Extracted:\n", extracted)#.model_dump_json(indent=2))
    
    conversions = convert_measurements(extracted)
    print("\nConverted:\n", json.dumps(conversions, indent=2))
    
    with open("conversions.json", "w") as f:
        json.dump(conversions, f, indent=2)
    print("\nSaved to conversions.json")


Extracted:
 ingredients=[]
Conversion failed, using empty dict

Converted:
 {}

Saved to conversions.json
