In [2]:
# BakingLLM - Structured Recipe Conversion System (Fixed Version)
import torch
from transformers import AutoTokenizer, DistilBertForTokenClassification, pipeline
from pydantic import BaseModel
from typing import List, Literal
import google.generativeai as genai
import json
import os
from dotenv import load_dotenv

# Configuration
load_dotenv()
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
gemini_client = genai.GenerativeModel("gemini-2.0-flash")

# FoodBERT Model Setup (Search Result 4,7)
# FoodBERT Model Setup
FOODBERT_CHECKPOINT = "alexdseo/RecipeBERT"
tokenizer = AutoTokenizer.from_pretrained(FOODBERT_CHECKPOINT)
model = DistilBertForTokenClassification.from_pretrained(FOODBERT_CHECKPOINT)


# Data Models (Search Result 1)
class Ingredient(BaseModel):
    name: str
    amount: str
    unit: str
    type: Literal["dry", "liquid"]

class Recipe(BaseModel):
    ingredients: List[Ingredient]

# Enhanced Ingredient Type Classifier (Search Result 6)
INGREDIENT_TYPE_LOOKUP = {
    # Dry ingredients
    "flour": "dry", "sugar": "dry", "baking powder": "dry", "salt": "dry",
    # Liquid ingredients
    "milk": "liquid", "water": "liquid", "oil": "liquid", "vanilla": "liquid"
}

def ingredient_type_lookup(name: str) -> str:
    lower_name = name.lower()
    return next((v for k, v in INGREDIENT_TYPE_LOOKUP.items() if k in lower_name), "dry")

# Fixed Extraction Pipeline (Search Result 4,7)
def parse_entities(text: str) -> Recipe:
    # Use pipeline for feature extraction
    extractor = pipeline('feature-extraction', model=FOODBERT_CHECKPOINT, tokenizer=tokenizer)
    
    # Extract features
    features = extractor(text, return_tensors='pt')
    
    # Process features to extract ingredients (this part needs custom logic)
    # For now, let's use a simple splitting approach as a placeholder
    ingredients = [ingredient.strip() for ingredient in text.split(',')]
    
    return Recipe(ingredients=[
        Ingredient(
            name=ing,
            amount="",  # We need more sophisticated parsing for amount and unit
            unit="",
            type=ingredient_type_lookup(ing)
        ) for ing in ingredients
    ])

# Enhanced Conversion Engine (Search Result 2,5)
def convert_measurements(recipe: Recipe) -> dict:
    CONVERSION_PROMPT = """You are a precision baking measurement converter. 
    For DRY ingredients (flour, sugar), return weights in grams/ounces.
    For LIQUID ingredients (milk, oil), return volumes in milliliters/fluid ounces.
    Return ONLY JSON format: {"ingredient": {"original": "...", "type": "...", "metric": "...", "imperial": "..."}}"""
    
    ingredients_str = "\n".join(
        f"- {i.amount} {i.unit} {i.name} ({i.type})" 
        for i in recipe.ingredients
    )
    
    response = gemini_client.generate_content(
        f"{CONVERSION_PROMPT}\nConvert these ingredients:\n{ingredients_str}"
    )
    
    try:
        return json.loads(response.text)
    except json.JSONDecodeError:
        print("Failed to parse response, using Gemini's token count API")  # Search Result 5
        usage = gemini_client.count_tokens(CONVERSION_PROMPT + ingredients_str)
        print(f"Token usage: {usage.total_tokens}")
        return {}

# Execution Flow with Error Handling
if __name__ == "__main__":
    recipe_text = "2 cups all-purpose flour, 1.5 cups sugar, 3/4 cup milk, 1 tsp vanilla extract"
    
    try:
        extracted = parse_entities(recipe_text)
        print("Extracted Ingredients:")
        print(extracted.model_dump_json(indent=2))
        
        conversions = convert_measurements(extracted)
        print("\nConverted Measurements:")
        print(json.dumps(conversions, indent=2))
        
        with open("conversions.json", "w") as f:
            json.dump(conversions, f, indent=2)
        print("\nSaved to conversions.json")
    
    except Exception as e:
        print(f"Error: {str(e)}")
        print("Falling back to Gemini extraction")  # Search Result 1
        backup_response = gemini_client.generate_content(f"Extract ingredients from: {recipe_text}")
        print(backup_response.text)


tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

You are using a model of type bert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at alexdseo/RecipeBERT and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.word_embeddings.weight', 'transformer.layer.0.attention.k_lin.bias', 'transformer.layer.0.attention.k_lin.weight', 'transformer.layer.0.attention.out_lin.bias', 'transformer.layer.0.attention.out_lin.weight', 'transformer.layer.0.attention.q_lin.bias', 'transformer.layer.0.attention.q_lin.weight', 'transformer.layer.0.attention.v_lin.bias', 'transformer.layer.0.attention.v_lin.weight', 'transformer.layer.0.ffn.lin1.bias', 'transformer.layer.0.ffn.lin1.weight', 'transformer.layer.0.ffn.lin2.bias', 'transformer.layer.0.ffn.lin2.weight', 'transformer.layer.0.output_layer_norm.bias', 'transformer.layer.0.output_layer_norm.weight', 'transformer.layer.0.sa_layer_norm.bias', 'transformer.layer.0.sa




Some weights of BertModel were not initialized from the model checkpoint at alexdseo/RecipeBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Extracted Ingredients:
{
  "ingredients": [
    {
      "name": "2 cups all-purpose flour",
      "amount": "",
      "unit": "",
      "type": "dry"
    },
    {
      "name": "1.5 cups sugar",
      "amount": "",
      "unit": "",
      "type": "dry"
    },
    {
      "name": "3/4 cup milk",
      "amount": "",
      "unit": "",
      "type": "liquid"
    },
    {
      "name": "1 tsp vanilla extract",
      "amount": "",
      "unit": "",
      "type": "liquid"
    }
  ]
}
Failed to parse response, using Gemini's token count API
Token usage: 117

Converted Measurements:
{}

Saved to conversions.json
