# Update the extraction results

In [1]:
import os
import datetime
import json
from pathlib import Path

import numpy as np
import pandas as pd
from pydantic import BaseModel, ValidationError
from amplifai import Amplifier
from langchain_openai import ChatOpenAI
from langchain_mistralai import ChatMistralAI
from dotenv import load_dotenv
from baker.schemas.ingredient import Ingredient
from baker.schemas.recipe import Recipe, ParsedRecipe
from baker.schemas.step import Step

## Initialization

### Config

In [2]:
# Loading necessary environment variables
load_dotenv()

True

In [3]:
# Preparing useful path
cwd = os.getcwd()
path_to_data = Path(cwd).parent / "data"
path_to_input = path_to_data / "input"
path_to_output = path_to_data / "output"
path_to_2_update_the_extraction_results = (
    path_to_output / "2_update_the_extraction_results"
)
path_to_recipes_v1 = path_to_input / "recipes_v1.json"

In [4]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
MISTRALAI_API_KEY = os.getenv("MISTRAL_API_KEY")

### Data Load

In [5]:
df_recipes_v1 = pd.read_json(path_to_recipes_v1)
df_recipes_v1.head()

Unnamed: 0,title,date,tags,introduction,ingredients,direction,output
0,Creamy Mashed Potatoes,2021-03-12,"[potato, side, cheesefare]",![Creamy Mashed Potatoes](/pix/creamy-mashed-p...,The quantities here are for about four adult p...,1. Peel and cut the potatoes into medium sized...,"{'title': 'Creamy Mashed Potatoes', 'ingredien..."
1,Red Sauce (Ragu all'Italiana),2021-03-16,"[italian, sauce]",My great-grandma's red sauce. All purpose: goo...,- 1/3 lb salt pork - 2 lb chuck roast - 3 Clov...,"1. Mince the garlic, peel the carrot, peel and...","{'title': 'Red Sauce (Ragu all'Italiana)', 'in..."
2,Turmeric Flatbread,2022-08-09,"[bread, turkish]",A great companion to Turkish Red Lentil Soup. ...,- 1 cup Wheat Flour (white or whole) - 1/2 Tbs...,"1. Combine flour, turmeric, salt and baking po...","{'title': 'Turmeric Flatbread', 'ingredients':..."
3,Zurich-Style Meat Saute,2023-01-06,"[beef, cream, swiss, quick]","Originally called ""Züri Gschnätzlets"" (Zurich ...","- 600g\tBeef, finely sliced - 250g\tMushrooms,...",1. Fry the meat in butter at high heat until i...,"{'title': 'Zurich-Style Meat Saute', 'ingredie..."
4,Tuscan Style Pork Roast,2021-03-10,"[italian, pork, roast]",![tuscan-style-pork-roast](/pix/tuscan-style-p...,- 1 pork Roast - 2-3 Tbsp fresh rosemary - 8 c...,1.\tPreheat oven to 275°F (135°C) 2.\tButterfl...,"{'title': 'Tuscan Style Pork Roast', 'ingredie..."


## More preparation work

### Preparing the data

In [6]:
recipes_v1 = df_recipes_v1.to_dict(orient="records")
random_recipes = df_recipes_v1.sample(10).to_dict(orient="records")

### Generating the extraction pipeline

In [7]:
openai_llm = ChatOpenAI(api_key=OPENAI_API_KEY, temperature=0.7, model="gpt-4o")
mistral_llm = ChatMistralAI(
    api_key=MISTRALAI_API_KEY, temperature=0.7, model="mistral-large-latest"
)

In [8]:
human_prompt = """Given the following recipe introduction, ingredients and steps. Please structure the recipe as a NoSQL document.

Introduction:
{introduction}

Ingredients:
{ingredients}

Steps:
{steps}

"""

In [9]:
openai_amplifier = Amplifier[ChatOpenAI, ParsedRecipe](
    llm=openai_llm,
    human_prompt=human_prompt,
)

mistral_amplifier = Amplifier[ChatMistralAI, ParsedRecipe](
    llm=mistral_llm,
    human_prompt=human_prompt,
)

## Testing

### On first row

In [None]:
first_recipe = recipes_v1[0]

In [None]:
parsed_recipe = openai_amplifier.denoise(
    introduction=first_recipe["introduction"],
    ingredients=first_recipe["ingredients"],
    steps=first_recipe["direction"],
)
parsed_recipe

In [None]:
parsed_recipe.model_dump()

In [None]:
mistral_parsed_recipe = mistral_amplifier.denoise(
    introduction=first_recipe["introduction"],
    ingredients=first_recipe["ingredients"],
    steps=first_recipe["direction"],
)
mistral_parsed_recipe.model_dump()

### On random sample

In [None]:
parsed_recipes = []
for recipe in random_recipes:
    parsed_recipe = openai_amplifier.denoise(
        introduction=recipe["introduction"],
        ingredients=recipe["ingredients"],
        steps=recipe["direction"],
    )

    new_recipe = Recipe(
        title=recipe["title"],
        date=recipe["date"],
        tags=recipe["tags"],
        introduction=recipe["introduction"],
        ingredients_source_text=recipe["ingredients"],
        directions_source_text=recipe["direction"],
        **parsed_recipe.model_dump(),
    )
    parsed_recipes.append(new_recipe)

In [None]:
parsed_recipes

In [None]:
parsed_recipes_json = [recipe.model_dump(mode="json") for recipe in parsed_recipes]
parsed_recipes_json

In [None]:
with open(
    path_to_2_update_the_extraction_results / "10_random_recipes_parsed.json", "w"
) as f:
    json.dump(parsed_recipes_json, f, indent=4)

In [None]:
parsed_recipes[3].model_dump()

## Main Run

In [10]:
path_to_main_run = path_to_2_update_the_extraction_results / "main_run"
path_to_main_run.exists()

True

In [11]:
parsed_recipes = []
for i, recipe in enumerate(recipes_v1):
    path_to_recipe = path_to_main_run / f"{i}.json"
    if not path_to_recipe.exists():
        try:
            parsed_recipe = openai_amplifier.denoise(
                introduction=recipe["introduction"],
                ingredients=recipe["ingredients"],
                steps=recipe["direction"],
            )

            new_recipe = Recipe(
                title=recipe["title"],
                date=recipe["date"],
                tags=recipe["tags"],
                introduction=recipe["introduction"],
                ingredients_source_text=recipe["ingredients"],
                directions_source_text=recipe["direction"],
                **parsed_recipe.model_dump(),
            )
            new_recipe_json = new_recipe.model_dump(mode="json")

            parsed_recipes.append(new_recipe)

            with open(
                path_to_2_update_the_extraction_results / path_to_recipe, "w"
            ) as f:
                json.dump(new_recipe_json, f, indent=4)

        except Exception as err:
            print(err)

    else:
        print(f"Recipe {i} already processed")

Recipe 0 already processed
Recipe 1 already processed
Recipe 2 already processed
Recipe 3 already processed
Recipe 4 already processed
Recipe 5 already processed
Recipe 6 already processed
Recipe 7 already processed
Recipe 8 already processed
Recipe 9 already processed
Recipe 10 already processed
Recipe 11 already processed
Recipe 12 already processed
Recipe 13 already processed
Recipe 15 already processed
Recipe 16 already processed
