<h1><center>The rising use case of LLM: Structuring untructured data</center></h1>

## Demo Start

In [1]:
import os
import json
import time
from pathlib import Path

import pandas as pd
from langchain.output_parsers import PydanticOutputParser
from langchain_mistralai.chat_models import ChatMistralAI
from dotenv import load_dotenv

from core import run
from prompt import DEFAULT_BASE_PROMPT, create_prompt
from schemas import Recipe

In [2]:
load_dotenv()
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")

In [3]:
path_to_data = Path(os.getcwd()) / "data" / "input"
df = pd.read_json(path_to_data / "recipes_v1.json")
df.head()

Unnamed: 0,title,date,tags,introduction,ingredients,direction,output
0,Creamy Mashed Potatoes,2021-03-12,"[potato, side, cheesefare]",![Creamy Mashed Potatoes](/pix/creamy-mashed-p...,The quantities here are for about four adult p...,1. Peel and cut the potatoes into medium sized...,"{'title': 'Creamy Mashed Potatoes', 'ingredien..."
1,Red Sauce (Ragu all'Italiana),2021-03-16,"[italian, sauce]",My great-grandma's red sauce. All purpose: goo...,- 1/3 lb salt pork - 2 lb chuck roast - 3 Clov...,"1. Mince the garlic, peel the carrot, peel and...","{'title': 'Red Sauce (Ragu all'Italiana)', 'in..."
2,Turmeric Flatbread,2022-08-09,"[bread, turkish]",A great companion to Turkish Red Lentil Soup. ...,- 1 cup Wheat Flour (white or whole) - 1/2 Tbs...,"1. Combine flour, turmeric, salt and baking po...","{'title': 'Turmeric Flatbread', 'ingredients':..."
3,Zurich-Style Meat Saute,2023-01-06,"[beef, cream, swiss, quick]","Originally called ""Züri Gschnätzlets"" (Zurich ...","- 600g\tBeef, finely sliced - 250g\tMushrooms,...",1. Fry the meat in butter at high heat until i...,"{'title': 'Zurich-Style Meat Saute', 'ingredie..."
4,Tuscan Style Pork Roast,2021-03-10,"[italian, pork, roast]",![tuscan-style-pork-roast](/pix/tuscan-style-p...,- 1 pork Roast - 2-3 Tbsp fresh rosemary - 8 c...,1.\tPreheat oven to 275°F (135°C) 2.\tButterfl...,"{'title': 'Tuscan Style Pork Roast', 'ingredie..."


In [4]:
model_name = "open-mixtral-8x7b"
#model_name = "open-mixtral-8x22b"
temperature = 0
llm = ChatMistralAI(api_key=MISTRAL_API_KEY, model_name=model_name, temperature=temperature)
parser = PydanticOutputParser(pydantic_object=Recipe)
prompt = create_prompt(DEFAULT_BASE_PROMPT, parser, df["ingredients"][0], df["direction"][0])
prompt

[HumanMessage(content='\nWhat are the ingredients and their associated quantities as well as the steps to make the recipe described by the following The quantities here are for about four adult portions. If you are planning on eating this as a side dish, it might be more like 6-8 portions. * 1kg potatoes * 200ml milk* * 200ml mayonnaise* * ~100g cheese * Garlic powder * 12-16 strips of bacon * Butter * 3-4 green onions * Black pepper * Salt  *You can play with the proportions depending on how creamy or dry you want the mashed potatoes to be. and 1. Peel and cut the potatoes into medium sized pieces. 2. Put the potatoes in a pot with some water so that it covers the potatoes and   boil them for about 20-30 minutes, or until the potatoes are soft. 3. About ten minutes before removing the potatoes from the boiling water, cut   the bacon into little pieces and fry it. 4. Warm up the milk and mayonnaise. 5. Shred the cheese. 6. When the potatoes are done, remove all water from the pot, add 

In [5]:
print(prompt[0].content)


What are the ingredients and their associated quantities as well as the steps to make the recipe described by the following The quantities here are for about four adult portions. If you are planning on eating this as a side dish, it might be more like 6-8 portions. * 1kg potatoes * 200ml milk* * 200ml mayonnaise* * ~100g cheese * Garlic powder * 12-16 strips of bacon * Butter * 3-4 green onions * Black pepper * Salt  *You can play with the proportions depending on how creamy or dry you want the mashed potatoes to be. and 1. Peel and cut the potatoes into medium sized pieces. 2. Put the potatoes in a pot with some water so that it covers the potatoes and   boil them for about 20-30 minutes, or until the potatoes are soft. 3. About ten minutes before removing the potatoes from the boiling water, cut   the bacon into little pieces and fry it. 4. Warm up the milk and mayonnaise. 5. Shred the cheese. 6. When the potatoes are done, remove all water from the pot, add the warm milk   and mayo

In [6]:
raw_outputs = []
outputs = []

In [None]:
example = await run(llm, prompt, parser)
example

## End of Demo

## Trial and run on the entire dataset

In [7]:
df_sample = df.sample(10)

In [8]:
df_sample= df_sample.reset_index(drop=True)
for i in range(df_sample.shape[0]):
  
    prompt = create_prompt(DEFAULT_BASE_PROMPT, parser, df_sample["ingredients"][i], df_sample["direction"][i])
    recipe = await run(llm, prompt, parser)
    raw_outputs.append(recipe)
    
    output_map = {
        "recipe_id":i,
        "original_title":df_sample["title"][i],
    }

    if isinstance(recipe, Recipe):
        output =  output_map | recipe.dict() 
        outputs.append(output)
    else:
        output_map["raw_llm_output"] = recipe
        outputs.append(output_map)

    time.sleep(1)
   
        

In [9]:
outputs

[{'recipe_id': 0,
  'original_title': 'Cuca Italiana',
  'name': 'Brazilian Cheese Bread with Crumbly Top',
  'serving_size': 12,
  'ingredients': [{'id': 1,
    'name': 'all-purpose flour',
    'quantity': 3.0,
    'unit': 'cups'},
   {'id': 2, 'name': 'sugar', 'quantity': 1.5, 'unit': 'cups'},
   {'id': 3, 'name': 'warm water', 'quantity': 0.75, 'unit': 'cups'},
   {'id': 4, 'name': 'eggs', 'quantity': 2.0, 'unit': None},
   {'id': 5, 'name': 'milk', 'quantity': 6.0, 'unit': 'tablespoons'},
   {'id': 6, 'name': 'lard or butter', 'quantity': 2.0, 'unit': 'tablespoons'},
   {'id': 7,
    'name': 'instant dry yeast',
    'quantity': 1.0,
    'unit': 'tablespoon'},
   {'id': 8, 'name': 'nutmeg', 'quantity': 0.25, 'unit': None},
   {'id': 9, 'name': 'vanilla essence', 'quantity': 1.0, 'unit': 'teaspoon'},
   {'id': 10, 'name': 'fennel seeds', 'quantity': 0.25, 'unit': 'teaspoon'},
   {'id': 11, 'name': 'cinnamon', 'quantity': 0.25, 'unit': 'teaspoon'},
   {'id': 12, 'name': 'cloves', 'qua

In [10]:
with open("sample_8x7b_with_temperature_0_less_constraints.json", "w") as f:
    json.dump(outputs, f, indent=4, ensure_ascii=False)

In [None]:
df_parsed = pd.json_normalize(outputs, sep=".")
df_parsed.columns

In [None]:
df_parsed