# General Setup

In [55]:
!export HF_HOME="/scratch/ssd004/scratch/lfy"

In [56]:
!pip install llama-cpp-python --quiet
!pip install python-terrier --quiet

In [57]:
from llama_cpp import Llama, LlamaGrammar
import json

In [58]:
import pyterrier as pt
import pandas as pd
import csv
import re

In [59]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch

    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)

# Obtain recipe ideas

In [60]:
llm = Llama.from_pretrained(
	repo_id="QuantFactory/Meta-Llama-3.1-8B-Instruct-GGUF",
	filename="Meta-Llama-3.1-8B-Instruct.Q2_K.gguf",
  n_ctx=5000,
)

llama_model_loader: loaded meta data with 27 key-value pairs and 291 tensors from /root/.cache/huggingface/hub/models--QuantFactory--Meta-Llama-3.1-8B-Instruct-GGUF/snapshots/b6d5cca03f341fd97b7657420bd60e070835b7e5/./Meta-Llama-3.1-8B-Instruct.Q2_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Models
llama_model_loader: - kv   3:                         general.size_label str              = 8.0B
llama_model_loader: - kv   4:                            general.license str              = llama3.1
llama_model_loader: - kv   5:                               general.tags arr[str,6]       = ["facebook", "meta", "pytorch", "ll

In [61]:
'''llm = Llama.from_pretrained(
    repo_id="QuantFactory/Meta-Llama-3.1-8B-Instruct-GGUF",
    filename="*Q4_K_M.gguf",
    n_gpu_layers=-1,
    n_ctx=128000,
    verbose=True,
    cache_dir="/checkpoint/lfy/13870962"
)'''

'llm = Llama.from_pretrained(\n    repo_id="QuantFactory/Meta-Llama-3.1-8B-Instruct-GGUF",\n    filename="*Q4_K_M.gguf",\n    n_gpu_layers=-1,\n    n_ctx=128000,\n    verbose=True,\n    cache_dir="/checkpoint/lfy/13870962"\n)'

In [62]:
ask_food_prompt_template = "The user is living in {user_location}. The user is looking for a {user_query}. Suggest 5 {user_query} which can be cooked by the user, without an actual recipe."

In [63]:
user_location = "California"
user_query = "vegan korean food"
ask_food_prompt = ask_food_prompt_template.format(user_location=user_location, user_query=user_query)

In [64]:
schema_food_name = r'''
root ::= (
    "{" newline
        doublespace "\"food_names\":" space listofstring newline
    "}"
)
newline ::= "\n"
doublespace ::= "  "
number ::= [0-9]+   "."?   [0-9]*
boolean ::= "true" | "false"
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
space ::= | " " | "\n" [ \t]{0,20}
string ::= "\"" char* "\"" space
listofstring ::= ("[" space (string ("," space string){4})? "]")
'''

# Creating a LlamaGrammar object with schema string
# Set verbose=False to not print the grammar, set to True for debugging
grammar_food_name = LlamaGrammar.from_string(grammar=schema_food_name, verbose=False)

In [65]:
result = llm.create_chat_completion(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant that outputs in JSON.",
        },
        {"role": "user", "content": ask_food_prompt},
    ],
    grammar=grammar_food_name,
    temperature=0.7,
)

llama_perf_context_print:        load time =   38395.91 ms
llama_perf_context_print: prompt eval time =       0.00 ms /    64 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /    51 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =   89587.15 ms /   115 tokens


In [66]:
result_json = json.loads(result['choices'][0]['message']['content'])
result_json

{'food_names': ['Vegan Bibimbap',
  'Vegan Japchae',
  'Vegan Ramyeon',
  'Vegan Mandu',
  'Vegan Kimbop']}

# IR for recipes

In [67]:
recipes_full = pd.read_csv('recipes.csv', encoding='utf-8', on_bad_lines='skip')
recipes_full = recipes_full.fillna(value={'RecipeServings': 1.0})

In [68]:
recipes_full.head()

Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
0,38,Low-Fat Berry Blue Frozen Dessert,1533,Dancer,PT24H,PT45M,PT24H45M,1999-08-09T21:46:00Z,Make and share this Low-Fat Berry Blue Frozen ...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,1.3,8.0,29.8,37.1,3.6,30.2,3.2,4.0,,"c(""Toss 2 cups berries with sugar."", ""Let stan..."
1,39,Biryani,1567,elly9812,PT25M,PT4H,PT4H25M,1999-08-29T13:12:00Z,Make and share this Biryani recipe from Food.com.,"c(""https://img.sndimg.com/food/image/upload/w_...",...,16.6,372.8,368.4,84.4,9.0,20.4,63.4,6.0,,"c(""Soak saffron in warm milk for 5 minutes and..."
2,40,Best Lemonade,1566,Stephen Little,PT5M,PT30M,PT35M,1999-09-05T19:52:00Z,This is from one of my first Good House Keepi...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,0.0,0.0,1.8,81.5,0.4,77.2,0.3,4.0,,"c(""Into a 1 quart Jar with tight fitting lid, ..."
3,41,Carina's Tofu-Vegetable Kebabs,1586,Cyclopz,PT20M,PT24H,PT24H20M,1999-09-03T14:54:00Z,This dish is best prepared a day in advance to...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,2.0,4 kebabs,"c(""Drain the tofu, carefully squeezing out exc..."
4,42,Cabbage Soup,1538,Duckie067,PT30M,PT20M,PT50M,1999-09-19T06:19:00Z,Make and share this Cabbage Soup recipe from F...,"""https://img.sndimg.com/food/image/upload/w_55...",...,0.1,0.0,959.3,25.1,4.8,17.7,4.3,4.0,,"c(""Mix everything together and bring to a boil..."


In [69]:
recipe_cols = ["Name", "Description", "RecipeIngredientQuantities", "RecipeIngredientParts", "RecipeInstructions"]
recipes_full["full_document"] = recipes_full[recipe_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
recipes_full = recipes_full.astype({'RecipeId': 'string'})

In [70]:
docno = len(recipes_full)

In [71]:
docs_df = recipes_full[['RecipeId', 'full_document']].copy()
docs_df = docs_df.rename(columns={"RecipeId": "docno", "full_document": "text"})
docs_df.head()

Unnamed: 0,docno,text
0,38,Low-Fat Berry Blue Frozen Dessert Make and sha...
1,39,Biryani Make and share this Biryani recipe fro...
2,40,Best Lemonade This is from one of my first Go...
3,41,Carina's Tofu-Vegetable Kebabs This dish is be...
4,42,Cabbage Soup Make and share this Cabbage Soup ...


In [72]:
index_dir = './recipes_index'
indexer = pt.DFIndexer(index_dir, overwrite=True, )
index_ref = indexer.index(docs_df["text"], docs_df["docno"])
index_ref.toString()
index = pt.IndexFactory.of(index_ref)
br = pt.BatchRetrieve(index, wmodel="Tf")

  indexer = pt.DFIndexer(index_dir, overwrite=True, )
  br = pt.BatchRetrieve(index, wmodel="Tf")


In [73]:
query_set = result_json['food_names']

desc = ['vegan', 'Vegan', 'Vegetarian', 'vegetarian']

for d in desc:
  query_set = [s.strip(d) for s in query_set]
query_results = br.transform(query_set)

  return fn(*args, **kwargs)


In [74]:
query_results.head()

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,495983,514183,0,4.0,Bibimbap
1,1,513744,532287,1,4.0,Bibimbap
2,1,520988,539710,2,4.0,Bibimbap
3,1,508482,526967,3,3.0,Bibimbap
4,1,509646,528152,4,3.0,Bibimbap


In [75]:
result_cols = ["RecipeId", "Name", "Description", "RecipeIngredientQuantities", "RecipeIngredientParts",
               "RecipeInstructions", "Calories", "FatContent", "SaturatedFatContent", "CholesterolContent",
               "SodiumContent", "CarbohydrateContent", "FiberContent", "SugarContent", "ProteinContent", "RecipeServings"]

In [76]:
results = pd.merge(query_results[["docno", "rank", "query"]],
                         recipes_full[result_cols],
                  right_on='RecipeId', left_on='docno', how='left')

In [77]:
results.head()

Unnamed: 0,docno,rank,query,RecipeId,Name,Description,RecipeIngredientQuantities,RecipeIngredientParts,RecipeInstructions,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings
0,514183,0,Bibimbap,514183,Bibimbap (Korean Vegetarian Noodles),This recipe is from a chef who operated the re...,"c(""3/4"", ""6"", ""3"", ""2"", ""2"", ""2"", ""2"", ""1"", ""1...","c(""miso"", ""corn syrup"", ""garlic cloves"", ""fres...","c(""In a bowl, whisk gochujang, lemon-lime soda...",1295.1,48.1,6.5,186.0,174.6,187.2,11.3,10.4,31.4,4.0
1,532287,1,Bibimbap,532287,Roasted Vegetable Bibimbap,Cook the rice in a skillet to get the characte...,"c(""4"", ""1"", ""1 1/2"", ""1 1/2"", ""1"", NA, ""2"", ""1...","c(""cucumbers"", ""sugar"", ""salt"", ""rice vinegar""...","c(""First, make the pickles:"", ""Slice the cucum...",578.8,6.1,1.9,186.0,1468.6,116.7,8.7,19.3,19.1,4.0
2,539710,2,Bibimbap,539710,Korean Rice Bowl (Dolsot Bibimbap),Make and share this Korean Rice Bowl (Dolsot B...,"c(""1"", ""2"", ""1 1/2"", ""1"", ""4"", ""1/4"", ""3"", ""2""...","c(""cider vinegar"", ""sugar"", ""salt"", ""cucumber""...","c(""For the pickles: Whisk vinegar, sugar and ...",861.6,28.9,4.9,186.0,2241.3,127.9,9.1,17.7,21.2,4.0
3,526967,3,Bibimbap,526967,Pork Bibimbap,"Bibimbap isn't easy to make, but it's certainl...","c(""1"", ""8"", ""6"", ""8"", ""2"", ""4"", ""3"", ""3"", ""3"",...","c(""Brussels sprouts"", ""bean sprouts"", ""shiitak...","c(""Create gochujang paste: Mix gochujang with ...",510.9,37.8,6.7,235.2,142.4,19.1,4.1,11.0,25.9,6.0
4,528152,4,Bibimbap,528152,Bibimbap Breakfast Bowl,Make and share this Bibimbap Breakfast Bowl re...,"c(""1"", ""1"", ""1"", ""2"", ""1/2"", ""1"", ""1"")","c(""olive oil"", ""egg"", ""spinach"", ""water"", ""bea...","c(""Add the olive oil to a skillet and warm it ...",230.4,19.9,3.7,186.0,99.3,5.2,1.9,2.5,9.2,1.0


In [78]:
results[results["query"] == query_set[0]].head()[['query', 'Name', 'RecipeInstructions']]

Unnamed: 0,query,Name,RecipeInstructions
0,Bibimbap,Bibimbap (Korean Vegetarian Noodles),"c(""In a bowl, whisk gochujang, lemon-lime soda..."
1,Bibimbap,Roasted Vegetable Bibimbap,"c(""First, make the pickles:"", ""Slice the cucum..."
2,Bibimbap,Korean Rice Bowl (Dolsot Bibimbap),"c(""For the pickles: Whisk vinegar, sugar and ..."
3,Bibimbap,Pork Bibimbap,"c(""Create gochujang paste: Mix gochujang with ..."
4,Bibimbap,Bibimbap Breakfast Bowl,"c(""Add the olive oil to a skillet and warm it ..."


# Generate new recipes

In [79]:
query_res = []

for query in query_set:
  res = results[results["query"] == query].head()[['query', 'Name', 'RecipeInstructions']]
  #print(res.to_dict('records'))

  #query_res['index'] = res['index']
  query_res.extend(res.to_dict('records'))

print(query_res)

[{'query': ' Bibimbap', 'Name': 'Bibimbap (Korean Vegetarian Noodles)', 'RecipeInstructions': 'c("In a bowl, whisk gochujang, lemon-lime soda, miso, corn syrup, sesame oil, garlic, ginger, vinegar and sesame seeds until smooth. Set aside.", "Bring a 4-qt saucepan of water to a boil and add sprouts. Cook until crisp tender (30 seconds). Transfer to a bowl of ice water, drain and dry with paper towels. Set aside.", "Repeat procedure with spinach (squeeze out as much liquid as possible when draining). When finished, pour boiling water into a bowl and add mushrooms. Let soften for 30 minutes. Drain, remove stems, and slice 1/4 inch thick. Set aside.", \n"Heat 1 tbsp canola oil and 1/2 tsp sesame oil in a 10 inch nonstick skillet over medium heat. Add 1 tsp garlic, 1/2 tsp ginger and mushrooms. Season with salt and pepper. Cook until hot (2 minutes). Transfer to a bowl. Set aside.", "Repeat procedure, using same amounts of canola oil, sesame oil, garlic, and ginger with the gosari, squash, 

In [80]:
extracted_recipe = {}

for recipe in query_res:
    recipe_name = recipe['Name']
    recipe_instructions = recipe["RecipeInstructions"]
    matches = re.findall(r'"(.*?)"', recipe_instructions)
    recipe_instructions = tuple(matches)
    steps_dict = {f"Step {i+1}": step for i, step in enumerate(recipe_instructions)}
    steps_json = json.dumps(steps_dict, indent=2)
    extracted_recipe[recipe_name] = {} # Overwrite recipe with same name, no duplicate
    extracted_recipe[recipe_name]['instructions'] = steps_json

In [81]:
print(extracted_recipe.keys())

dict_keys(['Bibimbap (Korean Vegetarian Noodles)', 'Roasted Vegetable Bibimbap', 'Korean Rice Bowl (Dolsot Bibimbap)', 'Pork Bibimbap', 'Bibimbap Breakfast Bowl', 'Mandu-Pi/Dumpling Wrappers', 'Yaki-Mandu (Korean Egg Roll)', 'Mandu (Korean Pot Stickers)', 'FUSF Mandu', 'My Korean Dipping Sauce'])


In [82]:
ask_recipe_template = "The user is living in {user_location}. The user is looking for a {user_query}. Here are few potential recipes: {retrieved_recipes}. Based on such recipe, generate a new recipe that satisfy user nutrition requirements. You can reuse the existing recipe, or you can modify it or create a new recipe."

In [83]:
ask_recipe_prompt = ask_recipe_template.format(user_location=user_location, user_query=user_query, retrieved_recipes=extracted_recipe)

In [84]:
schema_recipe = r'''
root ::= (
    "{" newline
        doublespace "\"Description\":" space string "," newline
        doublespace "\"Cooking instructions\":" cookinstructs "," space
        doublespace "\"Ingredients\":" space listofstring newline
    "}"
)
newline ::= "\n"
doublespace ::= "  "
number ::= [0-9]+   "."?   [0-9]*
integer ::= [0-9]*
boolean ::= "true" | "false"
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
space ::= | " " | "\n" [ \t]{0,20}
string ::= "\"" char* "\"" space
sentence ::= char* space
listofstring ::= ("[" space (string ("," space string)*)? "]")
cookstep ::= ("\"Step" space integer "\"" ":" space string)
cookinstructs ::= (space "{" space (cookstep ("," space cookstep){10})? "}")
'''

grammar_recipe = LlamaGrammar.from_string(grammar=schema_recipe, verbose=False)

In [85]:
result = llm.create_chat_completion(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant that outputs in JSON. You must not include nutrition information or any notes. Stop generating instructions when the food is ready to serve. Do not put notes or nutrition in cooking instructions.",
        },
        {"role": "user", "content": ask_recipe_prompt},
    ],
    grammar=grammar_recipe,
    temperature=0.7,
    stream=True
)

for chunk in result:
    delta = chunk['choices'][0]['delta']
    if 'role' in delta:
        print(delta['role'], end=': ')
    elif 'content' in delta:
        print(delta['content'], end='')


Llama.generate: 15 prefix-match hit, remaining 4129 prompt tokens to eval


assistant: {
  "Description": "Bibimbap is considered Korean therapy food, making sense given that the notion of food as medicine is a fundamental one in Korean cooking and the stirring helps to relieve stress.",
  "Cooking instructions": {
    "Step 1": "For the pickles: Whisk vinegar, sugar and salt together in medium bowl. Add cucumber and bean sprouts and toss to combine. Gently press on vegetables to submerge. Cover and refrigerate for at least 30 minutes or up to 24 hours.",
    "Step 2": "For the chile sauce: Whisk gochujang, water, oil, and sugar together in small bowl. Cover and set aside.",
    "Step 3": "For the rice: Bring rice, water, and salt to boil in medium saucepan over high heat. Cover, reduce heat to low, and cook for 7 minutes. Remove rice from heat and let sit, covered, until tender, about 15 minutes.",
    "Step 4": "For the vegetables: While the rice cooks, stir together water, scallions, soy sauce, garlic and sugar. Heat 1 teaspoon oil in Dutch oven over high h

llama_perf_context_print:        load time =   38395.91 ms
llama_perf_context_print: prompt eval time =       0.00 ms /  4129 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /   558 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time = 2953680.91 ms /  4687 tokens
