# Imports

In [1]:
import os
import json
import utils
import tqdm.notebook as tqdm

import pandas as pd
import textwrap

import torch
from transformers import (
    LlamaTokenizer,
    LlamaForCausalLM,
    GenerationConfig,
    pipeline,
    AutoTokenizer,
    AutoModelForCausalLM,
)

from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.vectorstores import Chroma
from langchain import PromptTemplate
from langchain.chains import RetrievalQA, ConversationalRetrievalChain

In [2]:
import warnings

warnings.simplefilter("ignore")

# Importing and parsing the dataset

In [3]:
project_root_directory = utils.find_root_directory()
DATA_DIR = os.path.join(project_root_directory, "data")

In [4]:
recipes_df = pd.read_json(os.path.join(DATA_DIR, "all-recipes.json"))
recipes_df.head()

Unnamed: 0,url,text
0,https://en.wikibooks.org/wiki/Cookbook:Aubergi...,Cookbook:Aubergine and Onion Vegetable Pie\nde...
1,https://en.wikibooks.org/wiki/Cookbook:Palatsc...,Cookbook:Palatschinken (Czech/Austrian Crepes)...
2,https://en.wikibooks.org/wiki/Cookbook:Austria...,Cookbook:Austrian Napkin Dumpling (Serviettenk...
3,https://en.wikibooks.org/wiki/Cookbook:Asparag...,Cookbook:Asparagus with Sesame Seeds and Soy S...
4,https://en.wikibooks.org/wiki/Cookbook:Atomic_...,Cookbook:Atomic Fudge Cake\ndesc\nA delicious ...


In [5]:
loader = DataFrameLoader(recipes_df, page_content_column="text")
data = loader.load()
print(data[0])

page_content='Cookbook:Aubergine and Onion Vegetable Pie\ndesc\nThis is a vegetable pie using tomatoes, aubergines (eggplant), onions, and mushrooms. You can make vegetable pie with many kinds of vegetables.\n\nIngredients\nCrust\n250 g (1 cup) flour\n100 g (0.4 cups) medium-soft butter (leave it out for a half-hour before making the crust)\n1 egg\nLukewarm water\n1 pinch of salt\nFilling\n1-2 aubergines (eggplants)\n1-2 onions\n4 tomatoes\n150 g (0.6 cups) of mushrooms\nGrated cheese\nMilk\n1 egg yolk\nFresh cream\nProcedure\nCrust\nAdd the flour to a large bowl.\nAdd the butter, and rub it into the dough with your fingers until the pastry is nearly consistent in texture.\nAdd the egg, then some water bit by bit while working the pastry with your hands until it becomes soft and smooth. It should not be sticky, and it should look shiny and medium-yellow.\nFilling\nCut the vegetables into thin slices.\nBrown the vegetables in oil or butter or steam them, starting with the onions and aub

In [6]:
text_splitter = CharacterTextSplitter(
    separator="\n", chunk_size=4000, chunk_overlap=500
)
texts = text_splitter.split_documents(data)

In [11]:
def wrap_text_preserve_newlines(text, width=110):
    lines = text.split("\n")
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
    wrapped_text = "\n".join(wrapped_lines)

    return wrapped_text


def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response["result"]))
    # print('\n\nSources:')
    # for source in llm_response["source_documents"]:
    # print(source.metadata['source'])


def converse(qa, query):
    text = qa({"question": query, "chat_history": []}, return_only_outputs=True)
    lines = text["answer"].split("\n")
    wrapped_lines = [textwrap.fill(line, width=110) for line in lines]
    wrapped_text = "\n".join(wrapped_lines)
    # print(wrapped_text)
    index = wrapped_text.find("Helpful Answer:")

    # Check if "Helpful answer:" is present in the input string
    if index != -1:
        # Extract everything after "Helpful answer:"
        result = wrapped_text[index + len("Helpful Answer:") :]
        print(result)  # Remove leading and trailing whitespace characters
    else:
        print(wrapped_text)


def get_qa_pipeline(
    model,
    tokenizer,
    embeddings,
    pipeline_kwargs={
        "max_new_tokens": 1024,
        "temperature": 0,
        "top_p": 0.98,
        "repetition_penalty": 1.15,
},
    search_kwargs={"k": 2},
    chain_kwargs={},
):
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        # max_length=3000,
        **pipeline_kwargs
    )

    local_llm = HuggingFacePipeline(pipeline=pipe)
    persist_directory = "db2"
    vectordb = Chroma.from_documents(
        documents=texts, embedding=embeddings, persist_directory=persist_directory
    )
    retriever = vectordb.as_retriever(search_kwargs=search_kwargs)
    # qa_chain = RetrievalQA.from_chain_type(
    #     llm=local_llm,
    #     chain_type="stuff",
    #     retriever=retriever,
    #     return_source_documents=False,
    #     **chain_kwargs
    # )
    qa = ConversationalRetrievalChain.from_llm(local_llm, retriever)
    return qa

# Experimenting with various models

Model sizes to be experimented with:
- 1B
- 3B
- 7B

## 1B `TinyLlama/TinyLlama-1.1B-Chat-v1.0`

In [8]:
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
model = AutoModelForCausalLM.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    load_in_8bit=True,
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    temperature=0,
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [9]:
instructor_embeddings = HuggingFaceEmbeddings(
    model_name="embaas/sentence-transformers-e5-large-v2",
    model_kwargs={"device": "cuda"},
)

In [12]:
qa = get_qa_pipeline(model, tokenizer, instructor_embeddings)

In [35]:
converse(qa, "Give the cooking procedure (with ingredients and instructions) for a chicken curry dish in the cookbook.")


Cookbook: Spicy Chilli Chicken
Desc
Chile chicken, cooked Kerala style.
Ingredients
1 pound boneless chicken meat
1 tbsp masala
1 cup lemon juice
Salt to taste
3 tbsp canola oil
1 1/2 ea. red onion, chopped
3/4 tbsp red chile powder
3/4 tsp turmeric powder
1 tsp smoked paprika
1 tsp coriander
3 medium tomatoes, chopped
3/4 tbsp minced ginger
2 tbsp minced garlic
6 tbsp tomato ketchup
8 tbsp soy sauce
4 Indian green chiles, chopped
2 green bell peppers, sliced
Procedure
Marinate the chicken with lemon juice, masala, and salt for 1 hour.
In a large pot, fry the marinated chicken in canola oil until golden brown (5-7 minutes).
Remove the chicken from the pot and cut into pieces.
Into the remaining oil, add the onions, red chili powder, turmeric, paprika, and coriander powder, and saute
until wilted.
Add the tomatoes, ginger, and garlic, and cook till the tomatoes are softened and breaking apart.
Put the ketchup, soy sauce, fried chicken pieces, green chiles, and green peppers in the pot,

In [36]:
converse(qa, "Give the ingredients for an Asian spicy dish in the cookbook.")

 In the cookbook, the recipe calls for 2 tbsp coconut oil, 1 big onion, 3 cloves garlic, 2 cm
piece ginger, 2 scallions, 50 g red chile, 1 cup basmati rice, 30 g chile powder, 3 tbsp light soy sauce, 2
tbsp fish sauce, 1 tbsp chicken powder, 1 tbsp salt, and 1 tbsp pepper.


In [19]:
converse(qa, "List 10 recipes in the cookbook that use pork.")

 Here are ten recipes in the cookbook that use pork:
1. Baked Pork Roast
2. Grilled Pork Tenderloin with Mango Salsa
3. Pork Loin with Cranberry Sauce
4. Pork Chop with Mustard Glaze
5. Pork Ribs with Barbecue Sauce
6. Pork Chops with Apple and Onion Stuffing
7. Pork Chops with Caramelized Onions and Bacon
8. Pork Chops with Rice Pilaf and Herb Butter
9. Pork Chops with Spicy Peach Glaze
10. Pork Chops with Maple Ginger Glaze


In [None]:
converse(qa, "Give the ingredients and the cooking procedure for Baked Pork Roast.")

## 3B `stabilityai/stablelm-zephyr-3b`

In [25]:
tokenizer = AutoTokenizer.from_pretrained("stabilityai/stablelm-zephyr-3b")
model = AutoModelForCausalLM.from_pretrained(
    "stabilityai/stablelm-zephyr-3b",
    load_in_8bit=True,
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    temperature=0,
)

tokenizer_config.json:   0%|          | 0.00/5.21k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/5.59G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [27]:
qa_3b = get_qa_pipeline(model, tokenizer, instructor_embeddings)

In [32]:
converse(qa_3b, "Give the cooking procedure (with ingredients and instructions) for a chicken curry dish in the cookbook.")

 The cooking procedure for a chicken curry dish in the Spicy Chilli Chicken cookbook involves
the following steps using the listed ingredients:

1. Marinate boneless chicken meat with lemon juice, masala, and salt for 1 hour.
2. Fry the marinated chicken in canola oil until golden brown. Remove the chicken from the pot and cut it into
pieces.
3. In the same oil, add red onions, red chili powder, turmeric powder, smoked paprika, and coriander powder.
Sauté them until they become wilted.
4. Add chopped tomatoes, ginger, and garlic, and cook until the tomatoes soften and break apart.
5. Mix tomato ketchup, soy sauce, fried chicken pieces, Indian green chiles, and green bell peppers into the
pot. Simmer everything together until the peppers become soft and the chicken is fully cooked.
6. Serve the spicy chicken curry over steamed rice.


In [33]:
converse(qa_3b, "Give the ingredients for an Asian spicy dish in the cookbook.")

 The ingredients for an Asian spicy dish in the cookbook include 2 tablespoons of coconut oil,
1 big onion, 3 cloves of garlic, 2 centimeters of ginger, 2 scallions, 50 grams of red chilies, 1 cup of
basmati rice, 30 grams of chili powder, 3 tablespoons of light soy sauce, 2 tablespoons of fish sauce, 1
tablespoon of chicken powder, 1 tablespoon of salt, and 1 tablespoon of pepper.


In [30]:
converse(qa_3b, "List 10 recipes in the cookbook that use pork.")

 1. Baked Pork Chops
2. Slow Cooker Pulled Pork
3. Pan-Seared Pork Tenderloin
4. Grilled Pork Ribs
5. Smoked Pork Shoulders
6. Roasted Pork Loin
7. Braised Pork Shanks
8. Stuffed Pork Tenderloin
9. Creamy Garlic Pork Spaghetti
10. Honey Mustard Glazed Pork Chop


Note that these recipes do exist in the cookbook. For the 1b model they do not.

In [34]:
converse(qa_3b, "Give the ingredients and the cooking procedure for Baked Pork Chops.")

 The ingredients include 3-4 bone-in pork chops, 1 medium apple, 1 medium onion, 1 tablespoon
honey, 1 tablespoon caraway seed, garlic salt, black pepper, and Dijon mustard. The cooking procedure involves
layering the sliced onions and apples in a baking dish, drizzling honey and half the caraway seeds over them,
seasoning the pork chops with garlic salt and black pepper, spreading Dijon mustard on top of the pork chops,
sprinkling the remaining caraway seeds, covering and baking the dish for one hour at 350 degrees Fahrenheit
(180 degrees Celsius).


## 7B `TheBloke/wizardLM-7B-HF`

In [40]:
tokenizer = LlamaTokenizer.from_pretrained("TheBloke/wizardLM-7B-HF")
model = LlamaForCausalLM.from_pretrained(
    "TheBloke/wizardLM-7B-HF",
    load_in_8bit=True,
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    temperature=0,
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [42]:
qa_7b = get_qa_pipeline(model, tokenizer, instructor_embeddings)

In [48]:
converse(qa_7b, "Give the cooking procedure (with ingredients and instructions) for any chicken  dish.")

Sure! Here is the recipe for Chicken Wings Asian Style from the cookbook you provided:
Ingredients:
- 24 chicken wingettes
- Poultry shake, as needed
- Cornmeal, as needed
- 2 eggs, beaten
- 6 tbsp hoisin sauce
- 1 tbsp dark soy sauce
- 2 tbsp hot sauce
- 1/2 tbsp minced garlic
- 1/2 tbsp minced ginger
- Spicy garlic oil for deep frying
Instructions:
1. Coat the chicken wingettes with egg. Season them liberally with poultry shake.
2. In a separate bowl, mix together the cornmeal, salt, and black pepper.
3. Dip each chicken wingette into the cornmeal mixture, making sure they are fully coated.
4. Heat up the spicy garlic oil in a deep fryer or a large pot over medium heat.
5. Once the oil is heated through, add the chicken wingettes in batches, making sure not to overcrowd the pan.
6. Cook the chicken for 12 minutes, or until they are golden brown and crispy on the outside.
7. Remove the chicken from the oil using a slotted spoon and transfer them onto a wire rack to drain excess
oil.
8

In [44]:
converse(qa_7b, "Give the ingredients for an Asian spicy dish in the cookbook.")

The ingredients for an Asian spicy dish in the cookbook are:
- 2 tbsp coconut oil
- 1 big onion, diced
- 3 cloves garlic, crushed
- 2 cm piece ginger, minced
- 2 scallions, cut into 1.5 cm pieces
- 50 g red chile, cut into 1 cm pieces
- 1 cup (900 g) basmati rice, cooked
- 30 g chile powder
- 3 tbsp light soy sauce
- 2 tbsp fish sauce
- 1 tbsp chicken powder
- 1 tbsp salt
- 1 tbsp pepper


In [None]:
converse(qa_7b, "List 10 recipes that use pork from the cookbooks.")