In [2]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Download the dataset
path = kagglehub.dataset_download("paultimothymooney/recipenlg")

# Load it with the correct compression setting
df = kagglehub.dataset_load(
    adapter=KaggleDatasetAdapter.PANDAS,
    handle="paultimothymooney/recipenlg",
    path="RecipeNLG_dataset.csv",
)

print(df.head())

  from .autonotebook import tqdm as notebook_tqdm


   Unnamed: 0                  title  \
0           0    No-Bake Nut Cookies   
1           1  Jewell Ball'S Chicken   
2           2            Creamy Corn   
3           3          Chicken Funny   
4           4   Reeses Cups(Candy)     

                                         ingredients  \
0  ["1 c. firmly packed brown sugar", "1/2 c. eva...   
1  ["1 small jar chipped beef, cut up", "4 boned ...   
2  ["2 (16 oz.) pkg. frozen corn", "1 (8 oz.) pkg...   
3  ["1 large whole chicken", "2 (10 1/2 oz.) cans...   
4  ["1 c. peanut butter", "3/4 c. graham cracker ...   

                                          directions  \
0  ["In a heavy 2-quart saucepan, mix brown sugar...   
1  ["Place chipped beef on bottom of baking dish....   
2  ["In a slow cooker, combine all ingredients. C...   
3  ["Boil and debone chicken.", "Put bite size pi...   
4  ["Combine first four ingredients and press in ...   

                                              link    source  \
0   www.cookbooks.com

In [3]:
df.rename(columns={"Unnamed: 0": "index"}, inplace=True)

df.columns

Index(['index', 'title', 'ingredients', 'directions', 'link', 'source', 'NER'], dtype='object')

## Setup Milvus

This will be using Milvus SDK for better control over the setup of the Milvus collections, schema, and databases instead of using the Langchain Milvus SDK


In [91]:
from pymilvus import MilvusClient, DataType, IndexType, Function, FunctionType

db_name = "chat_man_db"
collection_name = "recipe_rag"
milvus_uri = "http://windows-server:19530"

client = MilvusClient(uri=milvus_uri)

if db_name not in client.list_databases():
    print("[INFO] Database does not exist. Creating new database...")
    client.create_database(db_name)

print("[INFO] Using database: ", db_name)

client.use_database(db_name)

client.drop_collection(collection_name=collection_name)

if not client.has_collection(collection_name):
    print("[INFO] Collection does not exist. Creating new collection...")

    print("[INFO] Defining schema for collection")
    schema = client.create_schema(auto_id=True, enable_dynamic_field=True)
    schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
    schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=1536)
    schema.add_field(
        field_name="text",
        datatype=DataType.VARCHAR,
        max_length=2000,
        enable_analyzer=True,
    )
    schema.add_field(field_name="sparse_vector", datatype=DataType.SPARSE_FLOAT_VECTOR)

    sparse_ef = Function(
        name="text_bm25_emb",
        input_field_names=["text"],
        output_field_names=["sparse_vector"],
        function_type=FunctionType.BM25,
    )
    schema.add_function(sparse_ef)

    print("[INFO] Defining indexing parameters")
    index_params = client.prepare_index_params()
    index_params.add_index(field_name="id", index_type="AUTOINDEX")
    index_params.add_index(
        field_name="vector", index_type=IndexType.HNSW, metric_type="COSINE"
    )
    index_params.add_index(
        field_name="sparse_vector", index_type="SPARSE_INVERTED_INDEX", metric_type="BM25"
    )

    client.create_collection(
        collection_name=collection_name, schema=schema, index_params=index_params
    )

res = client.get_load_state(collection_name=collection_name)
print(res)

[INFO] Using database:  chat_man_db
[INFO] Collection does not exist. Creating new collection...
[INFO] Defining schema for collection
[INFO] Defining indexing parameters
{'state': <LoadState: Loaded>}


## Markdown Generation


In [48]:
from langchain_openai import OpenAIEmbeddings
import ast

# we will be combining the title, ingredients, and directions to come up with a single definition of what the food is
# instead of relying on the actual description of the food

doc_markdown_tmplt = """
# {food_name}

## Ingredients:
{ingredients}

## Directions:
{directions}
"""

# ingredients = ast.literal_eval(food_1["ingredients"])
# ingredients = "\n".join(ingredients)

# directions = ast.literal_eval(food_1["directions"])
# directions = "\n".join(directions)

# new_markdown = doc_markdown_tmplt.format(
#     food_name=food_1["title"], ingredients=ingredients, directions=directions
# )

# print(new_markdown)


# No-Bake Nut Cookies

## Ingredients:
1 c. firmly packed brown sugar
1/2 c. evaporated milk
1/2 tsp. vanilla
1/2 c. broken nuts (pecans)
2 Tbsp. butter or margarine
3 1/2 c. bite size shredded rice biscuits

## Directions:
In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine.
Stir over medium heat until mixture bubbles all over top.
Boil and stir 5 minutes more. Take off heat.
Stir in vanilla and cereal; mix well.
Using 2 teaspoons, drop and shape into 30 clusters on wax paper.
Let stand until firm, about 30 minutes.



## Embedding


In [49]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")

# defaults to `text-embedding-ada-002`
embed = OpenAIEmbeddings(dimensions=1536, model="text-embedding-3-small")

## Document Generation

Data Shaping to fit the needs of Langchain and Milvus


In [89]:
from langchain_text_splitters import (
    MarkdownTextSplitter,
)
from langchain_core.documents import Document

raw_data = df.values.tolist()

raw_docs: list[Document] = []
final_docs: list[dict] = []

for i, row in enumerate(raw_data):
    if i > 50:
        break

    title = row[1]
    link = row[4]
    source = row[5]
    ner = row[6]
    # convert list of type strings into list
    ingredients = ast.literal_eval(row[2])
    directions = ast.literal_eval(row[3])
    str_ingredients = "\n".join(ingredients)
    str_directions = "\n".join(directions)

    text_md = doc_markdown_tmplt.format(
        ingredients=ingredients, directions=directions, food_name=title
    )

    doc = Document(
        page_content=text_md.replace("\n", "\\n"),
        metadata={"food_name": title, "link": link, "source": source, "raw_ner": ner},
    )
    raw_docs.append(doc)

# chunking
# chunker = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
# chunked_docs = chunker.split_documents([doc])

chunker1 = MarkdownTextSplitter()
chunked_docs_2 = chunker1.split_documents(raw_docs)

for doc in chunked_docs_2:
    data = {
        "text": doc.page_content,
        "vector": embed.embed_query(doc.page_content),
        **doc.metadata,
    }
    final_docs.append(data)

for doc in final_docs:
    print(doc)
print(len(final_docs))


{'text': "\\n# No-Bake Nut Cookies\\n\\n## Ingredients:\\n['1 c. firmly packed brown sugar', '1/2 c. evaporated milk', '1/2 tsp. vanilla', '1/2 c. broken nuts (pecans)', '2 Tbsp. butter or margarine', '3 1/2 c. bite size shredded rice biscuits']\\n\\n## Directions:\\n['In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine.', 'Stir over medium heat until mixture bubbles all over top.', 'Boil and stir 5 minutes more. Take off heat.', 'Stir in vanilla and cereal; mix well.', 'Using 2 teaspoons, drop and shape into 30 clusters on wax paper.', 'Let stand until firm, about 30 minutes.']\\n", 'vector': [0.005020566750317812, 0.006637465674430132, 0.00836132001131773, -0.008499731309711933, 0.01533852331340313, -0.07786282151937485, -0.004932486452162266, -0.004017082508653402, -0.017049793154001236, -0.009298743680119514, 0.015804089605808258, -0.030198896303772926, 0.020862404257059097, -0.0005155046237632632, -0.02459951862692833, 0.047311604022979736, 

## Chunking


In [45]:
from langchain_text_splitters import MarkdownHeaderTextSplitter, MarkdownTextSplitter

headers_to_split = [("#", "Heading 1"), ("##", "Heading 2"), ("###", "Heading 3")]

chunker = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split,
)

chunker_1 = MarkdownTextSplitter()

chunks = chunker_1.split_text(markdown_template)
print(chunks)


NameError: name 'markdown_template' is not defined

## Indexing / Storage


In [92]:
client.insert(
    collection_name=collection_name,
    data=final_docs
)

{'insert_count': 51, 'ids': [463500220456063896, 463500220456063897, 463500220456063898, 463500220456063899, 463500220456063900, 463500220456063901, 463500220456063902, 463500220456063903, 463500220456063904, 463500220456063905, 463500220456063906, 463500220456063907, 463500220456063908, 463500220456063909, 463500220456063910, 463500220456063911, 463500220456063912, 463500220456063913, 463500220456063914, 463500220456063915, 463500220456063916, 463500220456063917, 463500220456063918, 463500220456063919, 463500220456063920, 463500220456063921, 463500220456063922, 463500220456063923, 463500220456063924, 463500220456063925, 463500220456063926, 463500220456063927, 463500220456063928, 463500220456063929, 463500220456063930, 463500220456063931, 463500220456063932, 463500220456063933, 463500220456063934, 463500220456063935, 463500220456063936, 463500220456063937, 463500220456063938, 463500220456063939, 463500220456063940, 463500220456063941, 463500220456063942, 463500220456063943, 46350022045

In [100]:
results = client.search(
    collection_name=collection_name,
    data=[embed.embed_query("good for the gut")],
    anns_field="vector",
    limit=3,
    output_fields=["text"],
)
print(len(results))
for res in results:
    print(res[0].entity.get("text"))

1
\n# Broccoli Dip For Crackers\n\n## Ingredients:\n['16 oz. sour cream', '1 pkg. dry vegetable soup mix', '10 oz. pkg. frozen chopped broccoli, thawed and drained', '4 to 6 oz. Cheddar cheese, grated']\n\n## Directions:\n['Mix together sour cream, soup mix, broccoli and half of cheese.', 'Sprinkle remaining cheese on top.', 'Bake at 350Â° for 30 minutes, uncovered.', 'Serve hot with vegetable crackers.']\n
