In [1]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter
from pandas import DataFrame

# Set the path to the file you'd like to load
file_path = "RecipeNLG_dataset.csv"

# Load the latest version
df : DataFrame = kagglehub.dataset_load(
  KaggleDatasetAdapter.PANDAS,
  "paultimothymooney/recipenlg",
  file_path,
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)
print("First 5 records:", df.head())

  from .autonotebook import tqdm as notebook_tqdm


First 5 records:    Unnamed: 0                  title  \
0           0    No-Bake Nut Cookies   
1           1  Jewell Ball'S Chicken   
2           2            Creamy Corn   
3           3          Chicken Funny   
4           4   Reeses Cups(Candy)     

                                         ingredients  \
0  ["1 c. firmly packed brown sugar", "1/2 c. eva...   
1  ["1 small jar chipped beef, cut up", "4 boned ...   
2  ["2 (16 oz.) pkg. frozen corn", "1 (8 oz.) pkg...   
3  ["1 large whole chicken", "2 (10 1/2 oz.) cans...   
4  ["1 c. peanut butter", "3/4 c. graham cracker ...   

                                          directions  \
0  ["In a heavy 2-quart saucepan, mix brown sugar...   
1  ["Place chipped beef on bottom of baking dish....   
2  ["In a slow cooker, combine all ingredients. C...   
3  ["Boil and debone chicken.", "Put bite size pi...   
4  ["Combine first four ingredients and press in ...   

                                              link    source  \
0   

In [2]:
df.rename(columns={
    "Unnamed: 0" : "index"
}, inplace=True)

df.columns

Index(['index', 'title', 'ingredients', 'directions', 'link', 'source', 'NER'], dtype='object')

## Markdown Generation

In [3]:
from langchain_openai import OpenAIEmbeddings
import ast

# we will be combining the title, ingredients, and directions to come up with a single definition of what the food is
# instead of relying on the actual description of the food

food_1 = df.iloc[0]

markdown_template = """
# {food_name}

## Ingredients:
{ingredients}

## Directions:
{directions}
"""

ingredients = ast.literal_eval(food_1['ingredients'])
ingredients = "\n".join(ingredients)

directions = ast.literal_eval(food_1['directions'])
directions = "\n".join(directions)

markdown_template = markdown_template.format(
    food_name=food_1['title'],
    ingredients=ingredients,
    directions=directions
)

print(markdown_template)


# No-Bake Nut Cookies

## Ingredients:
1 c. firmly packed brown sugar
1/2 c. evaporated milk
1/2 tsp. vanilla
1/2 c. broken nuts (pecans)
2 Tbsp. butter or margarine
3 1/2 c. bite size shredded rice biscuits

## Directions:
In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine.
Stir over medium heat until mixture bubbles all over top.
Boil and stir 5 minutes more. Take off heat.
Stir in vanilla and cereal; mix well.
Using 2 teaspoons, drop and shape into 30 clusters on wax paper.
Let stand until firm, about 30 minutes.



## Chunking

In [4]:
from langchain_text_splitters import MarkdownHeaderTextSplitter, MarkdownTextSplitter

headers_to_split = [
    ("#", "Heading 1"),
    ("##", "Heading 2"),
    ("###", "Heading 3")
]

chunker = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split,
)

chunks = chunker.split_text(markdown_template)
print(chunks)


[Document(metadata={'Heading 1': 'No-Bake Nut Cookies', 'Heading 2': 'Ingredients:'}, page_content='1 c. firmly packed brown sugar\n1/2 c. evaporated milk\n1/2 tsp. vanilla\n1/2 c. broken nuts (pecans)\n2 Tbsp. butter or margarine\n3 1/2 c. bite size shredded rice biscuits'), Document(metadata={'Heading 1': 'No-Bake Nut Cookies', 'Heading 2': 'Directions:'}, page_content='In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine.\nStir over medium heat until mixture bubbles all over top.\nBoil and stir 5 minutes more. Take off heat.\nStir in vanilla and cereal; mix well.\nUsing 2 teaspoons, drop and shape into 30 clusters on wax paper.\nLet stand until firm, about 30 minutes.')]


## Embedding

In [None]:
from langchain_openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "") 

# defaults to `text-embedding-ada-002`
embed = OpenAIEmbeddings()

content_chunks = [x.page_content for x in chunks]
print(content_chunks)
vectors = embed.embed_documents(content_chunks)
print(vectors[0][4])

['1 c. firmly packed brown sugar\n1/2 c. evaporated milk\n1/2 tsp. vanilla\n1/2 c. broken nuts (pecans)\n2 Tbsp. butter or margarine\n3 1/2 c. bite size shredded rice biscuits', 'In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine.\nStir over medium heat until mixture bubbles all over top.\nBoil and stir 5 minutes more. Take off heat.\nStir in vanilla and cereal; mix well.\nUsing 2 teaspoons, drop and shape into 30 clusters on wax paper.\nLet stand until firm, about 30 minutes.']
0.0026362028438597918


## Setup Milvus
This will be using Milvus SDK for better control over the setup of the Milvus collections, schema, and databases instead of using the Langchain Milvus SDK

## Indexing / Storage

In [None]:
from langchain_community.vectorstores import Milvus

vector_store = Milvus.from_documents(documents=chunks, embedding=embed)

SyntaxError: expected argument value expression (2901785019.py, line 3)