In [46]:
import pandas as pd
import numpy as np

In [47]:
data = pd.read_csv(
    "data/Food Ingredients and Recipe Dataset with Image Name Mapping.csv"
)
data.head()

Unnamed: 0.1,Unnamed: 0,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher..."
1,1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"['2 large egg whites', '1 pound new potatoes (..."
2,2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"['1 cup evaporated milk', '1 cup whole milk', ..."
3,3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"['1 (¾- to 1-pound) round Italian loaf, cut in..."
4,4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"['1 teaspoon dark brown sugar', '1 teaspoon ho..."


In [48]:
# compute the length of each instruction
data["Instructions_length"] = data["Instructions"].str.len()


# print the stats of the column Instructions_length
print(data["Instructions_length"].describe())

count    13493.000000
mean      1040.674201
std        710.946428
min         40.000000
25%        569.000000
50%        890.000000
75%       1345.000000
max      13952.000000
Name: Instructions_length, dtype: float64


In [49]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

client = QdrantClient("localhost", port=6333)

collection_name = "recipe_title_collection"
vector_dimension = 384

qdrant_collections = client.get_collections()

# If no collections exist or if the index_name is not present in the collections, create the collection
if len(qdrant_collections.collections) == 0 or not any(
    collection_name in collection.name for collection in qdrant_collections.collections
):
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=vector_dimension, distance=Distance.COSINE),
    )

    collection_info = client.get_collection(collection_name=collection_name)

In [50]:
import os

from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import TokenTextSplitter
from qdrant_client.http.models import PointStruct
import uuid

# text splitter config
text_splitter = TokenTextSplitter(chunk_size=250, chunk_overlap=25)

# embedding config - using All MiniLM L6 v2
os.environ["OPENAI_API_KEY"] = "random-string"
embeddings = OpenAIEmbeddings(openai_api_base="http://localhost:8444/v1")


# vector and payloads as points for qdrant
all_points = []


# go through the first 2 rows of the dataset,
# in the column Instructions, split the text into chunks and add the chunks
# along with the text in column Title as payload to the list all_points

# for index, row in data.iloc[:5].iterrows():

for index, row in data.iterrows():
    chunks = text_splitter.split_text(row["Instructions"])
    for chunk in chunks:
        chunk_embedding = embeddings.embed_query(chunk)
        vector_id = uuid.uuid4().hex
        # print("Vector ID: ", vector_id)
        all_points.append(
            PointStruct(
                id=vector_id, vector=chunk_embedding, payload={"title": row["Title"]}
            )
        )

    # print('Title: ', row['Title'])
    # print('Cleaned Ingredients: ', row['Cleaned_Ingredients'])
    # print('Chunks: ', chunks)
    # print('Total chunks: ', len(chunks))


TypeError: expected string or buffer

In [None]:
# do a batch upsert of all the points in tranches of 1000
for i in range(0, len(all_points), 1000):
    print("Upserting points from ", i, " to ", i + 1000)
    operation_info = client.upsert(
        collection_name=collection_name,
        wait=True,
        points=all_points[i : i + 1000],
    )

    from qdrant_client.http.models import UpdateStatus

    assert operation_info.status == UpdateStatus.COMPLETED

In [None]:
# Random Vector - An array of 384 random numbers
random_vector = np.random.rand(384)

search_result = client.search(
    collection_name=collection_name, query_vector=random_vector, limit=3
)
print(search_result)

[ScoredPoint(id='4dfb2e93-3b60-4846-9669-9a7a957f261a', version=0, score=0.048540186, payload={'title': 'Miso-Butter Roast Chicken With Acorn Squash Panzanella'}, vector=None), ScoredPoint(id='8accaa64-29bd-4aac-b331-5230abba3416', version=0, score=0.034743235, payload={'title': 'Miso-Butter Roast Chicken With Acorn Squash Panzanella'}, vector=None), ScoredPoint(id='5ddfd78a-aed9-4c70-a08e-635eb3c69406', version=0, score=0.026305828, payload={'title': 'Italian Sausage and Bread Stuffing'}, vector=None)]


In [None]:
client.delete_collection(collection_name=collection_name)

True