In [77]:
import pandas as pd
import numpy as np

In [78]:
data = pd.read_csv(
    "data/Food Ingredients and Recipe Dataset with Image Name Mapping.csv"
)
data.head()

Unnamed: 0.1,Unnamed: 0,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher..."
1,1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"['2 large egg whites', '1 pound new potatoes (..."
2,2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"['1 cup evaporated milk', '1 cup whole milk', ..."
3,3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"['1 (¾- to 1-pound) round Italian loaf, cut in..."
4,4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"['1 teaspoon dark brown sugar', '1 teaspoon ho..."


In [79]:
# compute the length of each instruction
data["Instructions_length"] = data["Instructions"].str.len()

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# computer the tokenized length of each instruction
data["Instructions_tokenized_length"] = data["Instructions"].apply(lambda x: len(tokenizer.tokenize(str(x))))

# print the stats of the column Instructions_length
print(data["Instructions_tokenized_length"].describe())


# drop the rows where the Instructions_tokenized_length is greater than 2000 and greater than 1
data = data[(data["Instructions_tokenized_length"] < 2000) & (data["Instructions_tokenized_length"] > 1)]

# print the stats of the column Instructions_length again
print(data["Instructions_tokenized_length"].describe())

Token indices sequence length is longer than the specified maximum sequence length for this model (583 > 512). Running this sequence through the model will result in indexing errors


count    13501.000000
mean       248.650174
std        167.257353
min          1.000000
25%        138.000000
50%        214.000000
75%        320.000000
max       3378.000000
Name: Instructions_tokenized_length, dtype: float64
count    13491.000000
mean       248.345193
std        163.037067
min          8.000000
25%        138.000000
50%        214.000000
75%        320.000000
max       1953.000000
Name: Instructions_tokenized_length, dtype: float64


In [80]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

client = QdrantClient("localhost", port=6333)

collection_name = "recipe_title_collection"
vector_dimension = 384

qdrant_collections = client.get_collections()

# If no collections exist or if the index_name is not present in the collections, create the collection
if len(qdrant_collections.collections) == 0 or not any(
    collection_name in collection.name for collection in qdrant_collections.collections
):
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=vector_dimension, distance=Distance.COSINE),
    )

    collection_info = client.get_collection(collection_name=collection_name)

In [81]:
import os

from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import TokenTextSplitter
from qdrant_client.http.models import PointStruct
import uuid

# text splitter config
text_splitter = TokenTextSplitter(chunk_size=250, chunk_overlap=25)

# embedding config - using All MiniLM L6 v2
os.environ["OPENAI_API_KEY"] = "random-string"
embeddings = OpenAIEmbeddings(openai_api_base="http://localhost:8444/v1")


# vector and payloads as points for qdrant
all_points = []

# iterate over the rows of the dataframe
for index, row in data.iterrows():
    vector_id = uuid.uuid4().hex
    vector_embedding = embeddings.embed_query(row["Title"])
    all_points.append(PointStruct(
                 id=vector_id, vector=vector_embedding, payload={"title": row["Title"], "recipe": row["Instructions"], "image": f"{row['Image_Name']}.jpg"}
             ))

In [82]:
from qdrant_client.http.models import UpdateStatus

# do a batch upsert of all the points in tranches of 1000
for i in range(0, len(all_points), 1000):
    print("Upserting points from ", i, " to ", i + 1000)
    operation_info = client.upsert(
        collection_name=collection_name,
        wait=True,
        points=all_points[i : i + 1000],
    )
    assert operation_info.status == UpdateStatus.COMPLETED

Upserting points from  0  to  1000
Upserting points from  1000  to  2000
Upserting points from  2000  to  3000
Upserting points from  3000  to  4000
Upserting points from  4000  to  5000
Upserting points from  5000  to  6000
Upserting points from  6000  to  7000
Upserting points from  7000  to  8000
Upserting points from  8000  to  9000
Upserting points from  9000  to  10000
Upserting points from  10000  to  11000
Upserting points from  11000  to  12000
Upserting points from  12000  to  13000
Upserting points from  13000  to  14000


In [83]:
# Random Vector - An array of 384 random numbers
random_vector = np.random.rand(384)

search_result = client.search(
    collection_name=collection_name, query_vector=random_vector, limit=3
)
print(search_result)

[ScoredPoint(id='218c746e-b679-4213-9212-f914caac58e3', version=9, score=0.116038635, payload={'image': 'sage-stuffing-350603.jpg', 'recipe': 'Preheat oven to 400°F with rack in lower third. Butter a 11/2-qt shallow baking dish or gratin dish.\nCook onion and celery in 6 tablespoon butter with 1/2 teaspoon salt and 1/4 teaspoon pepper in a large heavy skillet over medium heat, stirring occasionally, until softened, 8 to 10 minutes. Transfer to a bowl and toss with bread cubes, celery leaves, and sage, then cool 5 minutes. Whisk together stock and egg (if stock is hot, gradually whisk into egg), then toss with bread mixture until absorbed. Transfer to baking dish and dot top with remaining tablespoon butter.\nBake, covered with foil, 30 minutes, then uncover and bake until top is golden, about 10 minutes more.', 'title': 'Sage Stuffing'}, vector=None), ScoredPoint(id='c1787c84-5d91-4bc8-a1d9-5b7086915a4b', version=10, score=0.11008572, payload={'image': 'clay-pot-miso-chicken-240262.jpg