# Install and Import Libraries

In [None]:
!pip install peft sentence-transformers tqdm redis

In [None]:
from ast import literal_eval
from peft import PeftModel
from redis.commands.search.field import TextField, VectorField
from redis.commands.search.index_definition import IndexDefinition, IndexType
from redis.commands.search.query import Query
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pandas as pd
import numpy as np
import redis

# Create Global Variables

In [None]:
HF_MODEL_ID = "carlosalvarezg/all-mpnet-base-v2"
BASE_MODEL_ID = "all-mpnet-base-v2"
REDIS_HOST = "redis-host"
REDIS_PORT = 12345
REDIS_USERNAME = "redis-username"
REDIS_PASSWORD = "redis-password"

Read and cleans dataframe

In [None]:
DATAFRAME = pd.read_csv("recipes_data.csv.zip", compression='zip')

# Drops unnecessary columns
DATAFRAME.drop(columns=["site", "source", "NER", "directions"], inplace=True)

# Removes NaN values and resets index
DATAFRAME.dropna(inplace=True)

# Sorts dataframe by link
DATAFRAME.sort_values(by="link", inplace=True)

# Resets index
DATAFRAME.reset_index(drop=True, inplace=True)

# Removes commas and brackets from ingredients
DATAFRAME["ingredients"] = DATAFRAME["ingredients"].apply(lambda x : literal_eval(x))

Creates list of strings containing titles and ingredients for each recipe

In [None]:
TITLES_AND_INGREDIENTS = (DATAFRAME["title"] + " " + DATAFRAME["ingredients"].apply(lambda x : " ".join(x))).to_list()

Creates list of keys for dataframe

In [None]:
KEYS = [f"{index:07}" for index in list(DATAFRAME.index)]

Loads model

In [None]:
EMBEDDINGS_MODEL = PeftModel.from_pretrained(SentenceTransformer(BASE_MODEL_NAME, device="cuda"), HF_MODEL_ID)
EMBEDDINGS_MODEL.eval()

Creates Redis client

In [None]:
redisClient = redis.Redis(
    host=REDIS_HOST,
    port=REDIS_PORT,
    decode_responses=True,
    username=REDIS_USERNAME,
    password=REDIS_PASSWORD,
)

# Upload Recipe Data

Adds title, and link to database


In [None]:
pipeline = redisClient.pipeline()
batch_size = 10_000
for index, row in tqdm(DATAFRAME.iterrows(), total=len(DATAFRAME)):
  redis_key = f"{index:07}"
  pipeline.json().set(redis_key, "$", dict(row[["title", "link"]]))
  if index % batch_size == 0 and index != 0:
    res = pipeline.execute()
    pipeline = redisClient.pipeline()
res = pipeline.execute()

Adds vector embeddings to database

In [None]:
batch_size = 512
total = len(TITLES_AND_INGREDIENTS)
for index in tqdm(range(0, total, batch_size)):
  cur_keys = KEYS[index:min(index+batch_size, total)]
  embeddings = EMBEDDINGS_MODEL.encode(TITLES_AND_INGREDIENTS[index:min(index+batch_size, total)]).astype(np.float16).tolist()
  pipeline = redisClient.pipeline()
  for key, embedding in zip(cur_keys, embeddings):
    pipeline.json().set(key, "$.embedding", embedding)
  res = pipeline.execute()
res = pipeline.execute()

Checks recipe in database

In [None]:
redisClient.json().get(KEYS[0], "$")

# Creates vector index using embedding field

In [None]:
schema = (
    TextField("$.title", no_stem=True, as_name="title"),
    TextField("$.link", no_stem=True, as_name="link"),
    VectorField(
        "$.embedding",
        "FLAT",
        {
            "TYPE": "FLOAT16",
            "DIM": 768,
            "DISTANCE_METRIC": "COSINE",
        },
        as_name="vector",
    ),
)
definition = IndexDefinition(prefix=[""], index_type=IndexType.JSON)

In [None]:
response = redisClient.ft("idx:recipes").create_index(fields=schema, definition=definition)

Checks schema form database

In [None]:
info = redisClient.ft("idx:recipes").info()
num_docs = info["num_docs"]
indexing_failures = info["hash_indexing_failures"]
print(f"Number of documents: {num_docs}")
print(f"Number of indexing failures: {indexing_failures}")

Tests schema with example query

In [None]:
encoded_query = EMBEDDING_MODEL.encod(["chocolate cake", "spaghetti carbonara", "chicken curry"]).to_list()[0]
query = (
    Query('(*)=>[KNN 3 @vector $query_vector AS vector_score]')
     .sort_by('vector_score')
     .return_fields('vector_score', 'id', 'title', 'link')
     .dialect(2)
)
response = redisClient.ft('idx:recipes').search(
    query,
    {
      'query_vector': np.array(encoded_query, dtype=np.float32).tobytes()
    }
).docs
print(response)