In [1]:
import pandas as pd
import json

In [2]:
restaurant_details = pd.read_csv("data/restaurant_details_openai.csv")

In [3]:
restaurant_details.head()

Unnamed: 0,restaurantname,item_type,item_name,descriptiom,ingredients,cost,preparation_time
0,The Culinary Haven,Starters,Golden Corn Soup,"A creamy, comforting blend of sweet corn and h...","Sweet corn, Vegetable stock, Cream, Herbs",5.99,10
1,The Culinary Haven,Starters,Chicken Satay Skewers,Grilled chicken skewers marinated in a blend o...,"Chicken, Soy sauce, Peanut sauce, Spices",7.99,15
2,The Culinary Haven,Starters,Beef Carpaccio,"Thinly sliced raw beef dressed with olive oil,...","Beef, Olive oil, Lemon juice, Capers",9.99,10
3,The Culinary Haven,Starters,Crispy Calamari Rings,Fried calamari rings served with a tangy marin...,"Calamari, Flour, Marinara sauce, Lemon",8.5,12
4,The Culinary Haven,Starters,Veggie Spring Rolls,Crispy rolls filled with fresh vegetables and ...,"Cabbage, Carrot, Glass noodles, Spring roll wr...",6.5,15


In [4]:
out = []
for idx, row in restaurant_details.iterrows():
    data = {}
    data["restaurant_name"] = row["restaurantname"]
    data["item_type"] = row["item_type"]
    data["item_name"] = row["item_name"]
    data["ingredients"] = row["ingredients"]
    data["cost"] = row["cost"]
    data["preparation_time"] = row["preparation_time"]
    data["description"] = row["descriptiom"]
    out.append({
        "page_content": json.dumps(data),
        "metadata": dict((key, value) for key, value in data.items() if key!="description")
    })


In [5]:
out[0]

{'page_content': '{"restaurant_name": "The Culinary Haven", "item_type": "Starters", "item_name": "Golden Corn Soup", "ingredients": "Sweet corn, Vegetable stock, Cream, Herbs", "cost": 5.99, "preparation_time": 10, "description": "A creamy, comforting blend of sweet corn and herbs."}',
 'metadata': {'restaurant_name': 'The Culinary Haven',
  'item_type': 'Starters',
  'item_name': 'Golden Corn Soup',
  'ingredients': 'Sweet corn, Vegetable stock, Cream, Herbs',
  'cost': 5.99,
  'preparation_time': 10}}

In [6]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain_community.vectorstores import Chroma

In [7]:
model_kwargs = {
    "trust_remote_code": True,
    # "device": "cpu"
    }
encode_kwargs={"normalize_embeddings": True}
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)

In [8]:
docs = [
    Document(
        page_content=doc["page_content"],
        metadata=doc["metadata"]
    )
    for doc in out
]
# vectorstore = Chroma.from_documents(docs, embeddings, collection_name="course_info", persist_directory="../data/chromadb")
vectorstore = Chroma.from_documents(docs, embeddings)

In [9]:
vectorstore._collection.count()

45

In [11]:
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_openai import ChatOpenAI
from langchain.retrievers.self_query.chroma import ChromaTranslator

metadata_field_info = [
    AttributeInfo(
        name="restaurant_name",
        description="Name of the restaurant",
        type="string",
    ),
    AttributeInfo(
        name="item_type",
        description="Type of the food dish[Startes, Main Course, Desserts]",
        type="string",
    ),
    AttributeInfo(
        name="item_name",
        description="The name of a food dish",
        type="string",
    ),
    AttributeInfo(
        name="ingredients",
        description="The ingredients present in the food. All values are comma seperated. Use contains query to filter values for this attribute",
        type="string",
    ),
    AttributeInfo(
        name="cost",
        description="Cost of the food dish in dollars",
        type="float",
    ),
    AttributeInfo(
        name="preparation_time",
        description="Time taken to prepare the food in miuntes",
        type="integer",
    ),
]
document_content_description = "Information regarding the course"
llm = ChatOpenAI(model="gpt-4-1106-preview", temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info,
    structured_query_translator=ChromaTranslator(),
    enable_limit=True
)

In [36]:
user_query = "Could you provide me with a list of starters?"

In [37]:
question = f"""
```{user_query}```
Sample filters are:
and(eq("restaurant_name", "Spice Symphony"),  contain("ingredients", "Chicken"))
or(gte("cost", 5), eq("preparation_time", 10))
"""

In [38]:
retriever_prompt = f"""{question}""".strip()

In [39]:
from langchain.chains.query_constructor.base import (
    StructuredQueryOutputParser,
    get_query_constructor_prompt,
)
prompt = get_query_constructor_prompt(
    document_content_description,
    metadata_field_info,
)

output_parser = StructuredQueryOutputParser.from_components()
query_constructor = prompt | llm | output_parser

# print(prompt.format(query=retriever_prompt))
print(query_constructor.invoke(question))

query='starters' filter=Operation(operator=<Operator.OR: 'or'>, arguments=[Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='restaurant_name', value='Spice Symphony'), Comparison(comparator=<Comparator.CONTAIN: 'contain'>, attribute='ingredients', value='Chicken')]), Operation(operator=<Operator.OR: 'or'>, arguments=[Comparison(comparator=<Comparator.GTE: 'gte'>, attribute='cost', value=5), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='preparation_time', value=10)])]) limit=None
