In [2]:
import instructor
from openai import OpenAI
from typing import Iterable, Literal, List, Optional, Tuple
from pydantic import BaseModel, Field 
import pandas as pd

In [3]:
client = instructor.from_openai(OpenAI(), mode=instructor.Mode.TOOLS)

class Facet(BaseModel):
    """
    Facet is a search filter e.g. color, size, price, etc.
    """
    name: str = Field(..., description="Common filters in search e.g. color, size, price, gender, brand, fit, cut,datetime, season, pattern, material, dimension, etc.")
    values: List[str] = Field(..., description="Values of the facet")

class QueryAnnotation(BaseModel):
    query: str = Field(..., description="Query to search for relevant content")
    simplified_query: str = Field(..., description="Simplified query e.g. wristwatches for men -> men's wristwatches, red dress shirt for husband -> men's red dress shirt")
    hypothetical_product_description: Optional[List[str]] = Field(..., description="Given a query, generate unique and diverse product descriptions that satisfy the query")
    keywords: List[str] = Field(..., description=   "Keywords to search for")
    expanded_keywords: List[str] = Field(..., description="Expanded keywords to search for e.g. synonyms, related words, etc.")
    intent: Literal["informational", "navigational", "transactional"] = Field(..., description="Intent of the user")
    translated_query: Optional[str] = Field(..., description="English query to search for")
    iso_language_code: Optional[str] = Field(..., description="ISO language code to search e.g. en-US, in-IN")
    facets: List[Facet] = Field(..., description="Facets which are used to filter the search results")

    async def execute(self):
        print(
            f"Decomposing query `{self.query}` into `{self.simplified_query}` and `{self.semantic_query}`"
        )

In [4]:
df = pd.read_json("amzn_esci_train_query_info.jsonl", lines=True)
df.head()

Unnamed: 0,qid,q_text
0,1,!awnmower tires without rims
1,5,# 10 self-seal envelopes without window
2,6,# 2 pencils not sharpened
3,9,# mom life
4,11,#1 best and not expensive bath back brush crea...


query = f(queries, products, user_actions)

In [14]:
def query_annotation(data: str) -> Iterable[QueryAnnotation]:
    return client.chat.completions.create(
        model="gpt-4o",
        response_model=Iterable[QueryAnnotation],
        messages=[
            {
                "role": "user",
                "content": f"Consider the user query below: '\n{data}' and annotate the query",
            },
        ],
        max_tokens=1000,
        temperature=0.0,
        seed=42,
    )

def image_query_annotation(url: str, query: str) -> Iterable[QueryAnnotation]:
    return client.chat.completions.create(
        model="gpt-4o",
        response_model=Iterable[QueryAnnotation],
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {"url": url},
                    },
                    {
                        "type": "text",
                        "text": f"Analyze the image and annotate the query: {query}",
                    },
                ],
            }
        ],
        max_tokens=1000,
        temperature=0.0,
        seed=42,
    )

for query in query_annotation("I'm looking for a red dress shirt for my husband for a Bertem wedding"):
    print(query.model_dump_json(indent=2))

{
  "query": "red dress shirt for my husband for a Bertem wedding",
  "simplified_query": "men's red dress shirt",
  "hypothetical_product_description": [
    "A stylish men's red dress shirt made from high-quality cotton, perfect for formal occasions such as weddings. It features a slim fit design, button-down collar, and long sleeves.",
    "A vibrant red dress shirt for men, crafted from breathable fabric, ideal for weddings and other formal events. It comes with a classic fit, spread collar, and adjustable cuffs."
  ],
  "keywords": [
    "red dress shirt",
    "men's dress shirt",
    "wedding shirt"
  ],
  "expanded_keywords": [
    "men's red dress shirt",
    "formal red shirt",
    "wedding dress shirt",
    "husband's red dress shirt"
  ],
  "intent": "transactional",
  "translated_query": null,
  "iso_language_code": null,
  "facets": [
    {
      "name": "color",
      "values": [
        "red"
      ]
    },
    {
      "name": "gender",
      "values": [
        "men"
  

In [15]:
from IPython.display import Image
from IPython.core.display import HTML

def display_image_from_url(url: str):
    return Image(url=url)

# Display the image
display(HTML("<h3>Sample Image</h3>"))
display(display_image_from_url("https://upload.wikimedia.org/wikipedia/commons/thumb/d/d5/Blue_sari_2.jpg/920px-Blue_sari_2.jpg"))


for query in image_query_annotation("https://upload.wikimedia.org/wikipedia/commons/thumb/d/d5/Blue_sari_2.jpg/920px-Blue_sari_2.jpg", "I'm looking for a kanchipuram silk saree"):
    print(query.model_dump_json(indent=2))

{
  "query": "I'm looking for a kachipuram silk saree",
  "simplified_query": "kachipuram silk saree",
  "hypothetical_product_description": [
    "This exquisite Kachipuram silk saree features intricate zari work and a rich, vibrant color palette, perfect for weddings and special occasions.",
    "A traditional Kachipuram silk saree with a beautiful temple border and pallu, showcasing the craftsmanship of South Indian weavers.",
    "Elegant Kachipuram silk saree in a deep maroon shade, adorned with gold motifs and a contrasting border, ideal for festive celebrations."
  ],
  "keywords": [
    "kachipuram silk saree",
    "kachipuram saree",
    "silk saree",
    "traditional saree"
  ],
  "expanded_keywords": [
    "kachipuram silk saree",
    "kachipuram saree",
    "silk saree",
    "traditional saree",
    "south indian saree",
    "zari work saree",
    "wedding saree",
    "festive saree"
  ],
  "intent": "transactional",
  "translated_query": null,
  "iso_language_code": null,


In [16]:
# Pull 100 random queries from the df and display
def display_annotated_queries(queries: List[str]):
    for query in queries:
        annotated_queries = query_annotation(query)
        for annotated_query in annotated_queries:
            print(f"Query: {query}")
            print(f"Annotated Query: {annotated_query.model_dump_json(indent=2)}")
            print("\n")

seed = 69
queries = df["q_text"].sample(10, random_state=seed).tolist()
display_annotated_queries(queries)

Query: waterproof kid gloves small
Annotated Query: {
  "id": "1",
  "query": "waterproof kid gloves small",
  "simplified_query": "small waterproof kid gloves",
  "hypothetical_product_description": [
    "These small waterproof kid gloves are perfect for keeping little hands dry and warm during rainy or snowy weather. Made with durable, water-resistant materials, they offer excellent protection against the elements while ensuring comfort and flexibility for active kids. Available in various colors and designs to suit every child's preference."
  ],
  "type": "web",
  "rag_query_type": "lookup",
  "keywords": [
    "waterproof",
    "kid gloves",
    "small"
  ],
  "expanded_keywords": [
    "water-resistant",
    "children's gloves",
    "small size",
    "youth gloves",
    "rain gloves",
    "snow gloves"
  ],
  "intent": "transactional",
  "translated_query": null,
  "iso_language_code": null,
  "facets": [
    {
      "name": "size",
      "values": [
        "small"
      ]
    