In [25]:
import instructor
from openai import OpenAI
from typing import Iterable, Literal, List, Optional, Tuple
from pydantic import BaseModel, Field 
import pandas as pd

In [26]:
# Apply the patch to the OpenAI client
# enables response_model keyword
client = instructor.from_openai(OpenAI())

class Facet(BaseModel):
    """
    Facet is a search filter e.g. color, size, price, etc.
    """
    name: str = Field(..., description="Common filters in search e.g. color, size, price, gender, datetime, season, pattern, material, dimension, etc.")
    values: List[str] = Field(..., description="Values of the facet")

class QueryAnnotation(BaseModel):
    id: str = Field(..., description="Unique Query ID") 
    query: str = Field(..., description="Query to search for relevant content")
    simplified_query: str = Field(..., description="Simplified query e.g. wristwatches for men -> men's wristwatches, red dress shirt for husband -> men's red dress shirt")
    hypothetical_document_query: Optional[str] = Field(..., description="Given a question, generate possible product descriptions that satisfy the query")
    type: Literal["web", "image", "video"] = Field(..., description="Document type to search for")
    rag_query_type: Literal["lookup", "synthesis", "inference", "temporal", "comparative", "hypothetical", "multihop_query", "unanswerable", "recommendation"] = Field(..., description="RAG query type")
    keywords: List[str] = Field(..., description=   "Keywords to search for")
    expanded_keywords: List[str] = Field(..., description="Expanded keywords to search for")
    intent: Literal["informational", "navigational", "transactional"] = Field(..., description="Intent of the user")
    translated_query: Optional[str] = Field(..., description="English query to search for")
    iso_language_code: Optional[str] = Field(..., description="ISO language code to search e.g. en-US, in-IN")
    facets: List[Facet] = Field(..., description="Facets which are used to filter the search results")

    async def execute(self):
        print(
            f"Decomposing query `{self.query}` into `{self.simplified_query}` and `{self.semantic_query}`"
        )

In [27]:
df = pd.read_csv("queries_dev.tsv", sep="\t", names=["qid", "query", "region"])
df.head()

Unnamed: 0,qid,query,region
0,1,+www.govictory.com/flashpoint,en-US
1,2,022 orange pill round,"en-US, fr-FR, en-AU, en-US"
2,3,1 – (adaptado enem 2011).um dos processos usad...,pt-BR
3,4,14 day weather forecast chicago,"en-IE, en-US, fr-FR, en-CA, en-GB, en-US"
4,5,15034 appleton port charlotte fl,en-US


In [22]:
def query_annotation(data: str) -> Iterable[QueryAnnotation]:
    return client.chat.completions.create(
        model="gpt-4o-mini",
        response_model=Iterable[QueryAnnotation],
        messages=[
            {
                "role": "user",
                "content": f"Consider the user query below: '\n{data}' and annotate the query",
            },
        ],
        max_tokens=1000,
        temperature=0.0,
        seed=42,
    )

for query in query_annotation("I'm a little confused. I'm looking for a red dress shirt for my husband. I'm not sure if I should be looking for a solid color or one with some pattern. What do you think?"):
    print(query.model_dump_json(indent=2))

{
  "id": "1",
  "query": "red dress shirt for husband",
  "simplified_query": "men's red dress shirt",
  "hypothetical_document_query": "A stylish men's red dress shirt, available in both solid and patterned options, perfect for formal or casual occasions.",
  "type": "web",
  "rag_query_type": "recommendation",
  "keywords": [
    "red",
    "dress shirt",
    "husband",
    "men's clothing"
  ],
  "expanded_keywords": [
    "red dress shirt",
    "men's red dress shirt",
    "red shirt for husband",
    "dress shirt for men",
    "patterned dress shirt"
  ],
  "intent": "transactional",
  "translated_query": "red dress shirt for husband",
  "iso_language_code": "en-US",
  "facets": [
    {
      "name": "color",
      "values": [
        "red"
      ]
    },
    {
      "name": "gender",
      "values": [
        "men"
      ]
    },
    {
      "name": "type",
      "values": [
        "dress shirt"
      ]
    },
    {
      "name": "pattern",
      "values": [
        "solid",
  

In [23]:
# Pull 100 random queries from the df and display
def display_annotated_queries(queries: List[str]):
    for query in queries:
        annotated_queries = query_annotation(query)
        for annotated_query in annotated_queries:
            print(annotated_query.model_dump_json(indent=2))

seed = 37
queries = df["query"].sample(10, random_state=seed).tolist()
display_annotated_queries(queries)

{
  "id": "1",
  "query": "disfrutar vacaciones tras it",
  "simplified_query": "disfrutar vacaciones",
  "hypothetical_document_query": "¿Cómo disfrutar de unas vacaciones tras un viaje a Italia?",
  "type": "web",
  "rag_query_type": "synthesis",
  "keywords": [
    "disfrutar",
    "vacaciones",
    "tras",
    "it"
  ],
  "expanded_keywords": [
    "disfrutar vacaciones",
    "vacaciones tras it",
    "vacaciones en Italia"
  ],
  "intent": "informational",
  "translated_query": "enjoy vacations after it",
  "iso_language_code": "es-ES",
  "facets": [
    {
      "name": "destination",
      "values": [
        "Italia"
      ]
    },
    {
      "name": "activity",
      "values": [
        "vacaciones",
        "disfrutar"
      ]
    }
  ]
}
{
  "id": "1",
  "query": "is vanity fair biased",
  "simplified_query": "is Vanity Fair biased",
  "hypothetical_document_query": "An analysis of Vanity Fair's editorial stance and potential biases in its reporting.",
  "type": "web",
  "ra