In [1]:
import instructor
from openai import OpenAI
from typing import Iterable, Literal, List, Optional, Tuple
from pydantic import BaseModel, Field 
import pandas as pd

In [23]:
# Apply the patch to the OpenAI client
# enables response_model keyword
client = instructor.from_openai(OpenAI(), mode=instructor.Mode.TOOLS)

# class Facet(BaseModel):
#     """
#     Facet is a search filter e.g. color, size, price, etc.
#     """
#     name: str = Field(..., description="Common filters in search e.g. color, size, price, gender, brand, fit, cut,datetime, season, pattern, material, dimension, etc.")
#     values: List[str] = Field(..., description="Values of the facet")

class QueryAnnotation(BaseModel):
    # id: str = Field(..., description="Unique Query ID") 
    query: str = Field(..., description="Query to search for relevant content")
    simplified_query: str = Field(..., description="Simplified query")
    keywords: List[str] = Field(..., description=   "Keywords to search for")
    translated_query: Optional[str] = Field(..., description="English query to search for")
    iso_language_code: Optional[str] = Field(..., description="ISO language code to search e.g. en-US, in-IN")
    duration: Optional[str] = Field(..., description="Duration of the instrument")
    amount: Optional[str] = Field(..., description="Amount of the instrument")
    intent: Literal["informational", "navigational", "transactional"] = Field(..., description="Intent of the user")
    product_category: Literal["GIGA", "FixedDeposit", "HomeLoan", "CreditCard", "PersonalLoan", "SavingsAccount", "CarLoan", "XpressWay", "Fasttag"] = Field(..., description="Category of the product")
    escalation_matrix: Literal["self_serve", "support_agent", "branch", "branch_manager", "regional_team", "compliance", "fraud", "it_team"] = Field(..., description="Escalation matrix for the product")

    async def execute(self):
        print(
            f"Decomposing query `{self.query}` into `{self.simplified_query}` and `{self.semantic_query}`"
        )

In [24]:
df = pd.read_json("hdfc_services_queries.jsonl", lines=True)
df["product"].value_counts()

product
GIGA              11
FixedDeposit      11
HomeLoan          11
CreditCard        11
PersonalLoan      11
SavingsAccount    11
CarLoan           11
Xpressway         11
FASTtag           11
Name: count, dtype: int64

In [25]:
def query_annotation(data: str) -> Iterable[QueryAnnotation]:
    return client.chat.completions.create(
        model="gpt-4o",
        response_model=Iterable[QueryAnnotation],
        messages=[
            {
                "role": "user",
                "content": f"Consider the user query below: '\n{data}' and annotate the query",
            },
        ],
        max_tokens=1000,
        temperature=0.0,
        seed=42,
    )

def image_query_annotation(url: str) -> Iterable[QueryAnnotation]:
    return client.chat.completions.create(
        model="gpt-4o",
        response_model=Iterable[QueryAnnotation],
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {"url": url},
                    },
                    {
                        "type": "text",
                        "text": "Analyze the image and annotate the query",
                    },
                ],
            }
        ],
        max_tokens=1000,
        temperature=0.0,
        seed=42,
    )

for query in query_annotation("what's the status of my loan application?"):
    print(query.model_dump_json(indent=2))

{
  "query": "what's the status of my loan application?",
  "simplified_query": "status of loan application",
  "keywords": [
    "status",
    "loan application"
  ],
  "translated_query": "what's the status of my loan application?",
  "iso_language_code": "en-US",
  "duration": null,
  "amount": null,
  "intent": "informational",
  "product_category": "HomeLoan",
  "escalation_matrix": "support_agent"
}


In [26]:
# Pull 100 random queries from the df and display
def display_annotated_queries(queries: List[str]):
    for query in queries:
        annotated_queries = query_annotation(query)
        for annotated_query in annotated_queries:
            print(f"Query: {query}")
            print(f"Annotated Query: {annotated_query.model_dump_json(indent=2)}")
            print("\n")

seed = 69
queries = df["query"].sample(10, random_state=seed).tolist()
display_annotated_queries(queries)

Query: Balloon payment in car loans
Annotated Query: {
  "query": "Balloon payment in car loans",
  "simplified_query": "Balloon payment in car loans",
  "keywords": [
    "Balloon payment",
    "car loans"
  ],
  "translated_query": null,
  "iso_language_code": null,
  "duration": null,
  "amount": null,
  "intent": "informational",
  "product_category": "CarLoan",
  "escalation_matrix": "self_serve"
}


Query: Joint home loan tax benefits
Annotated Query: {
  "query": "Joint home loan tax benefits",
  "simplified_query": "joint home loan tax benefits",
  "keywords": [
    "joint",
    "home loan",
    "tax benefits"
  ],
  "translated_query": null,
  "iso_language_code": null,
  "duration": null,
  "amount": null,
  "intent": "informational",
  "product_category": "HomeLoan",
  "escalation_matrix": "self_serve"
}


Query: FD vs mutual funds
Annotated Query: {
  "query": "FD vs mutual funds",
  "simplified_query": "FD vs mutual funds",
  "keywords": [
    "FD",
    "mutual funds"
  ],