In [3]:
import polars as pl
import numpy as np

In [4]:
import dataclasses
from typing import Dict, Any

@dataclasses.dataclass
class NERAgentConfig:
    instruction: str = """
        # Define your task:
        Extract the most relevant keywords from the following sentence: '{query}'. 
        Focus on important nouns that convey the core meaning. 
        Detect any words related to dates such as tomorrow, today, last week, next year, so on, following the example below.
        Help me convert the date range to the format of YYYY-MM-DD to YYYY-MM-DD.
        
        # For date range, please help me convert it to from_date and to_date in DD/MM/YYYY format following these cases:

        1. If a single date is mentioned (e.g. "day 10"):
           - Use current month and year {current_year} and {current_month}
           - Set both from_date and to_date to that date
           Example: "day 10" -> from_date: 10/{current_month}/{current_year}, to_date: 10/{current_month}/{current_year}

        2. If a date range is specified (e.g. "01/02/2024 to 15/02/2024"):
            - Note that in this case, the date range derived from user's query must be DD/MM/YYYY format
            - Keep the dates as specified in DD/MM/YYYY format
           Example: "01/02/2024 to 15/02/2024" -> from_date: 01/02/2024, to_date: 15/02/2024

        3. If relative dates are mentioned:
           - "today" -> Use {current_date} for both
           - "yesterday" -> Use yesterday's date for both from current date {current_date}
           - "last week" -> from_date is 7 days ago, to_date is today from current date {current_date}
           - "last month" -> from_date is 1st of previous month, to_date is last day of previous month from current date {current_date}
           - "last year" -> from_date is Jan 1st of previous year, to_date is Dec 31st of previous year from current date {current_date}
           - "this week" -> from_date is Monday of current week, to_date is today from current date {current_date}
           - "this month" -> from_date is 1st of current month, to_date is today from current date {current_date}
           - "this year" -> from_date is Jan 1st of current year, to_date is today from current date {current_date}
           
        4. If a month range is specified (e.g. "1/1 to 31/1"):
           - Use current year {current_year}
           - Set from_date to first day of specified month 
           - Set to_date to last day of specified month
           Example: "1/1 to 31/1" in {current_year} -> from_date: 01/01/{current_year}, to_date: 31/01/{current_year}
           
        5. If no date is specified:
           - Set date_range as "N/A"
           - Set both from_date and to_date as "N/A"
           
        If no relevant keywords are detected, return 'All' (except for dates, you must fill 'N/A').
        If the date range is not specified, please return 'N/A' for date_range.
        If the product is not specified, please return 'All' for product.
        If the product detail is not specified, please return 'All' for product_detail.
        If the level is not specified, please return 'All' for level.
        If the user is not specified, please return 'N/A' for user.
        
        Here is the list of product and product detail you should detect:
        {parameter_properties}
    """
    few_shot: str = """
        # Example 1:
        ## User: Get me a Win Loss Detail Report on day 10
        ## Output:
        {{
            "date_range": "day 10",
            "from_date": "10/{current_month}/{current_year}",
            "to_date": "10/{current_month}/{current_year}",
            "product": "All",
            "product_detail": "All",
            "level": "All",
            "user": "N/A"
        }}
        
        Example 2:
        ## User: Get me a Win Loss Detail Report for Direct Member who played Product Detail Sportsbook in Sportsbook Product from 01/02/2024 to 15/02/2024
        ## Output:
        {{
            "date_range": "01/02/2024 to 15/02/2024",
            "from_date": "01/02/2024",
            "to_date": "15/02/2024",
            "product": "Sportsbook",
            "product_detail": "Sportsbook",
            "level": "Direct Member",
            "user": "N/A"
        }}
        
        Example 3:
        ## User: Get me a Win Loss Detail Report for Super Agent who played Product Detail SABA Basketball in SABA Basketball Product from 01/02/2024 to 15/02/2024
        ## Output:
        {{
            "date_range": "01/02/2024 to 15/02/2024",
            "from_date": "01/02/2024",
            "to_date": "15/02/2024",
            "product": "SABA Basketball",
            "product_detail": "SABA Basketball",
            "level": "Super Agent",
            "user": "N/A"
        }}
        
        Example 4:
        ## User: Win/Loss details for Product Sportsbook
        ## Output:
        {{
            "date_range": "N/A",
            "from_date": "N/A",
            "to_date": "N/A",
            "product": "Sportsbook",
            "product_detail": "All",
            "level": "All",
            "user": "N/A"
        }}
    """
    system_prompt: str = """
    You are an AI assistant majoring for Named Entity Recognition trained to extract entity and categorize queries for Winlost Report Detail
    """
    user_prompt: str = """

        User request: {query}
    
        Current date: {current_date}
        Current year: {current_year}
        Current month: {current_month}

        {instruction}

        {few_shot}
    """
    format_schema: Dict[str, Any] = dataclasses.field(default_factory=lambda: {
        "type": "object",
        "properties": {
            "date_range": {"type": "string"},
            "from_date": {"type": "string"},
            "to_date": {"type": "string"},
            "product": {"type": "string"},
            "product_detail": {"type": "string"},
            "level": {"type": "string"},
            "user": {"type": "string"}
        },
        "required": ["date_range", "from_date", "to_date", "product", "product_detail", "level", "user"]
    })

In [6]:
current_date = '2525'
current_year = '2525'   
current_month = '2525'
query = 'Get me a Win Loss Detail Report on day 10'
parameter_properties = 'All'
user_prompt = NERAgentConfig().user_prompt.format(
    query=query, 
    current_date=current_date,
    current_year=current_year,
    current_month=current_month,
    instruction=NERAgentConfig().instruction.format(
        query=query,
        current_date=current_date,
        current_year=current_year,
        current_month=current_month,
        parameter_properties=parameter_properties
    ), 
    few_shot=NERAgentConfig().few_shot.format(
        current_month=current_month,
        current_year=current_year,
    )
)

In [6]:
data = pl.read_csv("./output_hybrib_search.csv", encoding='iso-8859-1', use_pyarrow=True)

In [42]:
from ollama import chat
from pydantic import BaseModel
from typing import List
import tqdm

class Verify(BaseModel):
    matching: int

verify = []

for i in tqdm.tqdm(range(data.shape[0]), colour='green'):
    
    query = data['Question'][i]
    ground_truth = data['Sample'][i]
    hybrid_search = data['AnswerbyHybribsearch'][i]
    
    
    context = f''' 
        I have a question: {query}

        This is the correct answer: {ground_truth}

        This is the predicted answer: {hybrid_search}

        Please help me verify whether the correct answer and the predicted answer match based on the keywords in both type of answers.

        If they match, return 1; otherwise, return 0.
    '''

    response = chat(
        messages=[
            {
                'role': 'user',
                'content': context,
            }
        ],
        model='llama3.2:latest',
        format=Verify.model_json_schema(),
    )

    result = Verify.model_validate_json(response.message.content)
    verify.append(result.matching)
    
    df_results = pl.DataFrame({
        'query': data['Question'][:i+1],
        'ground_truth': data['Sample'][:i+1],
        'predict_from_hybrid_search': data['AnswerbyHybribsearch'][:i+1],
        'score_matching': verify
    })

    
    df_results.write_csv("./hani_hybrid_verified.csv")


100%|[32m██████████[0m| 1744/1744 [1:15:22<00:00,  2.59s/it]


In [46]:
df_results.filter(
    pl.col('score_matching').is_in([0, 1])
)['score_matching'].value_counts()

score_matching,count
i64,u32
1,457
0,1024


In [26]:
import requests
import json

def extract_keyword_by_llama32(query:str):
  
  
  url = 'http://localhost:11434/api/chat'

  context = f'''
  Extract the most relevant keywords from the following sentence: '{query}'. 
  Focus on important nouns, verbs, and adjectives that convey the core meaning. 
  Additionally, categorize the query into a relevant topic or section (e.g., Technology, Business, Health, Education, etc.).
  For each extracted keyword, provide a list of synonyms that accurately reflect its meaning in the given context. 
  Ensure that the synonyms include common alternatives, domain-specific terms (if applicable), 
  and variations in different registers (formal, informal, technical, etc.).
  '''

  headers = {
      "Content-Type": "application/json"
  }

  messages = [
      {"role": "system", "content": "You are an AI assistant trained to extract keywords and categorize queries."},
      {"role": "user", "content": context}
  ]

  model='llama3.2:latest'

  format={
      "type": "object",
      "properties": {
        "keywords": {
          "type": "array",
          'items': {
            'type': 'string'
          }
        }
      },
      "required": [
        "keywords"
      ]
    }

  data = {
      "model": model,
      "messages": messages,
      "format": format,
      "stream": False
  }

  response = requests.post(url, headers=headers, data=json.dumps(data))

  if response.status_code == 200:
      try:
          print(response.json())
      except json.JSONDecodeError:
          print("Invalid JSON response:", response.text)
  else:
      print(f"Error: {response.status_code}, {response.text}")
      
  keywords = json.loads(response.json()['message']['content'])['keywords']
  
  return keywords


In [27]:
t = extract_keyword_by_llama32(query = 'How to become a software engineer')

{'model': 'llama3.2:latest', 'created_at': '2025-03-03T07:10:19.530484Z', 'message': {'role': 'assistant', 'content': '{ "keywords": ["software engineer", "become", "How to"] }'}, 'done_reason': 'stop', 'done': True, 'total_duration': 921803200, 'load_duration': 19863300, 'prompt_eval_count': 162, 'prompt_eval_duration': 43000000, 'eval_count': 18, 'eval_duration': 857000000}


In [29]:
t

['software engineer', 'become', 'How to']