# Hotpot QA

In this task, the LLM must answer a question given a pre-configured context. The answer usually has to be concise, and accuracy is measured by calaulating the overlap (measured by F1) and exact match between the predicted answer and the ground truth answer.

Refer to https://hotpotqa.github.io/ for more details on the dataset.

### Data Preparation

In [11]:
# Download Hotpot Data
!curl -O http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json
!curl -O http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json
!curl -O http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_train_v1.1.json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 44.1M  100 44.1M    0     0  8831k      0  0:00:05  0:00:05 --:--:-- 9083k0:00:05 5574k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 45.2M  100 45.2M    0     0  8746k      0  0:00:05  0:00:05 --:--:-- 9948k260k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  540M  100  540M    0     0  12.5M      0  0:00:43  0:00:43 --:--:-- 14.8M    0  0:00:44  0:00:21  0:00:23 10.9M


In [3]:
import json
from rich.pretty import pprint

with open('./hotpot_dev_distractor_v1.json', 'r') as f:
    dev_distractor_data = json.load(f)
    # pprint([x[0] for x in dev_distractor_data[0]['context']])
    # pprint(dev_distractor_data[0]['context'])


with open('./hotpot_dev_fullwiki_v1.json', 'r') as f:
    dev_fullwiki_data = json.load(f)
    # pprint([x[0] for x in dev_fullwiki_data[0]['context']])
    # pprint(dev_fullwiki_data[0]['context'])

In [2]:
print(len(dev_fullwiki_data), len(dev_distractor_data))

7405 7405


### Llama Stack Initialization

In [4]:
from llama_stack_client import LlamaStackClient

client = LlamaStackClient(
    base_url="http://localhost:8321"
)

### Vanilla LLM

Here, we use the vanilla LLM with no supplied context to answer the question. 

In [5]:
"""
Utils from https://github.com/hotpotqa/hotpot/blob/master/hotpot_evaluate_v1.py
"""

import re
import string
from typing import Tuple
from collections import Counter

def normalize_answer(s: str) -> str:
    def remove_articles(text: str) -> str:
        return re.sub(r"\b(a|an|the)\b", " ", text)

    def white_space_fix(text: str) -> str:
        return " ".join(text.split())

    def remove_punc(text: str) -> str:
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text: str) -> str:
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction: str, ground_truth: str) -> Tuple[float, float, float]:
    normalized_prediction = normalize_answer(prediction)
    normalized_ground_truth = normalize_answer(ground_truth)

    ZERO_METRIC = (0, 0, 0)

    if (
        normalized_prediction in ["yes", "no", "noanswer"]
        and normalized_prediction not in normalized_ground_truth
    ):
        return ZERO_METRIC
    if (
        normalized_ground_truth in ["yes", "no", "noanswer"]
        and normalized_ground_truth not in normalized_prediction
    ):
        return ZERO_METRIC

    prediction_tokens = normalized_prediction.split()
    ground_truth_tokens = normalized_ground_truth.split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return ZERO_METRIC

    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1, precision, recall


def exact_match_score(prediction: str, ground_truth: str) -> bool:
    return normalize_answer(prediction) == normalize_answer(ground_truth)

In [6]:
from llama_stack_client.lib.agents.agent import Agent
from llama_stack_client.lib.agents.react.agent import ReActAgent
from llama_stack_client.lib.agents.client_tool import ClientTool
import uuid

def evaluate_hotpotqa(
        model_id: str = "", 
        agent: Agent = None, 
        query_template: str = "{question}", 
        show_result: bool = True, 
        num_examples: int = 5, 
        retrieval_tool: ClientTool = None):

    scores = {"exact_match": 0.0, "f1": 0.0}
    for query in dev_distractor_data[:num_examples]:
        expected_answer = query['answer']

        if model_id:
            if retrieval_tool is not None:
                context = retrieval_tool.run_impl(query['question'])
                question = query_template.format(question=query['question'], context=context)
                response = client.inference.chat_completion(
                    model_id=model_id,
                    messages=[
                        {"role": "user", "content": question}
                    ],
                    stream=False
                )
            else:
                question = query_template.format(question=query['question'])
                response = client.inference.chat_completion(
                    model_id=model_id,
                    messages=[
                        {"role": "user", "content": question}
                    ],
                    stream=False
                )
            
            generated_answer = response.completion_message.content
        elif agent is not None:
            question = query_template.format(question=query['question'])
            session_id = agent.create_session(f"test-session-{uuid.uuid4().hex}")
            response = agent.create_turn(
                messages=[
                    {"role": "user", "content": question}
                ],
                session_id=session_id,
                stream=False
            )
            if isinstance(agent, ReActAgent):
                answer_match = re.search(r'"answer":\s*"([^"]*)"', response.output_message.content)
                generated_answer = answer_match.group(1) if answer_match else "No answer found"
            else:
                generated_answer = response.output_message.content
        else:
            raise ValueError("No model_id or agent provided")

        f1, precision, recall = f1_score(generated_answer, expected_answer)
        exact_match = exact_match_score(generated_answer, expected_answer)
        
        if show_result:
            print("Question: ", question)
            print("Generated Answer: ", generated_answer)
            print("Expected Answer: ", expected_answer)
            print(f"F1: {f1}, Precision: {precision}, Recall: {recall}, Exact Match: {exact_match}")
            print("-"*100)
        
        scores["exact_match"] += exact_match
        scores["f1"] += f1
    
    for score in scores:
        scores[score] /= num_examples
    
    print("Scores: ", scores)

In [7]:
model_id = "meta-llama/Llama-3.1-8B-Instruct"
query_template = "{question} Give a short factoid answer (as few words as possible)."
show_result = True

evaluate_hotpotqa(model_id=model_id, query_template=query_template, show_result=show_result, num_examples=5)

Question:  Were Scott Derrickson and Ed Wood of the same nationality? Give a short factoid answer (as few words as possible).
Generated Answer:  No, they were not.
Expected Answer:  yes
F1: 0, Precision: 0, Recall: 0, Exact Match: False
----------------------------------------------------------------------------------------------------
Question:  What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell? Give a short factoid answer (as few words as possible).
Generated Answer:  Loretta Young was a U.S. Delegate to the United Nations.
Expected Answer:  Chief of Protocol
F1: 0, Precision: 0, Recall: 0, Exact Match: False
----------------------------------------------------------------------------------------------------
Question:  What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species? Give a short factoid answer (as few words as possible).
Generated

### (+ RAG) ReAct

- The agent will have access to a retrieval tool that can provide context for the question. 

- Note EM and F1 improved compared with using vanilla LLM. 

In [40]:
from llama_stack_client.lib.agents.client_tool import ClientTool
from llama_stack_client.types.tool_def_param import Parameter
from typing import List, Dict

class HotpotQAMockRetrievalTool(ClientTool):
    def __init__(self, dataset: List[Dict]):
        super().__init__()
        self.dataset = dataset
        self._queries2context = {x['question']: x['context'] for x in self.dataset}

    def get_name(self) -> str:
        return "retrieve_context"
    
    def get_description(self) -> str:
        return "Provides context about the query to better answer the question."
    
    def get_params_definition(self) -> Dict[str, Parameter]:
        return {
            "query": Parameter(
                name="query",
                parameter_type="str",
                description="The query to retrieve context for.",
                required=True,
            )
        }

    def run_impl(self, query: str) -> str:
        print("Running Mock Retrieval Tool on query: ", query)
        contexts = str(self._queries2context[query])
        return contexts

In [41]:
from llama_stack_client.lib.agents.react.agent import ReActAgent

retrieval_tool = HotpotQAMockRetrievalTool(dev_distractor_data)

agent = ReActAgent(
    client=client,
    model=model_id,
    client_tools=[retrieval_tool],
)

evaluate_hotpotqa(agent=agent, query_template=query_template, show_result=show_result, num_examples=5)

Error parsing action: 1 validation error for ReActOutput
  Invalid JSON: trailing characters at line 9 column 1 [type=json_invalid, input_value='{\n    "thought": "I wil...n    "answer": "Yes"\n}', input_type=str]
    For further information visit https://errors.pydantic.dev/2.10/v/json_invalid
Question:  Were Scott Derrickson and Ed Wood of the same nationality? Give a short factoid answer (as few words as possible).
Generated Answer:  Yes
Expected Answer:  yes
F1: 1.0, Precision: 1.0, Recall: 1.0, Exact Match: True
----------------------------------------------------------------------------------------------------
Running Mock Retrieval Tool on query:  Corliss Archer film Kiss and Tell government position
Running Mock Retrieval Tool on query:  Corliss Archer actress government position
Running Mock Retrieval Tool on query:  Corliss Archer film actress government position
Running Mock Retrieval Tool on query:  actress who played Corliss Archer in film Kiss and Tell government position

### (+RAG) Force Retrieval

- Notice that the retrieval tool is sometimes not used, and query is not exact match into the mock retrieval tool. 
- Since we are working with a mock retrieval tool that is not very reliable and is only able to retrieve the context for the question, we can test on an alternative approach where we force retrieval tool to get the correct context. 

- Note F1 improved from 0.35 to 0.46 compared with using ReACT. 

In [48]:
query_template = "{question} Give a short factoid answer (as few words as possible). Use the following context to answer the question: {context}"

evaluate_hotpotqa(model_id=model_id,  retrieval_tool=retrieval_tool, query_template=query_template, show_result=show_result, num_examples=5)

Running Mock Retrieval Tool on query:  Were Scott Derrickson and Ed Wood of the same nationality?
Question:  Were Scott Derrickson and Ed Wood of the same nationality? Give a short factoid answer (as few words as possible). Use the following context to answer the question: [['Ed Wood (film)', ['Ed Wood is a 1994 American biographical period comedy-drama film directed and produced by Tim Burton, and starring Johnny Depp as cult filmmaker Ed Wood.', " The film concerns the period in Wood's life when he made his best-known films as well as his relationship with actor Bela Lugosi, played by Martin Landau.", ' Sarah Jessica Parker, Patricia Arquette, Jeffrey Jones, Lisa Marie, and Bill Murray are among the supporting cast.']], ['Scott Derrickson', ['Scott Derrickson (born July 16, 1966) is an American director, screenwriter and producer.', ' He lives in Los Angeles, California.', ' He is best known for directing horror films such as "Sinister", "The Exorcism of Emily Rose", and "Deliver Us 

### (+RAG) MockRetrieval -> WebSearch

- Now, the mock retrieval tool will return noisy distractor context. 
- Let's try an approach where we use websearch to get the correct context. 

In [10]:
from llama_stack_client.types.agent_create_params import AgentConfig
query_template = "{question} Give a short factoid answer (as few words as possible)."

agent_config = AgentConfig(
    model=model_id,
    instructions="You are a helpful assistant that can answer questions by searching the web. Please use the websearch tool to answer the question.",
    sampling_params={
        "strategy": {"type": "top_p", "temperature": 1.0, "top_p": 0.9},
    },
    toolgroups=["builtin::websearch"],
    tool_choice="auto",
    tool_prompt_format="json",
    input_shields=[],
    output_shields=[],
    enable_session_persistence=False,
)

agent = Agent(
    client=client,
    agent_config=agent_config,
)

evaluate_hotpotqa(agent=agent, query_template=query_template, show_result=show_result, num_examples=5)

Question:  Were Scott Derrickson and Ed Wood of the same nationality? Give a short factoid answer (as few words as possible).
Generated Answer:  Yes, Scott Derrickson and Ed Wood were of the same nationality. They were both American filmmakers.
Expected Answer:  yes
F1: 0.125, Precision: 0.06666666666666667, Recall: 1.0, Exact Match: False
----------------------------------------------------------------------------------------------------
Question:  What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell? Give a short factoid answer (as few words as possible).
Generated Answer:  The woman who portrayed Corliss Archer in the film "Kiss and Tell" held the government position of Chief of Protocol.
Expected Answer:  Chief of Protocol
F1: 0.3, Precision: 0.17647058823529413, Recall: 1.0, Exact Match: False
----------------------------------------------------------------------------------------------------
Question:  What science fantasy young ad

### (+RAG) Reranking on retrieved context

- Try adding a rerankng on top of retrieved context