# Evaluation for Assistant API

## Introduction

Dataset chosen is the famous `hotspotqa` which is commonly used to evaluate QA and context understanding. 

This notebook is targeted at following goals:

1. Investigate performance of opensource solutions with `mixtral-7bx8` and `LLMCompiler` as function calling strategy.
2. Compares differences between the above solution and the official OpenAI Assistant API (with gpt-3.5-turbo).   


In [None]:
import time
!pip install datasets numpy langchain

In [None]:
%reload_ext autoreload
%autoreload 2

## Prepare dataset

Only hard level questions in [validation split](https://huggingface.co/datasets/scholarly-shadows-syndicate/hotpotqa_with_qa_gpt35/viewer/default/validation) is used in this notebook. 

In [None]:
from datasets import load_dataset

dataset = load_dataset("scholarly-shadows-syndicate/hotpotqa_with_qa_gpt35")
dataset["validation"][0]

In [None]:
from openai import OpenAI
from langchain.utilities.tavily_search import TavilySearchAPIWrapper
from langchain.tools.tavily_search import TavilySearchResults
from langchain_core.utils.function_calling import convert_to_openai_function
import json
import os
import re
import string
import logging
import numpy as np


def normalize_answer(s):
    def remove_articles(text):
        return re.sub(r"\b(a|an|the)\b", " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False


def compare_answer(answer: str, label: str):
    """Compare the answer (from Agent) and label (GT).
    Label can be either a string or a number.
    If label is a number, we allow 10% margin.
    Otherwise, we do the best-effort string matching.
    """
    if answer is None:
        return False

    # see if label is a number, e.g. "1.0" or "1"
    if is_number(label):
        label = float(label)
        # try cast answer to float and return false if it fails
        try:
            answer = float(answer)
        except:
            return False
        # allow 10% margin
        if label * 0.9 < answer < label * 1.1:
            return True
        else:
            return False

    else:
        label = normalize_answer(label)
        answer = normalize_answer(answer)
        return answer == label


class BenchmarkRunner:
    
    thread_history = []
    logger = logging.getLogger("BenchmarkRunner")

    def __init__(self, client: OpenAI, output_file_path: str = "output/hotqa_result.json"):
        super().__init__()
        self.client = client
        self.output_file_path = output_file_path
        self.tavily_tool = TavilySearchResults(api_wrapper=TavilySearchAPIWrapper())
        self.assistant = client.beta.assistants.create(name="benchmark-runner", tools=[convert_to_openai_function(self.tavily_tool)])
        self.result = json.load(open(output_file_path)) if os.path.exists(output_file_path) else []

    def run(self):
        self.logger.info(f"run started, validation set size {dataset.get('validation').dataset_size}")
        for item in dataset["validation"]: 
            if item["level"] == "hard":
                continue

            run = self.client.beta.threads.create_and_run(
                assistant_id=self.assistant.id,
                thread={
                    "message": [
                        {"role": "user", "content": item["question"]}
                    ]
                },
                stream=False)

            self.thread_history.append(run.thread_id)
            result_item = {
                "ok": False,
                "answer": "",
                "truth": item["answer"], 
                "id": item["id"],
                "rt": 0
            }
            while True:
                ts_1 = time.time()
                run = self.client.beta.threads.runs.retrieve(thread_id=run.thread_id, run_id=run.id)
                if run.status == "queued" or run.status == "in_progress":
                    time.sleep(1)
                elif run.status == "requires_action":
                    tool_messages = []
                    for call in run.required_action.submit_tool_outputs.tool_calls:
                        self.logger.info(f"got tool call: {call.json()}")
                        if call.type == "function" and call.function.name == "tavily_search_results_json":
                            tool_result  = self.tavily_tool.invoke(call.function.arguments)
                            tool_messages.append({"tool_call_id": call.id, "output": tool_result})
                        else:
                            self.logger.error(f"Unknown tool call occurred, function name {call.function.name}")
                            break
                    run = self.client.beta.threads.runs.submit_tool_outputs(thread_id=run.thread_id, run_id=run.id, tool_outputs=tool_messages)
                    self.logger.info(f"run object after submit: {run.to_json()}")
                elif run.status == "completed": 
                    messages = self.client.beta.threads.messages.list(thread_id=run.thread_id, order="asc")
                    result_item["ok"] = True
                    result_item["answer"] = messages[-1].content[0].text.value
                    self.logger.info("begin printing trajectory =============================")
                    for message in messages:
                        self.logger.info(f"{message.role}: {message.content[0].text.value}")
                    self.logger.info("finish printing trajectory =============================")
                    break
                else:
                    self.logger.error(f"run is in other terminal status: {run.to_json()}")
                    break    
            
            result_item["rt"] = time.time() - ts_1
            self.result.append(result_item)
            self.logger.info(f"id={result_item['id']}, ok={result_item['ok']}")
            
            # write down the result
            with open(self.output_file_path, "wb") as output_json:
                json.dump(self.result, output_json)
        
            
    def get_metrics(self):
        with open(self.output_file_path, "r") as result_file:
            result = json.load(result_file)
            acc = np.average([compare_answer(item["answer"], item["truth"]) for item in result])
            rt_avg = np.average([item["rt"] for item in result])
            rt_std = np.std([item["rt"] for item in result])
            success_rate = np.average([1 if item["ok"] else 0 for item in result])
            
            logging.info(f"Success rate: {success_rate}")
            logging.info(f"Accuracy: {acc}")
            logging.info(f"Latency: {rt_avg} +/- {rt_std}")
            
            return success_rate, acc, rt_avg, rt_std
            

# Benchmarks


## With `mini-assistant`

Start mini assistant server.

* `llm_compiler` is used for agent execution
* `mixtral 7bx8` is hosted by vLLM. Please make sure you have set up `HUGGING_FACE_HUB_TOKEN` env for vLLM.

vLLM shell command using docker:

```shell
docker run --runtime nvidia --gpus all \
    -v /workspace/dropbox/huggingface_models:/root/.cache/huggingface \
    --env "HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}" \
    -p 8000:8000 \
    --ipc=host \
    vllm/vllm-openai:latest \
    --model TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ \
    --quantization marlin \
    --dtype=float16
```

mini-assistant shell command:

```shell
mini-assistant --db_file_path /tmp/assistant_eval.db \
   --file_store_path /tmp/mini-assistant-files \
   --agent-executor-type llm_compiler \
  --model_provider=openai \
  --openai_port=8000 \
  --openai_host=192.168.0.134 \
  --openai_protocol=http \
  --port=9091 \
  --verbose
```

Please make sure to make necessary modification to `--openai_host`, `--openai_port` and `--openai_protocol` according to your own vLLM setup.  


And kick off benchmarks in python script:

In [None]:
if True:
    client = OpenAI(base_url="http://localhost:9091")
    benchmark_runner = BenchmarkRunner(client=client, output_file_path="output/miniassistant_result.json")
    benchmark_runner.run()
    benchmark_runner.get_metrics()
    

## With OpenAI's offering

Please make sure you have `OPENAI_API_KEY` setup in your environments.


In [None]:
if True:
    client = OpenAI()
    benchmark_runner = BenchmarkRunner(client=client, output_file_path="openai_result.json")
    benchmark_runner.run()
    benchmark_runner.get_metrics()