# Hybrid search

To address the limitations of the original embedding search method, we propose a hybrid search method that combines the embedding search with `ElasticSearch` via `XDD Articles API v2`. 

## Hybrid API workflow

1. Perform `ElasticSearch` on the query string and return top 100 results at article level
2. Query vector store with:
    - pre-filter among the top 100 results from elastic search
    - (optional) any additional term-based pre-filtering
    - embedding search on the query string
3. Return relevant documents


## Usage

- New API route: `/hybrid`
- [swagger doc](http://cosmos0001.chtc.wisc.edu:4502/docs#/default/hybrid_get_docs_hybrid_post)

In [None]:
import requests
import os
from dotenv import load_dotenv
from askem._experimental.testset_ta1 import load_testset, gpt_eval
from typing import Optional, List
import pandas as pd
from tqdm.auto import tqdm

load_dotenv()

In [None]:
TESTSET = load_testset()
RETRIEVER_URL = "http://retriever:4502"
TESTSET.sample(3)

### Can the new implementation helps on our worst performing segment?

The query with key terms are among the worst performing items in v0 retriever, so we target these queries for evaluation.

In [None]:
from askem.terms_extractor import MoreThanOneCapStrategy, get_blacklist

df = TESTSET.query("is_keyword == 1").copy()
get_terms = MoreThanOneCapStrategy(
    min_length=3, min_occurrence=1, top_k=3, blacklist=get_blacklist("covid")
)
df["terms"] = df["question"].apply(get_terms.extract_terms)
df

## Evaluation

We submit the same queries to the new API and compare the results with previous versions.

1. XDD Articles V2
2. Retriever V0
3. Retriever Hybrid
4. Retriever Hybrid + term-based pre-filtering

In [None]:
def query_xdd(query: str):
    url = "https://xdd.wisc.edu/api/v2/articles"

    params = {
        "term": query,
        "dataset": "xdd-covid-19",
        # 'include_highlights': True,
        "include_score": True,
        # 'facets': True,
        "additional_fields": "title,abstract",
        "match": "true",
        "max": 1,
        # 'per_page': 20
    }

    response = requests.get(url, params=params)
    response.raise_for_status()
    return [r["abstract"] for r in response.json()["success"]["data"]]


def query_retriever(body: dict, endpoint: Optional[str] = None) -> requests.Response:
    """Simulate Terarium query."""

    url = RETRIEVER_URL

    if endpoint:
        url = f"{url}/{endpoint}"

    if "top_k" not in body:
        body["top_k"] = 1

    headers = {
        "Content-Type": "application/json",
        "Api-Key": os.getenv("RETRIEVER_APIKEY"),
    }
    response = requests.post(url, json=body, headers=headers)
    response.raise_for_status()
    return [r["text"] for r in response.json()]


def compare(case: pd.Series, skip_term_based: bool = False) -> str:
    """Compare the results of different API versions."""

    output = f"""Question: {case.question}

    XDD API (article level, showing abstract only):
    {query_xdd(case.question)}

    V0 retrieve API:
    {query_retriever({"question": case.question})}

    Hybrid API (No terms pre-filtering):
    {query_retriever({"question": case.question}, "hybrid")}

    """

    if not skip_term_based:
        output += f"""
            Hybrid API (With terms pre-filtering):
            {query_retriever({"question": case.question, "paragraph_terms": case.terms}, "hybrid")}
            """

    return output.replace("    ", "")

Show one example:

In [None]:
y = compare(df.iloc[0])
print(y)

### Lazy quantification with GPT-4

Comparing the performance is difficult without a ground truth. But we can bypass this problem with a lazy quantification method. We use GPT-4 prompt engineering to compare the performance of each API endpoint. It only serve as a rough quantification, but it is better than nothing. Manual examination is still required to confirm the results. 

```python

# Prompt used
system_message = {
    "role": "system",
    "content": "You are a expert in epidemiology. Given the following evaluation results, select the best API for the given question. Tie is allowed. You organize your output like this: ['API1', 'API2', 'API3'] returning one or more best APIs. Return 'None' if you think none of the APIs are good.",
}

user_message = {
    "role": "user",
    "content": f"Given this results: {result}, which API is the best?",
}

```

In [None]:
def eval_chain(case: pd.Series, skip_term_based=False) -> dict:
    """Evaluate the results of different API versions with GPT."""

    raw = compare(case, skip_term_based=skip_term_based)
    return {
        "question": case.question,
        "raw_eval": raw,
        "gpt_eval": gpt_eval(raw, model="gpt-4"),
    }

Run all comparisons in the queries with `terms`

In [None]:
[r["question"] for r in results]

In [None]:
results = []

for _, case in tqdm(df.iterrows()):
    if case.question in [r["question"] for r in results]:
        continue
    results.append(eval_chain(case))

In [None]:
# Save results to json

import json

with open("results.json", "w") as f:
    json.dump(results, f)

### How many votes are casted to each API?

In [None]:
def count_votes(results: list) -> dict:
    # print cases
    for i, result in enumerate(results):
        print(f"{i}: {result['question']} --- {result['gpt_eval']}")

    # flatten votes
    votes = [r["gpt_eval"] for r in results]
    flat_votes = [item for sublist in votes for item in sublist if sublist != "None"]
    stat = {v: flat_votes.count(v) for v in flat_votes}

    # Sort votes by value
    return {
        k: v for k, v in sorted(stat.items(), key=lambda item: item[1], reverse=True)
    }


count_votes(results)

Examine tie situations

In [None]:
for i, result in enumerate(results):
    print(f"{i}: {result['question']} --- {result['gpt_eval']}")

In [None]:
def examine(results: list, i: int) -> None:
    print(
        results[i]["raw_eval"],
        "\n",
        f"Votes: {results[i]['gpt_eval']}",
        "\n",
        "=" * 160,
    )

#### Finding 1: Hybrid API perhaps works better than other reference APIs

In [None]:
examine(results, 1)

In [None]:
examine(results, 11)

#### Finding 2: Term-pre filtering can be too strict sometimes

In some case, term-based pre-filtering is too stringent, Null result might be returned.

In [None]:
examine(results, 18)

In [None]:
examine(results, 15)

In [None]:
examine(results, 1)

### Finding 3: Occasionally all APIs work good

In [None]:
examine(results, 6)

#### Finding 4: Despite XDD comparison is not fair (only using abstract), it still win occasionally, but it is hard quantify further.

It is difficult to tell.

In [None]:
examine(results, 9)

#### Finding 5: We still have room for improvement, where none of the API works well (case: 3, 4, 13, 16)

In [None]:
examine(results, 3)

In [None]:
examine(results, 4)

In [None]:
examine(results, 13)

- difficult question... probably should improve the question itself...

In [None]:
examine(results, 16)

## Can the hybrid-based search works in non-terms-based scenarios?  

In [None]:
df = TESTSET.query("is_keyword == 0").copy()
df

In [None]:
results_non_term = []

In [None]:
from time import sleep

for i, case in tqdm(df.iterrows()):
    if case.question in [r["question"] for r in results_non_term]:
        continue
    results_non_term.append(eval_chain(case, skip_term_based=True))
    if (i + 1) % 10 == 0:
        sleep(60)

In [None]:
with open("results_non_term.json", "w") as f:
    json.dump(results_non_term, f)

In [None]:
count_votes(results_non_term)

In [None]:
def examine_all(results: list) -> None:
    [examine(results, i) for i in range(len(results))]

In [None]:
examine_all(results_non_term)