## Run Testset and Just look at the results

Procedure summary:
1. Get article and paragraph level capitalized terms
2. Append top 10 `terms` and top 3 `terms` to the item's metadata in the vector store
3. Use Hackathon testset to compare 2 new search strategies (article level and paragraph level `term` filtering + embedding search) 

Findings:
1. New term filtering strategy is better than the old one in some cases
2. Paragraph level term filtering is better than article level term filtering in some cases
3. Have to address missing terms
    - `SpaCy` proper nouns?
    - Words that contains multiple capitalized letters
    - Allow hyphenated words



In [None]:
from dotenv import load_dotenv
import os
import gspread
import pandas as pd
import requests
from typing import List, Optional
from askem.preprocessing import get_all_cap_words

load_dotenv()

In [None]:
def load_testset() -> pd.DataFrame:
    """Load testset from Google Sheet."""

    GCP_SECRET_FILE_PATH = os.getenv("GCP_SECRET_FILE_PATH")

    gc = gspread.service_account(filename=GCP_SECRET_FILE_PATH)
    sheet = gc.open("ASKEM-TA1-testset").worksheet("questions")

    records = sheet.get_values()
    labels = records[0]
    data = records[1:]

    new_labels = [label.lower().replace(" ", "_") for label in labels]
    df = pd.DataFrame.from_records(data, columns=new_labels)
    return df[["source", "target_type", "is_keyword", "question"]]

In [None]:
df = load_testset()
df["terms"] = df["question"].apply(get_all_cap_words)

In [None]:
df[["terms", "is_keyword"]]

It seems that we should not include some really common terms like: `COVID`

In [None]:
BLACKLIST_COMMON_TERMS = ["COVID19", "COVID"]


def get_better_terms(text: str, blacklist: Optional[List[str]] = None) -> List[str]:
    """Get better terms from text."""
    terms = get_all_cap_words(text)
    if not terms:
        return None

    if blacklist is None:
        blacklist = BLACKLIST_COMMON_TERMS

    better_terms = [term for term in terms if term not in blacklist]
    if not better_terms:
        return None
    return better_terms

In [None]:
df["terms"] = df["question"].apply(get_better_terms)
df[["terms", "is_keyword", "question"]]

Note. `COmplexVID` is not considered as a term. TODO: May need to address this later.

In [None]:
def eval_ta1(
    question: str, article_terms: List[str] = None, paragraph_terms: List[str] = None
) -> List[dict]:
    """Evaluate a question using the retriever API."""

    URL = os.getenv("RETRIEVER_URL")
    APIKEY = os.getenv("RETRIEVER_APIKEY")

    headers = {"Content-Type": "application/json", "Api-Key": APIKEY}
    json = {"question": question, "top_k": 3, "doc_type": "paragraph"}

    if article_terms:
        json["article_terms"] = article_terms

    if paragraph_terms:
        json["paragraph_terms"] = paragraph_terms

    response = requests.post(URL, headers=headers, json=json)
    return response.json()

In [None]:
# Get all questions with terms

df_with_terms = df.query("terms.notnull()")

results = []
for row in df_with_terms.itertuples():
    results.append(
        {
            "question": row.question,
            "terms": row.terms,
            "results_original": eval_ta1(row.question),
            "results_with_article_level_filter": eval_ta1(
                row.question, article_terms=row.terms
            ),
            "results_with_paragraph_level_filter": eval_ta1(
                row.question, paragraph_terms=row.terms
            ),
        }
    )

### Make a proper df

In [None]:
def flatten(result: dict) -> dict:
    """Flatten and select results."""

    output = {
        "question": result["question"],
        "terms": result["terms"],
        "original_top_1": result["results_original"][0]["text"],
        "original_top_2": result["results_original"][1]["text"],
        "original_top_3": result["results_original"][2]["text"],
    }

    try:
        output["article_top_1"] = result["results_with_article_level_filter"][0]["text"]
        output["article_top_2"] = result["results_with_article_level_filter"][1]["text"]
        output["article_top_3"] = result["results_with_article_level_filter"][2]["text"]
    except IndexError:
        pass

    try:
        output["paragraph_top_1"] = result["results_with_paragraph_level_filter"][0][
            "text"
        ]
        output["paragraph_top_2"] = result["results_with_paragraph_level_filter"][1][
            "text"
        ]
        output["paragraph_top_3"] = result["results_with_paragraph_level_filter"][2][
            "text"
        ]
    except IndexError:
        pass

    return output

In [None]:
df_results = pd.DataFrame.from_records([flatten(result) for result in results])
df_results.to_csv("ta1_eval.csv", index=False)

Results moved to [shared drive](https://docs.google.com/spreadsheets/d/1TJjtPoCaxIWaMDR_yTDka72uzOglDJis-hxNU78b9AA/edit#gid=95932881) for manual examination.