Article level bm25
1. Obtain rank from elastic search service
2. Use (1) results as top-n pre-filtering
3. Perform vector search


Terarium search box defaults: 

- https://xdd.wisc.edu/articles/get_articles?term=SIR%20model&dataset=xdd-covid-19&include_highlights=true&include_score=true&facets=true&additional_fields=title,abstract&match=true&known_entities=url_extractions,askem_object&max=20&per_page=20

In [None]:
from typing import List
import requests

In [None]:
def query(q: str, top_k: int) -> dict:
    """Simulate Terarium query."""

    url = "https://xdd.wisc.edu/api/articles"

    params = {
        "term": q,
        "dataset": "xdd-covid-19",
        # 'include_highlights': True,
        # 'include_score': True,
        # 'facets': True,
        # 'additional_fields': 'title,abstract',
        "match": "true",
        "max": top_k,
        # 'per_page': 20
    }

    response = requests.get(url, params=params)
    response.raise_for_status()
    return response.json()


def get_contents(response: dict, path: list, field: str) -> List[str]:
    """Get list of _gddid values from response."""

    for key in path:
        response = response[key]

    return [hit[field] for hit in response]

In [None]:
response = query("SIR model and COVID-19", 50)

In [None]:
get_contents(response, ["success", "data"], "_gddid")[:10]

In [None]:
get_contents(response, ["success", "data"], "title")[:10]

Warning: Ranking is not identical to Terarium search box defaults

Perhaps useful ids:
- _gddid: at path `['success']['data'][i]` == `paper_id`

Test API call to weaviate

In [None]:
import os
import weaviate
from dotenv import load_dotenv

load_dotenv()
weaviate_apikey = os.getenv("WEAVIATE_APIKEY")
url = os.getenv("WEAVIATE_URL")
client = weaviate.Client(
    url,
    auth_client_secret=weaviate.auth.AuthApiKey(weaviate_apikey),
)

# How many paragraphs we have in the database?
client.query.aggregate("passage").with_meta_count().do()

In [None]:
client.query.aggregate("Passage").with_where(
    {
        "path": "paper_id",
        "operator": "ContainsAny",
        "valueText": ["5fe28aeaea8bd37226bbb965"],
    }
).with_meta_count().do()

In [None]:
client.query.aggregate("Passage").with_where(
    {
        "path": "paper_id",
        "operator": "ContainsAny",
        "valueText": ["5f21d318a58f1dfd52105648"],
    }
).with_meta_count().do()

In [None]:
two_papers = ["5fe28aeaea8bd37226bbb965", "5f21d318a58f1dfd52105648"]

client.query.aggregate("Passage").with_where(
    {"path": "paper_id", "operator": "ContainsAny", "valueText": two_papers}
).with_meta_count().do()

In [None]:
client.query.aggregate("Passage").with_where(
    {
        "path": "paper_id",
        "operator": "ContainsAny",
        "valueText": ["5ec5647d998e17af826f5499"],
    }
).with_meta_count().do()

Some id is not in the weaviate DB: e.g., `5ec5647d998e17af826f5499`
but some does: `5fe28aeaea8bd37226bbb965`