### SpaCy proper noun

In [None]:
from dotenv import load_dotenv
import os
import weaviate
import logging

from askem.terms_extractor import (
    CapitalizedWordsStrategy,
    MoreThanOneCapStrategy,
    ProperNounStrategy,
)

load_dotenv()
WEAVIATE_URL = "http://cosmos0001.chtc.wisc.edu:8080"
WEAVIATE_APIKEY = os.getenv("WEAVIATE_APIKEY")
print(WEAVIATE_URL)

logging.basicConfig(level=logging.WARNING)

In [None]:
client = weaviate.Client(
    url=WEAVIATE_URL, auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_APIKEY)
)

In [None]:
client.query.aggregate("passage").with_meta_count().do()

In [None]:
def sample_near(query: str) -> str:
    response = (
        client.query.get("Passage", ["text_content"])
        .with_near_text({"concepts": [query]})
        .with_limit(1)
        .do()
    )
    return response["data"]["Get"]["Passage"][0]["text_content"]

In [None]:
cws = CapitalizedWordsStrategy(min_length=3, top_k=10, min_occurrence=1)
mto = MoreThanOneCapStrategy(min_length=3, top_k=10, min_occurrence=1)
ppn = ProperNounStrategy(min_length=3, top_k=10, min_occurrence=1)


def test(query: str) -> None:
    text = sample_near(query)
    print(f"{text=}")

    print("========== CapitalizedWordsStrategy ==========")
    print(cws.extract_terms(text))
    print()
    print("========== MoreThanOneCapStrategy ==========")
    print(mto.extract_terms(text))
    print()
    print("========== ProperNounStrategy ==========")
    print(ppn.extract_terms(text))

In [None]:
test("SIR model for COVID")

In [None]:
test("How masking affects transmission rate in SEIRD model ")

- Spacy: it breaak hyphenated words into two words. It is not good for our case.
- Also, 'high' seems to be misclassified as a proper noun.

In [None]:
test("Alternative models to the SIR and SIDARTHE for studying COVID-19.")

In [None]:
test("Explain: MechBayes SEIRHD model ")

- Citations and misclassifications

In [None]:
test("Alternative to PIS")

In [None]:
test("SV2AIR3 model formula")

- Overall, `MoreThanOneCapStrategy` seems to works best for our case.