In [None]:
from embedding_search.vector_store import MiniStore
import logging

logging.basicConfig(level=logging.DEBUG)
store = MiniStore()
store.build()

In [None]:
authors = store.weighted_search_author(query="Reading and writing", top_k=3)

In [None]:
[print(a, a.weighted_score) for a in authors]

In [None]:
for s in sorted_orcids:
    print(scores[s])

In [None]:
from embedding_search.utils import sort_key_by_value

sorted_orcids = sort_key_by_value(scores, reversed=True)

# Patch community name

In [None]:
from embedding_search.community_map import download_datafile

df = download_datafile()
df.to_parquet("tmp/community.parquet")

id_to_community_name = {row.orcid: row.community_name for row in df.itertuples()}
id_to_email = {row.orcid: row.email for row in df.itertuples()}

In [None]:
df.orcid.tolist()

In [None]:
import json
from pathlib import Path
from embedding_search.data_model import Author
from tqdm import tqdm

AUTHORS_DIR = Path("authors/")
authors_path = AUTHORS_DIR.glob("*.json")


def patch_author(json_path: Path) -> None:
    """Patch author JSON to include community name and email."""

    with open(json_path, "r") as f:
        author = json.load(f)

    author = Author(**author)
    author.email = id_to_email[author.orcid]
    author.community_name = id_to_community_name[author.orcid]
    author.save(AUTHORS_DIR / f"{author.orcid}.json")


for file in tqdm(authors_path):
    patch_author(file)

# Patch cited-by in the database

In [None]:
from pathlib import Path
from tqdm import tqdm
from embedding_search.crossref import query_crossref
from embedding_search.vector_store import get_author

authors_path = Path("./authors/").glob("*.json")
last_processed = "0000-0002-5769-7094"


short_listed_authors = []
start = False
for author_file in list(authors_path):
    if author_file.stem == last_processed:
        start = True
    if start:
        short_listed_authors.append(author_file)

In [None]:
short_listed_authors

In [None]:
for author_file in tqdm(short_listed_authors):
    print(author_file)
    author = get_author(author_file.stem)

    for article in author.articles:
        _, cited_by = query_crossref(article.doi)
        if cited_by:
            article.cited_by = cited_by

    author.save(author_file)

In [None]:
from embedding_search.vector_store import MiniStore
import logging

logging.basicConfig(level=logging.INFO)
store = MiniStore()
store.build()

# Feature 1: Search related articles

In [None]:
articles = store.search("pandemic resilience", type="article")
[print(article.title) for article in articles]

In [None]:
# This search is based on the author's articles centroid, i.e., the average of all the articles' embeddings
# i.e., it is based on relevancy only, not weighted by the number of articles
authors = store.search("higgs boson", type="author")
[print(author) for author in authors]

In [None]:
# This is a weighted by articles related to the query
authors = store.weighted_search_author("Higgs field", n_pool=100)
[print(author) for author in authors]

# Network graph
The purpose of this network graph prototype is to visualize a author and their works.

In [None]:
from embedding_search.visualize import EmbeddingsProcessor, QueryPlotter

In [None]:
STORE = store  # cache this...


def plot(query: str):
    processor = EmbeddingsProcessor(STORE)
    plotter = QueryPlotter(processor)
    return plotter.plot(query)

In [None]:
query = "higgs boson"
p = plot(query)
p.save(f"plots/{query}.html")