In [1]:
import os, sys
import json
import requests
from local.caching import load, save, DictCache
from local.web import ncbi_search, chain_get
from local.constants import WORKSPACE_ROOT

In [2]:
ERR, RESPONSES = ncbi_search("cyanobacteria genomic models", "pubmed", "efetch", search_params=[("retmax", "10000")], response_params=[("rettype", "abstract")])
# print(json.dumps(d[0], indent=4))

fetching result 1246 of 1246

In [3]:
_kk = set()
def get_summary(data: dict):
    if "PubmedBookArticle" in data["PubmedArticleSet"]:
        print("skipping book")
        return None

    x = chain_get(data, "PubmedArticleSet, PubmedArticle, MedlineCitation, Article, ArticleTitle")
    assert len(x) == 1, x
    title = x[0]
    if isinstance(title, dict):
        title = "\n".join([s if isinstance(s, str) else " ".join(s) for s in title.values()])

    x = chain_get(data, "PubmedArticleSet, PubmedArticle, MedlineCitation, Article, Abstract, AbstractText")
    abstract = []
    if x is not None:
        for d in x:
            if isinstance(d, str):
                abstract.append(d)
            else:
                for k, v in d.items():
                    if k != "#text": 
                        _kk.add(k)
                        continue
                    try:
                        if isinstance(v, str): abstract.append(v)
                        elif isinstance(v, list): abstract.append(" ".join(set(v)))
                        elif isinstance(v, dict): abstract.append(" ".join(v.values()))
                    except:
                        abstract.append(json.dumps(v))
        
    x = chain_get(data, "PubmedArticleSet, PubmedArticle, PubmedData, ArticleIdList, ArticleId")
    if x is None:
        doi = ""
    else:
        doi = None
        for entry in x:
            if "doi" in entry.values():
                doi = entry["#text"]
        if doi is None:
            doi = ""

    return doi, title, "\n".join(abstract)

In [4]:
with open("./cache/abstracts.txt", "w") as f:
    all_entries = []
    for e in RESPONSES:
        toks = get_summary(e)
        if toks is None: continue
        for x in toks:
            assert isinstance(x, str), x
        doi, t, a = toks

        entry = "".join([f"{x}" for x in [
            f"{t}",
            f"{a}",
        ]])
        f.write(entry+"\n\n")
        all_entries.append((doi, entry))

len(all_entries)

skipping book


1245

In [5]:
with DictCache("ada_embeddings") as cache:
    for k, v in cache.items():
        # print(v)
        pass

In [6]:
with open(WORKSPACE_ROOT.joinpath("secrets/openai_key")) as s:
    OPENAI_KEY = s.readline()
def get_embedding(entry: str):
    MAX_L = 8191
    if len(entry) > MAX_L:
        e = entry[:MAX_L]
        print(f"truncated to {MAX_L} from {len(entry)}")
    else:
        e = entry

    with DictCache("ada_embeddings") as cache:
        if e in cache:
            return cache[e]
        else:
            r = requests.post(
                url="https://api.openai.com/v1/embeddings",
                headers={
                    "Content-Type": "application/json",
                    "Authorization": f"Bearer {OPENAI_KEY}",
                },
                json={
                    "model": "text-embedding-ada-002",
                    "input": e,
                }
            )
            data = r.json()
            if r.status_code == 200:
                cache[e] = data
            return data
        
embeddings = []
for i, (doi, text) in enumerate(all_entries):
    print(f"\r{i+1} of {len(all_entries)}", end="")
    d = get_embedding(text)
    embeddings.append(d)

1245 of 1245

In [7]:
import numpy as np
mapping = []
i = 0
for e in RESPONSES:
    toks = get_summary(e)
    if toks is None: continue
    doi, t, a = toks
    emb = embeddings[i]['data'][0]['embedding']
    mapping.append((doi, t, a, np.array(emb, dtype=np.float64)))
    i += 1

skipping book


In [8]:
save(f"1k_cy_mapping", mapping)

compressing & caching data to [{WORKSPACE}/main/scratch/cache/1k_cy_mapping.pkl.gz]
