In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../src")

from tqdm.notebook import tqdm
import pymed
from pymed import PubMed
import time

# from scrape import scrape_s2

## Scrape PubMed

In [2]:
pubmed = PubMed(tool="GPTAreaOverview", email="vp988+pubmed@pm.me")

In [34]:
max_results = 9999
article_ids = pubmed._getArticleIds("scRNA-seq", max_results=max_results)

In [51]:
step = 250
all_arts = []
for aids in tqdm(pymed.helpers.batches(article_ids, step), total=(max_results // step)):
    for _ in range(5):
        try:
            arts = list(pubmed._getArticles(aids))
            all_arts.extend(arts)
            break
        except Exception as e:
            print("Error in request: ", e, ' ; retrying...')
            time.sleep(5)
    print('Ignoring one query...')

  0%|          | 0/39 [00:00<?, ?it/s]

Error in request:  ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))  ; retrying...
Ignoring one query...


In [73]:
import pandas as pd
from pandas import DataFrame, Series
import json

In [88]:
rem_columns = ['xml', 'isbn', 'language', 'publication_type', 'publisher', 'publisher_location', 'sections']
art_df = DataFrame([json.loads(a.toJSON()) for a in all_arts]).drop(columns=rem_columns)
art_df.to_parquet("../data/scrna_seq.pq")

## Get vector info from OpenAI

In [144]:
import openai
import numpy as np

import os
from time import sleep
from dotenv import load_dotenv
load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

In [103]:
art_df = art_df[~art_df.abstract.isna()]
art_df = art_df[art_df.abstract.map(str.strip).map(len) > 10]

In [147]:
embs = []
total_tokens = 0
pbar = tqdm(art_df.title + "\n" + art_df.abstract)
for text in pbar:
    pbar.set_description(f"Total tokens: {total_tokens}, price: ${total_tokens * 0.0004 / 1000:0.3g}")
    fail = True
    for _ in range(3):
        try:
            response = openai.Embedding.create(input=text, model="text-embedding-ada-002")
            total_tokens += response["usage"]["total_tokens"]
            embs.append(response['data'][0]['embedding'])
            fail = False
            break
        except Exception as e:
            print("Error in request: ", e)
            sleep(3)
    if fail:
        embs.append(None)

  0%|          | 0/9653 [00:00<?, ?it/s]

Error in request:  This model's maximum context length is 8191 tokens, however you requested 18682 tokens (18682 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.
Error in request:  This model's maximum context length is 8191 tokens, however you requested 18682 tokens (18682 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.
Error in request:  This model's maximum context length is 8191 tokens, however you requested 18682 tokens (18682 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.
Error in request:  The server experienced an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists. {
  "error": {
    "message": "The server experienced an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.ope

In [174]:
last_none = False
embs_filt = []
for e in embs:
    if e is None:
        if last_none:
            embs_filt.append(e)
        last_none = True
    else:
        embs_filt.append(e)
        last_none = False

In [176]:
art_df["embedding"] = embs_filt

In [181]:
art_df.to_parquet("../data/scrna_seq_with_embs.pq")

Save for annotation:

In [194]:
label_df = DataFrame({'text': art_df.title + "\n" + art_df.abstract.values, 'embedding': art_df.embedding})
with open("../data/train_embeddings.json", "w") as f:
    print(label_df.to_json(orient='records'), file=f)