In [1]:
from pprint import pprint
import pandas as pd

In [2]:
from requests_futures.sessions import FuturesSession

In [3]:
from tracking_grants import articles_f, email, ncbi_api_key, tool_name

In [4]:
from ratelimit import limits, sleep_and_retry
from tqdm.auto import tqdm
import re

In [5]:
articles = pd.read_csv(articles_f)

In [6]:
dois = articles.DOI.sample(100).unique().tolist()

### Async calls with request futures

In [79]:
NCBI_CALLS_PER_SEC = 3

class Eutils:
    def __init__(self, tool, email, api_key):
        self.search_api = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
        self.session = FuturesSession()
        self.session.hooks['response'] = self.response_hook
        
        self.params = {
            "tool": tool,
            "email": email,
            "api_key": api_key,
            "db": "pubmed",
            "retmax": 1,
        }

    @sleep_and_retry
    @limits(calls=NCBI_CALLS_PER_SEC, period=1)
    def search(self, doi):
        params = self.params
        params["term"] = doi

        future = self.session.get(self.search_api, params=params)
        return future
    
    @staticmethod
    def response_hook(resp, *args, **kwargs):
        if resp.status_code != 200:
            resp.data = f"Error:{response.status_code}"
        else:
            text = resp.text
            resp.data = None

            # Only return DOI has been found in the text
            if 'PhraseNotFound' not in text:
                count = int(re.search(r"<Count>(\d+)<\/Count>", text).group(1))
                # Only return unique matches
                if count == 1:
                    match = re.search(r"<Id>(\d+)<\/Id>", text)
                    if match:
                        resp.data =  match.group(1)

In [80]:
eutils = Eutils(tool_name, email, ncbi_api_key)

In [81]:
futures = []
for doi in tqdm(dois):
    futures.append((doi, eutils.search(doi)))

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [82]:
results = {}
for doi, future in tqdm(futures):
    response = future.result()
    results[doi] = response.data

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




### Async Code with aiohttp

In [7]:
import aiohttp
import asyncio
from asyncio_throttle import Throttler

In [8]:
import time

In [9]:
COUNT_REGEX = r"<Count>(\d+)<\/Count>"
ID_REGEX = r"<Id>(\d+)<\/Id>"

class Eutils():
    def __init__(self, tool, email, api_key, loop, calls_per_sec=3):
        self.baseurl = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
        self.params = {
            "tool": tool,
            "email": email,
            "api_key": api_key,
            "db": "pubmed",
            "retmax": 1,
        }
        
        self.loop = loop
        self.throttler = Throttler(rate_limit=calls_per_sec, period=1)
    
    async def __fetch(self, session, params):
        async with self.throttler:
            async with session.get(self.baseurl, params=params) as resp:
                await asyncio.sleep(1)
                return await resp.text()
#         await asyncio.sleep(0.05)
                
    async def __parse(self, text):
        pmid = None
        if 'PhraseNotFound' not in text:
            count = int(re.search(COUNT_REGEX, text).group(1))
            # Only return unique matches
            if count == 1:
                match = re.search(ID_REGEX, text)
                if match:
                    pmid = match.group(1)
        return pmid

    async def get_pmid(self, session, doi):
        params = self.params
        params['term'] = doi
        
        text = await self.__fetch(session, params)
        pmid = await self.__parse(text)
        return (doi, pmid)

    async def run(self, dois):
        tasks = []
        async with aiohttp.ClientSession(loop=self.loop) as session:
            # Create tasks with DOI
            tasks = [asyncio.ensure_future(self.get_pmid(session, doi)) for doi in dois]  

            # Await tasks and print progress
            responses = [await t for t in tqdm(asyncio.as_completed(tasks), total=len(dois))]

            return responses

In [10]:
loop = asyncio.get_event_loop()
eutils = Eutils(tool_name, email, ncbi_api_key, loop)

In [11]:
dois = articles.DOI.sample(10).unique().tolist()

In [13]:
results = await eutils.run(dois)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [17]:
pd.DataFrame(results, columns=["DOI", "pmid"]).to_csv(pmid_f)

Unnamed: 0,DOI,pmid
0,10.1016/j.clgc.2014.08.012,25450037
1,10.1210/endo.143.6.8846,12021174
2,10.1124/mol.107.036681,17507690
3,10.3748/wjg.v13.i11.1659,12506112
4,10.1200/jco.2015.65.3154,12506112
5,10.1371/journal.pone.0071709,12506112
6,10.1002/nbm.973,12506112
7,10.2144/000114063,12506112
8,10.4161/cbt.12.9.17682,12506112
9,10.1074/jbc.m207637200,12506112
