In [21]:
import re
import ads
import time
import tqdm
import json
import requests
from collections import Counter

In [2]:
txt = open("test_data/celerite.bib", "r").read()
bibcodes = re.findall("http://adsabs.harvard.edu/abs/(.+)}", txt)
print(len(bibcodes))

50


In [5]:
def get_all_bibcodes(q):
    sort = "bibcode desc"
    query = ads.SearchQuery(q=q, sort=sort, fl=["bibcode"])
    bibcodes = []
    while True:
        query.execute()
        new_bibcodes = [a.bibcode for a in query.response.articles]
        bibcodes += new_bibcodes
        if len(new_bibcodes) < 50:
            break
        
        # Check rate limits
        limits = query.response.get_ratelimits()
        if int(limits["remaining"]) <= 0:
            wait = int(limits["reset"]) - time.time()
            print("Request has been rate limited. Resets in {0} minutes".format(wait/60.0))
            time.sleep(wait)
    return bibcodes

def get_refs_and_cites(bibcode, ref_cache, cite_cache):
    if bibcode not in ref_cache:
        ref_cache[bibcode] = get_all_bibcodes("references(bibcode:{0})".format(bibcode))
    if bibcode not in cite_cache:
        cite_cache[bibcode] = get_all_bibcodes("citations(bibcode:{0})".format(bibcode))
    return ref_cache[bibcode] + cite_cache[bibcode]

def get_connections(bibcode_list, ref_cache=None, cite_cache=None):
    ref_cache = {} if ref_cache is None else ref_cache
    cite_cache = {} if cite_cache is None else cite_cache
    
    connections = []
    with tqdm.tqdm(bibcode_list) as t:
        for bibcode in t:
            t.set_description(bibcode)
            connections += get_refs_and_cites(bibcode, ref_cache, cite_cache)
            t.set_postfix(num=len(connections))
        
    return connections, ref_cache, cite_cache

In [6]:
results = get_connections(bibcodes)

2014ApJS..211...24M: 100%|██████████| 50/50 [06:48<00:00,  4.02s/it, num=2.44e+04]


In [10]:
with open("test_data/ref_cache.json", "w") as f:
    json.dump(results[1], f)
with open("test_data/cite_cache.json", "w") as f:
    json.dump(results[2], f)

In [44]:
hist = Counter(results[0])
hist = Counter(dict((k, hist[k]) for k in set(hist.keys()) - set(bibcodes)))
sorted_entries = [entry[0] for entry in hist.most_common(100)]

In [45]:
token = ads.base.BaseQuery().token
headers = {
    "Authorization": "Bearer {}".format(token),
    "User-Agent": "citebot",
    "Content-Type": "application/json",
}
data = dict(name="citebot-test-3", bibcode=sorted_entries)

r = requests.post("https://api.adsabs.harvard.edu/v1/biblib/libraries",
                  json=data, headers=headers)
r.raise_for_status()