In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from habanero import Crossref
from ratelimit import limits, sleep_and_retry
from tracking_grants import articles_f, cr_metadata_f, email, tool_name
from tracking_grants.utils.logging import logger
import json

In [2]:
from tqdm.auto import tqdm

In [3]:
# load articles
articles = pd.read_csv(articles_f, index_col="article_id")

In [4]:
cr = Crossref(mailto=email, ua_string=tool_name)

In [7]:
cr.works("10.1120/jacmp.2022.25308")

{'status': 'ok',
 'message-type': 'work',
 'message-version': '1.0.0',
 'message': {'indexed': {'date-parts': [[2020, 4, 10]],
   'date-time': '2020-04-10T00:56:17Z',
   'timestamp': 1586480177188},
  'reference-count': 0,
  'publisher': 'Wiley',
  'issue': '4',
  'content-domain': {'domain': [], 'crossmark-restriction': False},
  'short-container-title': ['J. Appl. Clin. Med. Phys.'],
  'published-print': {'date-parts': [[2004, 10, 1]]},
  'DOI': '10.1120/jacmp.2022.25308',
  'type': 'journal-article',
  'created': {'date-parts': [[2006, 3, 5]],
   'date-time': '2006-03-05T15:35:54Z',
   'timestamp': 1141572954000},
  'page': '29-45',
  'source': 'Crossref',
  'is-referenced-by-count': 2,
  'title': ['Theoretical foundation for real-time prostate localization using an inductively coupled transmitter and a superconducting quantum interference device (SQUID) magnetometer system'],
  'prefix': '10.1002',
  'volume': '5',
  'author': [{'given': 'John E.',
    'family': 'McGary',
    'sequ

In [5]:
@sleep_and_retry
@limits(calls=10, period=1)
def query_crossref(cr, dois):
    return cr.works(dois)

In [16]:
dois = articles.DOI.tolist()

In [6]:

dois_per_call = 10
r = list(range(0, len(dois), dois_per_call))
r = r + [len(dois)]

results = []
for ix in tqdm(range(len(r)-1), total=len(r)-1):
    response = query_crossref(cr, dois[r[ix]:r[ix+1]])
    results.extend(response)

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




In [92]:
with open(cr_metadata_f, 'w') as f:
    json.dump(results, f)

## Processing data

In [5]:
results = json.loads(open(cr_metadata_f, "r").read())

In [7]:
direct_fields = ["ISSN", 'container-title', 'publisher', 'is-referenced-by-count', 'references-count', 'subject']
transform_fields = ["authors_count", ]
date_fields = ['created', 'deposited', 'indexed', 'published-online', 'issued']

In [9]:
df = pd.DataFrame(index=articles.DOI.tolist(), columns=direct_fields + transform_fields + date_fields)

In [10]:
for r in tqdm(results, total=len(results)):
    json = r['message']
    doi = json['DOI']
    
    for direct_f in direct_fields:
        if direct_f in json:
            df.loc[doi, direct_f] = json[direct_f]
    
    # authors
    if 'author' in json:
        df.loc[doi, 'authors_count'] = len(json['author'])
    
    for date_f in date_fields:
        if date_f in json:
            try:
                df.loc[doi, date_f] = json[date_f]['date-parts'][0][0]
            except:
                print(date_f, json[date_f])

HBox(children=(FloatProgress(value=0.0, max=6711.0), HTML(value='')))




In [117]:
df = df.replace(0, np.nan)
df = df.rename(columns={
    'container-title': 'journal_name',
    'is-referenced-by-count':'coci_citations',
    'references-count': 'references',
    'subject': 'cr_subject',
})

Unnamed: 0,ISSN,journal_name,publisher,coci_citations,references,cr_subject,authors_count,created,deposited,indexed,published-online,issued
10.1016/j.nbd.2007.07.015,[0969-9961],[Neurobiology of Disease],Elsevier BV,80.0,36.0,[Neurology],11.0,2007,2019,2020,,2007.0
10.1212/01.wnl.0000158653.81008.49,"[0028-3878, 1526-632X]",[Neurology],Ovid Technologies (Wolters Kluwer Health),35.0,,[Clinical Neurology],4.0,2011,2018,2020,2005.0,2005.0
10.1128/mcb.25.8.3151-3162.2005,"[0270-7306, 1098-5549]",[Molecular and Cellular Biology],American Society for Microbiology,21.0,79.0,"[Cell Biology, Molecular Biology]",6.0,2005,2020,2020,,2005.0
10.1111/j.1528-1167.2005.00289.x,"[0013-9580, 1528-1167]",[Epilepsia],Wiley,79.0,60.0,"[Neurology, Clinical Neurology]",5.0,2005,2020,2020,,2005.0
10.1002/glia.10324,"[0894-1491, 1098-1136]",[Glia],Wiley,27.0,48.0,"[Cellular and Molecular Neuroscience, Neurology]",7.0,2004,2020,2020,2004.0,2004.0
...,...,...,...,...,...,...,...,...,...,...,...,...
10.1089/wound.2016.0719,"[2162-1918, 2162-1934]",[Advances in Wound Care],Mary Ann Liebert Inc,7.0,14.0,,6.0,2017,2018,2020,,2017.0
10.1089/wound.2017.0737,"[2162-1918, 2162-1934]",[Advances in Wound Care],Mary Ann Liebert Inc,2.0,21.0,,12.0,2017,2018,2020,,2017.0
10.1115/1.4039342,"[1942-4302, 1942-4310]",[Journal of Mechanisms and Robotics],ASME International,,11.0,,5.0,2018,2019,2020,2018.0,2018.0
10.1109/tnsre.2018.2848845,"[1534-4320, 1558-0210]",[IEEE Transactions on Neural Systems and Rehab...,Institute of Electrical and Electronics Engine...,1.0,,,2.0,2018,2018,2020,,2018.0


In [121]:
articles.merge(df, left_on="DOI", right_index=True).to_csv(articles_f)