In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from habanero import Crossref
from ratelimit import limits, sleep_and_retry
from tracking_grants import articles_f, cr_metadata_f, email, tool_name
from tracking_grants.utils.logging import logger
import json

In [2]:
from tqdm.auto import tqdm

In [4]:
# load articles
articles = pd.read_csv(articles_f)

In [4]:
cr = Crossref(mailto=email, ua_string=tool_name)

In [5]:
@sleep_and_retry
@limits(calls=10, period=1)
def query_crossref(cr, dois):
    return cr.works(dois)

In [16]:
dois = articles.DOI.tolist()

In [6]:

dois_per_call = 10
r = list(range(0, len(dois), dois_per_call))
r = r + [len(dois)]

results = []
for ix in tqdm(range(len(r)-1), total=len(r)-1):
    response = query_crossref(cr, dois[r[ix]:r[ix+1]])
    results.extend(response)

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




In [92]:
with open(cr_metadata_f, 'w') as f:
    json.dump(results, f)

## Processing data

In [5]:
results = json.loads(open(cr_metadata_f, "r").read())

In [6]:
direct_fields = ["ISSN", 'container-title', 'publisher', 'is-referenced-by-count', 'references-count', 'subject']
transform_fields = ["authors_count", ]
date_fields = ['created', 'deposited', 'indexed', 'published-online', 'issued']

In [7]:
df = pd.DataFrame(index=articles.DOI.tolist(), columns=direct_fields + transform_fields + date_fields)

In [18]:
df.head()

Unnamed: 0,ISSN,container-title,publisher,is-referenced-by-count,references-count,subject,authors_count,created,deposited,indexed,published-online,issued
10.2144/04365st01,"[0736-6205, 1940-9818]",[BioTechniques],Future Science Ltd,42.0,12.0,,6.0,2018.0,2019.0,2020.0,,2004.0
10.1002/jcb.10712,"[0730-2312, 1097-4644]",[Journal of Cellular Biochemistry],Wiley,11.0,25.0,"[Cell Biology, Biochemistry, Molecular Biology]",7.0,2003.0,2018.0,2020.0,2003.0,2003.0
10.1081/cnv-120025093,0735-7907,Cancer Investigation,Informa UK Limited,12.0,157.0,,,,,,,
10.1097/00129492-200409000-00027,,,,,,,,,,,,
10.1097/00129492-200307000-00022,,,,,,,,,,,,


In [20]:
for r in tqdm(results, total=len(results)):
    json = r['message']
    doi = json['DOI']
    
    for direct_f in direct_fields:
        if direct_f in json:
            df.loc[doi, direct_f] = str(json[direct_f])
    
    # authors
    if 'author' in json:
        df.loc[doi, 'authors_count'] = len(json['author'])
    
    for date_f in date_fields:
        if date_f in json:
            try:
                df.loc[doi, date_f] = json[date_f]['date-parts'][0][0]
            except:
                print(date_f, json[date_f])

HBox(children=(FloatProgress(value=0.0, max=8595.0), HTML(value='')))




In [24]:
df = df.drop_duplicates()

In [25]:
df = df.replace(0, np.nan)
df = df.rename(columns={
    'container-title': 'journal_name',
    'is-referenced-by-count':'coci_citations',
    'references-count': 'references',
    'subject': 'cr_subject',
})

In [29]:
len(articles)

8595

Unnamed: 0,reference_id,grant_id,program,score,DOI,pmid
0,0,NF000002,NFRP,0.442644,10.2144/04365st01,10.2144/04365ST01
1,2,NF000010,NFRP,0.433704,10.1002/jcb.10712,10.1002/JCB.10712
2,3,NF000014,NFRP,0.498568,10.1081/cnv-120025093,10.1081/CNV-120025093
3,4,NF000015,NFRP,0.937954,10.1097/00129492-200409000-00027,10.1097/00129492-200409000-00027
4,5,NF000015,NFRP,0.531327,10.1097/00129492-200307000-00022,10.1097/00129492-200307000-00022
...,...,...,...,...,...,...
8590,9520,OR150169,PRORP,0.932630,10.1089/wound.2017.0737,10.1089/WOUND.2017.0737
8591,9521,OR150169,PRORP,0.940352,10.1089/wound.2016.0719,10.1089/WOUND.2016.0719
8592,9522,OR160120,PRORP,0.790298,10.1115/1.4039342,10.1115/1.4039342
8593,9523,OR160120,PRORP,0.939744,10.1109/tnsre.2018.2848845,10.1109/TNSRE.2018.2848845


In [28]:
articles.merge(df, left_on="DOI", right_index=True)

Unnamed: 0,reference_id,grant_id,program,score,DOI,ISSN,journal_name,publisher,coci_citations,references,cr_subject,authors_count,created,deposited,indexed,published-online,issued
0,0,NF000002,NFRP,0.442644,10.2144/04365st01,"['0736-6205', '1940-9818']",['BioTechniques'],Future Science Ltd,42,12,,6,2018,2019,2020,,2004
1,2,NF000010,NFRP,0.433704,10.1002/jcb.10712,"['0730-2312', '1097-4644']",['Journal of Cellular Biochemistry'],Wiley,11,25,"['Cell Biology', 'Biochemistry', 'Molecular Bi...",7,2003,2018,2020,2003,2003
2,3,NF000014,NFRP,0.498568,10.1081/cnv-120025093,"['0735-7907', '1532-4192']",['Cancer Investigation'],Informa UK Limited,12,157,"['Cancer Research', 'Oncology', 'General Medic...",1,2003,2020,2020,2003,2003
592,616,NF990031,NFRP,0.855856,10.1081/cnv-120025093,"['0735-7907', '1532-4192']",['Cancer Investigation'],Informa UK Limited,12,157,"['Cancer Research', 'Oncology', 'General Medic...",1,2003,2020,2020,2003,2003
3,4,NF000015,NFRP,0.937954,10.1097/00129492-200409000-00027,['1531-7129'],['Otology & Neurotology'],Ovid Technologies (Wolters Kluwer Health),56,28,"['Sensory Systems', 'Otorhinolaryngology', 'Cl...",4,2004,2018,2020,,2004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8590,9520,OR150169,PRORP,0.932630,10.1089/wound.2017.0737,"['2162-1918', '2162-1934']",['Advances in Wound Care'],Mary Ann Liebert Inc,2,21,,12,2017,2018,2020,,2017
8591,9521,OR150169,PRORP,0.940352,10.1089/wound.2016.0719,"['2162-1918', '2162-1934']",['Advances in Wound Care'],Mary Ann Liebert Inc,7,14,,6,2017,2018,2020,,2017
8592,9522,OR160120,PRORP,0.790298,10.1115/1.4039342,"['1942-4302', '1942-4310']",['Journal of Mechanisms and Robotics'],ASME International,0,11,,5,2018,2019,2020,2018,2018
8593,9523,OR160120,PRORP,0.939744,10.1109/tnsre.2018.2848845,"['1534-4320', '1558-0210']",['IEEE Transactions on Neural Systems and Reha...,Institute of Electrical and Electronics Engine...,1,0,,2,2018,2018,2020,,2018
