Last updated: July 5, 2023

Last run: June 2, 2023

**Data filtering and augmenting with dates from Crossref and ArXiv**

## Unreviewed science in the news: The evolution of preprint media coverage from 2014-2021


Juan Pablo Alperin

**Related Publication:**
Fleerackers, A., Shores, K., Chtena, N. & Alperin, J.P. (2023). Unreviewed science in the news: The evolution of preprint media coverage from 2014-2021. *bioarxiv*. 

**Related Dataset:**
Alperin, Juan Pablo; Fleerackers; Shores, 2023, "Data for: Unreviewed science in the news", https://doi.org/10.7910/DVN/ZHQUFD, *Harvard Dataverse*

N.B. Some of the code below may not run perfectly. Accessing the APIs would sometimes timeout or stop and so there was some amount of retarting and merging that was necessary at the time. The below should be the final working version of the code, but your mileage may vary if you try to re-use as-is. Also note that API responses will change on future runs. 

In [3]:
import numpy as np
import pandas as pd
import time

from habanero import Crossref
cr = Crossref()

import arxiv
arxiv.Client(
  page_size = 100,
  delay_seconds = 3,
  num_retries = 3
)

from tqdm.auto import tqdm
tqdm.pandas()

import json

RETRY_QUERIES = False

In [2]:
input_file = 'data/top outlets/preprints_mention_details.csv'
df = pd.read_csv(input_file)

In [23]:
def get_crossref(doi):
    try:
        r = cr.works(ids = doi)
        return json.dumps(r['message'])
    except KeyboardInterrupt: 
        raise
    except Exception as err:
        print(doi, type(err).__name__)
        return type(err).__name__
    
def get_arxiv(arxiv_id):
    try:
        search = arxiv.Search(id_list=[arxiv_id])
        paper = next(search.results())

        ret = {}
        ret['arxiv_title'] = paper.title
        ret['arxiv_published_date'] = paper.published
        ret['arxiv_update_date'] = paper.updated
        ret['doi'] = paper.doi
        return ret
    except KeyboardInterrupt: 
        raise
    except Exception as err:
        print(arxiv_id, type(err).__name__)
        return type(err).__name__     
    
def parse_crossref(cr):
    ret = {}
    
    try:
        ret['crossref_create_date'] = cr['created']['date-time']
    except:
        raise
    
    try: 
        publish_date = cr['published']['date-parts'][0]
        assert len(publish_date) == 3
        ret['crossref_publish_date'] = '-'.join(map(str,publish_date))
    except: 
        ret['crossref_publish_date'] = None
    
    try:
        ret['crossref_title'] = cr['title'][0]
    except:
        ret['crossref_title'] = None
        
    try: 
        ret['crossref_published_doi'] = cr['relation']['is-preprint-of'][0]['id']
    except:
        ret['crossref_published_doi'] = None
        
    return ret

In [24]:
## API responses folder is not
try:
    arxivs = pd.read_csv('data/top outlets/API responses/arxivs_full.csv', index_col='arxiv_id')
except:
    pass


if RETRY_QUERIES: 
    arxiv_ids = df[df.arxiv_id.notnull()].arxiv_id.drop_duplicates().to_list()
    # arxiv_ids.loc[:,'archive_response'] = arxiv_ids.arxiv_id.progress_apply(get_arxiv)
    try: 
        if arxivs.shape[0] > 0:
            arxiv_ids = list(set(arxiv_ids).difference(arxivs.index))
    except:
        arxivs = pd.DataFrame()

    for arxiv_id in tqdm(arxiv_ids):
        paper =  get_arxiv(arxiv_id)
        arxivs = arxivs.append(pd.DataFrame(paper, index=[arxiv_id]))
        time.sleep(3)
    
    arxivs.to_csv('data/top outlets/arxivs_full.csv')

arxivs.rename(columns={'doi': 'arxiv_published_doi'}, inplace=True)

In [25]:
# arxiv_ids
df = df.merge(arxivs, how='left', left_on='arxiv_id', right_index=True)

In [26]:
try:
    arxiv_dois = pd.read_csv('data/top outlets/API responses/arxiv_doi_responses.csv', index_col='arxiv_id')
except: 
    arxiv_dois = arxivs[arxivs.arxiv_published_doi.notnull()].arxiv_published_doi.drop_duplicates().to_frame()
    arxiv_dois.loc[:,'crossref_response'] = arxiv_dois.arxiv_published_doi.progress_apply(get_crossref)
    arxiv_dois.to_csv('data/top outlets/arxiv_doi_responses.csv')

arxiv_dois = arxiv_dois[arxiv_dois.crossref_response != '"JSONDecodeError"']
arxiv_dois = arxiv_dois[arxiv_dois.crossref_response != '"TypeError"']
arxiv_dois.loc[:,'crossref_response'] = arxiv_dois.crossref_response.map(json.loads)



In [27]:
for c in df.columns:
    if c.startswith('arxiv_published_doi_'):
        del df[c]

df2 = arxiv_dois.crossref_response.map(parse_crossref)
df2 = pd.json_normalize(df2).set_index(df2.index)
del df2['crossref_published_doi']  # these are the DOIs from arxiv, so shouldn't be a preprint of anything else
df2.columns = ['arxiv_published_doi_%s' % s for s in df2.columns]

df = df.merge(df2, how="left", left_on='arxiv_id', right_index=True)
del df2

In [28]:
try: 
    dois = pd.read_csv('data/top outlets/API responses/crossref_responses.csv', index_col='doi')
except:
    dois = df[df.doi.notnull()].doi.drop_duplicates().to_frame()
    dois.loc[:,'crossref_response'] = dois.doi.progress_apply(get_crossref)  
    dois.to_csv('data/top outlets/crossref_responses.csv')    

if RETRY_QUERIES: 
    dois.reset_index(inplace=True)
    dois.loc[dois.crossref_response == 'JSONDecodeError','crossref_response'] = dois.loc[dois.crossref_response == 'JSONDecodeError', 'doi'].progress_apply(get_crossref)
    dois.set_index('doi', inplace=True)
    dois.to_csv('data/top outlets/crossref_responses.csv')    

dois = dois[dois.crossref_response != 'JSONDecodeError']
dois['crossref_response'] = dois.crossref_response.map(json.loads)



In [29]:
for c in df.columns:
    if c.startswith('crossref'):
        del df[c]

# add in the DOI error messages
df.merge(dois, how='left', left_on = 'doi', right_index=True)

# add in the DOI fields
df2 = dois[dois.crossref_response.map(type) != str].crossref_response.map(parse_crossref)
df2 = pd.json_normalize(df2).set_index(df2.index)
df = df.merge(df2, how='left', left_on='doi', right_index=True)
del df2

In [30]:
try: 
    cr_published_dois = pd.read_csv('data/top outlets/API responses/cr_published_doi_responses.csv', index_col='crossref_published_doi')
except:
    cr_published_dois = df[df.crossref_published_doi.notnull()].crossref_published_doi.drop_duplicates().to_frame()
    cr_published_dois.loc[:,'crossref_response'] = cr_published_dois.crossref_published_doi.progress_apply(get_crossref)  
    cr_published_dois.set_index('crossref_published_doi', inplace=True)
    cr_published_dois.to_csv('data/top outlets/cr_published_doi_responses.csv')    

cr_published_dois = cr_published_dois[cr_published_dois.crossref_response != 'JSONDecodeError']
cr_published_dois['crossref_response'] = cr_published_dois.crossref_response.map(json.loads)



In [31]:
for c in df.columns:
    if c.startswith('crossref_published_doi_'):
        del df[c]

df2 = cr_published_dois.crossref_response.map(parse_crossref)
df2 = pd.json_normalize(df2).set_index(df2.index)
del df2['crossref_published_doi']  # these are the DOIs from arxiv, so shouldn't be a preprint of anything else
df2.columns = ['crossref_published_doi_%s' % s[9:] for s in df2.columns]

df = df.merge(df2, how="left", left_on='crossref_published_doi', right_index=True)
del df2

In [32]:
# def best_published_date(row):
dates = ['first_seen_on', 'posted_on'] + [c for c in df.columns if c[-4:] == 'date']

for d in dates:
    df.loc[:,d] = pd.to_datetime(df[d], errors = 'coerce')

### code below assigns the best guess as to when a preprint was published

In [33]:
def best_preprint_pub_date(row):
    dates = [x for x in [row['arxiv_published_date'], row['crossref_create_date'], row['pubdate']] if pd.notnull(x)]

    if len(dates) > 0:    
        if row['server'] == 'ssrn': 
            return min([x for x in [row['crossref_create_date'], row['pubdate']] if pd.notnull(x)])

        if row['server'] == 'arxiv':
            return row['arxiv_published_date']
                       
        if row['server'] in ('biorxiv', 'medrxiv'):
            return min([x for x in [row['crossref_create_date'], row['crossref_publish_date']] if pd.notnull(x)])
    
    else:
        # only use this as a last resort
        return row['first_seen_on']

# timezone fixing
for d in dates:
    df[d] = pd.to_datetime(df[d] ,utc=True)
    
df.loc[:,'best_preprint_pub_date'] = df.apply(best_preprint_pub_date, axis=1)

### code below assigns the best guess as to when the peer-reviewed version was published

In [34]:
def best_published_pub_date(row):
    if pd.notnull(raow['arxiv_published_doi_crossref_create_date']):
        return row['arxiv_published_doi_crossref_create_date']
    elif pd.notnull(row['crossref_published_doi_create_date']):
        return row['crossref_published_doi_create_date']
    dates = [x for x in [row['arxiv_published_doi_crossref_create_date'], row['crossref_published_doi_create_date']] if pd.notnull(x)]
    if len(dates) > 0: 
        return min(dates)
    else:
        return None

df.loc[:,'best_published_pub_date'] = df.apply(best_published_pub_date, axis=1)


In [35]:
df.loc[:,'days_since_preprint'] = (df.posted_on - df.best_preprint_pub_date).map(lambda x: x.days)
df.loc[:,'days_since_publication'] = (df.posted_on - df.best_published_pub_date).map(lambda x: x.days)

In [36]:
df.to_csv('data/top outlets/preprints_mention_details_with_dates.csv', index=False)