<a href="https://colab.research.google.com/github/jlee2843/Peer-Review-22-23/blob/Albert-2022/API_Biorvix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys
print("version:", sys.version)

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

**Biorxiv API** information is found [here](https://api.biorxiv.org/details/medrxiv/help).<br>
**Final edit:** December 02, 2022.<br>
**NB:** Using Jenny's notebook  as a template
<br>
tqdm is used as visualization for the processing fo data. (Its manual can be found https://github.com/tqdm/tqdm#manual)

# Common functions

In [None]:
from pandas.core.internals.ops import Iterator
#from tqdm.notebook import tqdm, trange
from typing import Union, List
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from threading import current_thread
import math
import pandas as pd
import time
import numpy as np
import requests
import json
import urllib, urllib.request
import pathlib
import tqdm.contrib.concurrent as tq

# common helper function
def get_total (url) -> int:
    json_info = get_json_data(url)
    return json_info["messages"][0]["total"]

def get_json_data (url):
    try:
        request_API = requests.get(url)
        #print(f"{url} request_API: {request_API}")
        return json.loads(request_API.text)
    except Exception as e:
        time.sleep(120)
        request_API = requests.get(url)
        return json.loads(request_API.text)

def get_data(url:str, loop_range:range, disable:bool):
    #print(f"values: {list(loop_range)}")
    results = []
    #print(args)
    results = tq.thread_map(get_json_data, (f'{url}/{cursor}' for cursor in loop_range), 
                            desc=f'get_json_data {current_thread().name}', total=len(loop_range),
                            disable=disable, leave=False)
    
    return results

#def get_article_detail(url, articles):
#    result = get_data(url,articles)
#
#    return result

def process_data(json_info, keys:List[str], cursor:int, disable:bool) -> List:
    journal_list = [[entry + cursor] + [getValue(journal, key) for key in keys] for entry, journal in enumerate(json_info["collection"])]
    if disable is False:
        time.sleep(0.001 * len(journal_list))
    return journal_list

def query_to_df(results, keys:List[str], col_names:List[str], loop_list:range, disable:bool) -> pd.DataFrame:
    args = zip(results, 
               [keys for _ in loop_list],
               loop_list,
               [disable for _ in loop_list])
    args = list(args)
    total = len(args)
    result_list = []
    result_list = tq.thread_map(lambda p: process_data(*p), args, desc=f'processing data {current_thread().name}', total=total, leave=False, disable=disable)
    
    data = np.array(flatten(result_list))
    #print(data)
    #print(col_names)0
    #print(f'loop_list: {loop_list}')
    return create_df(data, col_names)

# prepublish helper function
def create_prepublish_df(df:pd.DataFrame) -> pd.DataFrame:
    try:
        df['Num_of_Authors'] = df.Authors.apply(lambda x: len(x.split(';')))
        df.DOI = df.DOI.astype('str')
        df.Title = df.Title.astype('str').map(lambda x: x.strip())
        df.Authors = df.Authors.astype('str').map(lambda x: x.replace('\'','').strip())
        df.Corresponding_Authors = df.Corresponding_Authors.astype('str').map(lambda x: x.strip())
        df.Institution = df.Institution.map(lambda x: x.strip().upper()).astype('category')
        df.Date = df.map(lambda x: convert_date(x)).astype('datetimen64')
        df.Type = df.Type.map(lambda x: x.strip().lower()).astype('category')
        df.Category = df.Category.map(lambda x: x.strip().title()).astype('category')
#        df.Xml = df.Xml.astype('str')
    except Exception as e:
        print(f'Error in data format:{e.args}\n')
        print(e.with_traceback)

    return df

# helper functions for published articles
# pub = ["preprint_doi", "published_doi", "preprint_title", "preprint_authors", "preprint_author_corresponding", "preprint_author_corresponding_institution", "preprint_category", "published_journal", "preprint_date", "published_date"]
def create_published_df(df:pd.DataFrame) ->pd.DataFrame:
    try:
        df['Num_of_Authors'] = df.Authors.apply(lambda x: len(x.split(';')))
        df.DOI = df.DOI.astype('str')
        df.pub_DOI = df.pub_DOI.astype('str')
        df.Title = df.Title.astype('str').map(lambda x: x.strip())
        df.Authors = df.Authors.astype('str').map(lambda x: x.replace('\'','').strip())
        df.Corresponding_Authors = df.Corresponding_Authors.astype('str').str.strip()
        df.Institution = df.Institution.map(lambda x: x.strip().upper()).astype('category')
        df.Category = df.Category.map(lambda x: x.strip().title()).astype('category')
        df.Journal = df.Journal.astype('str').map(lambda x: x.strip().title()).astype('category')
        #tqdm.pandas(desc="Preprint_Date conversion!")
        df.Preprint_Date = df.Preprint_Date.map(lambda x: convert_date(x)).astype('datetime64')
        #tqdm.pandas(desc="Published_Date conversion!")
        df.Published_Date = df.Published_Date.map(lambda x: convert_date(x)).astype('datetime64')
    except Exception as e:
        print(f'Error in data format:{e.args}\n')
        print(e.with_traceback)
    #    print(f'Preprint_Date: {df.Preprint_Date} Published_Date: {df.Published_Date}')
#
    return df

def convert_date(value:str) -> datetime:
    try:
        return datetime.strptime(value.strip().split(':')[0], '%Y-%m-%d')
    except Exception as e:
        print(e)
        return pd.NaT

# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
        #df.Published_Date = datetime.strptime(df.Published_Date.str.strip().str.split(':')[0], '%Y-%m-%d')

def getValue(journal, key):
    result = np.NaN
    try:
        result = journal[key]
    except Exception as e:
        print (f'key: {key} journal: {journal}\n{e}')
        raise e

    finally:
        return result
  
def get_big_data(path:str, url:str, cursor:int, json_keys:List[str], col_names:List[str], step:int, disable:bool):
    result_list = [get_json_data(f'{url}/{cursor}')]
    df = query_to_df(result_list, json_keys, col_names, range(cursor, cursor + step, step), disable)
    df.to_parquet(pathlib.Path(f'{path}/{datetime.utcnow().timestamp()}.parquet'))
    #time.sleep(0.001)  # to visualize the progress

def multithread_processor(path:str, url:str, json_keys:List[str], col_names:List[str], step:int, loop_range:range, disable:bool):
    print(f"values: {list(loop_range)}")
    results = []
    args = [(path, url, cursor, json_keys, col_names, step, disable) for cursor in loop_range]
    print(f'args: {len(args)}\n{args}')
    tq.thread_map(lambda p: get_big_data(*p), args, desc='get_big_data', total=len(args))

def process_doi_data(path:str, url:str, doi:pd.Series, meta: List, col_names: List, item:int, disable:bool = False):
    results = get_data(url, doi[item:item+step], disable)
    tmp = list(results)
    df = query_to_df([r for r in tmp], meta, col_names, 
                     range(item, item + (len(tmp) * step), step), disable)
    df.to_parquet(pathlib.Path(f'{path}/{datetime.utcnow().timestamp()}.parquet'))

freq_count = lambda x,y: x[y].value_counts()

flatten = lambda y: sorted([sublist for inner in y for sublist in inner],
                           key=lambda x:x[0])

create_df = lambda x, y: pd.DataFrame(data=x[:, 1:], index=x[:, 0], columns=y)


## Pre-publish Data query for given timeframe

In [None]:
# "global" variables
base_url: str = 'https://api.biorxiv.org'
query_type: str = 'details'
server: str = 'biorxiv'
start_interval: str = '2011-01-01'
end_interval: str = '2020-12-31'
step = 100


### BioRvix Server

In [None]:
# "local" variables
url: str = f'{base_url}/{query_type}/{server}/{start_interval}/{end_interval}'
path: str = f'prepub-{query_type}-{server}-{start_interval}!{end_interval}-{datetime.now()}'
step = 100
journal_df = None

#finding the number of records for a given time period
#tmp = f"{base_url}/{start_interval}/{end_interval}"
#tmp = 'https://api.biorxiv.org/details/biorxiv/2020-08-21/2020-08-28'

#create directory
pathlib.Path(path).mkdir(parents=True, exist_ok=True)

multithread_processor(path, url, 
                      ["doi", "title", "authors", "author_corresponding", "author_corresponding_institution", "date", "version", "type", "category", "jatsxml", "published"],
                      ["DOI", "Title", "Authors", "Corresponding_Authors", "Institution", "Date", "Version", "Type", "Category", "Xml", "Published"],
                      step, range(0, get_total(url), step), True)



values: [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 5100, 5200, 5300, 5400, 5500, 5600, 5700, 5800, 5900, 6000, 6100, 6200, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100, 7200, 7300, 7400, 7500, 7600, 7700, 7800, 7900, 8000, 8100, 8200, 8300, 8400, 8500, 8600, 8700, 8800, 8900, 9000, 9100, 9200, 9300, 9400, 9500, 9600, 9700, 9800, 9900, 10000, 10100, 10200, 10300, 10400, 10500, 10600, 10700, 10800, 10900, 11000, 11100, 11200, 11300, 11400, 11500, 11600, 11700, 11800, 11900, 12000, 12100, 12200, 12300, 12400, 12500, 12600, 12700, 12800, 12900, 13000, 13100, 13200, 13300, 13400, 13500, 13600, 13700, 13800, 13900, 14000, 14100, 14200, 14300, 14400, 14500, 14600, 14700, 14800, 14900, 15000, 15100, 15200, 15300, 15400, 15500, 15600, 1570

get_big_data:   0%|          | 0/1481 [00:00<?, ?it/s]

In [None]:
journal_df = pd.read_parquet(pathlib.Path(path))
journal_df = create_prepublish_df(journal_df)

display(journal_df)

Error in data format:("'DataFrame' object has no attribute 'map'",)

<built-in method with_traceback of AttributeError object at 0x7fe915f0ce50>


Unnamed: 0,DOI,Title,Authors,Corresponding_Authors,Institution,Date,Version,Type,Category,Xml,Published,Num_of_Authors
400,10.1101/003194,Phylogenetic tree shapes resolve disease trans...,Jennifer Gardy;Caroline Colijn;,Caroline Colijn,IMPERIAL COLLEGE LONDON,2014-03-05,1,New Results,Evolutionary Biology,https://www.biorxiv.org/content/early/2014/03/...,10.1093/emph/eou018,3
401,10.1101/003194,Phylogenetic tree shapes resolve disease trans...,Jennifer Gardy;Caroline Colijn;,Caroline Colijn,IMPERIAL COLLEGE LONDON,2014-05-28,2,New Results,Evolutionary Biology,https://www.biorxiv.org/content/early/2014/05/...,10.1093/emph/eou018,3
402,10.1101/003202,TCF7L2 is a master regulator of insulin produc...,Yuedan Zhou;Soo-Young Park;Jing Su;Kathleen Ba...,Ola Hansson,LUND UNIVERSITY,2014-03-05,1,New Results,Cell Biology,https://www.biorxiv.org/content/early/2014/03/...,10.1093/hmg/ddu359,21
403,10.1101/003236,Characterization of directed differentiation b...,Magali Soumillon;Davide Cacchiarelli;Stefan Se...,Tarjei S Mikkelsen,BROAD INSTITUTE,2014-03-05,1,New Results,Genomics,https://www.biorxiv.org/content/early/2014/03/...,,6
404,10.1101/003244,An Analysis of Cochlear Implant Distortion fro...,Barry David Jacobson;,Barry David Jacobson,MASSACHUSETTS INSTITUTE OF TECHNOLOGY,2014-03-06,1,New Results,Bioengineering,https://www.biorxiv.org/content/early/2014/03/...,,2
...,...,...,...,...,...,...,...,...,...,...,...,...
148040,10.1101/2020.12.30.424890,Sequencing of clinical samples reveals that ad...,"Li, L.; Ma, J.; Li, J.; Yuan, J.; Su, W.; Jin,...",Liqiang Li,BGI-SHENZHEN,2020-12-31,1,new results,microbiology,https://www.biorxiv.org/content/early/2020/12/...,,32
148041,10.1101/2020.12.30.424885,Evolution of lbx spinal cord expression and fu...,"Juarez-Morales, J. L.; Weierud, F.; England, S...",Katharine Lewis,SYRACUSE UNIVERSITY,2020-12-31,1,new results,developmental biology,https://www.biorxiv.org/content/early/2020/12/...,10.1111/ede.12387,7
148042,10.1101/2020.12.30.424879,Cheese or cheese infusions - Ecological traps ...,"Peach, D.; Almond, M.; Meraj, S.; Ko, E.; Grie...",Dan Peach,THE UNIVERSITY OF BRITISH COLUMBIA,2020-12-31,1,new results,ecology,https://www.biorxiv.org/content/early/2020/12/...,10.1002/ps.6603,6
148043,10.1101/2020.12.30.424842,Computational quantification of global effects...,"Sommariva, S.; Caviglia, G.; Ravera, S.; Frass...",Sara Sommariva,UNIVERSITÀ DEGLI STUDI DI GENOVA,2020-12-31,1,new results,cancer biology,https://www.biorxiv.org/content/early/2020/12/...,10.1038/s41598-021-99073-7,9


In [None]:
pd.options.mode.use_inf_as_na = True # this option check for empty strings as well
jl_fix_df = pd.DataFrame(columns=journal_df.columns)
for col in journal_df:
    jl_fix_df = pd.concat([jl_fix_df, journal_df[journal_df[col].isna()]])
    jl_fix_df = pd.concat([jl_fix_df, journal_df[journal_df[col] == '']])

display(jl_fix_df)

Unnamed: 0,DOI,Title,Authors,Corresponding_Authors,Institution,Date,Version,Type,Category,Xml,Published,Num_of_Authors
12153,10.1101/092171,Darwin: A Hardware-acceleration Framework for ...,"Turakhia, Y.; Zheng, K. J.; Bejerano, G.; Dall...",,STANFORD UNIVERSITY,2017-01-15,1,new results,genomics,https://www.biorxiv.org/content/early/2017/01/...,10.1109/MM.2019.2910009,4
12154,10.1101/092171,Darwin: A Hardware-acceleration Framework for ...,"Turakhia, Y.; Zheng, K. J.; Bejerano, G.; Dall...",,STANFORD UNIVERSITY,2017-01-24,2,new results,genomics,https://www.biorxiv.org/content/early/2017/01/...,10.1109/MM.2019.2910009,4
12673,10.1101/104778,Engaging narratives evoke similar neural activ...,"Cohen, S. S.; Henin, S.; Parra, L. C.",,THE CITY COLLEGE OF THE CITY UNIVERSITY OF NEW...,2017-01-31,1,new results,neuroscience,https://www.biorxiv.org/content/early/2017/01/...,10.1038/s41598-017-04402-4,3
80783,10.1101/708800,CLEC-2 suppresses calcification in cultured os...,"Kanai, T.; Sawa, Y.; Takara, K.; Kajiwara, K.;...",,OKAYAMA UNIVERSITY GRADUATE SCHOOL OF MEDICINE...,2019-07-19,1,new results,cell biology,https://www.biorxiv.org/content/early/2019/07/...,,8
94799,10.1101/843763,STRESS-INDUCED GENETIC CHANGE IN FLAX REVEALS ...,"Li, X.",,CASE WESTERN RESERVE UNIVERSITY,2019-11-15,1,new results,genomics,https://www.biorxiv.org/content/early/2019/11/...,,1
...,...,...,...,...,...,...,...,...,...,...,...,...
9463,10.1101/073999,Calculation of a distribution free estimate of...,Joachim Goedhart;,Joachim Goedhart,UNIVERSITY OF AMSTERDAM,2016-09-10,2,New Results,,https://www.biorxiv.org/content/early/2016/09/...,,2
9466,10.1101/070631,Gender disparity in computational biology rese...,Kevin S. Bonham;Melanie I. Stefan;,Kevin S. Bonham,"CURRICULUM FELLOWS PROGRAM, HARVARD MEDICAL SC...",2016-08-26,1,New Results,,https://www.biorxiv.org/content/early/2016/08/...,10.1371/journal.pcbi.1005134,3
9467,10.1101/070631,Gender disparity in computational biology rese...,Kevin S. Bonham;Melanie I. Stefan;,Kevin S. Bonham,"CURRICULUM FELLOWS PROGRAM, HARVARD MEDICAL SC...",2016-09-09,2,New Results,,https://www.biorxiv.org/content/early/2016/09/...,10.1371/journal.pcbi.1005134,3
9469,10.1101/069468,How do Research Faculty in the Biosciences Eva...,Timothy Kassis;,Timothy Kassis,MASSACHUSETTS INSTITUTE OF TECHNOLOGY,2016-08-14,1,New Results,,https://www.biorxiv.org/content/early/2016/08/...,10.1371/journal.pone.0183632,2


### Publication data for Prepublication data for the given time period

In [None]:
#from tqdm import tqdm_notebook, tnrange
from tqdm.notebook import tqdm_notebook
import time
import math

#variables
#https://api.biorxiv.org/pubs/biorvix/10.1101/759530
base_url: str = 'https://api.biorxiv.org'
query_type: str = 'pubs'
server: str ='biorxiv'
url: str = f'{base_url}/{query_type}/{server}'
path: str = f'pub_journal-{query_type}-{server}-{datetime.now()}'
step = 100
#doi = ['10.1101/856302']
doi = journal_df[journal_df.Published != 'NA'].Published

#create directory
pathlib.Path(path).mkdir(parents=True, exist_ok=True)
total = len(doi)
#def process_doi_data(path:str, url:str, doi:pd.Series, meta: List, col_names: List, item:int, loop_list, disable:bool = False):
                     
args = [(path, url, doi, 
         ["preprint_doi", "published_doi", "preprint_title", "preprint_authors", "preprint_author_corresponding", "preprint_author_corresponding_institution", "preprint_category", "published_journal", "preprint_date", "published_date"],
         ["DOI", "pub_DOI", "Title", "Authors", "Corresponding_Authors", "Institution", "Category", "Journal", "Preprint_Date", "Published_Date"],
         item, True) for item in range(0, total, step)]
print(f'total doi: {total} iter: {total/step}')
tq.thread_map(lambda p: process_doi_data(*p), args, desc='process_doi_data', max_workers=5, total=len(args))
#tq.thread_map(lambda p: process_doi_data(*p), args, desc='process_doi_data', total=len(args))


total doi: 102771 iter: 1027.71


process_doi_data:   0%|          | 0/1028 [00:00<?, ?it/s]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [None]:
filtered_df = pd.read_parquet(pathlib.Path(path))
print([item for item in filtered_df.Published_Date if len(item.strip()) != 10])
filtered_df = create_published_df(filtered_df).reindex()
filtered_df.rename(columns={'pub_DOI':'Published', 'Preprint_Date':'Date'}, inplace=True)


['2017-11-3', '2017-12-4', '2017-12-4', '2017-11-9', '2017-11-8', '2017-11-8', '2017-11-8', '2018-08-30:e12744', '2017-12-6', '2017-12-6', '2017-12-8', '2018-05-15:1-12', '2018-03-21:1-11', '2018-05-15:fj2018002', '2018-11-28:1-14', '2018-11-28:1-14', '2018-02-6', '2018-05-1', '2018-10-24:mbcE18080', '2018-10-24:mbcE18080', '2018-10-24:mbcE18080', '2019-05-16:e1800100', '2019-05-14:1-8', '2019-07-4:1-9', '-00-00', '-00-00', '-00-00', '-00-00', '-00-00', '-00-00', '-00-00', '-00-00', '-00-00', '-00-00']
time data '-00-00' does not match format '%Y-%m-%d'
time data '-00-00' does not match format '%Y-%m-%d'
time data '-00-00' does not match format '%Y-%m-%d'
time data '-00-00' does not match format '%Y-%m-%d'
time data '-00-00' does not match format '%Y-%m-%d'
time data '-00-00' does not match format '%Y-%m-%d'
time data '-00-00' does not match format '%Y-%m-%d'
time data '-00-00' does not match format '%Y-%m-%d'
time data '-00-00' does not match format '%Y-%m-%d'
time data '-00-00' does 

Finding and cleaning missing data

In [None]:
pd.options.mode.use_inf_as_na = True # this option check for empty strings as well
fix_df = pd.DataFrame(columns=filtered_df.columns)
for col in filtered_df:
    fix_df = pd.concat([fix_df, filtered_df[filtered_df[col].isna()]])
    fix_df = pd.concat([fix_df, filtered_df[filtered_df[col] == '']])
    
    
display("Rows with incomplete data:", fix_df)

'Rows with incomplete data:'

Unnamed: 0,DOI,Published,Title,Authors,Corresponding_Authors,Institution,Category,Journal,Date,Published_Date,Num_of_Authors
13400,10.1101/092171,10.1109/MM.2019.2910009,Darwin: A Hardware-acceleration Framework for ...,"Turakhia, Y.; Zheng, K. J.; Bejerano, G.; Dall...",,STANFORD UNIVERSITY,Genomics,Ieee Micro,2017-01-15,2019-04-11,4
13500,10.1101/092171,10.1109/MM.2019.2910009,Darwin: A Hardware-acceleration Framework for ...,"Turakhia, Y.; Zheng, K. J.; Bejerano, G.; Dall...",,STANFORD UNIVERSITY,Genomics,Ieee Micro,2017-01-15,2019-04-11,4
16800,10.1101/104778,10.1038/s41598-017-04402-4,Engaging narratives evoke similar neural activ...,"Cohen, S. S.; Henin, S.; Parra, L. C.",,THE CITY COLLEGE OF THE CITY UNIVERSITY OF NEW...,Neuroscience,Scientific Reports,2017-01-31,2017-07-04,3
79400,10.1101/2020.01.16.908798,10.1371/journal.pone.0228429,Pterostilbene Protects Cochlea from Ototoxicit...,"ÖZDAS, S.; TASTEKIN, B.; GÜRGEN, S. G.; ÖZD...",,"HACETTEPE UNIVERSITY, FACULTY OF SCIENCE",Neuroscience,Plos One,2020-01-16,2020-07-28,8
92000,10.1101/2020.05.10.084335,10.1126/sciadv.abd2781,Notch1 switches progenitor competence in induc...,"Tiberi, L.",,UNIVERSITY OF TRENTO,Cancer Biology,Science Advances,2020-05-10,2021-06-23,1
...,...,...,...,...,...,...,...,...,...,...,...
95900,10.1101/2020.06.18.159608,10.1038/s41467-021-22110-6,"Structure of the complete, membrane-assembled ...","Hutchings, J.; Stancheva, V. G.; Brown, N. R.;...",Giulia Zanetti,"ISMB, BIRKBECK COLLEGE",Cell Biology,,2020-06-19,NaT,6
96000,10.1101/2020.06.18.159608,10.1038/s41467-021-22110-6,"Structure of the complete, membrane-assembled ...","Hutchings, J.; Stancheva, V. G.; Brown, N. R.;...",Giulia Zanetti,"ISMB, BIRKBECK COLLEGE",Cell Biology,,2020-06-19,NaT,6
96100,10.1101/2020.06.18.159608,10.1038/s41467-021-22110-6,"Structure of the complete, membrane-assembled ...","Hutchings, J.; Stancheva, V. G.; Brown, N. R.;...",Giulia Zanetti,"ISMB, BIRKBECK COLLEGE",Cell Biology,,2020-06-19,NaT,6
94100,10.1101/2020.08.28.273003,10.1002/glia.23946,A Novel Factor in Olfactory Ensheathing Cell-A...,"Saglam, A.; Calof, A.; Wray, S.",Susan Wray,NATIONAL INSTITUTES OF HEALTH,Neuroscience,,2020-08-28,NaT,3


In [None]:
print(f'\nPrepublished Artices w/ Publication Info that have missing info: {filtered_df.isnull().sum()}\n')
#print('Published data:\n',[(name, journal_df[name].isnull().sum()) for name in journal_df.columns if journal_df[name].isnull().values.any()])
#print('Prepublish data:\n',[(name, filtered_df[name].isnull().sum()) for name in filtered_df.columns if filtered_df[name].isnull().values.any()])
fix_df = filtered_df[filtered_df['Published_Date'].isna()]
display("Rows with NaN:", fix_df)

In [None]:
filtered_df.info()
clean_df = filtered_df.dropna()
merged_df = pd.merge(journal_df, clean_df, how='right', on=['DOI','Published','Title', 'Authors','Corresponding_Authors','Institution','Category','Date','Num_of_Authors'])

In [None]:
pd.options.mode.use_inf_as_na = True
print(f'Prepublished Artices w/ Publication Info: {merged_df.isnull().sum()}')
display(merged_df)

#### General Descriptive Anaylsis

In [None]:
#summary of table
journal_df.describe(include='all')


In [None]:
#freq count of Num_of_Authors
freq_count(journal_df,'Num_of_Authors')

In [None]:
#freq count of Corresponding_Authors
freq_count(journal_df, 'Corresponding_Authors')

In [None]:
#freq count of Instituion
freq_count(journal_df, 'Institution')

In [None]:
#freq count of Date
freq_count(journal_df, 'Date')

In [None]:
#freq count of number of version
freq_count(journal_df, 'Version')

In [None]:
#freq count of Type
freq_count(journal_df, 'Type')

In [None]:
#freq count Category
freq_count(journal_df, 'Category')

#### Check

In [None]:
assert get_total(url) == journal_df.shape[0]
"Total number of papers submitted and the length of the dataframe match."

The above code runs through all papers that are submitted within selected time frame. Error discussed during the previous meeting has been fixed. 

----

## Published Articles query for a given timefame

In [None]:
# "global" variables
#https://api.biorxiv.org/pubs/biorvix/2018-08-21/2018-08-28
base_url: str = 'https://api.biorxiv.org'
query_type: str = 'pubs'
server: str = 'biorxiv'
start_interval: str = '2011-01-01'
end_interval: str = '2020-12-31'


### BioRvix Server

In [None]:
# "local" variables
url: str = f'{base_url}/{query_type}/{server}/{start_interval}/{end_interval}'
path: str = f'pub-{query_type}-{server}-{start_interval}!{end_interval}-{datetime.now()}'
step = 100
pubs_df = None

#finding the number of records for a given time period

pathlib.Path(path).mkdir(parents=True, exist_ok=True)
print(url)

multithread_processor(path, url, 
                      ["preprint_doi", "published_doi", "preprint_title", "preprint_authors", "preprint_author_corresponding", "preprint_author_corresponding_institution", "preprint_category", "published_journal", "preprint_date", "published_date"],
                      ["DOI", "pub_DOI", "Title", "Authors", "Corresponding_Authors", "Institution", "Category", "Journal", "Preprint_Date", "Published_Date"],
                      step, range(0, get_total(url), 100), True)


In [None]:
pubs_df = pd.read_parquet(pathlib.Path(path)).sort_index(kind='mergesort', key=lambda x: x.astype(int))

In [None]:
pubs_df.head(1)
#print([(index, value, datetime.strptime(value.split(':')[0], '%Y-%m-%d').date()) for index, value in enumerate(pubs_df.Published_Date) if len(value.strip()) != 10])
print([(index, value) for index, value in enumerate(pubs_df.Published_Date) if len(value.strip()) != 10])



In [None]:

pubs_df = create_published_df(pubs_df)

#display(pubs_df)
#pubs_df.shape


In [None]:
pubs_df.info()
print(pubs_df.Published_Date.isna().values.any(), pubs_df.Preprint_Date.isna().values.any())
print(pubs_df.Published_Date.subtract(pubs_df.Preprint_Date))
#for index in range(pubs_df.shape[0]):
#    try:
#        pubs_df.loc(index, 'Published_Date') - pubs_df.loc(index, 'Preprint_Date')
#    except Exception as e:
        #print(e)
        #print (f'index: {index} Published_Date: {')

#### General Descriptive Analysis

In [None]:
pubs_df.info()

In [None]:
#summary of table
pubs_df.describe(include='all')


In [None]:
#freq count of Num_of_Authors
freq_count(pubs_df,'Num_of_Authors')

In [None]:
#freq count "Corresponding_Authors"
freq_count(pubs_df, 'Corresponding_Authors')

In [None]:
#freq_count Institution
freq_count(pubs_df, 'Institution')

In [None]:
#freq_count Category
freq_count(pubs_df, 'Category')

In [None]:
#freq count Jounral
freq_count(pubs_df, 'Journal')

In [None]:
#freq count Preprint_Date
freq_count(pubs_df, 'Preprint_Date')

In [None]:
#freq count Published_date
freq_count(pubs_df, 'Published_Date')

#### Check

In [None]:
'''
                      columns=["pre_DOI", "pub_DOI",
                               "Title", "Authors", "Corresponding_Authors",
                               "Institution",
                               "Category", "Journal", "Preprint_Date", "Published_Date"])
'''
#assert get_total(f'{base_url}{query_type}{server}{start_interval}{end_interval}') == pubs_df.shape[0]
"Total number of papers published and the length of the dataframe does match."

----

## Detailed analysis of published articles

In [None]:
from multipledispatch import dispatch
@dispatch(pd.DataFrame, int, str)
def get_values(df: pd.DataFrame, row: int, col: str):
    return get_values(df, [row], [col])

@dispatch(pd.DataFrame, int, list)
def get_values(df: pd.DataFrame, row: int, cols: List[str]):
    return get_values(df, [row], cols)

@dispatch(pd.DataFrame, list, str)
def get_values(df: pd.DataFrame, rows: List[int], col: str):
    return get_values(df, rows, [col])

@dispatch(pd.DataFrame, list, list)
def get_values(df: pd.DataFrame, rows: List[int], cols: List[str]) -> List[List[str]]:
    return [[df.at(row, col) for col in cols] for row in rows]

@dispatch(pd.DataFrame, str)
def get_values(df: pd.DataFrame, col: str):
    return df[col]

@dispatch(pd.DataFrame, list)
def get_values(df: pd.DataFrame, cols: List[str]):
    return df[cols]

                                                                                                                                                                          ### BioRvix Service

In [None]:
#from tqdm import tqdm_notebook, tnrange
from tqdm.notebook import tqdm_notebook
import time
import math

#variables
#https://api.biorxiv.org/pubs/biorvix/2018-08-21/2018-08-28
#https://api.biorxiv.org/details/biorxiv/10.1101/759530
#https://api.biorxiv.org/details/biorvix/10.1101/759530
base_url: str = 'https://api.biorxiv.org'
query_type: str = 'details'
server: str ='biorxiv'
url: str = f'{base_url}/{query_type}/{server}'
path: str = f'prepub-{query_type}-{server}-{datetime.now()}'
step = 100
#doi = ['10.1101/856302']
doi = pubs_df.DOI
prepub_df = None

'''
for item in tqdm_notebook(range(0, total, step), desc='get_prepublish_data', 
                 total=math.ceil(total/step)):
    #time.sleep(0.005)
    results = get_data(None, 
                       url, 
                       doi[item:item+step])
    tmp = list(results)
    prepub_df = query_to_df([r for r in tmp], 
                            ["doi", "title", "authors", "author_corresponding", "author_corresponding_institution", "date", "version", "type", "category", "published"],
                            range(item, item + (len(tmp) * step), step),
#                            range(item, (item + ((len(tmp[counter]) - 1) * step)) * 10, step),
                            ["DOI", "Title", "Authors", "Corresponding_Authors", "Institution", "Date", "Version", "Type", "Category", "pub_DOI"], False)
    #prepub_df = create_prepublish_df(prepub_df)
    prepub_df.to_parquet(pathlib.Path(f'{path}/{datetime.utcnow().timestamp()}.parquet'))
'''

#create directory
pathlib.Path(path).mkdir(parents=True, exist_ok=True)
total = len(doi)
args = [(path, url, doi, item, step, True) for item in range(0, total, step)]
#print(f'args: {len(args)}\n{args}')
print(url, f'total doi: {total} iter: {total/step}')
#tq.thread_map(lambda p: process_doi_data(*p), args, desc='process_doi_data', max_workers=5, total=len(args))
tq.thread_map(lambda p: process_doi_data(*p), args, desc='process_doi_data', total=len(args))


In [None]:
prepub_df = create_prepublish_df(pd.read_parquet(pathlib.Path(path)))

prepub_df

In [None]:
print('Published data:\n',[(name, pubs_df[name].isnull().sum()) for name in pubs_df.columns if pubs_df[name].isnull().values.any()])
print('Prepublish data:\n',[(name, prepub_df[name].isnull().sum()) for name in prepub_df.columns if prepub_df[name].isnull().values.any()])

In [None]:
combined_df = pd.merge(pubs_df,prepub_df, how="outer", on=['DOI','Title', 'Authors', 'Corresponding_Authors', 'Institution', 'Category', 'Num_of_Authors', 'pub_DOI'])
#combined_df = pd.merge(pubs_df,prepub_df, how="right", on=['DOI','Title', 'Authors', 'Corresponding_Authors', 'Institution', 'Category', 'Num_of_Authors'])
combined_df['Time_month'] = (combined_df['Published_Date'] - combined_df['Preprint_Date']) / np.timedelta64(1, 'M')
combined_df['Keep'] = combined_df.Published_Date > combined_df.Preprint_Date
#ref = combined_df.set_index(['pub_DOI'], inplace=False)
combined_df.head(1)
#display(combined_df.groupby('DOI').count())

In [None]:
print([(index, value) for index, value in enumerate(combined_df.groupby(['DOI'])['Time_month']) ])


In [None]:
# checking for NA
print([(name, combined_df[name].isnull().sum()) for name in combined_df.columns if combined_df[name].isnull().values.any()])
print([(name, combined_df.groupby([name], dropna=False).size) for name in combined_df.columns if (combined_df.groupby([name], dropna=False)[name].count() > 0).any()])
print([(name, combined_df.groupby([name], dropna=True).size) for name in combined_df.columns if (combined_df.groupby([name], dropna=False)[name].count() > 0).any()])

In [None]:
#%%capture cap
display(prepub_df.shape, pubs_df.shape, combined_df.shape, combined_df.groupby(['pub_DOI'], as_index=True).count().shape)
#test = combined_df.groupby(['DOI', 'pub_DOI', 'Category', 'Institution', 'Corresponding_Authors', 'Journal', 'Preprint_Date', 'Published_Date', 'Authors', 'Num_of_Authors', 'Title', 'Date', 'Version'])
test = combined_df.groupby(['DOI'], as_index = True)
#display(test.iloc[test.Title.nunique().ne(1),:])
#display(test.Title.nunique().ne(1).index)
#display(test.Title.nunique().ne(1))
#key = test.Title.nunique().ne(1)
#display(ref.index.isin(key))
#display(ref[ref.index.isin(key)])
#display(test.Title.nunique().ne(1).shape)
#test.Title.indices.
#display(test.Title.indices)
#df.at only get signal value so need to use list comprehension to get multiple columns in a particular row
#print(test.Title.indices['10.1128/IAI.00353-19'])
#display(combined_df .iloc[[1826, 1827]])
#display(combined_df)
combined_df.groupby(['pub_DOI'], as_index=False).Title.unique()

In [None]:
combined_df.groupby(['Category', 'Keep'], as_index=True).Time_month.describe()

**NB:** There are some issues with the above table since there are 29 publications that have more than one enteries.

In [None]:
df = combined_df.loc[combined_df.reset_index().groupby(['pub_DOI'])['Time_month'].idxmax()]


In [None]:
display(df.groupby(['Category']).Time_month.describe())

In [None]:
combined_df.groupby(['Institution'], as_index=True).pub_DOI.describe().to_csv("general.csv")

In [None]:
combined_df.groupby(['Institution', 'Category'], as_index=True).Time_month.describe()

In [None]:
reduce_df = combined_df[combined_df.Keep == True]
reduce_df.groupby(['Category', 'Keep'], as_index=True).Time_month.describe()

In [None]:
print(pubs_df.shape)
print(prepub_df.shape)
print(test.count().shape)
print((test.Title.nunique() - 1 ).shape)

In [None]:
test.filter(lambda x: x['Title'].nunique() > 1).to_csv('title-change.csv')
prepub_df.to_csv('pre&pub-data.csv')
pubs_df.to_csv('pub_data.csv')

# Junk Code
Please disregard the codes below.

In [None]:
'''
for item in tqdm_notebook(range(0, total, step), desc='get_prepublish_data', 
                 total=math.ceil(total/step)):
    #time.sleep(0.005)
    results = get_data(None, 
                       url, 
                       doi[item:item+step])
    tmp = list(results)
    prepub_df = query_to_df([r for r in tmp], 
                            ["doi", "title", "authors", "author_corresponding", "author_corresponding_institution", "date", "version", "type", "category", "published"],
                            range(item, item + (len(tmp) * step), step),
#                            range(item, (item + ((len(tmp[counter]) - 1) * step)) * 10, step),
                            ["DOI", "Title", "Authors", "Corresponding_Authors", "Institution", "Date", "Version", "Type", "Category", "pub_DOI"], False)
    #prepub_df = create_prepublish_df(prepub_df)
    prepub_df.to_parquet(pathlib.Path(f'{path}/{datetime.utcnow().timestamp()}.parquet'))
'''



In [None]:
# "local" variables
url: str = f'{base_url}/{query_type}/{server}/{start_interval}/{end_interval}'
path: str = f'pub-{query_type}-{server}-{start_interval}!{end_interval}-{datetime.datetime.now()}'
step = 100
pubs_df = None

#finding the number of records for a given time period

pathlib.Path(path).mkdir(parents=True, exist_ok=True)
print(url)
results = get_data(None, 
         url, 
         range(0, get_total(url), 100))

tmp = list(results)
for item in range (0, get_total(url), step):
    result_list = tmp[item:(item+step)]
    pubs_df = query_to_df(result_list, 
                          ["preprint_doi", "published_doi", "preprint_title", "preprint_authors", "preprint_author_corresponding", "preprint_author_corresponding_institution", "preprint_category", "published_journal", "preprint_date", "published_date"],
                          range(item, len(result_list) * step, step),
                          ["DOI", "pub_DOI", "Title", "Authors", "Corresponding_Authors", "Institution", "Category", "Journal", "Preprint_Date", "Published_Date"])
    pubs_df.to_parquet(pathlib.Path(f'{path}/{datetime.datetime.utcnow().timestamp()}.parquet'))
pubs_df = pd.read_parquet(pathlib.Path(path))
pubs_df = create_published_df(pubs_df)

display(pubs_df)

In [None]:
def get_publisher_prefix(doi: pd.Series) -> pd.Series:
    return doi.apply(lambda x: x.split("/")[0])

query_type='/publisher'

def get_publisher_data(url)
get_data(None,f'{base_url}{query_type}'
         get_publisher_prefix(pubs_df.pub_DOI).unique())

**Journal API**<br>
https://www.nature.com/opensearch/<br>
https://www.biorxiv.org/content/10.1101/339747v4<br>
https://www.biorxiv.org/content/10.1101/339747v4.full.pdf<br>
https://api.biorxiv.org/details/biorxiv/10.1101/099697


In [None]:
journal_list = []

# `license`,`abstract`, and `server` are excluded from the metrics. 
for journal in json_info["collection"]:
    journal_list.append([journal["doi"], journal["title"], journal["authors"],
                         journal["author_corresponding"], 
                         journal["author_corresponding_institution"],
                         journal["date"], journal["version"], journal["type"],
                         journal["category"], journal["jatsxml"], journal["published"]])
    

In [None]:
journal_df = pd.DataFrame(data=journal_list,
                         columns=["DOI", "Title", "Authors", "Corresponding Authors",
                                  "Institution",
                                  "Date", "Version", "Type", "Category", "Xml", "Published"])
journal_df.head()