In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [26]:
inoutpath = '/home/xiaopengxu/Desktop/data-covid-review/2021-05-11/'
compdata_path = inoutpath + 'features.ori_doc2vec_deepwalk.csv' # 'compdata_ext.csv' 
CSCoV_scores_path = inoutpath + 'CSCoV_scores.csv'
pubmed_path = inoutpath + 'PubMed.csv'
arxiv_path = inoutpath + 'aRxiv.csv'
biomedrxiv_path = inoutpath + 'biomedRxiv.csv'

train_data_path = inoutpath + 'features.ori_doc2vec_deepwalk_scores_train.csv'
test_data_path = inoutpath + 'features.ori_doc2vec_deepwalk_scores_test.csv'

is_balanced = False

# Load data and add author and artical metrics
## Load data

In [3]:
def load_data(compdata_path):
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ": Loading data ...")
    papers = pd.read_csv(compdata_path, index_col=False)
    papers.drop(['Unnamed: 0'], axis=1, inplace=True)

    return papers

papers = load_data(compdata_path)

2021-05-14 16:05:22.164426: Loading data ...


  if (await self.run_code(code, result,  async_=asy)):


## Add Author and Article metrics

In [4]:
CSCoV_scores = load_data(CSCoV_scores_path)
papers_amet = pd.concat([papers, CSCoV_scores], axis=1)

2021-05-14 16:05:22.895632: Loading data ...


# Add subDate, acptDate and pubDate
## Add subDate from archives (ArXiv, BioRxiv and MedRxiv)

In [5]:
arxiv = load_data(arxiv_path)
biomedrxiv = load_data(biomedrxiv_path)

2021-05-14 16:05:22.963681: Loading data ...
2021-05-14 16:05:23.041650: Loading data ...


In [6]:
arxiv['subDate'] = pd.to_datetime(arxiv.submitted)
arxiv['title'] = arxiv.title.str.lower()
biomedrxiv['subDate']  = pd.to_datetime(biomedrxiv.rel_date)
biomedrxiv['doi'] = biomedrxiv.rel_doi
biomedrxiv['title'] = biomedrxiv.rel_title.str.lower()

all_rxiv = pd.concat([arxiv[['doi', 'subDate', 'title']], biomedrxiv[['doi', 'subDate', 'title']]])
all_rxiv_redup = all_rxiv[~all_rxiv.doi.isna() & ~all_rxiv.title.isna()
                         ].drop_duplicates(subset=['doi']).drop_duplicates(subset=['title']) # Remove null and duplicates using doi & title

In [7]:
papers_amet.title = papers_amet.title.str.lower()
papers_redup = papers_amet.drop_duplicates(subset=['title']).drop_duplicates(subset=['DOI'])

In [8]:
papers_subDate = papers_redup.merge(all_rxiv_redup[['subDate', 'title']], left_on='title', right_on='title',how='left')
papers_subDate[['subDate']].describe(datetime_is_numeric=True)

Unnamed: 0,subDate
count,3548
mean,2020-08-30 06:40:12.908962816
min,2020-01-19 00:00:00
25%,2020-05-17 00:00:00
50%,2020-08-14 00:00:00
75%,2020-12-15 00:00:00
max,2021-04-08 00:00:00


## Add acptDate and subDate from PubMed

In [9]:
def concate_dates_and_redup(pubmed):
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ": Processing pubmed data ...")
    monthMap = {'01': 1, '02': 2, '03': 3, '04': 4, '05': 5, '06': 6, '07': 7, '08': 8, 
                '09': 9, '10': 10, '11': 11, '12': 12, 
                'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 
                'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}

    # Caution: null values are replaced to 2028-03-01 in pubDate and acptDate
    pubmed.pubYear = pubmed.pubYear.fillna(2028).astype('int64')
    pubmed.pubMonth = pubmed.pubMonth.fillna('03').apply(lambda x: monthMap[x])
    pubmed.pubDay = pubmed.pubDay.fillna(1).astype('int64')
    pubmed['pubDate'] =  pd.to_datetime(pubmed[['pubYear', 'pubMonth', 'pubDay']].astype('str').agg('-'.join, axis=1))
    
    pubmed.acptYear = pubmed.acptYear.fillna(2028).astype('int64')
    pubmed.acptMonth = pubmed.acptMonth.fillna(3).astype('int64')
    pubmed.acptDay = pubmed.acptDay.fillna(1).astype('int64')
    pubmed['acptDate'] =  pd.to_datetime(pubmed[['acptYear', 'acptMonth', 'acptDay']].astype('str').agg('-'.join, axis=1))
    
    pubmed.title = pubmed.title.str.lower()
    pubmed_redup = pubmed.drop_duplicates(subset=['title']).drop_duplicates(subset=['doi'])
    
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ": Processing pubmed data finished.")
    

    return pubmed_redup

In [10]:
pubmed = load_data(pubmed_path)
pubmed_day = concate_dates_and_redup(pubmed)

2021-05-14 16:05:23.757410: Loading data ...
2021-05-14 16:05:25.031555: Processing pubmed data ...
2021-05-14 16:05:32.296802: Processing pubmed data finished.


In [11]:
papers_dates = papers_subDate.merge(pubmed_day[['title', 'acptDate', 'pubDate']], left_on='title', right_on='title',how='left')
papers_dates[['subDate', 'acptDate', 'pubDate']].describe(datetime_is_numeric=True)

Unnamed: 0,subDate,acptDate,pubDate
count,3548,9769,9769
mean,2020-08-30 06:40:12.908962816,2022-09-13 10:02:35.512335104,2020-12-26 05:36:13.856075264
min,2020-01-19 00:00:00,2020-01-14 00:00:00,2020-01-01 00:00:00
25%,2020-05-17 00:00:00,2020-08-31 00:00:00,2020-08-24 00:00:00
50%,2020-08-14 00:00:00,2020-12-10 00:00:00,2020-12-01 00:00:00
75%,2020-12-15 00:00:00,2028-03-01 00:00:00,2021-02-25 00:00:00
max,2021-04-08 00:00:00,2028-03-01 00:00:00,2028-03-01 00:00:00


# Add publishing labels
## Add age to preprints

In [12]:
def add_age(papers):
    print("Lastest preprint submission time: " + papers_dates.subDate.max().strftime('%Y-%m-%d') + ".")
    papers['preprint_age'] = (papers.subDate.max() - papers.subDate) # 3548 preprints
    
    return papers

papers_age = add_age(papers_dates)

Lastest preprint submission time: 2021-04-08.


## Mark preprints vs published articles

In [13]:
papers_age['is_preprint'] = papers_age.collection.isin(['arxiv', 'biorxiv', 'medrxiv']) \
                | papers_age.journal.isin(['ArXiv', 'medRxiv', 'bioRxiv', 'ChemRxiv']) \
                | ~papers_age.subDate.isna() # 9468 published papers, 5294 preprints(4897 in arxiv, 371 in pubmed, 26 subDate not null)

## Add publish labels

In [14]:
pub_preprints_manual_added = ['a rigorous evaluation of optimal peptide targets for ms-based clinical diagnostics of coronavirus disease 2019 (covid-19).',
                              'vaccine optimization for covid-19, who to vaccinate first?',
                              'identification of vulnerable populations and areas at higher risk of covid-19 related mortality in the u.s.',
                              'accommodating individual travel history, global mobility, and unsampled diversity in phylogeography: a sars-cov-2 case study.',
                              'sars-cov2 (covid-19) structural/evolution dynamicome: insights into functional evolution and human genomics.']

In [15]:
# 9468 pubmed papers published, 813 preprints published,(including 778 published match, 31 pubDate match, 5 manual added)
papers_age['is_published'] = ~papers_age.is_preprint | \
                            (papers_age.is_preprint & (~papers_age.published.isna()
                                                       | (~papers_age.pubDate.isna() & ((papers_age.acptDate < "2028-03-01") | 
                                                                                      ((papers_age.acptDate >= "2028-03-01") & ~papers_age.subDate.isna() & ~papers_age.journal.isna())))
                                                       | papers_age.title.isin(pub_preprints_manual_added)))

In [16]:
len(papers_age[papers_age.is_published])

10281

In [17]:
len(papers_age[papers_age.is_preprint & papers_age.is_published])

813

## Seperate training, validation, and testing datasets

In [18]:
# Papers in last 30 days for searching promising papers
papers_age['preprint_test'] = papers_age.preprint_age <= pd.Timedelta(30,'D') # 255

In [19]:
papers_age[papers_age.preprint_test].is_published.value_counts()

False    253
True       2
Name: is_published, dtype: int64

In [20]:
papers_age['preprint_train'] = papers_age.is_preprint & ~papers_age.preprint_test

In [21]:
papers_age['pubmed_train'] = ~papers_age.is_preprint

# Combine features and save

In [24]:
def save_features(filepath, papers):
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ": Save features ...")

    papers.to_csv(filepath)

In [27]:
save_features(train_data_path, papers_age[papers_age.preprint_train | papers_age.pubmed_train])
save_features(test_data_path, papers_age[papers_age.preprint_test])

2021-05-14 16:07:53.128267: Save features ...
2021-05-14 16:07:55.554537: Save features ...
