In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
inoutpath = 'data/'

compdata_path = inoutpath + 'features.ori_doc2vec_deepwalk.csv' # 'compdata_ext.csv' 
CSCoV_scores_path = inoutpath + 'CSCoV_scores.csv'
pubmed_path = inoutpath + 'PubMed.csv'
arxiv_path = inoutpath + 'aRxiv.csv'
biomedrxiv_path = inoutpath + 'biomedRxiv.csv'

train_data_path = inoutpath + 'features.ori_doc2vec_deepwalk_scores_train.csv'
test_data_path = inoutpath + 'features.ori_doc2vec_deepwalk_scores_test.csv'
all_data_path = inoutpath + 'features.ori_doc2vec_deepwalk_scores_all.csv'

is_balanced = False

# Load data and add author and artical metrics
## Load data

In [3]:
def load_data(compdata_path):
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ": Loading data ...")
    papers = pd.read_csv(compdata_path, index_col=False)
    papers.drop(['Unnamed: 0'], axis=1, inplace=True)

    return papers

papers = load_data(compdata_path)

2021-07-26 00:04:13.480187: Loading data ...


  if (await self.run_code(code, result,  async_=asy)):


In [4]:
len(papers.index)

18575

## Add Author and Article metrics

In [5]:
CSCoV_scores = load_data(CSCoV_scores_path)
papers_amet = pd.concat([papers, CSCoV_scores], axis=1)

2021-07-26 00:04:14.429953: Loading data ...


# Add subDate, acptDate and pubDate
## Add subDate from archives (ArXiv, BioRxiv and MedRxiv)

In [6]:
arxiv = load_data(arxiv_path)
biomedrxiv = load_data(biomedrxiv_path)

2021-07-26 00:04:14.548689: Loading data ...
2021-07-26 00:04:14.635342: Loading data ...


In [7]:
arxiv['subDate'] = pd.to_datetime(arxiv.submitted)
arxiv['title'] = arxiv.title.str.lower()
biomedrxiv['subDate']  = pd.to_datetime(biomedrxiv.rel_date)
biomedrxiv['doi'] = biomedrxiv.rel_doi
biomedrxiv['title'] = biomedrxiv.rel_title.str.lower()

all_rxiv = pd.concat([arxiv[['doi', 'subDate', 'title']], biomedrxiv[['doi', 'subDate', 'title']]])
all_rxiv_redup = all_rxiv[~all_rxiv.doi.isna() & ~all_rxiv.title.isna()
                         ].drop_duplicates(subset=['doi']).drop_duplicates(subset=['title']) # Remove null and duplicates using doi & title

In [8]:
# duplicates removed
papers_amet.title = papers_amet.title.str.lower()
papers_redup = papers_amet.drop_duplicates(subset=['title']).drop_duplicates(subset=['DOI']) 

In [9]:
papers_subDate = papers_redup.merge(all_rxiv_redup[['subDate', 'title']], left_on='title', right_on='title',how='left')
papers_subDate[['subDate']].describe(datetime_is_numeric=True)

Unnamed: 0,subDate
count,3978
mean,2020-09-30 11:18:26.947963904
min,2019-10-16 23:06:34
25%,2020-05-24 00:00:00
50%,2020-09-10 00:00:00
75%,2021-02-02 00:00:00
max,2021-06-22 00:00:00


## Add acptDate and subDate from PubMed

In [10]:
def concate_dates_and_redup(pubmed):
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ": Processing pubmed data ...")
    monthMap = {'01': 1, '02': 2, '03': 3, '04': 4, '05': 5, '06': 6, '07': 7, '08': 8, 
                '09': 9, '10': 10, '11': 11, '12': 12, 
                'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 
                'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}

    # Caution: null values are replaced to 2028-03-01 in pubDate and acptDate
    pubmed.pubYear = pubmed.pubYear.fillna(2028).astype('int64')
    pubmed.pubMonth = pubmed.pubMonth.fillna('03').apply(lambda x: monthMap[x])
    pubmed.pubDay = pubmed.pubDay.fillna(1).astype('int64')
    pubmed['pubDate'] =  pd.to_datetime(pubmed[['pubYear', 'pubMonth', 'pubDay']].astype('str').agg('-'.join, axis=1))
    
    pubmed.acptYear = pubmed.acptYear.fillna(2028).astype('int64')
    pubmed.acptMonth = pubmed.acptMonth.fillna(3).astype('int64')
    pubmed.acptDay = pubmed.acptDay.fillna(1).astype('int64')
    pubmed['acptDate'] =  pd.to_datetime(pubmed[['acptYear', 'acptMonth', 'acptDay']].astype('str').agg('-'.join, axis=1))
    
    pubmed.title = pubmed.title.str.lower()
    pubmed_redup = pubmed.drop_duplicates(subset=['title']).drop_duplicates(subset=['doi'])
    
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ": Processing pubmed data finished.")
    

    return pubmed_redup

In [11]:
pubmed = load_data(pubmed_path)
pubmed_day = concate_dates_and_redup(pubmed)

2021-07-26 00:04:15.458450: Loading data ...
2021-07-26 00:04:17.108786: Processing pubmed data ...
2021-07-26 00:04:26.044653: Processing pubmed data finished.


In [12]:
papers_dates = papers_subDate.merge(pubmed_day[['title', 'acptDate', 'pubDate']], left_on='title', right_on='title',how='left')
papers_dates[['subDate', 'acptDate', 'pubDate']].describe(datetime_is_numeric=True)

Unnamed: 0,subDate,acptDate,pubDate
count,3978,12565,12565
mean,2020-09-30 11:18:26.947963904,2022-09-08 08:15:32.877039360,2021-02-13 00:59:49.399124224
min,2019-10-16 23:06:34,2020-01-14 00:00:00,2020-01-01 00:00:00
25%,2020-05-24 00:00:00,2020-09-30 00:00:00,2020-10-01 00:00:00
50%,2020-09-10 00:00:00,2021-02-01 00:00:00,2021-02-01 00:00:00
75%,2021-02-02 00:00:00,2021-06-05 00:00:00,2021-04-01 00:00:00
max,2021-06-22 00:00:00,2028-03-01 00:00:00,2028-03-01 00:00:00


# Add publishing labels
## Add age to preprints

In [13]:
def add_age(papers):
    print("Lastest preprint submission time: " + papers_dates.subDate.max().strftime('%Y-%m-%d') + ".")
    papers['preprint_age'] = (papers.subDate.max() - papers.subDate) # 3548 preprints
    
    return papers

papers_age = add_age(papers_dates)

Lastest preprint submission time: 2021-06-22.


## Mark preprints vs published articles

In [14]:
papers_age['is_preprint'] = papers_age.collection.isin(['arxiv', 'biorxiv', 'medrxiv']) \
                | papers_age.journal.isin(['ArXiv', 'medRxiv', 'bioRxiv', 'ChemRxiv']) \
                | ~papers_age.subDate.isna() # 11192(9468) published papers, 5703(5294) preprints(5272(4897) in arxiv, 399(371) in pubmed, 32(26) subDate not null)

In [15]:
len(papers_age)

18175

## Add publish labels

In [16]:
pub_preprints_manual_added = ['a rigorous evaluation of optimal peptide targets for ms-based clinical diagnostics of coronavirus disease 2019 (covid-19).',
                              'vaccine optimization for covid-19, who to vaccinate first?',
                              'identification of vulnerable populations and areas at higher risk of covid-19 related mortality in the u.s.',
                              'accommodating individual travel history, global mobility, and unsampled diversity in phylogeography: a sars-cov-2 case study.',
                              'sars-cov2 (covid-19) structural/evolution dynamicome: insights into functional evolution and human genomics.',
                              'dynamics of b-cell repertoires and emergence of cross-reactive responses in covid-19 patients with different disease severity.',
                              'fast identification of possible drug treatment of coronavirus disease -19 (covid-19) through computational drug repurposing study.',
                              'supercomputer-based ensemble docking drug discovery pipeline with application to covid-19.',
                              'adaptive evolution of peptide inhibitors for mutating sars-cov-2.',
                              'comparative multiplexed interactomics of sars-cov-2 and homologous coronavirus non-structural proteins identifies unique and shared host-cell dependencies.',
                              'conserved genomic terminals of sars-cov-2 as co-evolving functional elements and potential therapeutic targets.',
                              'female reproductive tract has low concentration of sars-cov2 receptors.',
                              'single-cell longitudinal analysis of sars-cov-2 infection in human bronchial epithelial cells.',
                              'a transcriptional regulatory atlas of coronavirus infection of human cells.',
                              'an insertion unique to sars-cov-2 exhibits superantigenic character strengthened by recent mutations.',
                              'rna-gps predicts sars-cov-2 rna localization to host mitochondria and nucleolus.',
                              'targeting the sars-cov-2 main protease to repurpose drugs for covid-19.',
                              'a modular framework for multiscale multicellular spatial modeling of viral infection, immune response and drug therapy timing and efficacy in epithelial tissues: a multiscale model of viral infection in epithelial tissues.']
len(pub_preprints_manual_added)

18

In [17]:
# 11192(9468) pubmed papers published, 879(813) preprints published,(including 826(778) published match, 37(31) pubDate match, 18(5) manual added)
papers_age['is_published'] = ~papers_age.is_preprint | \
                            (papers_age.is_preprint & (~papers_age.published.isna()
                                                       | (~papers_age.pubDate.isna() & ((papers_age.acptDate < "2028-03-01") | 
                                                                                      ((papers_age.acptDate >= "2028-03-01") & ~papers_age.subDate.isna() & ~papers_age.journal.isna())))
                                                       | papers_age.title.isin(pub_preprints_manual_added)))

In [18]:
pd.set_option('display.max_colwidth', None)
papers_age[papers_age.is_preprint & papers_age.published.isna() 
           & ~(~papers_age.pubDate.isna() & ((papers_age.acptDate < "2028-03-01") | 
                                             ((papers_age.acptDate >= "2028-03-01") & ~papers_age.subDate.isna() & ~papers_age.journal.isna())))
           & ~papers_age.pubDate.isna() &  papers_age.subDate.isna() & (papers_age.journal == 'medRxiv') & (papers_age.pubDate < "2021-01-01")
                                                                       ][['title', 'subDate', 'acptDate', 'pubDate', 'journal', 'published']].describe(datetime_is_numeric=True)#.head(140).tail(10)#

# subData is not null: 16,
# * pubDate is not null: 375, subDate is null: 359, 151 bioRxiv, 197 medRxiv, 3 ArXiv, 8 ChemRxiv Some articles are not correctly labeled

Unnamed: 0,subDate,acptDate,pubDate
count,0,110,110
mean,NaT,2028-03-01 00:00:00,2020-08-07 15:03:16.363636480
min,NaT,2028-03-01 00:00:00,2020-02-17 00:00:00
25%,NaT,2028-03-01 00:00:00,2020-05-18 06:00:00
50%,NaT,2028-03-01 00:00:00,2020-07-27 12:00:00
75%,NaT,2028-03-01 00:00:00,2020-11-03 00:00:00
max,NaT,2028-03-01 00:00:00,2020-12-29 00:00:00


In [19]:
len(papers_age[papers_age.is_published])

13176

In [20]:
len(papers_age[~papers_age.is_published])

4999

In [21]:
len(papers_age[papers_age.is_preprint & papers_age.is_published])

920

## Seperate training and testing datasets

In [22]:
# Papers in last X days for searching promising papers:  
#  X=30: 255-vs-2, *X=90: 680-vs-14, X=180: 1254-vs-102,
papers_age['preprint_test'] = papers_age.preprint_age <= pd.Timedelta(90,'D') # 255

papers_age[papers_age.preprint_test].is_published.value_counts() # 05-31: 669-vs-12, 07-11: 646 (unpub)-vs-10 (pub)

False    646
True      10
Name: is_published, dtype: int64

In [23]:
papers_age['preprint_train'] = papers_age.is_preprint & ~papers_age.preprint_test

In [24]:
papers_age['pubmed_train'] = ~papers_age.is_preprint

In [25]:
# len(papers_age[papers_age.preprint_train & ~papers_age.is_published])

# Published papers
len(papers_age[(papers_age.preprint_train | papers_age.pubmed_train)& papers_age.is_published])

13166

In [26]:
len(papers_age[papers_age.preprint_test])

656

# Combine features and save

In [27]:
def save_features(filepath, papers):
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ": Save features ...")

    papers.to_csv(filepath)

In [28]:
len(papers_age.index)

18175

In [29]:
# Training data
save_features(train_data_path, papers_age[papers_age.preprint_train | papers_age.pubmed_train])

# Testing data
save_features(test_data_path, papers_age[papers_age.preprint_test])

2021-07-26 00:04:27.987118: Save features ...
2021-07-26 00:04:31.407481: Save features ...


In [30]:
# All data
save_features(all_data_path, papers_age)

2021-07-26 00:04:31.533194: Save features ...


# Check distribution of omics papers

In [31]:
papers_age[papers_age.keywords.str.contains('genom')][['title','topic']].groupby('topic').count()

Unnamed: 0_level_0,title
topic,Unnamed: 1_level_1
Clinics,85
Drug discovery,382
Epidemiology,75
Genomics,1307
Healthcare,41
Imaging,5


In [32]:
papers_age[papers_age.keywords.str.contains('transcriptom')][['title','topic']].groupby('topic').count()

Unnamed: 0_level_0,title
topic,Unnamed: 1_level_1
Clinics,46
Drug discovery,494
Genomics,56
Healthcare,4
Imaging,2


In [33]:
papers_age[papers_age.keywords.str.contains('proteom')][['title','topic']].groupby('topic').count()

Unnamed: 0_level_0,title
topic,Unnamed: 1_level_1
Clinics,66
Drug discovery,372
Epidemiology,4
Genomics,65
Healthcare,7
Imaging,2


In [34]:
papers_age[papers_age.keywords.str.contains('metabolom')][['title','topic']].groupby('topic').count()

Unnamed: 0_level_0,title
topic,Unnamed: 1_level_1
Clinics,37
Drug discovery,52
Epidemiology,1
Genomics,7
Healthcare,4
Imaging,1


In [35]:
papers_age[papers_age.keywords.str.contains('interactom')][['title','topic']].groupby('topic').count()

Unnamed: 0_level_0,title
topic,Unnamed: 1_level_1
Clinics,1
Drug discovery,110
Genomics,3
Healthcare,2
Imaging,2


In [36]:
papers_age[papers_age.keywords.str.contains('phenom')][['title','topic']].groupby('topic').count()

Unnamed: 0_level_0,title
topic,Unnamed: 1_level_1
Clinics,2
Drug discovery,2
Genomics,1


In [37]:
papers_age[papers_age.keywords.str.contains('radiom')][['title','topic']].groupby('topic').count()

Unnamed: 0_level_0,title
topic,Unnamed: 1_level_1
Clinics,17
Drug discovery,1
Epidemiology,12
Genomics,1
Imaging,61


In [38]:
papers_age[papers_age.keywords.str.contains('epigenom')][['keywords','topic']].groupby('topic').count()

Unnamed: 0_level_0,keywords
topic,Unnamed: 1_level_1
