Some exploration and preprocessing to create one dataframe and export it as .csv.

Adjusted to data from 2020-05-01.

# Load Packages

In [1]:
import numpy as np 
import pandas as pd

import glob
import json

# Load and Prepare Data

To read the JSON files we follow [COVID EDA: Initial Exploration Tool](https://www.kaggle.com/ivanegapratama/covid-eda-initial-exploration-tool).

In [2]:
root_path = '/kaggle/input/CORD-19-research-challenge/'
metadata_path = f'{root_path}/metadata.csv'
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})
meta_df.head(2)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,arxiv_id,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,zjufx4fo,b2897e1277f56641193a6db73825f707eed3e4c9,PMC,Sequence requirements for RNA strand transfer ...,10.1093/emboj/20.24.7220,PMC125340,11742998,unk,Nidovirus subgenomic mRNAs contain a leader se...,2001-12-17,"Pasternak, Alexander O.; van den Born, Erwin; ...",The EMBO Journal,,,,True,True,custom_license,http://europepmc.org/articles/pmc125340?pdf=re...
1,ymceytj3,e3d0d482ebd9a8ba81c254cc433f314142e72174,PMC,"Crystal structure of murine sCEACAM1a[1,4]: a ...",10.1093/emboj/21.9.2076,PMC125375,11980704,unk,CEACAM1 is a member of the carcinoembryonic an...,2002-05-01,"Tan, Kemin; Zelus, Bruce D.; Meijers, Rob; Liu...",The EMBO Journal,,,,True,True,custom_license,http://europepmc.org/articles/pmc125375?pdf=re...


In [3]:
len(meta_df) - meta_df.has_pdf_parse.sum()

14124

In [4]:
meta_df.isnull().sum()

cord_uid                           0
sha                            14124
source_x                           0
title                            163
doi                             4086
pmcid                          11748
pubmed_id                      18559
license                            0
abstract                       11130
publish_time                       8
authors                         2591
journal                         7059
Microsoft Academic Paper ID    58923
WHO #Covidence                 58119
arxiv_id                       59211
has_pdf_parse                      0
has_pmc_xml_parse                  0
full_text_file                  8461
url                              440
dtype: int64

In [5]:
meta_df.cord_uid.nunique()

59851

In [6]:
meta_df.sha.nunique()

45748

In [7]:
meta_df.title.nunique()

58676

In [8]:
all_json = glob.glob(f'{root_path}/**/pdf_json/*.json', recursive=True)
len(all_json)

48409

In [9]:
all_json_pmc = glob.glob(f'{root_path}/**/pmc_json/*.json', recursive=True)
len(all_json_pmc)

22851

# pdf_json

In [10]:
methods = ['methods','method','statistical methods','materials','materials and methods',
                'data collection','the study','study design','experimental design','objective',
                'objectives','procedures','data collection and analysis', 'methodology',
                'material and methods','the model','experimental procedures','main text']

In [11]:
# [''.join(x.lower() for x in m if x.isalpha()) for m in methods]

# for m in methods:
#     print(''.join(x.lower() for x in m if x.isalpha()))

In [12]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            self.methods = []
            self.results = []

            # Abstract
            for entry in content['abstract']:
                self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            # Methods
            methods = ['methods','method','statistical methods','materials','materials and methods',
                'data collection','the study','study design','experimental design','objective',
                'objectives','procedures','data collection and analysis', 'methodology',
                'material and methods','the model','experimental procedures','main text']
            for entry in content['body_text']:
                section_title = ''.join(x.lower() for x in entry['section'] if x.isalpha()) #remove numbers and spaces
                if any(m in section_title for m in [''.join(x.lower() for x in m if x.isalpha()) for m in methods]) : 
                    self.methods.append(entry['text'])
            # Results
            results_synonyms = ['result']
            for entry in content['body_text']:
                section_title = ''.join(x.lower() for x in entry['section'] if x.isalpha())
                if any(r in section_title for r in results_synonyms) :
                    self.results.append(entry['text'])
                    
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
            self.methods = '\n'.join(self.methods)
            self.results = '\n'.join(self.results)

    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'
first_row = FileReader(all_json[0])
print(first_row)

3cdc48bb9e40afd30a59463b7872761a726998c8: House ßies, Musca domestica L. (Diptera: Muscidae), were examined for their ability to harbor and transmit Newcastle disease virus (family Paramyxoviridae, genus Avulavirus, NDV) by using a mesogenic ... Newcastle disease (ND) is an emerging disease affecting the poultry industry worldwide. The nature of the international poultry industry facilitates the movement of poultry products, equipment, and pe...


In [13]:
dict_ = {'paper_id': [], 'abstract': [], 'body_text': [], 'methods': [], 'results': []}
for idx, entry in enumerate(all_json):
    if idx % (len(all_json) // 10) == 0:
        print(f'Processing index: {idx} of {len(all_json)}')
    content = FileReader(entry)
    dict_['paper_id'].append(content.paper_id)
    dict_['abstract'].append(content.abstract)
    dict_['body_text'].append(content.body_text)
    dict_['methods'].append(content.methods)
    dict_['results'].append(content.results)

Processing index: 0 of 48409
Processing index: 4840 of 48409
Processing index: 9680 of 48409
Processing index: 14520 of 48409
Processing index: 19360 of 48409
Processing index: 24200 of 48409
Processing index: 29040 of 48409
Processing index: 33880 of 48409
Processing index: 38720 of 48409
Processing index: 43560 of 48409
Processing index: 48400 of 48409


In [14]:
papers = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text', 'methods', 'results'])
papers.head()

Unnamed: 0,paper_id,abstract,body_text,methods,results
0,3cdc48bb9e40afd30a59463b7872761a726998c8,"House ßies, Musca domestica L. (Diptera: Musci...",Newcastle disease (ND) is an emerging disease ...,ND Virus Culture. NDV (Roakin strain) was obta...,Experiment 1. Adult house ßies harbored Newcas...
1,d99acb4e99be7852aa61a688c9fbd38d44b5a252,Live attenuated recombinant measles vaccine vi...,Live attenuated viruses have been developed an...,Viruses and cells. RSV A2 strain was obtained ...,
2,748d4c57fe1acc8d9d97cf574f7dea5296f9386c,Ebola virus (EBOV) makes extensive and intrica...,occurs primarily through a macropinocytosis-li...,Cells. U2OS human osteosarcoma cells were cult...,
3,b891efc6e1419713b05ff7d89b26d260478c28df,China has the world's second largest tuberculo...,The goal of the present study was to investiga...,,
4,353852971069ad5794445e5c1ab6077ce23da75d,,Coronavirus disease 2019 (COVID-19) has spread...,,


In [15]:
papers[(papers.results.str.len() != 0) | (papers.methods.str.len() != 0)].shape

(18591, 5)

In [16]:
df = pd.merge(papers, meta_df, left_on='paper_id', right_on='sha', how='left').drop('sha', axis=1)

In [17]:
df.columns

Index(['paper_id', 'abstract_x', 'body_text', 'methods', 'results', 'cord_uid',
       'source_x', 'title', 'doi', 'pmcid', 'pubmed_id', 'license',
       'abstract_y', 'publish_time', 'authors', 'journal',
       'Microsoft Academic Paper ID', 'WHO #Covidence', 'arxiv_id',
       'has_pdf_parse', 'has_pmc_xml_parse', 'full_text_file', 'url'],
      dtype='object')

# pmc_json

This only contains the full text - no abstracts!

In [18]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.body_text = []
            self.methods = []
            self.results = []

            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            # Methods
            methods = ['methods','method','statistical methods','materials','materials and methods',
                'data collection','the study','study design','experimental design','objective',
                'objectives','procedures','data collection and analysis', 'methodology',
                'material and methods','the model','experimental procedures','main text']
            for entry in content['body_text']:
                section_title = ''.join(x.lower() for x in entry['section'] if x.isalpha()) #remove numbers and spaces
                if any(m in section_title for m in [''.join(x.lower() for x in m if x.isalpha()) for m in methods]) : 
                    self.methods.append(entry['text'])
            # Results
            results_synonyms = ['result']
            for entry in content['body_text']:
                section_title = ''.join(x.lower() for x in entry['section'] if x.isalpha())
                if any(r in section_title for r in results_synonyms) :
                    self.results.append(entry['text'])
                    
            self.body_text = '\n'.join(self.body_text)
            self.methods = '\n'.join(self.methods)
            self.results = '\n'.join(self.results)

    def __repr__(self):
        return f'{self.paper_id}: {self.body_text[:200]}...'
first_row = FileReader(all_json_pmc[0])
print(first_row)

PMC6254639: Human rhinoviruses (HRV) are positive single-stranded RNA viruses belonging to the family Piconaviridae. HRV infection in humans usually causes common cold and mild illnesses, but is sometimes associa...


In [19]:
dict_ = {'paper_id': [], 'body_text': [], 'methods': [], 'results': []}
for idx, entry in enumerate(all_json_pmc):
    if idx % (len(all_json_pmc) // 10) == 0:
        print(f'Processing index: {idx} of {len(all_json_pmc)}')
    content = FileReader(entry)
    dict_['paper_id'].append(content.paper_id)
    dict_['body_text'].append(content.body_text)
    dict_['methods'].append(content.methods)
    dict_['results'].append(content.results)

Processing index: 0 of 22851
Processing index: 2285 of 22851
Processing index: 4570 of 22851
Processing index: 6855 of 22851
Processing index: 9140 of 22851
Processing index: 11425 of 22851
Processing index: 13710 of 22851
Processing index: 15995 of 22851
Processing index: 18280 of 22851
Processing index: 20565 of 22851
Processing index: 22850 of 22851


In [20]:
pmc_text = pd.DataFrame(dict_, columns=['paper_id', 'body_text', 'methods', 'results'])
pmc_text.head()

Unnamed: 0,paper_id,body_text,methods,results
0,PMC6254639,Human rhinoviruses (HRV) are positive single-s...,Pochonin D was synthesized and purified as rec...,To identify the anti-HRV1B effect of pochonin ...
1,PMC4713849,"On May 20, 2015, Middle East respiratory syndr...",The study population consisted of clinical lab...,The number of MERS-CoV rRT-PCR tests performed...
2,PMC6294899,The first quantities generally investigated in...,The model considers a weighted multiplex netwo...,The first quantities generally investigated in...
3,PMC7142029,Fever is a common cause of patient visits in p...,Ethical approval for this study was obtained f...,"During the study period, 610 febrile infants a..."
4,PMC5404251,Multiple stockpiles and releasing models were ...,,


In [21]:
pmc_text.shape

(22851, 4)

Careful, some of the new texts are empty strings!

In [22]:
pmc_text[pmc_text.body_text == '']

Unnamed: 0,paper_id,body_text,methods,results
12,PMC6735695,,,
43,PMC2115513,,,
50,PMC7088696,,,
61,PMC2119890,,,
77,PMC2200014,,,
...,...,...,...,...
22376,PMC3680036,,,
22464,PMC7117791,,,
22696,PMC7089550,,,
22715,PMC1808101,,,


In [23]:
pmc_text = pmc_text[pmc_text.body_text != '']

In [24]:
pmc_text.shape

(20440, 4)

In [25]:
df.head()

Unnamed: 0,paper_id,abstract_x,body_text,methods,results,cord_uid,source_x,title,doi,pmcid,...,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,arxiv_id,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,3cdc48bb9e40afd30a59463b7872761a726998c8,"House ßies, Musca domestica L. (Diptera: Musci...",Newcastle disease (ND) is an emerging disease ...,ND Virus Culture. NDV (Roakin strain) was obta...,Experiment 1. Adult house ßies harbored Newcas...,66ecksxm,PMC,Experimental Evaluation of Musca domestica (Di...,10.1093/jmedent/44.4.666,PMC7107465,...,2007-07-01,"Watson, D. Wes; Niño, Elina L.; Rochon, Katery...",J Med Entomol,,,,True,True,noncomm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...
1,d99acb4e99be7852aa61a688c9fbd38d44b5a252,Live attenuated recombinant measles vaccine vi...,Live attenuated viruses have been developed an...,Viruses and cells. RSV A2 strain was obtained ...,,3qdjmb2j,PMC,Evaluation of Measles Vaccine Virus as a Vecto...,10.2174/1874357901206010012,PMC3286841,...,2012-02-16,"Mok, Hoyin; Cheng, Xing; Xu, Qi; Zengel, James...",Open Virol J,,,,True,True,noncomm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...
2,748d4c57fe1acc8d9d97cf574f7dea5296f9386c,Ebola virus (EBOV) makes extensive and intrica...,occurs primarily through a macropinocytosis-li...,Cells. U2OS human osteosarcoma cells were cult...,,tnaizwxo,PMC,Direct Visualization of Ebola Virus Fusion Tri...,10.1128/mbio.01857-15,PMC4752599,...,2016-02-09,"Spence, Jennifer S.; Krause, Tyler B.; Mittler...",mBio,,,,True,True,noncomm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
3,b891efc6e1419713b05ff7d89b26d260478c28df,China has the world's second largest tuberculo...,The goal of the present study was to investiga...,,,qxsj2zud,PMC,Tuberculosis prevention in healthcare workers ...,10.1183/23120541.00015-2015,PMC5005135,...,2015-08-21,"Deng, Yunfeng; Li, Yan; Wang, Fengtian; Gao, D...",ERJ Open Res,,,,True,True,noncomm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
4,353852971069ad5794445e5c1ab6077ce23da75d,,Coronavirus disease 2019 (COVID-19) has spread...,,,,,,,,...,,,,,,,,,,


In [26]:
df = pd.merge(df, pmc_text, left_on='pmcid', right_on='paper_id', how='left').drop('paper_id_y', axis=1)

In [27]:
df.head(3)

Unnamed: 0,paper_id_x,abstract_x,body_text_x,methods_x,results_x,cord_uid,source_x,title,doi,pmcid,...,Microsoft Academic Paper ID,WHO #Covidence,arxiv_id,has_pdf_parse,has_pmc_xml_parse,full_text_file,url,body_text_y,methods_y,results_y
0,3cdc48bb9e40afd30a59463b7872761a726998c8,"House ßies, Musca domestica L. (Diptera: Musci...",Newcastle disease (ND) is an emerging disease ...,ND Virus Culture. NDV (Roakin strain) was obta...,Experiment 1. Adult house ßies harbored Newcas...,66ecksxm,PMC,Experimental Evaluation of Musca domestica (Di...,10.1093/jmedent/44.4.666,PMC7107465,...,,,,True,True,noncomm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,NDV (Roakin strain) was obtained from Dr. D. J...,NDV (Roakin strain) was obtained from Dr. D. J...,Adult house flies harbored Newcastle Disease v...
1,d99acb4e99be7852aa61a688c9fbd38d44b5a252,Live attenuated recombinant measles vaccine vi...,Live attenuated viruses have been developed an...,Viruses and cells. RSV A2 strain was obtained ...,,3qdjmb2j,PMC,Evaluation of Measles Vaccine Virus as a Vecto...,10.2174/1874357901206010012,PMC3286841,...,,,,True,True,noncomm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,Live attenuated viruses have been developed an...,RSV A2 strain was obtained from ATCC (Manassas...,The reverse genetics system for measles Edmons...
2,748d4c57fe1acc8d9d97cf574f7dea5296f9386c,Ebola virus (EBOV) makes extensive and intrica...,occurs primarily through a macropinocytosis-li...,Cells. U2OS human osteosarcoma cells were cult...,,tnaizwxo,PMC,Direct Visualization of Ebola Virus Fusion Tri...,10.1128/mbio.01857-15,PMC4752599,...,,,,True,True,noncomm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,Ebola virus (EBOV) and other members of the fa...,U2OS human osteosarcoma cells were cultured in...,For evaluating EBOV GP triggering under biosaf...


In [28]:
df.columns

Index(['paper_id_x', 'abstract_x', 'body_text_x', 'methods_x', 'results_x',
       'cord_uid', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id', 'license',
       'abstract_y', 'publish_time', 'authors', 'journal',
       'Microsoft Academic Paper ID', 'WHO #Covidence', 'arxiv_id',
       'has_pdf_parse', 'has_pmc_xml_parse', 'full_text_file', 'url',
       'body_text_y', 'methods_y', 'results_y'],
      dtype='object')

In [29]:
df.drop(columns=['has_pdf_parse', 'has_pmc_xml_parse', 'Microsoft Academic Paper ID', 'WHO #Covidence'], inplace=True)

In [30]:
df.head(2)

Unnamed: 0,paper_id_x,abstract_x,body_text_x,methods_x,results_x,cord_uid,source_x,title,doi,pmcid,...,abstract_y,publish_time,authors,journal,arxiv_id,full_text_file,url,body_text_y,methods_y,results_y
0,3cdc48bb9e40afd30a59463b7872761a726998c8,"House ßies, Musca domestica L. (Diptera: Musci...",Newcastle disease (ND) is an emerging disease ...,ND Virus Culture. NDV (Roakin strain) was obta...,Experiment 1. Adult house ßies harbored Newcas...,66ecksxm,PMC,Experimental Evaluation of Musca domestica (Di...,10.1093/jmedent/44.4.666,PMC7107465,...,"House flies, Musca domestica L. (Diptera: Musc...",2007-07-01,"Watson, D. Wes; Niño, Elina L.; Rochon, Katery...",J Med Entomol,,noncomm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,NDV (Roakin strain) was obtained from Dr. D. J...,NDV (Roakin strain) was obtained from Dr. D. J...,Adult house flies harbored Newcastle Disease v...
1,d99acb4e99be7852aa61a688c9fbd38d44b5a252,Live attenuated recombinant measles vaccine vi...,Live attenuated viruses have been developed an...,Viruses and cells. RSV A2 strain was obtained ...,,3qdjmb2j,PMC,Evaluation of Measles Vaccine Virus as a Vecto...,10.2174/1874357901206010012,PMC3286841,...,Live attenuated recombinant measles vaccine vi...,2012-02-16,"Mok, Hoyin; Cheng, Xing; Xu, Qi; Zengel, James...",Open Virol J,,noncomm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,Live attenuated viruses have been developed an...,RSV A2 strain was obtained from ATCC (Manassas...,The reverse genetics system for measles Edmons...


# Exploration/Cleaning

### Different Abstract in Metadata and JSON files

abstract_x from json, abstract_y from metadata

In [31]:
df[df.abstract_x != df.abstract_y].shape

(44677, 22)

In [32]:
df[df.abstract_x != df.abstract_y][['abstract_x', 'abstract_y', 'url']].tail(10)

Unnamed: 0,abstract_x,abstract_y,url
48412,The very first case of corona-virus illness wa...,The very first case of corona-virus illness wa...,
48413,"COVID-19 has infected more than 823,000 people...","COVID-19 has infected more than 823,000 people...",https://arxiv.org/abs/2003.07353
48414,The COVID-19 epidemic was listed as a public h...,The COVID-19 epidemic was listed as a public h...,https://arxiv.org/abs/2004.01479
48415,,COVID-19 has spread from China across Europe a...,
48416,The SARS-CoV-2 infectious outbreak has rapidly...,The SARS-CoV-2 infectious outbreak has rapidly...,https://arxiv.org/abs/2004.10376
48418,"Presently, India bears amongst the highest bur...","Presently, India bears amongst the highest bur...",https://arxiv.org/abs/2004.06361
48419,This paper describes how mobile phone data can...,This paper describes how mobile phone data can...,https://arxiv.org/abs/2003.12347
48420,The coronavirus disease 2019 (COVID-19) caused...,The coronavirus disease 2019 (COVID-19) caused...,https://arxiv.org/abs/2003.00163
48421,"Disease outbreaks, such as those of Severe Acu...",,
48423,We consider an age-structured epidemic model w...,We consider an age-structured epidemic model w...,https://arxiv.org/abs/1312.2120


In [33]:
df[df.abstract_x != df.abstract_y][['abstract_x', 'abstract_y', 'url']].url.iloc[-1]

'https://arxiv.org/abs/1312.2120'

Checking some of the files online, it seems that where the abstract is missing in the metadata, the abstract in the JSON file is simply the beginning of the text.

In [34]:
df[df.abstract_x != df.abstract_y][['abstract_x', 'abstract_y', 'url', 'body_text_x', 'body_text_y']][
    (df.abstract_y.isnull()) & (df.abstract_x != '') & (~df.url.isnull())]

  


Unnamed: 0,abstract_x,abstract_y,url,body_text_x,body_text_y
256,"Citation Alagaili AN, Briese T, Karesh WB, Das...",,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,W e thank Samara and Abdoun for this opportuni...,We thank Samara and Abdoun for this opportunit...
403,In the recently published paper by Zhang et al...,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,In the recently published paper by Zhang et al...,Dear editor\nIn the recently published paper b...
413,"To cite: Rabinowitz PMacG, Pappaioanou M, Bard...",,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,"In 2015, the Rockefeller Foundation-Lancet Com...","In 2015, the Rockefeller Foundation-Lancet Com..."
453,The past few months have yielded disconcerting...,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,Virologists have been surprised by a recent re...,Virologists have been surprised by a recent re...
545,Citation Fouchier RAM. 2015. Studies on influe...,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,L ipsitch and Inglesby recently estimated the ...,Initial calculations of the potential risks as...
...,...,...,...,...,...
46875,Positive-strand (+)RNA viruses are important a...,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,"All +RNA viruses (e.g., poliovirus, hepatitis ...","All +RNA viruses (e.g., poliovirus, hepatitis ..."
47054,We read with great interest the article by Kha...,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,of ACE2 via the MAS1 receptor (and not via the...,We read with great interest the article by Kha...
47253,"Highly mutable, infinitely malleable, and all-...",,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,"Highly mutable, infinitely malleable, and all-...","Highly mutable, infinitely malleable, and all-..."
47277,Health-Emergency Disaster Risk Management (Hea...,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Health-Emergency Disaster Risk Management (Hea...,Health-Emergency Disaster Risk Management (Hea...


body_text_x is from pdf, body_text_y from pmc

In [35]:
df.shape

(48424, 22)

In [36]:
df.abstract_x.isnull().sum(), (df.abstract_x =='').sum() # missing abstracts in json files

(0, 13736)

In [37]:
df.abstract_y.isnull().sum(), (df.abstract_y=='').sum() # missing abstracts in metadata

(10992, 0)

Since the abstracts from the metadata seem more reliable we generally use these, but fill the missing values with the abstract from the extracted values from the JSON file.

In [38]:
df.loc[df.abstract_y.isnull() & (df.abstract_x != ''), 'abstract_y'] = df[(df.abstract_y.isnull()) & (df.abstract_x != '')].abstract_x

In [39]:
df.abstract_y.isnull().sum()

6514

the remaining missing values are also empty in the json files

In [40]:
(df.abstract_y.isnull() & (df.abstract_x!='')).sum()

0

In [41]:
df.rename(columns = {'abstract_y': 'abstract'}, inplace=True)
df.drop('abstract_x', axis=1, inplace=True)

In [42]:
df.columns

Index(['paper_id_x', 'body_text_x', 'methods_x', 'results_x', 'cord_uid',
       'source_x', 'title', 'doi', 'pmcid', 'pubmed_id', 'license', 'abstract',
       'publish_time', 'authors', 'journal', 'arxiv_id', 'full_text_file',
       'url', 'body_text_y', 'methods_y', 'results_y'],
      dtype='object')

We still have to compare the text body from pdf and pmc files.

In [43]:
df.shape

(48424, 21)

# Quick comparison of both texts

In [44]:
df.shape

(48424, 21)

In [45]:
(df.body_text_x != df.body_text_y).sum()

48424

In [46]:
df[(df.body_text_x != df.body_text_y) & df.body_text_y.notnull()][['body_text_x', 'body_text_y']].head(10).iloc[2].values[0][:500]

'occurs primarily through a macropinocytosis-like process, and the virus traffics through the endocytic pathway (16) (17) (18) . Proteolytic cleavage of GP1 by a class of low-pH-activated proteases, cysteine cathepsins, removes the C-terminal glycan cap and mucin domain sequences (19) to reveal a receptor-binding domain. This newly exposed GP1 domain interacts with Niemann-Pick C1 (NPC1), which serves as an obligate intracellular receptor for filoviruses (20, 21) . While cathepsin cleavage and su'

In [47]:
df[(df.body_text_x != df.body_text_y) & df.body_text_y.notnull()][['body_text_x', 'body_text_y']].head(10).iloc[2].values[1][:500]

'Ebola virus (EBOV) and other members of the family Filoviridae of enveloped, negative-strand RNA viruses are associated with highly lethal disease for which no FDA-approved vaccines or treatments exist. Filovirus particles are characterized by a thread-like morphology, with infectious virions averaging 800 to 1,000 nm in length by 100 nm in diameter (1). The viral genome is encapsidated by several proteins, the nucleoprotein (NP), VP24, VP30, and VP35, to form the ribonucleocapsid, along with th'

In [48]:
df[df.body_text_x != df.body_text_y].head()

Unnamed: 0,paper_id_x,body_text_x,methods_x,results_x,cord_uid,source_x,title,doi,pmcid,pubmed_id,...,abstract,publish_time,authors,journal,arxiv_id,full_text_file,url,body_text_y,methods_y,results_y
0,3cdc48bb9e40afd30a59463b7872761a726998c8,Newcastle disease (ND) is an emerging disease ...,ND Virus Culture. NDV (Roakin strain) was obta...,Experiment 1. Adult house ßies harbored Newcas...,66ecksxm,PMC,Experimental Evaluation of Musca domestica (Di...,10.1093/jmedent/44.4.666,PMC7107465,17695023.0,...,"House flies, Musca domestica L. (Diptera: Musc...",2007-07-01,"Watson, D. Wes; Niño, Elina L.; Rochon, Katery...",J Med Entomol,,noncomm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,NDV (Roakin strain) was obtained from Dr. D. J...,NDV (Roakin strain) was obtained from Dr. D. J...,Adult house flies harbored Newcastle Disease v...
1,d99acb4e99be7852aa61a688c9fbd38d44b5a252,Live attenuated viruses have been developed an...,Viruses and cells. RSV A2 strain was obtained ...,,3qdjmb2j,PMC,Evaluation of Measles Vaccine Virus as a Vecto...,10.2174/1874357901206010012,PMC3286841,22383906.0,...,Live attenuated recombinant measles vaccine vi...,2012-02-16,"Mok, Hoyin; Cheng, Xing; Xu, Qi; Zengel, James...",Open Virol J,,noncomm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,Live attenuated viruses have been developed an...,RSV A2 strain was obtained from ATCC (Manassas...,The reverse genetics system for measles Edmons...
2,748d4c57fe1acc8d9d97cf574f7dea5296f9386c,occurs primarily through a macropinocytosis-li...,Cells. U2OS human osteosarcoma cells were cult...,,tnaizwxo,PMC,Direct Visualization of Ebola Virus Fusion Tri...,10.1128/mbio.01857-15,PMC4752599,26861015.0,...,Ebola virus (EBOV) makes extensive and intrica...,2016-02-09,"Spence, Jennifer S.; Krause, Tyler B.; Mittler...",mBio,,noncomm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,Ebola virus (EBOV) and other members of the fa...,U2OS human osteosarcoma cells were cultured in...,For evaluating EBOV GP triggering under biosaf...
3,b891efc6e1419713b05ff7d89b26d260478c28df,The goal of the present study was to investiga...,,,qxsj2zud,PMC,Tuberculosis prevention in healthcare workers ...,10.1183/23120541.00015-2015,PMC5005135,27730135.0,...,BSL3 and respiratory isolation wards protect h...,2015-08-21,"Deng, Yunfeng; Li, Yan; Wang, Fengtian; Gao, D...",ERJ Open Res,,noncomm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,To the Editor:\nChina has the world's second l...,,
4,353852971069ad5794445e5c1ab6077ce23da75d,Coronavirus disease 2019 (COVID-19) has spread...,,,,,,,,,...,,,,,,,,,,


In [49]:
df.iloc[34885].body_text_x[:500]

'Canine infectious respiratory disease (CIRD) is a disease complex which occurs frequently in densely housed dog populations such as in re-homing, training and boarding kennels. The disease is characterised by a dry hacking cough with a recovery period of 1-3 weeks, but can progress to a severe bronchopneumonia which may be fatal (Appel and Binn, 1987) . Many microbiological agents have been associated with CIRD; including Bordetella bronchiseptica, canine parainfluenza virus; canine adenovirus-2'

In [50]:
df.iloc[34885].body_text_y[:500]

TypeError: 'float' object is not subscriptable

In [51]:
df.iloc[34885].url

'https://doi.org/10.1016/j.jviromet.2008.10.008'

In [52]:
df.iloc[34888].body_text_x[:500]

'The origin of genetic variations in clinical expression of viral diseases is of major biological importance but is far from being completely understood. Experimental diseases induced in mice by coronaviruses allow a convenient approach to study this question, since the induced pathologies depend upon the genetic origin of the infected mice [12, 20] . Thus, intraperitoneal injection of murine hepatitis virus type 3 (MHV 3) into adult mice results either in early death (at day 3 to 4 post infectio'

In [53]:
df.iloc[34888].body_text_y[:500]

TypeError: 'float' object is not subscriptable

In [54]:
df.iloc[34888].url

'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7086966/'

In [55]:
df.iloc[1337].body_text_x[:500]

'Acute respiratory distress syndrome (ARDS) is a clinically and biologically heterogeneous syndrome with a severe lung inflammation disorder that presents as rapidly progressive hypoxemia and dyspnea. 1 ARDS is principally associated with direct injury to the lung, such as infectious pneumonia, or indirect injury to the lung resulting from systemic inflammation, such as non-pulmonary sepsis, trauma, and surgery. 2 Pneumonia remains the predominant cause of death in children under 5 years of age 3'

In [56]:
df.iloc[1337].body_text_y[:500]

'Acute respiratory distress syndrome (ARDS) is a clinically and biologically heterogeneous syndrome with a severe lung inflammation disorder that presents as rapidly progressive hypoxemia and dyspnea.1 ARDS is principally associated with direct injury to the lung, such as infectious pneumonia, or indirect injury to the lung resulting from systemic inflammation, such as non‐pulmonary sepsis, trauma, and surgery.2 Pneumonia remains the predominant cause of death in children under 5 years of age3 an'

In [57]:
df.iloc[1337].url

'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5697698/'

In [58]:
df.iloc[1242].body_text_x[:500]

'The global prevalence of the dengue virus (DENV) has grown dramatically in recent decades, and it is now endemic in .100 countries, with some 2.5 billion people at risk of infection. Dengue is an arthropod-borne flavivirus that can be subdivided into the 4 major serotypes (DEN-1-DEN-4). Most dengue infections either are asymptomatic or lead to a self-limiting febrile illness, dengue fever. In some cases the illness is more severe, leading to dengue hemorrhagic fever (DHF) with severe plasma leak'

In [59]:
df.iloc[1242].body_text_y[:500]

'The dengue-2 strain, 16681, was grown in C6/36 cells, Vero cells, and monocyte-derived dendritic cells (DCs). Cell-free supernatants were used either neat or after concentration by ultracentrifugation at 45,000 rpm for 4 h at 4°C, and the virus pellet was resuspended in 1.5% fetal bovine serum (FBS)/Leibovitz L-15. To concentrate large volumes of low-titer DENV supernatant, DENV were precipitated with 10% Polyethylene glycol 8000 (Sigma) before ultracentrifugation. U937 were maintained in 10% FB'

In [60]:
df.iloc[1242].url

'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3100511/'

Where available we use the text from the pmc file (body_text_y), trusting the statement that it is of higher quality.

In [61]:
df.body_text_x.isnull().sum(), df.body_text_y.isnull().sum()

(0, 30108)

In [62]:
(df.body_text_x == '').sum(), (df.body_text_y == '').sum()

(0, 0)

In [63]:
df.loc[df.body_text_y.notnull(), 'body_text_x'] = df.loc[df.body_text_y.notnull(), 'body_text_y']

In [64]:
df.body_text_x.isnull().sum()

0

In [65]:
df.rename(columns = {'body_text_x': 'body_text'}, inplace=True)
df.drop('body_text_y', axis=1, inplace=True)

In [66]:
df.columns

Index(['paper_id_x', 'body_text', 'methods_x', 'results_x', 'cord_uid',
       'source_x', 'title', 'doi', 'pmcid', 'pubmed_id', 'license', 'abstract',
       'publish_time', 'authors', 'journal', 'arxiv_id', 'full_text_file',
       'url', 'methods_y', 'results_y'],
      dtype='object')

In [67]:
df[['methods_x', 'methods_y', 'url']][df.methods_y.notnull()]

Unnamed: 0,methods_x,methods_y,url
0,ND Virus Culture. NDV (Roakin strain) was obta...,NDV (Roakin strain) was obtained from Dr. D. J...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...
1,Viruses and cells. RSV A2 strain was obtained ...,RSV A2 strain was obtained from ATCC (Manassas...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...
2,Cells. U2OS human osteosarcoma cells were cult...,U2OS human osteosarcoma cells were cultured in...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
3,,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
5,Microarray design. National Center for Biotech...,National Center for Biotechnology Information ...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
...,...,...,...
47632,To evaluate the specificity of the utilized Re...,Enzymes including Bst2.0 DNA polymerase (8000 ...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
47633,The sitting-drop vapor diffusion method was us...,"The peptide, NP44 (CTELKLSDY) derived from the...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...
47634,"Endocrine disruption, cancer, birth defects, n...",,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
47635,Medical charts were reviewed for demographic c...,This retrospective study included children wit...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...


In [68]:
(df.methods_x == '').sum(), df.methods_x.isnull().sum()

(34707, 0)

In [69]:
(df.methods_y == '').sum(), df.methods_y.isnull().sum()

(6622, 30108)

In [70]:
# use methods_y (from pmc) when it's available
mask = (df.methods_y.notnull()) & (df.methods_y != '')
df.loc[mask, 'methods_x'] = df.loc[mask, 'methods_y']

# same for results
mask = (df.results_y.notnull()) & (df.results_y != '')
df.loc[mask, 'results_x'] = df.loc[mask, 'results_y']

In [71]:
(df.results_x == '').sum(), df.results_x.isnull().sum()

(31384, 0)

In [72]:
(df.results_y == '').sum(), df.results_y.isnull().sum()

(7408, 30108)

In [73]:
df.rename(columns = {'methods_x': 'methods', 'results_x': 'results'}, inplace=True)
df.drop(columns=['methods_y', 'results_y'], inplace=True)

In [74]:
df.rename(columns = {'paper_id_x': 'paper_id', 'source_x': 'source'}, inplace=True)

In [75]:
df.columns

Index(['paper_id', 'body_text', 'methods', 'results', 'cord_uid', 'source',
       'title', 'doi', 'pmcid', 'pubmed_id', 'license', 'abstract',
       'publish_time', 'authors', 'journal', 'arxiv_id', 'full_text_file',
       'url'],
      dtype='object')

In [76]:
df.head()

Unnamed: 0,paper_id,body_text,methods,results,cord_uid,source,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,arxiv_id,full_text_file,url
0,3cdc48bb9e40afd30a59463b7872761a726998c8,NDV (Roakin strain) was obtained from Dr. D. J...,NDV (Roakin strain) was obtained from Dr. D. J...,Adult house flies harbored Newcastle Disease v...,66ecksxm,PMC,Experimental Evaluation of Musca domestica (Di...,10.1093/jmedent/44.4.666,PMC7107465,17695023.0,cc-by-nc,"House flies, Musca domestica L. (Diptera: Musc...",2007-07-01,"Watson, D. Wes; Niño, Elina L.; Rochon, Katery...",J Med Entomol,,noncomm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...
1,d99acb4e99be7852aa61a688c9fbd38d44b5a252,Live attenuated viruses have been developed an...,RSV A2 strain was obtained from ATCC (Manassas...,The reverse genetics system for measles Edmons...,3qdjmb2j,PMC,Evaluation of Measles Vaccine Virus as a Vecto...,10.2174/1874357901206010012,PMC3286841,22383906.0,cc-by-nc,Live attenuated recombinant measles vaccine vi...,2012-02-16,"Mok, Hoyin; Cheng, Xing; Xu, Qi; Zengel, James...",Open Virol J,,noncomm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...
2,748d4c57fe1acc8d9d97cf574f7dea5296f9386c,Ebola virus (EBOV) and other members of the fa...,U2OS human osteosarcoma cells were cultured in...,For evaluating EBOV GP triggering under biosaf...,tnaizwxo,PMC,Direct Visualization of Ebola Virus Fusion Tri...,10.1128/mbio.01857-15,PMC4752599,26861015.0,cc-by-nc-sa,Ebola virus (EBOV) makes extensive and intrica...,2016-02-09,"Spence, Jennifer S.; Krause, Tyler B.; Mittler...",mBio,,noncomm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
3,b891efc6e1419713b05ff7d89b26d260478c28df,To the Editor:\nChina has the world's second l...,,,qxsj2zud,PMC,Tuberculosis prevention in healthcare workers ...,10.1183/23120541.00015-2015,PMC5005135,27730135.0,cc-by-nc,BSL3 and respiratory isolation wards protect h...,2015-08-21,"Deng, Yunfeng; Li, Yan; Wang, Fengtian; Gao, D...",ERJ Open Res,,noncomm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
4,353852971069ad5794445e5c1ab6077ce23da75d,Coronavirus disease 2019 (COVID-19) has spread...,,,,,,,,,,,,,,,,


# Duplicates

Some paper ids are duplicated

In [77]:
len(df)

48424

In [78]:
df.paper_id.nunique()

48409

In [79]:
df[df.duplicated(subset=['paper_id'], keep=False)][['paper_id', 'body_text']]

Unnamed: 0,paper_id,body_text
435,58be092086c74c58e9067121a6ba4836468e7ec3,"administration reduces viral load. Also, the a..."
436,58be092086c74c58e9067121a6ba4836468e7ec3,I am grateful that Dr. Lim and his colleagues ...
15230,77943f83d13697f86b9d1eb3cfa86581ed9965e6,Respect for patient privacy and confidentialit...
15231,77943f83d13697f86b9d1eb3cfa86581ed9965e6,Respect for patient privacy and confidentialit...
15333,bd3945ec343cc4535650c9425aa143f71716de7c,"Until the late 1970s, papillomaviruses attract..."
15334,bd3945ec343cc4535650c9425aa143f71716de7c,"Until the late 1970s, papillomaviruses attract..."
15644,4644c32551fb23aa873a7738ecc8d777bd49877e,"S15-S61 A. Carhan *, N. Albayrak, A.B. Altas, ..."
15645,4644c32551fb23aa873a7738ecc8d777bd49877e,"S15-S61 A. Carhan *, N. Albayrak, A.B. Altas, ..."
15646,4644c32551fb23aa873a7738ecc8d777bd49877e,"S15-S61 A. Carhan *, N. Albayrak, A.B. Altas, ..."
15647,4644c32551fb23aa873a7738ecc8d777bd49877e,"S15-S61 A. Carhan *, N. Albayrak, A.B. Altas, ..."


But luckily they also have the same text body. So we will just keep one article per paper_id.
Check for example [https://www.sciencedirect.com/science/article/pii/S1386653209701295?via%3Dihub](https://www.sciencedirect.com/science/article/pii/S1386653209701295?via%3Dihub) and [https://www.sciencedirect.com/science/article/pii/S1386653209701325?via%3Dihub](https://www.sciencedirect.com/science/article/pii/S1386653209701325?via%3Dihub) - they have the same content.

In [80]:
df[df.duplicated(subset=['paper_id', 'body_text'], keep=False)].shape

(16, 18)

In [81]:
df.drop_duplicates(['paper_id', 'body_text'], inplace=True)

In [82]:
len(df)

48414

In [83]:
df[df.duplicated(['paper_id'], keep=False)].head(2)

Unnamed: 0,paper_id,body_text,methods,results,cord_uid,source,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,arxiv_id,full_text_file,url
435,58be092086c74c58e9067121a6ba4836468e7ec3,"administration reduces viral load. Also, the a...",,,fjmchbew,CZI,The Author's Response: Case of the Index Patie...,10.3346/jkms.2020.35.e89,,32080993,cc-by-nc,,2020,"Lim, Jaegyun; Jeon, Seunghyun; Shin, Hyun Youn...",J Korean Med Sci,,noncomm_use_subset,https://doi.org/10.3346/jkms.2020.35.e89
436,58be092086c74c58e9067121a6ba4836468e7ec3,I am grateful that Dr. Lim and his colleagues ...,,,u4taazr3,PMC,Letter to the Editor: Case of the Index Patien...,10.3346/jkms.2020.35.e88,PMC7036343,32080992,cc-by-nc,,2020-02-20,"Kim, Jin Yong",J Korean Med Sci,,noncomm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...


In [84]:
df.drop_duplicates(['paper_id'], inplace=True)

In [85]:
df.paper_id.nunique()

48409

In [86]:
df.shape

(48409, 18)

Now the paper_id is unique.

In [87]:
df.isnull().sum()

paper_id              0
body_text             0
methods               0
results               0
cord_uid           5079
source             5079
title              5117
doi                6143
pmcid             13345
pubmed_id         18358
license            5079
abstract           6503
publish_time       5079
authors            5775
journal            8798
arxiv_id          47780
full_text_file     5079
url                5199
dtype: int64

# Some new columns for convenience

In [88]:
# some new columns for convenience
df['publish_year'] = df.publish_time.str[:4].fillna(-1).astype(int) # 360 times None
# df['link'] = 'http://dx.doi.org/' + df.doi #dataset now has url column

In [89]:
df['is_covid19'] = df.body_text.str.contains('COVID-19|covid|sar cov 2|SARS-CoV-2|2019-nCov|2019 ncov|SARS Coronavirus 2|2019 Novel Coronavirus|coronavirus 2019| Wuhan coronavirus|wuhan pneumonia|wuhan virus', case=False)

In [90]:
df.is_covid19.sum()

6695

# Language Detection to remove non-english articles and abstracts

In [91]:
from IPython.utils import io

with io.capture_output() as captured:
    !pip install scispacy
    !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz
    !pip install spacy-langdetect
    !pip install spac scispacy spacy_langdetect https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.3/en_core_sci_lg-0.2.3.tar.gz

In [92]:
import scispacy
import spacy
import en_core_sci_lg
from spacy_langdetect import LanguageDetector

In [93]:
# medium model
nlp = en_core_sci_lg.load(disable=["tagger", "ner"])
nlp.max_length = 2000000
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

In [94]:
doc = nlp('This is some English text. Das ist ein Haus. This is a house.')
doc._.language

{'language': 'en', 'score': 0.9999965388666541}

In [95]:
for s in doc.sents:
    print(s._.language)

{'language': 'en', 'score': 0.9999972741074902}
{'language': 'de', 'score': 0.9999974580281228}
{'language': 'en', 'score': 0.9999968742655584}


In [96]:
doc = nlp(df[df.paper_id == '1a8a4dbbaa94ced4ef6af69ec7a09d3fa4c0eece'].body_text.iloc[0])

In [97]:
doc[:500]

Flu Influenza Influenza virus Infection of the respiratory tract Acute infection a b s t r a c t Virus: Influenza virus is the only human-pathogenic member of orthomyxovirus, which during evolution acquired the capability to infect the cells of various mammals and birds. Besides man common hosts are domestic animals as horse, swine, cat and poultry, among the birds especially goose and duck, and nearly all mammals living in water. The influenza virus A and B genome is segmented, when two different viruses infect one cell whole genome-segments can be exchanged (reassortment). This capability leads to a high variability of the envelope surface proteins H (haemagglutinin) and N (neuraminidase), as well as the polymerase inside the virus particle. An influenza virus equipped with epitopes on its surface that are unknown by the human and animal immune system has the potential to initiate a new epidemic. Transmission: for influenza virus main route is aerogene, especially by small droplets w

In [98]:
doc_engl = ''
for s in doc.sents:
    if (s._.language['language'] == 'en'):
        doc_engl += s.text 

In [99]:
doc_engl[:2000]

'Flu Influenza Influenza virus Infection of the respiratory tract Acute infection a b s t r a c t Virus: Influenza virus is the only human-pathogenic member of orthomyxovirus, which during evolution acquired the capability to infect the cells of various mammals and birds.Besides man common hosts are domestic animals as horse, swine, cat and poultry, among the birds especially goose and duck, and nearly all mammals living in water.The influenza virus A and B genome is segmented, when two different viruses infect one cell whole genome-segments can be exchanged (reassortment).This capability leads to a high variability of the envelope surface proteins H (haemagglutinin) and N (neuraminidase), as well as the polymerase inside the virus particle.An influenza virus equipped with epitopes on its surface that are unknown by the human and animal immune system has the potential to initiate a new epidemic.Transmission: for influenza virus main route is aerogene, especially by small droplets while

Check language of each text body (only use the first 2000 characters).

In [100]:
df['text_language'] = df.body_text.apply(lambda x: nlp(str(x[:2000]))._.language['language'])

df.text_language.value_counts()

en         47110
fr           432
es           404
nl           188
de           167
it            24
pt            16
UNKNOWN       13
cy            12
ca             8
zh-cn          5
ko             3
ru             3
et             3
da             3
id             3
so             3
no             2
lt             2
tl             2
ro             2
pl             2
af             1
sq             1
Name: text_language, dtype: int64

## Number of non-english texts to drop.

In [101]:
df.loc[df[df.text_language != 'en'].index].shape

(1299, 21)

In [102]:
df = df.drop(df[df.text_language != 'en'].index)

In [103]:
# Check language of all abstracts

# df['abstract_lang'] = df.abstract.apply(lambda x: nlp(str(x))._.language['language'])

#  df[df.abstract.isnull()]

In [104]:
# Number of non-english abstracts

# df[(df.abstract_lang != 'en') & (df.abstract.notnull())].abstract_lang.value_counts()

# Keep all english abstracts and those without abstract

# df = df[(df.abstract_lang == 'en') | (df.abstract.isnull())]

# df.shape

# df.paper_id.nunique()

# Analyze title/text body of the papers without abstract

# temp = df[df.abstract.isnull()].copy()

# def remove_non_english_sentences(doc):
#     doc = nlp(doc)
#     doc_engl = ''
#     for s in doc.sents:
#         if (s._.language['language'] == 'en'):
#             doc_engl += s.text 
#     return doc_engl

# remove_non_english_sentences(df[df.paper_id == '1a8a4dbbaa94ced4ef6af69ec7a09d3fa4c0eece'].body_text.iloc[0])

# temp['text_length'] = temp.body_text.apply(lambda x: len(x))

# temp['english_text'] = temp.body_text.apply(remove_non_english_sentences)

# temp['english_length'] = temp.english_text.apply(lambda x: len(x))

# temp.to_csv('df_english.csv', index=False)

# (temp.english_length/temp.text_length).hist()

# ((temp.english_length/temp.text_length)<0.8).sum()

# temp[((temp.english_length/temp.text_length)<0.8)].head()

# temp[temp.paper_id == '7925057cfe0cb75ae6079879cb2d22d23e42dfa5'].body_text.values[0][:500]

# temp[temp.paper_id == '617197cc751a9208cb0af1b4e31baeddc8d2e985'].body_text.values[0]

# temp[temp.paper_id == 'ca51b53fa512085e1aa166d5308602ff1666a90c'].body_text.values[0][:500]

# df = df.drop(temp[((temp.english_length/temp.text_length)<0.8)].index)

In [105]:
# temp['title_lang'] = df.title.apply(lambda x: nlp(str(x))._.language['language'])

# temp.title_lang.value_counts()

# Too many false-positves. 

# temp[temp.paper_id == '6f6b7b1efffae7f3765f29fe801ab63dd35110bb'].body_text.values[0]

# temp[temp.title_lang == 'de']

# We check the beginning of each text body instead.

# temp['text_lang'] = df.body_text.apply(lambda x: nlp(str(x[:2000]))._.language['language'])

# temp.text_lang.value_counts()

# Number of non-english texts to drop.

# df.loc[temp[temp.text_lang != 'en'].index].shape

# df = df.drop(temp[temp.text_lang != 'en'].index)

# Extract Study Design/ Methodological Keywords

In [106]:
# filter_dict = {
#     "discussion": ["conclusions","conclusion",'| discussion', "discussion",  'concluding remarks',
#                    'discussion and conclusions','conclusion:', 'discussion and conclusion',
#                    'conclusions:', 'outcomes', 'conclusions and perspectives', 
#                    'conclusions and future perspectives', 'conclusions and future directions'],
#     "results": ['executive summary', 'result', 'summary','results','results and discussion','results:',
#                 'comment',"findings"],
#     "introduction": ['introduction', 'background', 'i. introduction','supporting information','| introduction'],
#     "methods": ['methods','method','statistical methods','materials','materials and methods',
#                 'data collection','the study','study design','experimental design','objective',
#                 'objectives','procedures','data collection and analysis', 'methodology',
#                 'material and methods','the model','experimental procedures','main text',],
#     "statistics": ['data analysis','statistical analysis', 'analysis','statistical analyses', 
#                    'statistics','data','measures'],
#     "clinical": ['diagnosis', 'diagnostic features', "differential diagnoses", 'classical signs','prognosis', 'clinical signs', 'pathogenesis',
#                  'etiology','differential diagnosis','clinical features', 'case report', 'clinical findings',
#                  'clinical presentation'],
#     'treatment': ['treatment', 'interventions'],
#     "prevention": ['epidemiology','risk factors'],
#     "subjects": ['demographics','samples','subjects', 'study population','control','patients', 
#                'participants','patient characteristics'],
#     "animals": ['animals','animal models'],
#     "abstract": ["abstract", 'a b s t r a c t','author summary'], 
#     "review": ['review','literature review','keywords']}

In [107]:
study_designs = {'RCT': ['RCT', 'randomized controlled trial', 'randomised controlled trial', 'randomized control trial', 'randomised control trial',
                         'randomized clinical trial','randomised clinical trial'], 
                'time series analysis': ['time series analysis', 'time series', 'survival analysis'],
                'retrospective cohort': ['retrospective cohort'],
                'cross-sectional case-control': ['cross-sectional case-control', 'cross sectional case control', 'cross-sectional case control'],
                'prospective case-control': ['prospective case-control', 'prospective case control'],
                'matched case-control': ['matched case-control', 'matched case control'],
                'medical records review': ['medical records review'],
                'prevalence survey': ['prevalence survey'],
                'syndromic surveillance': ['syndromic surveillance'],
                'systematic review': ['systematic review'],
                'meta-analysis': ['meta-analysis', 'meta analysis', 'meta-syntheses'],
                'interventional study': ['interventional study'],
                'association': ['association', 'associated with'],
                 'p-value': ['p-value', 'p value'],
                 'pseudo-randomized controlled trial': ['pseudo-randomized controlled trial', 'pseudo-randomised controlled trial']
                }

Keywords from  [https://docs.google.com/spreadsheets/d/1t2e3CHGxHJBiFgHeW0dfwtvCG4x0CDCzcTFX7yz9Z2E/edit#gid=1217643351](https://docs.google.com/spreadsheets/d/1t2e3CHGxHJBiFgHeW0dfwtvCG4x0CDCzcTFX7yz9Z2E/edit#gid=1217643351)

In [108]:
generic_keywords = ['estimation',
 'prevalence survey',
 'response rate',
 'incidence',
 'psychometric evaluation of instrument',
 'median time to event',
 'pooled OR',
 'd-pooled',
 'randomized controlled trial',
 'non-randomized',
 'allocation method',
 'Cochrane review',
 'Cox proportional hazards',
 'gamma',
 'Weibull',
 'pseudo-randomised',
 'chart review',
 'log odds',
 'surveillance',
 'time-to-event analysis',
 'pooled adjusted odds ratio',
 'pooled relative risk',
 'data abstraction forms',
 'frequency',
 'etiology logistic regression',
 'exclusion criteria',
 'eligibility criteria',
 'right-censored',
 'pooled odds ratio',
 'non-comparative study',
 'medical records review',
 'CONSORT',
 'number of controls per case',
 'quasi-randomised',
 'risk of bias',
 'publication bias',
 'syndromic surveillance',
 'truncated',
 'longitudinal',
 'matching criteria',
 'double-blind',
 "Cohen's d",
 'registry data',
 'Adjusted Odds Ratio',
 'questionnaire development',
 'Kaplan-Meier',
 'heterogeneity',
 'recruitment',
 'randomization method',
 'censoring',
 'meta-analysis',
 'non-randomised',
 'β',
 'electronic medical records',
 'eligibility',
 'cross-sectional survey',
 'PRISMA',
 'prevalence',
 'inclusion criteria',
 'control arm',
 'protocol',
 'pooled risk ratio',
 'non-response bias',
 'baseline',
 'retrospective chart review',
 'survival analysis',
 'logistic regression',
 'blind',
 'exposure status',
 'randomized',
 'associated with',
 'lognormal',
 'systematic review',
 'RCT',
 'randomised',
 'survey instrument',
 'interrater reliability',
 'randomisation',
 'pooled RR',
 'hazard ratio',
 'AOR',
 'potential confounders',
 'treatment effect',
 'randomized clinical trial',
 'data collection instrument',
 'pooled AOR',
 'association',
 'power',
 "cohen's kappa",
 'pseudo-randomized',
 'treatment arm',
 'search string',
 'quasi-randomized',
 'cohort',
 'risk factors',
 'difference between means',
 'registry',
 'inter-rater reliability',
 'Odds Ratio',
 'placebo',
 'databases searched',
 'risk factor analysis',
 'difference in means',
 'random sample',
 'etiology',
 'i2']

In [109]:
for a in generic_keywords:
    if a not in [x for v in study_designs.values() for x in v]:
        study_designs[a] = [a]

In [110]:
len([x for v in study_designs.values() for x in v])

127

In [111]:
# def tag_study_design(study_designs):
#     df['study_design'] = [set() for _ in range(len(df))]
#     for tag in study_designs.keys():
#         for synonym in study_designs[tag]:
#             df[df.abstract.str.contains(synonym, case=False, na=False)].study_design.apply(lambda x: x.add(tag))

In [112]:
def tag_study_design(study_designs):
    df['study_abstract'] = [set() for _ in range(len(df))]
    df['study_methods'] = [set() for _ in range(len(df))]
    df['study_results'] = [set() for _ in range(len(df))]

    for tag in study_designs.keys():
        for synonym in study_designs[tag]:
            df[df.abstract.str.contains(synonym, case=False, na=False) | df.title.str.contains(synonym, case=False, na=False)].study_abstract.apply(lambda x: x.add(tag))
            df[df.methods.str.contains(synonym, case=False, na=False)].study_methods.apply(lambda x: x.add(tag))
            df[df.results.str.contains(synonym, case=False, na=False)].study_results.apply(lambda x: x.add(tag))
    
    df['study_design'] = df.apply(lambda x: list(x.study_abstract.union(x.study_methods).union(x.study_results)), axis=1)
    df.study_abstract = df.study_abstract.apply(lambda x: list(x))
    df.study_methods = df.study_abstract.apply(lambda x: list(x))
    df.study_results = df.study_results.apply(lambda x: list(x))

In [113]:
tag_study_design(study_designs)

In [114]:
df[df.study_design.str.len() != 0].tail(20).study_design

48366                           [association, protocol]
48371                       [estimation, heterogeneity]
48375                              [exclusion criteria]
48378                          [prevalence, estimation]
48381                                        [protocol]
48382                         [estimation, association]
48386                                             [AOR]
48387    [random sample, power, association, frequency]
48388                                           [power]
48391                 [time series analysis, incidence]
48393                                    [surveillance]
48399                     [estimation, power, protocol]
48402                 [truncated, gamma, heterogeneity]
48404                   [power, p-value, heterogeneity]
48405                                   [heterogeneity]
48407                                     [association]
48412                         [time series analysis, β]
48416                                      [prev

In [115]:
len(df.study_abstract[df.study_abstract.str.len() != 0])

17023

In [116]:
len(df.study_methods[df.study_methods.str.len() != 0])

17023

In [117]:
len(df.study_results[df.study_results.str.len() != 0])

10879

In [118]:
len(df.study_design[df.study_design.str.len() != 0])

24603

In [119]:
len(df.study_design[(df.study_design.str.len() != 0) & df.is_covid19])

2651

In [120]:
df.drop(columns=['cord_uid', 'pmcid', 'pubmed_id', 'full_text_file', 'license', 'text_language',
                 'study_abstract', 'study_methods', 'study_results'], inplace=True)

# Export as .csv

In [121]:
df.head()

Unnamed: 0,paper_id,body_text,methods,results,source,title,doi,abstract,publish_time,authors,journal,arxiv_id,url,publish_year,is_covid19,study_design
0,3cdc48bb9e40afd30a59463b7872761a726998c8,NDV (Roakin strain) was obtained from Dr. D. J...,NDV (Roakin strain) was obtained from Dr. D. J...,Adult house flies harbored Newcastle Disease v...,PMC,Experimental Evaluation of Musca domestica (Di...,10.1093/jmedent/44.4.666,"House flies, Musca domestica L. (Diptera: Musc...",2007-07-01,"Watson, D. Wes; Niño, Elina L.; Rochon, Katery...",J Med Entomol,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,2007,False,[]
1,d99acb4e99be7852aa61a688c9fbd38d44b5a252,Live attenuated viruses have been developed an...,RSV A2 strain was obtained from ATCC (Manassas...,The reverse genetics system for measles Edmons...,PMC,Evaluation of Measles Vaccine Virus as a Vecto...,10.2174/1874357901206010012,Live attenuated recombinant measles vaccine vi...,2012-02-16,"Mok, Hoyin; Cheng, Xing; Xu, Qi; Zengel, James...",Open Virol J,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,2012,False,"[truncated, gamma, protocol]"
2,748d4c57fe1acc8d9d97cf574f7dea5296f9386c,Ebola virus (EBOV) and other members of the fa...,U2OS human osteosarcoma cells were cultured in...,For evaluating EBOV GP triggering under biosaf...,PMC,Direct Visualization of Ebola Virus Fusion Tri...,10.1128/mbio.01857-15,Ebola virus (EBOV) makes extensive and intrica...,2016-02-09,"Spence, Jennifer S.; Krause, Tyler B.; Mittler...",mBio,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,2016,False,"[truncated, heterogeneity]"
3,b891efc6e1419713b05ff7d89b26d260478c28df,To the Editor:\nChina has the world's second l...,,,PMC,Tuberculosis prevention in healthcare workers ...,10.1183/23120541.00015-2015,BSL3 and respiratory isolation wards protect h...,2015-08-21,"Deng, Yunfeng; Li, Yan; Wang, Fengtian; Gao, D...",ERJ Open Res,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,2015,False,[]
4,353852971069ad5794445e5c1ab6077ce23da75d,Coronavirus disease 2019 (COVID-19) has spread...,,,,,,,,,,,,-1,True,[]


In [122]:
df.shape

(47110, 16)

In [123]:
df.to_csv('cord19_df.csv', index=False)