In [17]:
#Source https://www.kaggle.com/code/maksimeren/covid-19-literature-clustering
import os
import networkx as nx
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from argparse import ArgumentParser
import logging
import nltk
import random
import pandas as pd
import json

root_path = './covid_dataset/'

json_path = f'{root_path}pdf_json/'
metadata_path = f'{root_path}/metadata.csv'
num_documents = 10000

In [18]:
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})
meta_df.head()

  meta_df = pd.read_csv(metadata_path, dtype={


Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,PMC35282,11472636,no-cc,OBJECTIVE: This retrospective chart review des...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,,,,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,PMC,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,PMC59543,11667967,no-cc,Inflammatory diseases of the respiratory tract...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,,,,document_parses/pdf_json/6b0567729c2143a66d737...,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
2,ejv2xln0,06ced00a5fc04215949aa72528f2eeaae1d58927,PMC,Surfactant protein-D and pulmonary host defense,10.1186/rr19,PMC59549,11667972,no-cc,Surfactant protein-D (SP-D) participates in th...,2000-08-25,"Crouch, Erika C",Respir Res,,,,document_parses/pdf_json/06ced00a5fc04215949aa...,document_parses/pmc_json/PMC59549.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
3,2b73a28n,348055649b6b8cf2b9a376498df9bf41f7123605,PMC,Role of endothelin-1 in lung disease,10.1186/rr44,PMC59574,11686871,no-cc,Endothelin-1 (ET-1) is a 21 amino acid peptide...,2001-02-22,"Fagan, Karen A; McMurtry, Ivan F; Rodman, David M",Respir Res,,,,document_parses/pdf_json/348055649b6b8cf2b9a37...,document_parses/pmc_json/PMC59574.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
4,9785vg6d,5f48792a5fa08bed9f56016f4981ae2ca6031b32,PMC,Gene expression in epithelial cells in respons...,10.1186/rr61,PMC59580,11686888,no-cc,Respiratory syncytial virus (RSV) and pneumoni...,2001-05-11,"Domachowske, Joseph B; Bonville, Cynthia A; Ro...",Respir Res,,,,document_parses/pdf_json/5f48792a5fa08bed9f560...,document_parses/pmc_json/PMC59580.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,


In [19]:
all_json = ! ls $json_path
all_json = [json_path + s for s in all_json]

In [20]:
all_json = all_json[:num_documents]
print("Number of documents: ", len(all_json))

Number of documents:  2278


In [21]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            # Abstract
            for entry in content['abstract']:
                self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'
first_row = FileReader(all_json[0])
print(first_row)

0015023cc06b5362d332b3baf348d11567ca2fbb: word count: 194 22 Text word count: 5168 23 24 25 author/funder. All rights reserved. No reuse allowed without permission. Abstract 27 The positive stranded RNA genomes of picornaviruses comprise a si... VP3, and VP0 (which is further processed to VP2 and VP4 during virus assembly) (6). The P2 64 and P3 regions encode the non-structural proteins 2B and 2C and 3A, 3B (1-3) (VPg), 3C pro and 4 structura...


In [29]:
from tqdm import tqdm
dict_ = {'paper_id': [], 'doi':[], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [], 'abstract_summary': []}
for idx, entry in tqdm(enumerate(all_json), total = len(all_json)):
    
    try:
        content = FileReader(entry)
    except Exception as e:
        continue  # invalid paper format, skip
    
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    # no metadata, skip this paper
    if len(meta_data) == 0:
        continue
    if len(content.body_text) == 0:
        continue
    dict_['abstract'].append(content.abstract)
    dict_['paper_id'].append(content.paper_id)
    dict_['body_text'].append(content.body_text)
    
    # also create a column for the summary of abstract to be used in a plot
    if len(content.abstract) == 0: 
        # no abstract provided
        dict_['abstract_summary'].append("Not provided.")
    else:
        # abstract is short enough
        summary = content.abstract
        dict_['abstract_summary'].append(summary)
        
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    
    try:
        # if more than one author
        authors = meta_data['authors'].values[0].split(';')
        if len(authors) > 2:
            # more than 2 authors, may be problem when plotting, so take first 2 append with ...
            dict_['authors'].append(authors)
        else:
            # authors will fit in plot
            dict_['authors'].append(". ".join(authors))
    except Exception as e:
        # if only one author - or Null valie
        dict_['authors'].append(meta_data['authors'].values[0])
    
    # add the title information, add breaks when needed
    try:
        title = meta_data['title'].values[0]
        dict_['title'].append(title)
    # if title was not provided
    except Exception as e:
        dict_['title'].append(meta_data['title'].values[0])
    
    # add the journal information
    dict_['journal'].append(meta_data['journal'].values[0])
    
    # add doi
    dict_['doi'].append(meta_data['doi'].values[0])
    
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'doi', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])
df_covid.head()

100%|███████████████████████████████████████| 2278/2278 [00:53<00:00, 42.81it/s]


Unnamed: 0,paper_id,doi,abstract,body_text,authors,title,journal,abstract_summary
0,004f0f8bb66cf446678dc13cf2701feec4f36d76,10.1101/2020.02.11.20022111,,The 2019-nCoV epidemic has spread across China...,"[Zhou, Hanchu, Yang, Jianan, Tang, Kaichen, ...",Healthcare-resource-adjusted vulnerabilities t...,,Not provided.
1,005d189d5bd7ac01aee65e934fd3d5186a3f7b27,10.1101/2020.04.10.20059337,The rapid outbreak of the new Coronavirus pand...,The outbreak of infectious diseases has always...,"[Pirouz, Behzad, Golmohammadi, Amirsina, Sae...",Relationship between Average Daily Temperature...,,The rapid outbreak of the new Coronavirus pand...
2,0109d1273b2d59a099ab66cdad6939d5e7fcb2e8,10.1101/2020.04.09.20059626,,"To date, the coronavirus disease 2019 (COVID-1...","[Zuo, Yu, Yalavarthi, Srilakshmi, Shi, Hui, ...",Neutrophil extracellular traps (NETs) as marke...,medRxiv : the preprint server for health sciences,Not provided.
3,0185f63fd6ecdf04829a155f4d7f62a5b532f06d,10.1101/2020.03.26.20044289,Background: With the outbreak of coronavirus d...,A novel coronavirus caused pneumonia cases in ...,"[Zhou, Xiang, Ma, Xudong, Hong, Na, Su, Lon...",Forecasting the Worldwide Spread of COVID-19 b...,,Background: With the outbreak of coronavirus d...
4,01d162d7fae6aaba8e6e60e563ef4c2fca7b0e18,10.1101/2020.02.24.20025437,Faced with the current large-scale public heal...,The sudden outbreak of the new coronavirus (SA...,"[Ji, Xiaoyang, Zhang, Chunming, Zhai, Yubo, ...","TWIRLS, an automated topic-wise inference meth...",,Faced with the current large-scale public heal...


In [23]:
df_covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 507 entries, 0 to 506
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   paper_id          507 non-null    object
 1   doi               507 non-null    object
 2   abstract          507 non-null    object
 3   body_text         507 non-null    object
 4   authors           507 non-null    object
 5   title             507 non-null    object
 6   journal           13 non-null     object
 7   abstract_summary  507 non-null    object
dtypes: object(8)
memory usage: 31.8+ KB


# Data Pre-processing

In [30]:
df = df_covid
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 507 entries, 0 to 506
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   paper_id          507 non-null    object
 1   doi               507 non-null    object
 2   abstract          507 non-null    object
 3   body_text         507 non-null    object
 4   authors           507 non-null    object
 5   title             507 non-null    object
 6   journal           13 non-null     object
 7   abstract_summary  507 non-null    object
dtypes: object(8)
memory usage: 31.8+ KB


In [31]:
from tqdm import tqdm
from langdetect import detect
from langdetect import DetectorFactory

# set seed
DetectorFactory.seed = 0

# hold label - language
languages = []

# go through each text
for ii in tqdm(range(0,len(df))):
    # split by space into list, take the first x intex, join with space
    text = df.iloc[ii]['body_text'].split(" ")
    
    lang = "en"
    try:
        if len(text) > 50:
            lang = detect(" ".join(text[:50]))
        elif len(text) > 0:
            lang = detect(" ".join(text[:len(text)]))
    # ught... beginning of the document was not in a good format
    except Exception as e:
        all_words = set(text)
        try:
            lang = detect(" ".join(all_words))
        # what!! :( let's see if we can find any text in abstract...
        except Exception as e:
            
            try:
                # let's try to label it through the abstract then
                lang = detect(df.iloc[ii]['abstract_summary'])
            except Exception as e:
                lang = "unknown"
                pass
    
    # get the language    
    languages.append(lang)

100%|████████████████████████████████████████| 507/507 [00:01<00:00, 480.56it/s]


In [32]:
from pprint import pprint

languages_dict = {}
for lang in set(languages):
    languages_dict[lang] = languages.count(lang)
    
print("Total: {}\n".format(len(languages)))
pprint(languages_dict)
df['language'] = languages
df = df[df['language'] == 'en'] 
df.info()

Total: 507

{'cy': 2, 'en': 505}
<class 'pandas.core.frame.DataFrame'>
Int64Index: 505 entries, 0 to 506
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   paper_id          505 non-null    object
 1   doi               505 non-null    object
 2   abstract          505 non-null    object
 3   body_text         505 non-null    object
 4   authors           505 non-null    object
 5   title             505 non-null    object
 6   journal           13 non-null     object
 7   abstract_summary  505 non-null    object
 8   language          505 non-null    object
dtypes: object(9)
memory usage: 39.5+ KB


In [34]:
#NLP 
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_sci_lg
import string

punctuations = string.punctuation
stopwords = list(STOP_WORDS)
stopwords[:10]

['ourselves',
 'anyway',
 'on',
 'fifty',
 '‘s',
 'could',
 'below',
 'yourself',
 'wherein',
 'few']

In [35]:
custom_stop_words = [
    'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 
    'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 
    'al.', 'Elsevier', 'PMC', 'CZI'
]

for w in custom_stop_words:
    if w not in stopwords:
        stopwords.append(w)
        
print(stopwords)

['ourselves', 'anyway', 'on', 'fifty', '‘s', 'could', 'below', 'yourself', 'wherein', 'few', 'behind', "'s", '‘ve', 'they', 'neither', 'hereafter', 'throughout', 'that', 'his', 'three', '’re', 'via', 'nine', 'thereby', 'cannot', 'you', 'yours', 'their', 'into', 'to', 'among', '‘ll', 'should', 'thence', 'often', 'and', 'either', 'otherwise', 'doing', 'make', 'name', 'seemed', 'ten', 'namely', 'almost', 'more', 'must', 'them', 'he', 'only', 'forty', 'the', 'thus', 'him', 'where', 'one', 'across', 'whole', 'else', 'itself', 'though', 'amongst', 'whereafter', 'up', 'due', 'most', 'seems', 'there', 'hence', 'move', '’ll', 'out', '’d', 'ours', 'this', 'therefore', 'call', 'go', 'latterly', 'over', 'during', 'put', 'meanwhile', 'becoming', 'least', 'already', 'regarding', 'fifteen', 'n‘t', 'full', 'because', 'then', 'whereas', 'see', 'although', 'anywhere', 'how', 'hereupon', 'nothing', 'done', 'take', 'twenty', 'is', 'somehow', 'latter', 'after', 'everyone', 'ever', 'would', 'formerly', 'unl

In [36]:
# Parser
parser = en_core_sci_lg.load(disable=["tagger", "ner"])
parser.max_length = 7000000

def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [37]:
tqdm.pandas()
df["processed_text"] = df["body_text"].progress_apply(spacy_tokenizer)

100%|█████████████████████████████████████████| 505/505 [03:16<00:00,  2.57it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["processed_text"] = df["body_text"].progress_apply(spacy_tokenizer)


In [38]:
display(df)

Unnamed: 0,paper_id,doi,abstract,body_text,authors,title,journal,abstract_summary,language,processed_text
0,004f0f8bb66cf446678dc13cf2701feec4f36d76,10.1101/2020.02.11.20022111,,The 2019-nCoV epidemic has spread across China...,"[Zhou, Hanchu, Yang, Jianan, Tang, Kaichen, ...",Healthcare-resource-adjusted vulnerabilities t...,,Not provided.,en,2019-ncov epidemic spread china 24 countries 1...
1,005d189d5bd7ac01aee65e934fd3d5186a3f7b27,10.1101/2020.04.10.20059337,The rapid outbreak of the new Coronavirus pand...,The outbreak of infectious diseases has always...,"[Pirouz, Behzad, Golmohammadi, Amirsina, Sae...",Relationship between Average Daily Temperature...,,The rapid outbreak of the new Coronavirus pand...,en,outbreak infectious diseases important health ...
2,0109d1273b2d59a099ab66cdad6939d5e7fcb2e8,10.1101/2020.04.09.20059626,,"To date, the coronavirus disease 2019 (COVID-1...","[Zuo, Yu, Yalavarthi, Srilakshmi, Shi, Hui, ...",Neutrophil extracellular traps (NETs) as marke...,medRxiv : the preprint server for health sciences,Not provided.,en,date coronavirus disease 2019 covid-19 pandemi...
3,0185f63fd6ecdf04829a155f4d7f62a5b532f06d,10.1101/2020.03.26.20044289,Background: With the outbreak of coronavirus d...,A novel coronavirus caused pneumonia cases in ...,"[Zhou, Xiang, Ma, Xudong, Hong, Na, Su, Lon...",Forecasting the Worldwide Spread of COVID-19 b...,,Background: With the outbreak of coronavirus d...,en,novel coronavirus caused pneumonia cases wuhan...
4,01d162d7fae6aaba8e6e60e563ef4c2fca7b0e18,10.1101/2020.02.24.20025437,Faced with the current large-scale public heal...,The sudden outbreak of the new coronavirus (SA...,"[Ji, Xiaoyang, Zhang, Chunming, Zhai, Yubo, ...","TWIRLS, an automated topic-wise inference meth...",,Faced with the current large-scale public heal...,en,sudden outbreak new coronavirus sars-cov-2 end...
...,...,...,...,...,...,...,...,...,...,...
502,fdf20c752531c78cd56750268f1d62c61eb9e2bd,10.1101/2020.04.09.20059550,Social distancing has been adopted as a non-ph...,"In December 2019, a novel coronavirus named SA...","[Liu, Pai, Beeler, Payton, Chakrabarty, Raja...",Diminishing Marginal Benefit of Social Distanc...,,Social distancing has been adopted as a non-ph...,en,december 2019 novel coronavirus named sars-cov...
503,fe685aa676e739bd52ba2585a7e5b27c55e2d0d6,10.1101/2020.02.17.20023721,,CC-BY-NC-ND 4.0 International license It is ma...,"[Xiao, Fei, Tang, Meiwen, Zheng, Xiaobin, L...",Evidence for gastrointestinal infection of SAR...,,Not provided.,en,cc-by-nc-nd 4.0 international available author...
504,ff067164497bcfbd9145be223dcd2b05f159dd63,10.1101/2020.03.02.20030189,The novel coronavirus SARS-CoV-2 is a newly em...,Since early December of 2019 and up to Februar...,"[Zhao, Juanjuan, Yuan, Quan, Wang, Haiyan, ...",Antibody responses to SARS-CoV-2 in patients o...,,The novel coronavirus SARS-CoV-2 is a newly em...,en,early december 2019 february 24 2020 79 000 ca...
505,ffbd7555a337706238c211197b221795e4e35146,10.1101/2020.02.24.20027375,,CC-BY-NC-ND 4.0 International license It is ma...,"[Tuite, Ashleigh R., Bogoch, Isaac, Sherbo, ...",Estimation of COVID-2019 burden and potential ...,,Not provided.,en,cc-by-nc-nd 4.0 international available author...


In [39]:
df = df[["processed_text"]]
display(df)

Unnamed: 0,processed_text
0,2019-ncov epidemic spread china 24 countries 1...
1,outbreak infectious diseases important health ...
2,date coronavirus disease 2019 covid-19 pandemi...
3,novel coronavirus caused pneumonia cases wuhan...
4,sudden outbreak new coronavirus sars-cov-2 end...
...,...
502,december 2019 novel coronavirus named sars-cov...
503,cc-by-nc-nd 4.0 international available author...
504,early december 2019 february 24 2020 79 000 ca...
505,cc-by-nc-nd 4.0 international available author...


In [40]:
df.to_csv(root_path + "train.csv") 