# BERT-fy CORD-19 data

Original code from https://www.kaggle.com/theamrzaki/covid-19-bert-researchpapers-semantic-search#Data-Processing

In [1]:
import glob
import json
import pandas as pd
from tqdm import tqdm

root_path = './../data/'
all_json = glob.glob(f'{root_path}/**/*.json', recursive=True)

len(all_json)

517904

In [2]:
import os

# get only rows with attached files

metadata_path = f'{root_path}/metadata.csv'
stripped_metadata_path = f'{root_path}/stripped_metadata.csv'

if not os.path.exists(stripped_metadata_path):
    meta_df = pd.read_csv(metadata_path, dtype={
        'pubmed_id': str,
        'Microsoft Academic Paper ID': str, 
        'doi': str
    })

    stripped_meta_df = meta_df.dropna(subset=['pmc_json_files'])

    stripped_meta_df.to_csv(stripped_metadata_path)

    stripped_meta_df.head()

    del stripped_meta_df

In [3]:
import subprocess
import os

small_metadata_path = f'{root_path}/small_metadata.csv'
if not os.path.exists(small_metadata_path):
    small_metadata_file = open(f'{root_path}/small_metadata.csv', 'w')
    # get header from metadata.csv
    print(subprocess.run(['head', '-n 1', f'{root_path}/stripped_metadata.csv'], stdout=small_metadata_file))
    #!head -n 1 "{root_path}/stripped_metadata.csv" > "{root_path}/small_metadata.csv"
    # get random sample from metadata.csv
    print(subprocess.run(['shuf', '-n 12500', f'{root_path}/stripped_metadata.csv'], stdout=small_metadata_file))
    #!shuf -n 12500 "{root_path}/stripped_metadata.csv" >> "{root_path}/small_metadata.csv"
    small_metadata_file.close()

In [4]:
small_metadata_path = f'{root_path}/small_metadata.csv'

meta_df = pd.read_csv(small_metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})

meta_df.head()

Unnamed: 0.1,Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,677299,oqvz3956,46361636bb49fa36f2246008c87ee0d5997050b9,Medline; PMC,Development of a Simple In Vitro Assay To Iden...,10.1128/aac.01508-20,PMC7927875,33122171,no-cc,Nucleotide analogs targeting viral RNA polymer...,2020-12-16,"Lu, Gaofei; Zhang, Xi; Zheng, Weinan; Sun, Jia...",Antimicrob Agents Chemother,,,,document_parses/pdf_json/46361636bb49fa36f2246...,document_parses/pmc_json/PMC7927875.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/33122171/;...,226207105.0
1,793863,t9w0au4s,d2b14c5bbbcc55efa85bf81bae9b8b66a09b61f8,Medline; PMC,Patient experience of telemedicine for headach...,10.1111/head.14110,PMC8206943,34021595,no-cc,OBJECTIVE: We sought to investigate the patien...,2021-05-21,"Chiang, Chia‐Chun; Halker Singh, Rashmi; Lalva...",Headache,,,,document_parses/pdf_json/d2b14c5bbbcc55efa85bf...,document_parses/pmc_json/PMC8206943.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/34021595/;...,235093394.0
2,779998,cv2mss6p,9d6c9fb4554a450479c8977dfd2ee33aa9f85117,Medline; PMC,Targeting CA-125 Transcription by Development ...,10.3390/cancers13174265,PMC8428227,34503075,cc-by,SIMPLE SUMMARY: Ovarian cancer is the fifth mo...,2021-08-24,"Yue, Er; Yang, Guangchao; Yao, Yuanfei; Wang, ...",Cancers (Basel),,,,document_parses/pdf_json/9d6c9fb4554a450479c89...,document_parses/pmc_json/PMC8428227.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/34503075/;...,237467975.0
3,797924,fjchfgzt,66f20258cf3d0564e191715b3e1c86cb2e92f172,Medline; PMC,Treating patients across European Union border...,10.1097/eja.0000000000001423,PMC7969157,33350712,cc-by-nc-nd,BACKGROUND: In light of the coronavirus diseas...,2020-12-18,"Adam, Elisabeth H.; Flinspach, Armin N.; Janko...",Eur J Anaesthesiol,,,,document_parses/pdf_json/66f20258cf3d0564e1917...,document_parses/pmc_json/PMC7969157.xml.json,https://doi.org/10.1097/eja.0000000000001423; ...,229351518.0
4,771079,xa3iy187,3b32a4f9e3c54272bdf3527635b7b47642ccfae0,Elsevier; Medline; PMC; WHO,"Nurses as political knowledge brokers, opportu...",10.1016/j.ijnurstu.2020.103690,PMC7297165,32673811,no-cc,,2020-06-16,"Santillan-Garcia, Azucena; Zaforteza-Lallemand...",Int J Nurs Stud,,,,document_parses/pdf_json/3b32a4f9e3c54272bdf35...,document_parses/pmc_json/PMC7297165.xml.json,https://doi.org/10.1016/j.ijnurstu.2020.103690...,219700339.0


In [9]:
import math

class Article:
    def __init__(self, pmcid):

        self.paper_id = ''
        self.abstract = []
        self.body_text = []

        if not isinstance(pmcid, str) and math.isnan(pmcid):
            return

        with open(f"{root_path}/document_parses/pmc_json/{pmcid}.xml.json") as file:
            content = json.load(file)
            content_metadata = meta_df.loc[meta_df['pmcid'] == pmcid]

            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            self.metadata = {}

            if not content_metadata is None:
                self.metadata = content_metadata

            if 'abstract' in content_metadata:
                # Abstract
                # self.abstract.append(content_metadata['abstract'][0])
                for entry in content_metadata['abstract']:
                    self.abstract.append(str(entry))
                # print(self.abstract)
            # Body text
            if 'body_text' in content:
                for entry in content['body_text']:
                    self.body_text.append(entry['text'])

            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)

    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'

first_row = Article(meta_df['pmcid'][0])
print(first_row)
# meta_df.iloc[0]

PMC7927875: Nucleotide analogs targeting viral RNA polymerase have been proved to be an effective strategy for antiviral treatment and are promising antiviral drugs to combat the current severe acute respiratory ... The ongoing global pandemic of coronavirus disease 2019 (COVID-19) is caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) (1–3), which is a single-stranded, positive-sense RNA virus...


In [6]:
def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0

    # add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + "<br>" + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data

In [10]:
dict_ = {'paper_id': [], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [], 'abstract_summary': []}
for idx, entry in enumerate(meta_df['pmcid']):
    if idx % (len(meta_df) // 10) == 0:
        print(f'Processing index: {idx} of {len(meta_df)}')
    content = Article(entry)
    
    #print(content)

    # get metadata information
    meta_data = content.metadata
    #meta_df.loc[meta_df['pmcid'] == content.paper_id]
    # no metadata, skip this paper
    if len(meta_data) == 0:
        continue

    #print(meta_data)
    
    dict_['paper_id'].append(content.paper_id)
    dict_['abstract'].append(content.abstract)
    dict_['body_text'].append(content.body_text)
    
    # also create a column for the summary of abstract to be used in a plot
    if len(content.abstract) == 0: 
        # no abstract provided
        dict_['abstract_summary'].append("Not provided.")
    elif len(content.abstract.split(' ')) > 100:
        # abstract provided is too long for plot, take first 300 words append with ...
        info = content.abstract.split(' ')[:100]
        summary = get_breaks(' '.join(info), 40)
        dict_['abstract_summary'].append(summary + "...")
    else:
        # abstract is short enough
        summary = get_breaks(content.abstract, 40)
        dict_['abstract_summary'].append(summary)
        
    # get metadata information
    # meta_data = meta_df.loc[meta_df['pmcid'] == content.paper_id]
    
    try:
        # if more than one author
        authors = meta_data['authors'].values[0].split(';')
        if len(authors) > 2:
            # more than 2 authors, may be problem when plotting, so take first 2 append with ...
            dict_['authors'].append(". ".join(authors[:2]) + "...")
        else:
            # authors will fit in plot
            dict_['authors'].append(". ".join(authors))
    except Exception as e:
        # if only one author - or Null valie
        dict_['authors'].append(meta_data['authors'].values[0])
    
    # add the title information, add breaks when needed
    try:
        title = get_breaks(meta_data['title'].values[0], 40)
        dict_['title'].append(title)
    # if title was not provided
    except Exception as e:
        dict_['title'].append(meta_data['title'].values[0])
    
    # add the journal information
    dict_['journal'].append(meta_data['journal'].values[0])
    
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])
df_covid.describe()

Processing index: 0 of 12500
Processing index: 1250 of 12500
Processing index: 2500 of 12500
Processing index: 3750 of 12500
Processing index: 5000 of 12500
Processing index: 6250 of 12500
Processing index: 7500 of 12500
Processing index: 8750 of 12500
Processing index: 10000 of 12500
Processing index: 11250 of 12500


Unnamed: 0,paper_id,abstract,body_text,authors,title,journal,abstract_summary
0,PMC7927875,Nucleotide analogs targeting viral RNA polymer...,The ongoing global pandemic of coronavirus dis...,"Lu, Gaofei. Zhang, Xi...",Development of a Simple In Vitro Assay To<br>...,Antimicrob Agents Chemother,Nucleotide analogs targeting viral RNA<br>pol...
1,PMC8206943,OBJECTIVE: We sought to investigate the patien...,The coronavirus disease 2019 (COVID‐19) pandem...,"Chiang, Chia‐Chun. Halker Singh, Rashmi...",Patient experience of telemedicine for<br>hea...,Headache,OBJECTIVE: We sought to investigate the<br>pa...
2,PMC8428227,SIMPLE SUMMARY: Ovarian cancer is the fifth mo...,"Ovarian cancer, the umbrella term for ovarian ...","Yue, Er. Yang, Guangchao...",Targeting CA-125 Transcription by<br>Developm...,Cancers (Basel),SIMPLE SUMMARY: Ovarian cancer is the fifth<b...
3,PMC7969157,BACKGROUND: In light of the coronavirus diseas...,The victims of the unprecedented coronavirus d...,"Adam, Elisabeth H.. Flinspach, Armin N....",Treating patients across European Union<br>bo...,Eur J Anaesthesiol,BACKGROUND: In light of the coronavirus<br>di...
4,PMC7297165,,The authors declare that they have no known co...,"Santillan-Garcia, Azucena. Zaforteza-Lalleman...","Nurses as political knowledge brokers,<br>opp...",Int J Nurs Stud,


In [None]:
import re

df_covid['body_text'] = df_covid['body_text'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
df_covid['abstract'] = df_covid['abstract'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))

df_covid.head(4)

In [None]:
def lower_case(input_str):
    input_str = input_str.lower()
    return input_str

df_covid['body_text'] = df_covid['body_text'].apply(lambda x: lower_case(x))
df_covid['abstract'] = df_covid['abstract'].apply(lambda x: lower_case(x))

df_covid.head()