# BERT-fy CORD-19 data

Original code from https://www.kaggle.com/theamrzaki/covid-19-bert-researchpapers-semantic-search#Data-Processing

In [1]:
import glob
import json
import pandas as pd
from tqdm import tqdm

dataset_size = 'small'
all_data_path = './../data/'
data_path = f'./../data/{dataset_size}/'


In [2]:
all_json = glob.glob(f'{all_data_path}/**/*.json', recursive=True)

len(all_json)


517912

In [3]:
import os

# get only rows with attached files

metadata_path = f'{all_data_path}/metadata.csv'
stripped_metadata_path = f'{all_data_path}/stripped_metadata.csv'

if not os.path.exists(stripped_metadata_path):
    meta_df = pd.read_csv(metadata_path, dtype={
        'pubmed_id': str,
        'Microsoft Academic Paper ID': str,
        'doi': str
    })

    stripped_meta_df = meta_df.dropna(subset=['pmc_json_files'])

    stripped_meta_df.to_csv(stripped_metadata_path)

    stripped_meta_df.head()

    del stripped_meta_df


In [4]:
import subprocess
import os

# create shuffled subset of metadata rows

small_metadata_path = f'{all_data_path}/small/metadata.csv'
if not os.path.exists(small_metadata_path):
    small_metadata_file = open(f'{all_data_path}/small/metadata.csv', 'w')
    # get header from metadata.csv
    print(subprocess.run(
        ['head', '-n 1', f'{all_data_path}/stripped_metadata.csv'], stdout=small_metadata_file))
    #!head -n 1 "{all_data_path}/stripped_metadata.csv" > "{all_data_path}/metadata.csv"
    # get random sample from metadata.csv
    print(subprocess.run(
        ['shuf', '-n 12500', f'{all_data_path}/stripped_metadata.csv'], stdout=small_metadata_file))
    #!shuf -n 12500 "{all_data_path}/stripped_metadata.csv" >> "{root_path}/metadata.csv"
    small_metadata_file.close()


In [5]:
metadata_path = f'{data_path}/metadata.csv'

meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str,
    'doi': str
})

meta_df.head()


Unnamed: 0.1,Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,627783,sep45sga,04d0cf9898016ae1295e1bca17a6b3a5fde89802,Elsevier; Medline; PMC,The absence of coronavirus in expressed prosta...,10.1016/j.reprotox.2020.06.006,PMC7286226,32534021,no-cc,Due to the cellular entry of the novel coronav...,2020-06-10,"Zhang, Shiqi; Wang, Xiaobo; Zhang, Hong; Xu, A...",Reprod Toxicol,,,,document_parses/pdf_json/04d0cf9898016ae1295e1...,document_parses/pmc_json/PMC7286226.xml.json,https://api.elsevier.com/content/article/pii/S...,219560235.0
1,807648,doj39520,bc35bf4d3debb1af7ff8dc285d06f822fab24a7e,Medline; PMC,SARS-CoV-2 receptor ACE2 expression in the hum...,10.1093/eurheartj/ehaa410,PMC7239191,32383758,no-cc,,2020-05-14,"Thum, Thomas",Eur Heart J,,,,document_parses/pdf_json/bc35bf4d3debb1af7ff8d...,document_parses/pmc_json/PMC7239191.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/32383758/;...,218557404.0
2,611676,lsfkpd5l,8ff5c024beea4259b7b1b0e143bbd6a1734ccfef; 9039...,Medline; PMC,PARP9-DTX3L ubiquitin ligase targets host hist...,10.1038/ni.3279,PMC4653074,26479788,no-cc,Enhancing the response to interferon could off...,2015-12-01,"Zhang, Yong; Mao, Dailing; Roswit, William T; ...",Nat Immunol,,,,document_parses/pdf_json/8ff5c024beea4259b7b1b...,document_parses/pmc_json/PMC4653074.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/26479788/;...,19031101.0
3,643594,hjriid4h,8a61e2af9bdd4ba0b83becf314017f3f3a8b9adf,Medline; PMC,Towards defining core principles of public hea...,10.1186/s12889-020-09307-y,PMC7527265,32998729,cc-by,"BACKGROUND: European Member States, the Europe...",2020-10-01,"Belfroid, Evelien; Roβkamp, Dorothee; Fraser, ...",BMC Public Health,,,,document_parses/pdf_json/8a61e2af9bdd4ba0b83be...,document_parses/pmc_json/PMC7527265.xml.json,https://doi.org/10.1186/s12889-020-09307-y; ht...,222071121.0
4,745390,pczdapvd,7448479341487a112b8e4f87c117622a861eeb0b; ef74...,Medline; PMC,The UPTAKE study: a cross-sectional survey exa...,10.1136/bmjopen-2021-048856,PMC8210694,34130964,cc-by-nc,OBJECTIVE: A key challenge towards a successfu...,2021-06-15,"Sethi, Sonika; Kumar, Aditi; Mandal, Anandadee...",BMJ Open,,,,document_parses/pdf_json/7448479341487a112b8e4...,document_parses/pmc_json/PMC8210694.xml.json,https://doi.org/10.1136/bmjopen-2021-048856; h...,235450913.0


In [6]:
import math


class Article:
    def __init__(self, pmcid):

        self.paper_id = ''
        self.abstract = []
        self.body_text = []


        if not isinstance(pmcid, str) and math.isnan(pmcid):
            return

        with open(f"{all_data_path}/document_parses/pmc_json/{pmcid}.xml.json") as file:
            content = json.load(file)
            content_metadata = meta_df.loc[meta_df['pmcid'] == pmcid]

            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            self.metadata = {}

            if not content_metadata is None:
                self.metadata = content_metadata

            if 'abstract' in content_metadata:
                # Abstract
                # self.abstract.append(content_metadata['abstract'][0])
                for entry in content_metadata['abstract']:
                    self.abstract.append(str(entry))
                # print(self.abstract)
            # Body text
            if 'body_text' in content:
                for entry in content['body_text']:
                    self.body_text.append(entry['text'])

            self.abstract = '<br>'.join(self.abstract)
            self.body_text = '<br>'.join(self.body_text)

    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'


first_row = Article(meta_df['pmcid'][0])
print(first_row.body_text)
# meta_df.iloc[0]


Currently, the novel coronavirus disease (COVID-19) has outbroken in China and exponentially spread along the world since the first case was diagnosed on December 2019 in Wuhan City of China [1]. The pathogenic novel coronavirus (SARS-CoV-2), isolated from the nasal and pharyngeal secretion, was highly homologous with the coronavirus caused Severe Acute Respiratory Syndrome (SARS) [2]. Owing to the lack of immunity this global pandemic increases sharply within a few months, which not only burns a large amount of health expenditure but seriously threatens lives.<br>The amputating of COVID-19 transmission route is one of the effective measures to prevent the continuing spread. This disease is believed to transmit by inhalation or contact with infected droplets, which is confirmed by the detection of coronavirus in nasopharyngeal swabs [3] and saliva [4] of COVID-19 patients. Apart from the respiratory tract, the SARS-CoV-2 could be also detected in urine and gastrointestinal tract (posit

In [7]:
from dask import dataframe as dd

global partial_df
global dict_

dict_ = {'paper_id': [], 'abstract': [], 'body_text': []}
partial_df = pd.DataFrame(dict_, columns=[
    'paper_id', 'abstract', 'body_text'])
partial_df = dd.from_pandas(partial_df, npartitions=10)

partial_df.compute().to_csv(f'{data_path}/df_covid.csv', index=False)


In [8]:

def populateDict(content):
    # no metadata, skip this paper
    if len(content.metadata) == 0:
        return

    # print(meta_data)

    dict_['paper_id'].append(content.paper_id)
    dict_['abstract'].append(content.abstract.replace('\n', '<br>'))
    dict_['body_text'].append(content.body_text.replace('\n', '<br>'))


def saveProgress():
    global partial_df
    global dict_

    partial_df = pd.DataFrame(dict_, columns=[
        'paper_id', 'abstract', 'body_text'])
    partial_df = dd.from_pandas(partial_df, npartitions=1)

    print('saving current progress')
    partial_df.compute().to_csv(
        f'{data_path}/df_covid.csv', mode='a', header=False, index=False)

    print('reseting partial df')
    del partial_df

    print('reseting partial dict')
    del dict_
    dict_ = {'paper_id': [], 'abstract': [], 'body_text': []}


for idx, entry in enumerate(meta_df['pmcid']):
    if idx % 1000 == 0:
        print(f'Processing index: {idx} of {len(meta_df)}')
        saveProgress()

    populateDict(Article(entry))

saveProgress()


Processing index: 0 of 12500
saving current progress
reseting partial df
reseting partial dict
Processing index: 1000 of 12500
saving current progress
reseting partial df
reseting partial dict
Processing index: 2000 of 12500
saving current progress
reseting partial df
reseting partial dict
Processing index: 3000 of 12500
saving current progress
reseting partial df
reseting partial dict
Processing index: 4000 of 12500
saving current progress
reseting partial df
reseting partial dict
Processing index: 5000 of 12500
saving current progress
reseting partial df
reseting partial dict
Processing index: 6000 of 12500
saving current progress
reseting partial df
reseting partial dict
Processing index: 7000 of 12500
saving current progress
reseting partial df
reseting partial dict
Processing index: 8000 of 12500
saving current progress
reseting partial df
reseting partial dict
Processing index: 9000 of 12500
saving current progress
reseting partial df
reseting partial dict
Processing index: 10000

In [9]:
from dask import dataframe as dd
import re


def lower_case(input_str):
    input_str = input_str.lower()
    return input_str


df_covid = dd.read_csv(f'{data_path}/df_covid.csv')

df_covid['body_text'] = df_covid['body_text'].astype(str)
df_covid['abstract'] = df_covid['abstract'].astype(str)

df_covid['abstract'] = df_covid['abstract'].apply(
    lambda x: re.sub('[^a-zA-z0-9\s]', '', x), meta=('abstract', 'str'))
df_covid['body_text'] = df_covid['body_text'].apply(
    lambda x: re.sub('[^a-zA-z0-9\s]', '', x), meta=('body_text', 'str'))

df_covid['abstract'] = df_covid['abstract'].apply(
    lambda x: lower_case(x), meta=('abstract', 'str'))
df_covid['body_text'] = df_covid['body_text'].apply(
    lambda x: lower_case(x), meta=('body_text', 'str'))

df_covid.head()


Unnamed: 0,paper_id,abstract,body_text
0,PMC7286226,due to the cellular entry of the novel coronav...,currently the novel coronavirus disease covid1...
1,PMC7239191,,brthis editorial refers to cell typespecific e...
2,PMC4653074,enhancing the response to interferon could off...,the interferon signaling pathway is considered...
3,PMC7527265,background european member states the european...,crossborder outbreaks demonstrate that in an i...
4,PMC8210694,objective a key challenge towards a successful...,covid19 is an infectious disease that is cause...


In [10]:
df_covid.to_csv(f'{data_path}/df_covid_preprocessed.csv', single_file=True, compute=True, index=False)

df_covid.head()


Unnamed: 0,paper_id,abstract,body_text
0,PMC7286226,due to the cellular entry of the novel coronav...,currently the novel coronavirus disease covid1...
1,PMC7239191,,brthis editorial refers to cell typespecific e...
2,PMC4653074,enhancing the response to interferon could off...,the interferon signaling pathway is considered...
3,PMC7527265,background european member states the european...,crossborder outbreaks demonstrate that in an i...
4,PMC8210694,objective a key challenge towards a successful...,covid19 is an infectious disease that is cause...


In [26]:
import dask.dataframe as dd

df_covid = dd.read_csv(f'{data_path}/df_covid_preprocessed.csv').set_index('paper_id')

# df_covid = df_covid.drop(
#     ["Unnamed: 0", "authors", "journal"], axis=1)


df_covid['body_text'] = df_covid['body_text'].astype(str)
df_covid['abstract'] = df_covid['abstract'].astype(str)

df_covid.head()


Unnamed: 0_level_0,abstract,body_text
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1
PMC1054884,viral recombination can dramatically impact ev...,as increasing numbers of fulllength viral sequ...
PMC1110908,a wide range of rna viruses use programmed 1 r...,severe acute respiratory syndrome sars first a...
PMC1149493,just as proteins form distinct structural moti...,hepatitis delta virus hdv is a satellite virus...
PMC1160249,genstyle is a workspace designed for the char...,a great number of dna sequences are now availa...
PMC1324949,background tuberculosis which is caused by myc...,onethird of the worlds population is estimated...


In [27]:
import numpy as np

df_covid['body_text'] = df_covid['body_text'].apply(
    lambda x: x.split('<br>'), meta=('abstract', 'str'))

abstract_df = df_covid.drop(['body_text'], axis=1).replace('nan', np.nan).dropna(subset=['abstract'])
df_sentences = abstract_df.rename(columns={'abstract': 'paragraph'})

body_text_df = df_covid.drop(['abstract'], axis=1).rename(columns={'body_text': 'paragraph'})
df_sentences = df_sentences.append(body_text_df.explode('paragraph'))

# df_sentences = df_sentences.replace('NaN', np.nan).dropna(subset=['paragraph'])

df_sentences.head()


Unnamed: 0_level_0,paragraph
paper_id,Unnamed: 1_level_1
PMC1054884,viral recombination can dramatically impact ev...
PMC1110908,a wide range of rna viruses use programmed 1 r...
PMC1149493,just as proteins form distinct structural moti...
PMC1160249,genstyle is a workspace designed for the char...
PMC1324949,background tuberculosis which is caused by myc...


In [28]:
df_sentences.to_csv(f'{data_path}/covid_sentences.csv', single_file=True, compute=True)

df_sentences.head()


Unnamed: 0_level_0,paragraph
paper_id,Unnamed: 1_level_1
PMC1054884,viral recombination can dramatically impact ev...
PMC1110908,a wide range of rna viruses use programmed 1 r...
PMC1149493,just as proteins form distinct structural moti...
PMC1160249,genstyle is a workspace designed for the char...
PMC1324949,background tuberculosis which is caused by myc...


In [None]:
# text_dict = text.to_dict()
# len_text = len(text_dict["paper_id"])


In [None]:
# paper_id_list = []
# body_text_list = []

# title_list = []
# abstract_list = []
# abstract_summary_list = []
# for i in tqdm(range(0, len_text)):
#     paper_id = text_dict["paper_id"][i]
#     body_text = text_dict["body_text"][i].split("<br>")
#     title = text_dict["title"][i]
#     abstract = text_dict["abstract"][i]
#     abstract_summary = text_dict["abstract_summary"][i]
#     for b in body_text:
#         paper_id_list.append(paper_id)
#         body_text_list.append(b)
#         title_list.append(title)
#         abstract_list.append(abstract)
#         abstract_summary_list.append(abstract_summary)


In [None]:
# df_sentences = pd.DataFrame({"paper_id": paper_id_list}, index=body_text_list)
# df_sentences.to_csv(f'{root_path}/covid_sentences_body.csv')
# df_sentences.head()


In [None]:
# from dask import dataframe as dd

# df_sentences = pd.DataFrame({"paper_id": paper_id_list, "title": title_list,
#                             "abstract": abstract_list, "abstract_summary": abstract_summary_list}, index=body_text_list)
# df_sentences = dd.from_pandas(df_sentences)
# df_sentences.to_csv(f'{root_path}/covid_sentences.csv')
# df_sentences.head()


In [46]:
from dask import dataframe as dd

df_sentences = dd.read_csv(f'{data_path}/covid_sentences.csv', blocksize=32e6).set_index('paper_id')#.rename(columns={'Unnamed: 0': 'index'})

df_sentences.head()


Unnamed: 0_level_0,paragraph
paper_id,Unnamed: 1_level_1
PMC1054884,viral recombination can dramatically impact ev...
PMC1054884,as increasing numbers of fulllength viral sequ...
PMC1110908,a wide range of rna viruses use programmed 1 r...
PMC1110908,severe acute respiratory syndrome sars first a...
PMC1149493,just as proteins form distinct structural moti...


In [None]:
# df_sentences = df_sentences.set_index("Unnamed: 0")

# df_sentences.head()


In [None]:
# df_covid.to_csv(f'{root_path}/covid_sentences.csv', single_file=True, compute=True)


In [None]:
# df_sentences = df_sentences["paper_id"].to_dict()
# df_sentences_list = list(df_sentences.keys())
# len(df_sentences_list)


In [None]:
# list(df_sentences.keys())[:5]


In [None]:
# df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]


In [30]:
# https://github.com/UKPLab/sentence-transformers/blob/master/examples/application_semantic_search.py
"""
This is a simple application for sentence embeddings: semantic search
We have a corpus with various sentences. Then, for a given query sentence,
we want to find the most similar sentence in this corpus.
This script outputs for various queries the top 5 most similar sentences in the corpus.
"""

import torch
from sentence_transformers import SentenceTransformer

# embedder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
embedder = SentenceTransformer(f'{all_data_path}/models/pretrained/')


In [None]:
# embedder.save(f'{root_path}/models/pretrained/', 'multi-qa-MiniLM-L6-cos-v1')

In [51]:
import os

# Corpus with example sentences
corpus = df_sentences['paragraph']

# corpus.compute()[1]

if not os.path.exists(f'{data_path}/corpus_embeddings.npy'):
    corpus_embeddings = embedder.encode(
        corpus.compute(), device='cuda', show_progress_bar=True)

    torch.save(corpus_embeddings, f'{data_path}/corpus_embeddings.npy')



Batches: 100%|██████████| 720/720 [03:28<00:00,  3.46it/s]


In [None]:
# import pandas as pd

# df = pd.read_csv(f'{root_path}/covid_sentences.csv', index_col=0)
# df.head()

In [62]:
# NOT SCALABLE
df_sentences = df_sentences.compute()

# df_sentences.iloc[5]

paragraph    hepatitis delta virus hdv is a satellite virus...
Name: PMC1149493, dtype: object

In [None]:
test = corpus.loc[corpus['ind'] == 0].compute()

test

In [56]:
import scipy.spatial
import torch

corpus_embeddings = torch.load(f'{data_path}/corpus_embeddings.npy')

# Query sentences:
queries = ['What has been published about medical care?',
           'Knowledge of the frequency, manifestations, and course of extrapulmonary manifestations of COVID-19, including, but not limited to, possible cardiomyopathy and cardiac arrest',
           'Use of AI in real-time health care delivery to evaluate interventions, risk factors, and outcomes in a way that could not be done manually',
           'Resources to support skilled nursing facilities and long term care facilities.',
           'Mobilization of surge medical staff to address shortages in overwhelmed communities .',
           'Age-adjusted mortality data for Acute Respiratory Distress Syndrome (ARDS) with/without other organ failure – particularly for viral etiologies .']
query_embeddings = embedder.encode(queries, device='cuda', show_progress_bar=True)



Batches: 100%|██████████| 1/1 [00:00<00:00, 46.42it/s]


In [67]:
def get_corpus_row(idx):
    row = df_sentences.iloc[idx]

    return row

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
closest_n = 5
print("\nTop 5 most similar sentences in corpus:")
for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist(
        [query_embedding], corpus_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n=========================================================")
    print("==========================Query==============================")
    print("===", query, "=====")
    print("=========================================================")

    for idx, distance in results[0:closest_n]:
        print("Score:   ", "(Score: %.4f)" % (1-distance), "\n")
        print("Article Index:   ", idx, "\n")
        print("Paragraph:   ", get_corpus_row(idx)['paragraph'][:150], "\n")
        # row_dict = df_sentences.loc[df_sentences.index ==
        #                             corpus.loc[idx]].to_dict()
        # print("paper_id:  ", row_dict["paper_id"][corpus[idx]], "\n")
        # print("Title:  ", row_dict["title"][corpus[idx]], "\n")
        # print("Abstract:  ", row_dict["abstract"][corpus[idx]], "\n")
        # print("Abstract_Summary:  ",
        #       row_dict["abstract_summary"][corpus[idx]], "\n")
        print("-------------------------------------------")



Top 5 most similar sentences in corpus:


=== What has been published about medical care? =====
Score:    (Score: 0.6148) 

Article Index:    9078 

Paragraph:    the mass availability and use of mobile health mhealth technology provide a significant potential for such technologies to be integrated into clinical 

-------------------------------------------
Score:    (Score: 0.5991) 

Article Index:    15993 

Paragraph:    in india assaults on medical workforce such as doctors have been happening for a long time the covid19 epidemic brought the violence against doctors o 

-------------------------------------------
Score:    (Score: 0.5728) 

Article Index:    16396 

Paragraph:    patient experience of nursing care is correlated with safety clinical effectiveness care quality treatment outcomes including mortality and overall se 

-------------------------------------------
Score:    (Score: 0.5612) 

Article Index:    7913 

Paragraph:    dear editorbrfirstly we would like to than

In [None]:
import torch

torch.cuda.is_available()

torch.cuda.current_device()

In [None]:
"""
TODO: make a custom file (seekable) containing a list of embedings and it's corresponding paper id
if possible: make it a data structure where finding the knn is < n^2, possibilities are quad-trees 
if possible: make it cuda optimized
"""