# BERT-fy CORD-19 data

Original code from https://www.kaggle.com/theamrzaki/covid-19-bert-researchpapers-semantic-search#Data-Processing

In [1]:
import glob
import json
import pandas as pd
from tqdm import tqdm

dataset_size = 'full'
all_data_path = './../data/'
data_path = f'./../data/{dataset_size}/'


In [None]:
all_json = glob.glob(f'{all_data_path}/**/*.json', recursive=True)

len(all_json)


In [None]:
import os

# get only rows with attached files

metadata_path = f'{all_data_path}/metadata.csv'
stripped_metadata_path = f'{all_data_path}/stripped_metadata.csv'

if not os.path.exists(stripped_metadata_path):
    meta_df = pd.read_csv(metadata_path, dtype={
        'pubmed_id': str,
        'Microsoft Academic Paper ID': str,
        'doi': str
    })

    stripped_meta_df = meta_df.dropna(subset=['pmc_json_files'])

    stripped_meta_df.to_csv(stripped_metadata_path)

    stripped_meta_df.head()

    del stripped_meta_df


In [None]:
import subprocess
import os

# create shuffled subset of metadata rows

small_metadata_path = f'{all_data_path}/small/metadata.csv'
if not os.path.exists(small_metadata_path):
    small_metadata_file = open(f'{all_data_path}/small/metadata.csv', 'w')
    # get header from metadata.csv
    print(subprocess.run(
        ['head', '-n 1', f'{all_data_path}/stripped_metadata.csv'], stdout=small_metadata_file))
    #!head -n 1 "{all_data_path}/stripped_metadata.csv" > "{all_data_path}/metadata.csv"
    # get random sample from metadata.csv
    print(subprocess.run(
        ['shuf', '-n 12500', f'{all_data_path}/stripped_metadata.csv'], stdout=small_metadata_file))
    #!shuf -n 12500 "{all_data_path}/stripped_metadata.csv" >> "{root_path}/metadata.csv"
    small_metadata_file.close()


In [None]:
import subprocess
import os

# create shuffled subset of metadata rows

full_metadata_path = f'{all_data_path}/full/metadata.csv'
if not os.path.exists(full_metadata_path):
    full_metadata_file = open(f'{all_data_path}/full/metadata.csv', 'w')
    print(subprocess.run(
        ['cat', f'{all_data_path}/stripped_metadata.csv'], stdout=full_metadata_file))
    full_metadata_file.close()

In [None]:
metadata_path = f'{data_path}/metadata.csv'

meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str,
    'doi': str
})

meta_df.head()


In [None]:
import math


class Article:
    def __init__(self, pmcid):

        self.paper_id = ''
        self.abstract = []
        self.body_text = []


        if not isinstance(pmcid, str) and math.isnan(pmcid):
            return

        with open(f"{all_data_path}/document_parses/pmc_json/{pmcid}.xml.json") as file:
            content = json.load(file)
            content_metadata = meta_df.loc[meta_df['pmcid'] == pmcid]

            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            self.metadata = {}

            if not content_metadata is None:
                self.metadata = content_metadata

            if 'abstract' in content_metadata:
                # Abstract
                # self.abstract.append(content_metadata['abstract'][0])
                for entry in content_metadata['abstract']:
                    self.abstract.append(str(entry))
                # print(self.abstract)
            # Body text
            if 'body_text' in content:
                for entry in content['body_text']:
                    self.body_text.append(entry['text'])

            self.abstract = '<br>'.join(self.abstract)
            self.body_text = '<br>'.join(self.body_text)

    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'


first_row = Article(meta_df['pmcid'][0])
print(first_row.body_text)
# meta_df.iloc[0]


In [None]:
from dask import dataframe as dd

global partial_df
global dict_

dict_ = {'paper_id': [], 'abstract': [], 'body_text': []}
partial_df = pd.DataFrame(dict_, columns=[
    'paper_id', 'abstract', 'body_text'])
partial_df = dd.from_pandas(partial_df, npartitions=10)

partial_df.compute().to_csv(f'{data_path}/df_covid.csv', index=False)


In [None]:

def populateDict(content):
    # no metadata, skip this paper
    if len(content.metadata) == 0:
        return

    # print(meta_data)

    dict_['paper_id'].append(content.paper_id)
    dict_['abstract'].append(content.abstract.replace('\n', '<br>'))
    dict_['body_text'].append(content.body_text.replace('\n', '<br>'))


def saveProgress():
    global partial_df
    global dict_

    partial_df = pd.DataFrame(dict_, columns=[
        'paper_id', 'abstract', 'body_text'])
    partial_df = dd.from_pandas(partial_df, npartitions=1)

    print('saving current progress')
    partial_df.compute().to_csv(
        f'{data_path}/df_covid.csv', mode='a', header=False, index=False)

    print('reseting partial df')
    del partial_df

    print('reseting partial dict')
    del dict_
    dict_ = {'paper_id': [], 'abstract': [], 'body_text': []}


for idx, entry in enumerate(meta_df['pmcid']):
    if idx % 1000 == 0:
        print(f'Processing index: {idx} of {len(meta_df)}')
        saveProgress()

    populateDict(Article(entry))

saveProgress()


In [None]:
from dask import dataframe as dd
import re


def lower_case(input_str):
    input_str = input_str.lower()
    return input_str


df_covid = dd.read_csv(f'{data_path}/df_covid.csv')

df_covid['body_text'] = df_covid['body_text'].astype(str)
df_covid['abstract'] = df_covid['abstract'].astype(str)

df_covid['abstract'] = df_covid['abstract'].apply(
    lambda x: re.sub('[^a-zA-z0-9\s]', '', x), meta=('abstract', 'str'))
df_covid['body_text'] = df_covid['body_text'].apply(
    lambda x: re.sub('[^a-zA-z0-9\s]', '', x), meta=('body_text', 'str'))

df_covid['abstract'] = df_covid['abstract'].apply(
    lambda x: lower_case(x), meta=('abstract', 'str'))
df_covid['body_text'] = df_covid['body_text'].apply(
    lambda x: lower_case(x), meta=('body_text', 'str'))

df_covid.head()


In [None]:
df_covid.to_csv(f'{data_path}/df_covid_preprocessed.csv', single_file=True, compute=True, index=False)

df_covid.head()


In [2]:
import dask.dataframe as dd

df_covid = dd.read_csv(f'{data_path}/df_covid_preprocessed.csv')#.set_index('paper_id')

# df_covid = df_covid.drop(
#     ["Unnamed: 0", "authors", "journal"], axis=1)


df_covid['body_text'] = df_covid['body_text'].astype(str)
df_covid['abstract'] = df_covid['abstract'].astype(str)

df_covid.head()


Unnamed: 0,paper_id,abstract,body_text
0,PMC35282,objective this retrospective chart review desc...,mycoplasma pneumoniae is a common cause of upp...
1,PMC59543,inflammatory diseases of the respiratory tract...,since its discovery as a biological messenger ...
2,PMC59549,surfactant proteind spd participates in the in...,surfactant proteind spd is a member of the col...
3,PMC59574,endothelin1 et1 is a 21 amino acid peptide wit...,et1 et2 and et3 are members of a peptide famil...
4,PMC59580,respiratory syncytial virus rsv and pneumonia ...,rsv and pvm are viruses of the family paramyxo...


In [3]:
import numpy as np

df_covid['body_text'] = df_covid['body_text'].apply(
    lambda x: x.split('<br>'), meta=('abstract', 'str'))

abstract_df = df_covid.drop(['body_text'], axis=1).replace('nan', np.nan).dropna(subset=['abstract'])
df_sentences = abstract_df.rename(columns={'abstract': 'paragraph'})

body_text_df = df_covid.drop(['abstract'], axis=1).rename(columns={'body_text': 'paragraph'})
df_sentences = df_sentences.append(body_text_df.explode('paragraph'))

# df_sentences = df_sentences.replace('NaN', np.nan).dropna(subset=['paragraph'])

df_sentences.head()


Unnamed: 0,paper_id,paragraph
0,PMC35282,objective this retrospective chart review desc...
1,PMC59543,inflammatory diseases of the respiratory tract...
2,PMC59549,surfactant proteind spd participates in the in...
3,PMC59574,endothelin1 et1 is a 21 amino acid peptide wit...
4,PMC59580,respiratory syncytial virus rsv and pneumonia ...


In [4]:
df_sentences.to_csv(f'{data_path}/covid_sentences.csv', single_file=True, compute=True, index=False)

df_sentences.head()


In [None]:
# text_dict = text.to_dict()
# len_text = len(text_dict["paper_id"])


In [None]:
# paper_id_list = []
# body_text_list = []

# title_list = []
# abstract_list = []
# abstract_summary_list = []
# for i in tqdm(range(0, len_text)):
#     paper_id = text_dict["paper_id"][i]
#     body_text = text_dict["body_text"][i].split("<br>")
#     title = text_dict["title"][i]
#     abstract = text_dict["abstract"][i]
#     abstract_summary = text_dict["abstract_summary"][i]
#     for b in body_text:
#         paper_id_list.append(paper_id)
#         body_text_list.append(b)
#         title_list.append(title)
#         abstract_list.append(abstract)
#         abstract_summary_list.append(abstract_summary)


In [None]:
# df_sentences = pd.DataFrame({"paper_id": paper_id_list}, index=body_text_list)
# df_sentences.to_csv(f'{root_path}/covid_sentences_body.csv')
# df_sentences.head()


In [None]:
# from dask import dataframe as dd

# df_sentences = pd.DataFrame({"paper_id": paper_id_list, "title": title_list,
#                             "abstract": abstract_list, "abstract_summary": abstract_summary_list}, index=body_text_list)
# df_sentences = dd.from_pandas(df_sentences)
# df_sentences.to_csv(f'{root_path}/covid_sentences.csv')
# df_sentences.head()


In [2]:
from dask import dataframe as dd

df_sentences = dd.read_csv(f'{data_path}/covid_sentences.csv', blocksize=32e6)#.set_index('paper_id')#.rename(columns={'Unnamed: 0': 'index'})

df_sentences.head()


Unnamed: 0,paper_id,paragraph
0,PMC35282,objective this retrospective chart review desc...
1,PMC59543,inflammatory diseases of the respiratory tract...
2,PMC59549,surfactant proteind spd participates in the in...
3,PMC59574,endothelin1 et1 is a 21 amino acid peptide wit...
4,PMC59580,respiratory syncytial virus rsv and pneumonia ...


In [None]:
# df_sentences = df_sentences.set_index("Unnamed: 0")

# df_sentences.head()


In [None]:
# df_covid.to_csv(f'{root_path}/covid_sentences.csv', single_file=True, compute=True)


In [None]:
# df_sentences = df_sentences["paper_id"].to_dict()
# df_sentences_list = list(df_sentences.keys())
# len(df_sentences_list)


In [None]:
# list(df_sentences.keys())[:5]


In [None]:
# df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]


In [3]:
# https://github.com/UKPLab/sentence-transformers/blob/master/examples/application_semantic_search.py
"""
This is a simple application for sentence embeddings: semantic search
We have a corpus with various sentences. Then, for a given query sentence,
we want to find the most similar sentence in this corpus.
This script outputs for various queries the top 5 most similar sentences in the corpus.
"""

import torch
from sentence_transformers import SentenceTransformer

# embedder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
embedder = SentenceTransformer(f'{all_data_path}/models/pretrained/')


In [None]:
# embedder.save(f'{root_path}/models/pretrained/', 'multi-qa-MiniLM-L6-cos-v1')

In [6]:
from collections.abc import Sequence


class SentenceList(Sequence):
    def __init__(self, csv_path):
        # Read in the file once and build a list of line offsets
        self.f_covid_sentences = open(csv_path)
        self.header_line = ""
        self.line_offset = []

        self.init_offsets()

        super().__init__()

    def init_offsets(self):
        self.header_line = self.f_covid_sentences.readline()

        offset = len(self.header_line)
        for line in self.f_covid_sentences:
            self.line_offset.append(offset)
            offset += len(line)

        self.f_covid_sentences.seek(0)
        
        # print(self.line_offset)

    def __getitem__(self, i):
        self.f_covid_sentences.seek(self.line_offset[i])
        line = self.f_covid_sentences.readline()
        # print(line)

        return ' '.join(line.split(',')[1:])

    def __len__(self):
        return len(self.line_offset)

# Let's test it:
sentence_list = SentenceList(f'{data_path}/covid_sentences.csv')


In [7]:
try:
    print(sentence_list[0])
except Exception:
    pass

objective this retrospective chart review describes the epidemiology and clinical features of 40 patients with cultureproven mycoplasma pneumoniae infections at king abdulaziz university hospital jeddah saudi arabia methods patients with positive m pneumoniae cultures from respiratory specimens from january 1997 through december 1998 were identified through the microbiology records charts of patients were reviewed results 40 patients were identified 33 825 of whom required admission most infections 925 were communityacquired the infection affected all age groups but was most common in infants 325 and preschool children 225 it occurred yearround but was most common in the fall 35 and spring 30 more than threequarters of patients 775 had comorbidities twentyfour isolates 60 were associated with pneumonia 14 35 with upper respiratory tract infections and 2 5 with bronchiolitis cough 825 fever 75 and malaise 588 were the most common symptoms and crepitations 60 and wheezes 40 were the most

In [7]:
import os

# Corpus with example sentences
# corpus = df_sentences['paragraph']

# corpus.compute()[1]

if not os.path.exists(f'{data_path}/corpus_embeddings.npy'):
    corpus_embeddings = embedder.encode(
        sentence_list, device='cuda', show_progress_bar=True)

    torch.save(corpus_embeddings, f'{data_path}/corpus_embeddings.npy')


Batches: 100%|██████████| 4306/4306 [13:52<00:00,  5.17it/s]


In [None]:
# import pandas as pd

# df = pd.read_csv(f'{root_path}/covid_sentences.csv', index_col=0)
# df.head()

In [None]:

# NOT SCALABLE
# df_sentences = df_sentences.compute()

# df_sentences.iloc[5]

In [4]:
import scipy.spatial
import torch

corpus_embeddings = torch.load(f'{data_path}/corpus_embeddings.npy')

# Query sentences:
queries = ['What has been published about medical care?',
           'Knowledge of the frequency, manifestations, and course of extrapulmonary manifestations of COVID-19, including, but not limited to, possible cardiomyopathy and cardiac arrest',
           'Use of AI in real-time health care delivery to evaluate interventions, risk factors, and outcomes in a way that could not be done manually',
           'Resources to support skilled nursing facilities and long term care facilities.',
           'Mobilization of surge medical staff to address shortages in overwhelmed communities .',
           'Age-adjusted mortality data for Acute Respiratory Distress Syndrome (ARDS) with/without other organ failure – particularly for viral etiologies .']
query_embeddings = embedder.encode(queries, device='cuda', show_progress_bar=True)



Batches: 100%|██████████| 1/1 [00:00<00:00,  6.73it/s]


In [10]:
def get_corpus_row(idx):
    row = sentence_list[idx]

    return row.split(',')

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
closest_n = 5
print("\nTop 5 most similar sentences in corpus:")
for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist(
        [query_embedding], corpus_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n=========================================================")
    print("==========================Query==============================")
    print("===", query, "=====")
    print("=========================================================")

    for idx, distance in results[0:closest_n]:
        print("Score:   ", "(Score: %.4f)" % (1-distance), "\n")
        print("Article Index:   ", idx, "\n")
        print("Paragraph:   ", sentence_list[idx][:150], "\n")
        # row_dict = df_sentences.loc[df_sentences.index ==
        #                             corpus.loc[idx]].to_dict()
        # print("paper_id:  ", row_dict["paper_id"][corpus[idx]], "\n")
        # print("Title:  ", row_dict["title"][corpus[idx]], "\n")
        # print("Abstract:  ", row_dict["abstract"][corpus[idx]], "\n")
        # print("Abstract_Summary:  ",
        #       row_dict["abstract_summary"][corpus[idx]], "\n")
        print("-------------------------------------------")



Top 5 most similar sentences in corpus:


=== What has been published about medical care? =====
Score:    (Score: 0.6168) 

Article Index:    74263 



IndexError: list index out of range

In [None]:
import torch

torch.cuda.is_available()

torch.cuda.current_device()

In [None]:
"""
TODO: make a custom file (seekable) containing a list of embedings and it's corresponding paper id
if possible: make it a data structure where finding the knn is < n^2, possibilities are quad-trees 
if possible: make it cuda optimized
"""