# BERT-fy CORD-19 data

Original code from https://www.kaggle.com/theamrzaki/covid-19-bert-researchpapers-semantic-search#Data-Processing

In [1]:
import glob
import json
import pandas as pd
from tqdm import tqdm

root_path = './../data/'


In [None]:
all_json = glob.glob(f'{root_path}/**/*.json', recursive=True)

len(all_json)


In [None]:
import os

# get only rows with attached files

metadata_path = f'{root_path}/metadata.csv'
stripped_metadata_path = f'{root_path}/stripped_metadata.csv'

if not os.path.exists(stripped_metadata_path):
    meta_df = pd.read_csv(metadata_path, dtype={
        'pubmed_id': str,
        'Microsoft Academic Paper ID': str,
        'doi': str
    })

    stripped_meta_df = meta_df.dropna(subset=['pmc_json_files'])

    stripped_meta_df.to_csv(stripped_metadata_path)

    stripped_meta_df.head()

    del stripped_meta_df


In [None]:
import subprocess
import os

# create shuffled subset of metadata rows

small_metadata_path = f'{root_path}/small/small_metadata.csv'
if not os.path.exists(small_metadata_path):
    small_metadata_file = open(f'{root_path}/small/small_metadata.csv', 'w')
    # get header from metadata.csv
    print(subprocess.run(
        ['head', '-n 1', f'{root_path}/stripped_metadata.csv'], stdout=small_metadata_file))
    #!head -n 1 "{root_path}/stripped_metadata.csv" > "{root_path}/small_metadata.csv"
    # get random sample from metadata.csv
    print(subprocess.run(
        ['shuf', '-n 12500', f'{root_path}/stripped_metadata.csv'], stdout=small_metadata_file))
    #!shuf -n 12500 "{root_path}/stripped_metadata.csv" >> "{root_path}/small_metadata.csv"
    small_metadata_file.close()


In [None]:
metadata_path = f'{root_path}/stripped_metadata.csv'

meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str,
    'doi': str
})

meta_df.head()


In [None]:
import math


class Article:
    def __init__(self, pmcid):

        self.paper_id = ''
        self.abstract = []
        self.body_text = []

        if not isinstance(pmcid, str) and math.isnan(pmcid):
            return

        with open(f"{root_path}/document_parses/pmc_json/{pmcid}.xml.json") as file:
            content = json.load(file)
            content_metadata = meta_df.loc[meta_df['pmcid'] == pmcid]

            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            self.metadata = {}

            if not content_metadata is None:
                self.metadata = content_metadata

            if 'abstract' in content_metadata:
                # Abstract
                # self.abstract.append(content_metadata['abstract'][0])
                for entry in content_metadata['abstract']:
                    self.abstract.append(str(entry))
                # print(self.abstract)
            # Body text
            if 'body_text' in content:
                for entry in content['body_text']:
                    self.body_text.append(entry['text'])

            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)

    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'


first_row = Article(meta_df['pmcid'][0])
print(first_row.body_text)
# meta_df.iloc[0]


In [None]:
def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0

    # add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + " <br> " + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data


In [None]:
from dask import dataframe as dd

global partial_df
global dict_

dict_ = {'paper_id': [], 'abstract': [], 'body_text': []}
partial_df = pd.DataFrame(dict_, columns=[
    'paper_id', 'abstract', 'body_text'])
partial_df = dd.from_pandas(partial_df, npartitions=10)

def populateDict(content):
    # get metadata information
    meta_data = content.metadata
    # meta_df.loc[meta_df['pmcid'] == content.paper_id]
    # no metadata, skip this paper
    if len(meta_data) == 0:
        return

    # print(meta_data)

    dict_['paper_id'].append(content.paper_id)
    dict_['abstract'].append(content.abstract)
    dict_['body_text'].append(content.body_text)


In [None]:
partial_df.compute().to_csv(f'{root_path}/df_covid.csv')


def saveProgress():
    global partial_df
    global dict_

    partial_df = pd.DataFrame(dict_, columns=[
        'paper_id', 'abstract', 'body_text'])
    partial_df = dd.from_pandas(partial_df, npartitions=32)

    print('saving current progress')
    partial_df.compute().to_csv(
        f'{root_path}/df_covid.csv', mode='a', header=False)

    print('reseting partial df')
    del partial_df

    print('reseting partial dict')
    del dict_
    dict_ = {'paper_id': [], 'abstract': [], 'body_text': []}


for idx, entry in enumerate(meta_df['pmcid']):
    if idx % (len(meta_df) // 100) == 0:
        print(f'Processing index: {idx} of {len(meta_df)}')
        saveProgress()

    content = Article(entry)
    populateDict(content)

saveProgress()


In [None]:
from dask import dataframe as dd
import re


def lower_case(input_str):
    input_str = input_str.lower()
    return input_str


df_covid = dd.read_csv(f'{root_path}/df_covid.csv', sample=1000000)

df_covid['body_text'] = df_covid['body_text'].astype(str)
df_covid['abstract'] = df_covid['abstract'].astype(str)

df_covid['abstract'] = df_covid['abstract'].apply(
    lambda x: re.sub('[^a-zA-z0-9\s]', '', x), meta=('abstract', 'str'))
df_covid['body_text'] = df_covid['body_text'].apply(
    lambda x: re.sub('[^a-zA-z0-9\s]', '', x), meta=('body_text', 'str'))

df_covid['abstract'] = df_covid['abstract'].apply(
    lambda x: lower_case(x), meta=('abstract', 'str'))
df_covid['body_text'] = df_covid['body_text'].apply(
    lambda x: lower_case(x), meta=('body_text', 'str'))


In [None]:
df_covid.to_csv(f'{root_path}/df_covid_preprocessed.csv', single_file=True, compute=True)

df_covid.head(1)


In [None]:
import dask.dataframe as dd

df_covid = dd.read_csv(f'{root_path}/df_covid_preprocessed.csv')

df_covid = df_covid.drop(
    ["authors", "journal", "Unnamed: 0", "Unnamed: 0.1"], axis=1)


df_covid['body_text'] = df_covid['body_text'].astype(str)
df_covid['abstract'] = df_covid['abstract'].astype(str)

df_covid.head(5)


In [None]:
df_covid['body_text'] = df_covid['body_text'].apply(
    lambda x: x.split('\n'), meta=('abstract', 'str'))

df_covid = df_covid.explode('body_text')

df_covid.head(5)


In [None]:
df_covid.to_csv(f'{root_path}/covid_sentences.csv', single_file=True, compute=True)

df_covid.head(5)


In [None]:
# text_dict = text.to_dict()
# len_text = len(text_dict["paper_id"])


In [None]:
# paper_id_list = []
# body_text_list = []

# title_list = []
# abstract_list = []
# abstract_summary_list = []
# for i in tqdm(range(0, len_text)):
#     paper_id = text_dict["paper_id"][i]
#     body_text = text_dict["body_text"][i].split("<br>")
#     title = text_dict["title"][i]
#     abstract = text_dict["abstract"][i]
#     abstract_summary = text_dict["abstract_summary"][i]
#     for b in body_text:
#         paper_id_list.append(paper_id)
#         body_text_list.append(b)
#         title_list.append(title)
#         abstract_list.append(abstract)
#         abstract_summary_list.append(abstract_summary)


In [None]:
# df_sentences = pd.DataFrame({"paper_id": paper_id_list}, index=body_text_list)
# df_sentences.to_csv(f'{root_path}/covid_sentences_body.csv')
# df_sentences.head()


In [None]:
# from dask import dataframe as dd

# df_sentences = pd.DataFrame({"paper_id": paper_id_list, "title": title_list,
#                             "abstract": abstract_list, "abstract_summary": abstract_summary_list}, index=body_text_list)
# df_sentences = dd.from_pandas(df_sentences)
# df_sentences.to_csv(f'{root_path}/covid_sentences.csv')
# df_sentences.head()


In [None]:
from dask import dataframe as dd

df_sentences = dd.read_csv(f'{root_path}/covid_sentences.csv', blocksize=32e6)

df_sentences.head()


In [None]:
# df_sentences = df_sentences.set_index("Unnamed: 0")

# df_sentences.head()


In [None]:
# df_covid.to_csv(f'{root_path}/covid_sentences.csv', single_file=True, compute=True)


In [None]:
# df_sentences = df_sentences["paper_id"].to_dict()
# df_sentences_list = list(df_sentences.keys())
# len(df_sentences_list)


In [None]:
# list(df_sentences.keys())[:5]


In [None]:
# df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]


In [2]:
# https://github.com/UKPLab/sentence-transformers/blob/master/examples/application_semantic_search.py
"""
This is a simple application for sentence embeddings: semantic search
We have a corpus with various sentences. Then, for a given query sentence,
we want to find the most similar sentence in this corpus.
This script outputs for various queries the top 5 most similar sentences in the corpus.
"""

import pickle as pkl
import torch
from sentence_transformers import SentenceTransformer

# embedder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
embedder = SentenceTransformer(f'{root_path}/models/pretrained/')


In [None]:
# embedder.save(f'{root_path}/models/pretrained/', 'multi-qa-MiniLM-L6-cos-v1')

In [None]:
import os

# Corpus with example sentences
corpus = df_sentences['body_text']

if not os.path.exists(f'{root_path}/pickles/corpus_embeddings.npy'):
    corpus_embeddings = embedder.encode(
        corpus, device='cuda', show_progress_bar=True)

    # with open(f'{root_path}/pickles/corpus_embeddings.pkl', "wb") as file_:
    #     pkl.dump(corpus_embeddings, file_)

    torch.save(corpus_embeddings, f'{root_path}/pickles/corpus_embeddings.npy')

In [None]:
# import pandas as pd

# df = pd.read_csv(f'{root_path}/covid_sentences.csv', index_col=0)
# df.head()

In [None]:
corpus = df_sentences[['ind', 'body_text']]

corpus.head()


In [None]:
test = corpus.loc[corpus['ind'] == 0].compute()

test

In [None]:
import scipy.spatial
import torch

# with open(f'{root_path}/pickles/corpus_embeddings.pkl', "rb") as file_:
#     corpus_embeddings = pkl.load(file_)
corpus_embeddings = torch.load(f'{root_path}/pickles/small_corpus_embeddings.npy')

# Query sentences:
queries = ['What has been published about medical care?',
           'Knowledge of the frequency, manifestations, and course of extrapulmonary manifestations of COVID-19, including, but not limited to, possible cardiomyopathy and cardiac arrest',
           'Use of AI in real-time health care delivery to evaluate interventions, risk factors, and outcomes in a way that could not be done manually',
           'Resources to support skilled nursing facilities and long term care facilities.',
           'Mobilization of surge medical staff to address shortages in overwhelmed communities .',
           'Age-adjusted mortality data for Acute Respiratory Distress Syndrome (ARDS) with/without other organ failure – particularly for viral etiologies .']
query_embeddings = embedder.encode(queries, device='cuda', show_progress_bar=True)



In [None]:
def get_corpus_row(idx):
    pass

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
closest_n = 5
print("\nTop 5 most similar sentences in corpus:")
for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist(
        [query_embedding], corpus_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n=========================================================")
    print("==========================Query==============================")
    print("===", query, "=====")
    print("=========================================================")

    for idx, distance in results[0:closest_n]:
        print("Score:   ", "(Score: %.4f)" % (1-distance), "\n")
        print("Article Index:   ", idx, "\n")
        # print("Paragraph:   ", get_corpus_row(idx), "\n")
        # row_dict = df_sentences.loc[df_sentences.index ==
        #                             corpus.loc[idx]].to_dict()
        # print("paper_id:  ", row_dict["paper_id"][corpus[idx]], "\n")
        # print("Title:  ", row_dict["title"][corpus[idx]], "\n")
        # print("Abstract:  ", row_dict["abstract"][corpus[idx]], "\n")
        # print("Abstract_Summary:  ",
        #       row_dict["abstract_summary"][corpus[idx]], "\n")
        print("-------------------------------------------")


In [None]:
import torch

torch.cuda.is_available()

torch.cuda.current_device()