In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import warnings
warnings.filterwarnings('ignore')

import re
import json
import random
import sent2vec

import numpy as np
import pandas as pd
import networkx as nx
import tqdm.notebook as tqdm

from rank_bm25 import BM25Okapi

from collections import defaultdict

from transformers import pipeline
from transformers import BertTokenizer
from transformers import BertForQuestionAnswering
from transformers import T5Tokenizer
from transformers import T5ForConditionalGeneration

from nltk import word_tokenize
from spacy.lang.en.stop_words import STOP_WORDS

from sentence_splitter import SentenceSplitter 
from sentence_splitter import split_text_into_sentences

In [2]:
biosentvec = sent2vec.Sent2vecModel()
biosentvec.load_model('bio_sent_vec.file')

In [3]:
t5_tokenizer = T5Tokenizer.from_pretrained('t5-large')
t5 = T5ForConditionalGeneration.from_pretrained('t5-large')

In [4]:
bertqa = pipeline('question-answering', model = 'ptnv-s/biobert_squad2_cased-finetuned-squad')

In [5]:
directory = 'dataset/cord19/pdf_json'
article_names = os.listdir(directory)

random.seed(1811)
article_names = random.sample(article_names, 1000)

print(len(article_names))

1000


In [6]:
stopwords = set(STOP_WORDS)

custom_stop_words = [
    'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 
    'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 
    'al.', 'Elsevier', 'PMC', 'CZI','table'
]

stopwords |= set(custom_stop_words)

stopwords = set([word.lower() for word in stopwords])

splitter = SentenceSplitter(language = 'en')

def clean(text):
    text = text.split()
    text = ' '.join(text)
    text = text.lower()
    text = text.strip()
    return text

def remove_stopwords(text):
    filtered_text = []
    for word in word_tokenize(text):
        if word not in stopwords:
            filtered_text.append(word)
    filtered_text = ' '.join(filtered_text)
    return filtered_text

def further_preprocess(sent):
    sent = clean(sent)
    sent = re.sub('[^a-z ]+', '', sent)
    sent = clean(sent)
    sent = remove_stopwords(sent)
    sent = clean(sent)
    return sent

In [7]:
sents = []

for article_name in tqdm.tqdm(article_names):
    
    text = ''
    
    file = json.load(open(directory + '/' + article_name))

    for abstract_content in file['abstract']:
        text += ' ' + abstract_content['text']

    for body_content in file['body_text']:
        text += ' ' + body_content['text']

    for ref_content in file['ref_entries'].values():
        text += ' ' + ref_content['text']
    
    sentences = splitter.split(text = text)
    
    for sent in sentences:
        sent = clean(sent)
        if len(sent) <= 200:
            sents.append(sent)

print(len(sents))

  0%|          | 0/1000 [00:00<?, ?it/s]

140152


In [214]:
query = 'common symptoms of covid 19'
# query = 'How to mitigate COVID transmission?'
# query = 'various assistance programmes for covid 19'


query_copy = query
query = clean(query)
query = further_preprocess(query)

In [215]:
tokenized_sents = []

for i in range(len(sents)):
    sent = sents[i]
    sent = further_preprocess(sent)
    tokenized_sents.append(sent.split())

bm25 = BM25Okapi(tokenized_sents)
bm_scores = bm25.get_scores(query.split())
bm_scores = np.array(bm_scores)
bm_scores /= np.sum(bm_scores ** 2) ** 0.5

In [216]:
def cosine(a, b):
    return np.sum(a * b) / ((np.sum(a ** 2) ** 0.5) * (np.sum(b ** 2) ** 0.5))

In [217]:
query_emb = biosentvec.embed_sentences([query])[0]

bioemb = biosentvec.embed_sentences(sents)
bioemb_scores = []

for i in range(len(sents)):
    if np.sum(bioemb[i]):
        bioemb_scores.append(cosine(query_emb, bioemb[i]))
    else:
        bioemb_scores.append(0)
bioemb_scores = np.array(bioemb_scores)
bioemb_scores /= np.sum(bioemb_scores ** 2) ** 0.5

In [218]:
tmp = []
for i in range(len(sents)):
    tmp.append([bm_scores[i] + bioemb_scores[i], i])
tmp.sort(reverse = True)

ranklist = []
for [score, index] in tmp:
    ranklist.append(index)

rel_sents = []
for i in ranklist[: 500]:
    rel_sents.append(sents[i])

In [219]:
tokenized_sents = []

for i in range(len(rel_sents)):
    sent = rel_sents[i]
    tokenized_sents.append(sent.split())

bm25 = BM25Okapi(tokenized_sents)
bm_scores = bm25.get_scores(query.split())
bm_scores = np.array(bm_scores)
bm_scores /= np.sum(bm_scores ** 2) ** 0.5

In [220]:
bioemb = biosentvec.embed_sentences(rel_sents)
bioemb_scores = []

for i in range(len(rel_sents)):
    if np.sum(bioemb[i]):
        bioemb_scores.append(cosine(query_emb, bioemb[i]))
    else:
        bioemb_scores.append(0)
bioemb_scores = np.array(bioemb_scores)
bioemb_scores /= np.sum(bioemb_scores ** 2) ** 0.5

In [221]:
adjList = defaultdict(set)
for i in tqdm.tqdm(range(len(rel_sents))):
    for j in range(i + 1, len(rel_sents)):
        if np.sum(bioemb[i]) and np.sum(bioemb[j]) and cosine(bioemb[i], bioemb[j]) >= 0.1:
            if cosine(bioemb[i], query_emb) >= 0.1 and cosine(bioemb[j], query_emb) >= 0.1:
                adjList[i].add(j)
                adjList[j].add(i)

G = nx.Graph()
G.add_nodes_from(list(range(len(rel_sents))))

for i in adjList.keys():
    for j in adjList[i]:
        G.add_edge(i, j)

pagerank = nx.pagerank(G)
pr_scores = [0] * len(rel_sents)
for key, val in pagerank.items():
    pr_scores[key] = val
pr_scores = np.array(pr_scores)
pr_scores /= np.sum(pr_scores ** 2) ** 0.5

  0%|          | 0/500 [00:00<?, ?it/s]

In [222]:
tmp = []
for i in range(len(rel_sents)):
    tmp.append([bm_scores[i] + bioemb_scores[i] + pr_scores[i], i])
tmp.sort(reverse = True)

ranklist = []
for [score, index] in tmp:
    ranklist.append(index)

final_sents = []
for i in ranklist[: 10]:
    final_sents.append(rel_sents[i])

for index, sent in enumerate(final_sents):
    print(str(index + 1) + ')', sent)

1) some of the family members experienced long covid symptoms and challenging recoveries.
2) if the lus is normal despite 3-5 days of symptoms then covid is very unlikely (but not this article is protected by copyright.
3) sanders et al. reported a young man with end stage renal disease on hemodialysis catheter presenting with a febrile illness and symptoms of covid 19 along with new-onset systolic murmur.
4) i know i'm going to need increased medical support, definitely because of the long covid symptoms.
5) the most common symptoms are fever, cough, and difficulty breathing.
6) the most common symptoms of covid-19 are fever, cough, and tiredness.
7) most common symptoms were fever (42.1%, n=24) and cough (52,6%, n=30).
8) one of the most common symptoms associated with covid-19 is fever.
9) the most common symptoms of covid-19 are fever, dry cough, and malaise/fatigue.
10) covid screening for that visitor depends on resource availability and symptoms, and the management of covid-posi

In [223]:
t5_text = 'summarize: ' + ' '.join(final_sents)
t5_text = t5_tokenizer.encode(t5_text, return_tensors = 'pt')
summary_ids = t5.generate(t5_text, num_beams = 3, min_length = 10, max_length = 60)
summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens = True)

print (summary)

the most common symptoms of covid-19 are fever, cough, and tiredness. the most common symptoms of covid-19 are fever, dry cough, and malaise/fatigue. covid screening for that visitor depends on resource availability and symptoms.


In [224]:
output = bertqa(question = query_copy, context = summary)
print(output['answer'])

fever, cough, and tiredness
