In [3]:
#!pip install PyPDF2
#!pip install textract
#!pip install wordcloud
#!pip install nltk
#!pip install gensim
#!pip install spacy
#!pip install pyLDAvis
#!nltk.download('punkt') # one time execution
#nltk.download('stopwords') # one time execution

In [1]:
import os
import re

# DS tools
import numpy as np
import pandas as pd
import spacy

# nltk tools
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx


In [2]:
#biorxiv ,parse JSON and to csv
biorxiv_clean = pd.read_csv("data/biorxiv_clean.csv")
biorxiv_clean["text"][0]

'Introduction\n\nCOVID-19, initially observed/detected in Hubei province of China during December 2019, has since spread to all but a handful countries, causing (as of the time of writing) an estimated 855,000 infections and 42,000 deaths ( [8] , March 31st). COVID-19 has a basic reproductive number, R 0 , currently estimated in the region of 2.5 -3 [5] . Social distance and general quarantine measures can reduce R 0 temporarily, but not permanently. For R 0 = 3, left unchecked COVID-19 can be expected to infect more than 90% of our community, with 30% of the population infected at the epidemic peak. Even with significant quarantine measures in place the population will not reach "herd immunity" to this virus until 2/3 of the population has gained resistance-either through vaccination, or infection and subsequent recovery.In order to place these numbers in a concrete context, a recent survey in New Zealand indicated that the country had a total of 520 ventilator machines [7] . Given th

In [3]:
# Filter papers containing all words in list
def filter_papers_word_list(word_list):
    papers_id_list = []
    for idx, paper in biorxiv_clean.iterrows():
        if all(x in paper.text for x in word_list):
            papers_id_list.append(paper.paper_id)

    return papers_id_list

In [4]:
pd.set_option("display.max_colwidth", 100000) # Extend the display width to prevent split functions to not cover full text
biorxiv_environment = filter_papers_word_list(["coronavirus","tissue","immune","disease","gene","drug",'sars','cov'])
print("Papers containing coronavirus: ", len(biorxiv_environment))

Papers containing coronavirus:  1


In [5]:
def extract_conclusion(df, papers_id_list):
    data = df.loc[df['paper_id'].isin(papers_id_list)]
    conclusion = []
    for idx, paper in data.iterrows():
        paper_text = paper.text
        if "\nConclusion\n" in paper.text:
            conclusion.append(paper_text.split('\nConclusion\n')[1])
        else:
            conclusion.append("No Conclusion section")
    data['conclusion'] = conclusion
        
    return data

pd.reset_option('^display.', silent=True)

In [6]:
environ_trans_conclusion = extract_conclusion(biorxiv_clean, biorxiv_environment)
environ_trans_conclusion["text"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


99    INTRODUCTION Purpose and Research Questions\n\...
Name: text, dtype: object

In [40]:
sentences = []
for s in environ_trans_conclusion["text"]:
    sentences.append(sent_tokenize(s))

sentences = [y for x in sentences for y in x] # flatten list
print(sentences[0])

INTRODUCTION Purpose and Research Questions

The Infection Prevention & Control of the World Health Organization (WHO) Health Emergency Programme presented a query on preventing and managing COVID-19 in older adults aged 60 years and above living in long-term care facilities including privately paid for and publicly paid for settings with a 5-business day timeline.


In [28]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

/bin/sh: wget: command not found
unzip:  cannot find or open glove*.zip, glove*.zip.zip or glove*.zip.ZIP.

No zipfiles found.


In [41]:
# Extract word vectors
word_embeddings = {}
f = open('data/glove.6B/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [42]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

#stopwords retrieval
stop_words = stopwords.words('english')

In [43]:
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [44]:
# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]
print(len(clean_sentences))

3603


In [45]:
sentence_vectors = []
for i in clean_sentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v = np.zeros((100,))
    sentence_vectors.append(v)

In [46]:
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]


KeyboardInterrupt: 

In [None]:
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [None]:

# Extract top 10 sentences as the summary
for i in range(5):
    print(ranked_sentences[i][1])