***This notebook is intended for the development of a future feature on the website related to a centralized research repository with search.***

## Import Packages

In [1]:
import pandas as pd
from os import listdir
from os.path import isfile, join
import json
from pymongo import MongoClient
from tqdm import tqdm_notebook as tqdm
from nltk.tokenize import word_tokenize
import gensim
from pprint import pprint
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import spacy
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

## Connect to MongoDB

In [30]:
client = MongoClient('localhost',27017)
db = client['covid']
studies = db.studies 
doc_path = '/Users/<user>/Library/Mobile Documents/com~apple~CloudDocs/CORD-19-research-challenge'
metadata = pd.read_csv(doc_path + '/metadata.csv')

In [113]:
# display metadata
metadata.head()

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,xqhn0vbp,1e1286db212100993d03cc22374b624f7caee956,PMC,Airborne rhinovirus detection and effect of ul...,10.1186/1471-2458-3-5,PMC140314,12525263.0,no-cc,"BACKGROUND: Rhinovirus, the most common cause ...",2003-01-13,"Myatt, Theodore A; Johnston, Sebastian L; Rudn...",BMC Public Health,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
1,gi6uaa83,8ae137c8da1607b3a8e4c946c07ca8bda67f88ac,PMC,Discovering human history from stomach bacteria,10.1186/gb-2003-4-5-213,PMC156578,12734001.0,no-cc,Recent analyses of human pathogens have reveal...,2003-04-28,"Disotell, Todd R",Genome Biol,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
2,le0ogx1s,,PMC,A new recruit for the army of the men of death,10.1186/gb-2003-4-7-113,PMC193621,12844350.0,no-cc,"The army of the men of death, in John Bunyan's...",2003-06-27,"Petsko, Gregory A",Genome Biol,,,False,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
3,fy4w7xz8,0104f6ceccf92ae8567a0102f89cbb976969a774,PMC,Association of HLA class I with severe acute r...,10.1186/1471-2350-4-9,PMC212558,12969506.0,no-cc,BACKGROUND: The human leukocyte antigen (HLA) ...,2003-09-12,"Lin, Marie; Tseng, Hsiang-Kuang; Trejaut, Jean...",BMC Med Genet,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...
4,0qaoam29,5b68a553a7cbbea13472721cd1ad617d42b40c26,PMC,A double epidemic model for the SARS propagation,10.1186/1471-2334-3-19,PMC222908,12964944.0,no-cc,BACKGROUND: An epidemic of a Severe Acute Resp...,2003-09-10,"Ng, Tuen Wai; Turinici, Gabriel; Danchin, Antoine",BMC Infect Dis,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...


In [115]:
metadata.loc[metadata['sha'] == 'ff766484e50701bbb36ed45d3a4464d817a471ff']['abstract'].iloc[0]

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
40896,nl37p9vp,ff766484e50701bbb36ed45d3a4464d817a471ff,Elsevier,Middle East Respiratory Syndrome Interpreted: ...,10.1016/j.amjmed.2016.04.030,PMC7124269,27215907.0,els-covid,,2016-09-30,"Lau, Susanna K.P.; Chan, Jasper F.W.; Hung, Iv...",The American Journal of Medicine,,,True,False,custom_license,https://doi.org/10.1016/j.amjmed.2016.04.030


In [70]:
# define paths for different sources
comm_pmc_json = doc_path + '/comm_use_subset/comm_use_subset/pmc_json/'
comm_pdf_json = doc_path + '/comm_use_subset/comm_use_subset/pdf_json/'
custom_pmc_json = doc_path + '/custom_license/custom_license/pmc_json/'
custom_pdf_json = doc_path + '/custom_license/custom_license/pdf_json/'

In [71]:
# list all json files available for each folder
comm_pmc_json_files = [f for f in listdir(comm_pmc_json) if isfile(join(comm_pmc_json, f))]
comm_pdf_json_files = [f for f in listdir(comm_pdf_json) if isfile(join(comm_pdf_json, f))]
custom_pmc_json_files = [f for f in listdir(custom_pmc_json) if isfile(join(custom_pmc_json, f))]
custom_pdf_json_files = [f for f in listdir(custom_pdf_json) if isfile(join(custom_pdf_json, f))]

In [8]:
len(comm_pdf_json_files)

9524

In [72]:
len(comm_pmc_json_files)

9148

In [10]:
len(custom_pdf_json_files)

26505

In [11]:
len(custom_pmc_json_files)

7797

In [None]:
def find_abstract(research_id):
    ''' Helper function to aggregate abstract and metadata '''
    if research_id is not None:
        if 'PMC' in research_id:
            try:
                abstract = metadata.loc[metadata['pmcid'] == research_id]['abstract'].iloc[0]
            except:
                abstract = None
        else:
            try: 
                abstract = metadata.loc[metadata['sha'] == research_id]['abstract'].iloc[0]
            except:
                abstract = None
    else:
        abstract = None
    return abstract

In [82]:
def load_files(files_list, files_path):
    ''' Helpfer function to load mongodb files '''
    for orig_file in tqdm(files_list):
        with open((files_path + orig_file), 'r') as file:
            try:
                file = json.load(file)
            except:
                print('Warning! File not read:', orig_file)
            res = {}
            try:
                res['paper_id'] = file['paper_id'] 
            except:
                res['paper_id'] = None

            try:
                res['abstract'] = file['abstract'][0]['text']
            except:
                res['abstract'] = None

            try: 
                res['title'] = file['metadata']['title']
            except:
                res['title'] = None

            try:
                res['authors'] = file['metadata']['authors']
            except:
                res['authors'] = None

            try:
                res['other_metadata'] = file['metadata']
            except:
                res['other_metadata'] = None

            try:
                res['other_abstract'] = file['abstract']
            except:
                res['other_abstract'] = None

            full_text = ''
            try:
                for i in file['body_text']:
                    full_text += i['text']

            except:
                full_text = None

            try:
                res['body'] = full_text
            except:
                res['body'] = None
            try:
                studies.insert_one(res, bypass_document_validation=True)
            except:
                print('Warning! File not inserted:', orig_file)

In [83]:
load_files(custom_pdf_json_files, custom_pdf_json)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=26505.0), HTML(value='')))




In [84]:
load_files(custom_pmc_json_files, custom_pmc_json)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=7797.0), HTML(value='')))




In [97]:
# load full data from mongodb
full_data = pd.DataFrame(list(studies.find(
    {}, {'_id': 0, 'authors': 0, 'other_metadata': 0, 'other_abstract': 0})))
full_data.drop('abstract', axis=1).to_csv('./all_research.csv', index=False)

In [2]:
# do some data cleaning
full_data = pd.read_csv('./all_research.csv')
full_data = full_data.drop_duplicates(['title', 'new_abstract'], keep='first')
full_data = full_data.dropna()

In [3]:
# get the first 5000 articles for development
subset = full_data.head(5000)
# convert to lower case
subset['lower_body'] = subset['body'].str.lower()
# tokenize body of research articles
subset['lower_body_tokens'] = subset['lower_body'].apply(lambda x: word_tokenize(str(x)))
# remove non alphanumeric characters
subset['lower_body_tokens'] = subset['lower_body_tokens'].apply(lambda x: [w for w in x if w.isalpha()])

In [None]:
stop_words = stopwords.words('english')
def remove_stopwords(word_tokens):
    ''' helped function to remove stopwords '''
    return [w for w in word_tokens if not w in stop_words] 

In [None]:
subset['lower_body_tokens'] = subset['lower_body_tokens'].apply(lambda x: remove_stopwords(x))

In [None]:
all_body = list(subset['lower_body_tokens'])
model = gensim.models.Word2Vec(all_body, size=100, window=5, min_count=1)
word_vectors = model.wv

In [25]:
model.most_similar(positive='coronavirus')

  """Entry point for launching an IPython kernel.


[('cov', 0.8944141268730164),
 ('norovirus', 0.8365249633789062),
 ('picornavirus', 0.8242976665496826),
 ('adenovirus', 0.8122658729553223),
 ('paramyxovirus', 0.8113682866096497),
 ('coxsackievirus', 0.8097898960113525),
 ('enterovirus', 0.809447169303894),
 ('flavivirus', 0.8092314004898071),
 ('herpesvirus', 0.8069195747375488),
 ('retroviruses', 0.8040987849235535)]

In [26]:
model.most_similar(positive='transmission')

  """Entry point for launching an IPython kernel.


[('spread', 0.8131103515625),
 ('contact', 0.7746533155441284),
 ('transmissibility', 0.7723363637924194),
 ('spillover', 0.7054035663604736),
 ('epidemic', 0.7000199556350708),
 ('transmissions', 0.6984718441963196),
 ('airborne', 0.6930473446846008),
 ('emergence', 0.68519526720047),
 ('epidemics', 0.6673940420150757),
 ('spreads', 0.6639245748519897)]

In [95]:
# define a search string
search_string = 'What do we know about the transmission of the virus?'

In [96]:
# get distances of each paper against the search string
distances = [(i[1]['paper_id'], 
              model.wmdistance(search_string, 
                               i[1]['body'])) for i in full_data.iterrows()]

  This is separate from the ipykernel package so we can avoid doing imports until


In [97]:
# get the first 5 elements
sorted(distances,key=lambda x: x[1])[:5]

[('3a8471a27ef5de09200e4d659467e203c900ae48', 2.3013047687998696),
 ('PMC7123587', 2.3067632244057754),
 ('b24beffd1af8f836c7931d9729b74078feb7e5ba', 2.384921039897632),
 ('PMC7121202', 2.4490523670813626),
 ('PMC7123263', 2.5113062362673215)]

In [104]:
# sample and see research title
full_data.loc[full_data['paper_id'] == 'PMC7121202']['title'].iloc[0]

'A Locally Transmitted Case of SARS-CoV-2 Infection in Taiwan'

In [105]:
# randomly sample and see research text
full_data.loc[full_data['paper_id'] == 'PMC7121202']['new_abstract'].iloc[0]

'On January 25, 2020, a 52-year-old woman with a history of type 2 diabetes presented with fever to an emergency department in central Taiwan. She was admitted to the hospital because of suspicion of pneumonia associated with SARS-CoV-2 infection. She had lived in Wuhan from October 21, 2019, to January 20, 2020. She returned to Taiwan from Wuhan on January 20 on an airplane. On the same day, a throat swab was obtained from another passenger on that flight; that passenger was confirmed to have the first known imported case of SARS-CoV-2 infection in Taiwan when the swab was found to be positive for the virus on January 21.'

In [6]:
# perform LDA
count_vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
doc_term_matrix = count_vect.fit_transform(subset['body'].values.astype('U'))

In [8]:
LDA = LatentDirichletAllocation(n_components=5, random_state=42)
LDA.partial_fit(doc_term_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [9]:
doc_term_matrix

<5000x80491 sparse matrix of type '<class 'numpy.int64'>'
	with 4602534 stored elements in Compressed Sparse Row format>

In [18]:
LDA

array([ 6661, 16776,  6161, ..., 31054, 11235,  2840])

In [10]:
first_topic =LDA.components_[0]

In [11]:
for i in first_topic.argsort()[-20:]:
    print(count_vect.get_feature_names()[i])

epidemic
human
countries
control
research
infectious
surveillance
diseases
case
outbreak
influenza
information
population
public
risk
transmission
model
cases
disease
health


In [12]:
count_vect.get_feature_names()[i]

'health'

In [13]:
for i,topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['influenza', 'information', 'population', 'public', 'risk', 'transmission', 'model', 'cases', 'disease', 'health']


Top 10 words for topic #1:
['infections', 'positive', 'viral', 'children', 'virus', 'samples', 'clinical', 'influenza', 'respiratory', 'patients']


Top 10 words for topic #2:
['genome', 'virus', 'viral', 'viruses', 'gene', 'sequence', 'sequences', 'genes', 'et', 'al']


Top 10 words for topic #3:
['activity', 'rna', 'figure', 'proteins', 'virus', 'viral', 'expression', 'protein', 'cell', 'cells']


Top 10 words for topic #4:
['antibodies', 'viral', 'protein', 'antibody', 'ml', 'infected', 'cell', 'mice', 'virus', 'cells']




In [14]:
topic_values = LDA.transform(doc_term_matrix)
topic_values.shape

(5000, 5)

In [15]:
subset['Topic'] = topic_values.argmax(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [16]:
subset['Topic'].unique()

array([4, 3, 2, 1, 0])

In [17]:
subset.head()

Unnamed: 0,paper_id,title,body,new_abstract,Topic
0,5e0c586f047ff909c8ed3fe171c8975a90608d08,Neutralizing antibodies against porcine epidem...,"Porcine epidemic diarrhea virus (PEDV), which ...",BACKGROUND: Porcine epidemic diarrhea virus (P...,4
1,1579fbff7af9b156c6f49fee0526e48f852ea460,A Recombinant Newcastle Disease Virus (NDV) Ex...,"Generation of rNDVs expressing S1, S2 or S pro...",Infectious bronchitis virus (IBV) causes a hig...,4
3,38aa050ad79d8a1d7022c33535255ce9d47914e5,Potent Inhibition of Junín Virus Infection by ...,Arenaviruses are enveloped RNA viruses with bi...,The new world arenavirus Junín virus (JUNV) is...,3
4,61722c462b054f36461375e96e502cbf22648c04,2 convergent Research center for emerging Viru...,"In this study, the anti-dengue activity of nic...",Dengue fever is one of the most important mosq...,3
5,7107f088cbed45d8a06a026276ccf4d602d50f10,Microglia Play a Major Role in Direct Viral-In...,Microglia are specialized macrophages of the C...,Microglia are the resident macrophage-like pop...,4
