# Parse PubMed data in order to get plain text of relevant papers

Through the URLs below, PubMed data can be ontained interactively. The idea here is to select papers based on keywords (and perhaps dates, ...) in a first query that results in PMC IDs. After that, the full text of those publications can be ontained one-by-one.

These texts can be analyzed by scispacy language modesl that include POS tagging for biological/scienti

In [3]:
import os
import pandas as pd
import scispacy
import spacy

from utils import perform_query, extract_clean, retrieve_paper


## Obtain and process data
Steps:
1. Define a query to search papers
2. Call API with that query to obtain all PMC IDs for that query
3. Read the full content of tyhose papers one by one
4. Pre-process the full text
5. Run language model on clean text of body of paper

In [9]:
query = 'covid-19'
IDs = perform_query(query)

savetext = True  # Set True if clean text of paper needs to be saved to a text file

all_papers = []
for ID in IDs:
    fname = f'papers/cleantext_{ID}.txt'
    
    if os.path.exists(fname):
        with open(fname, 'r') as f:
            cleantext = f.read()
    else:  
        content = retrieve_paper(ID)
        cleantext = extract_clean(content)
        if savetext:
            with open(fname, 'w') as f:
                f.write(cleantext)
                
    all_papers.append(cleantext)
    

In [6]:
nlp = spacy.load("en_ner_bionlp13cg_md")

In [7]:
docs = [nlp(text) for text in all_papers]

In [8]:
len(docs)

25

In [10]:
for ent in docs[2].ents:
    print(f"{ent.text:20s}, {ent.start_char:6}, {ent.end_char:6}, {ent.label_:20s}")

Wu et al. [ ]       ,     49,     62, ORGANISM            
patients            ,    108,    116, ORGANISM            
COVID               ,    122,    127, GENE_OR_GENE_PRODUCT
pulmonary capillary ,    221,    240, TISSUE              
albumin             ,    252,    259, GENE_OR_GENE_PRODUCT
capillary           ,    266,    275, TISSUE              
COVID               ,    293,    298, GENE_OR_GENE_PRODUCT
alveolar epithelial endothelial,    333,    364, CELL                
lung                ,    384,    388, ORGAN               
edema               ,    403,    408, IMMATERIAL_ANATOMICAL_ENTITY
COVID 19            ,    562,    570, GENE_OR_GENE_PRODUCT
hepatic             ,    589,    596, ORGAN               
albumin             ,    610,    617, GENE_OR_GENE_PRODUCT
albumin             ,    697,    704, GENE_OR_GENE_PRODUCT
plasma albumin      ,    763,    777, ORGANISM_SUBSTANCE  
liver albumin       ,    865,    878, CANCER              
hepatic             ,    888,    895,

In [12]:
for token in docs[2][100:110]:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

COVID-19 covid-19 NOUN NN nmod XXXX-dd False False
has have VERB VBZ aux xxx True True
not not PART RB neg xxx True True
spared spare VERB VBN ROOT xxxx True False
children child NOUN NNS dobj xxxx True False
. . PUNCT . punct . False False
Since since SCONJ IN case Xxxxx True True
March March PROPN NNP nmod Xxxxx True False
2020 2020 NUM CD nummod dddd False False
, , PUNCT , punct , False False


True