In [1]:
from typing import List, Union
from paperscraper.pubmed import get_query_from_keywords_and_date
from paperscraper.pubmed import get_pubmed_papers


def search_pubmed_papers(
    keywords: List[Union[str, List[str]]]
    = [['SARS-CoV-2', 'COVID-19', 'coronavirus', 'SARS-CoV', 'MERS-CoV',
        'SARS'],
        ['antibody', 'antibodies', 'nanobody', 'immunoglobulin', 'MAb',
         'nanobodies'],
        ['neutralizing', 'neutralize', 'neutralization', 'bind', 'binding',
         'inhibit', 'targeting'],
        ['heavy chain', 'complementarity determining region', 'gene',
         'epitope', 'receptor-binding domain', 'rbd', 'spike protein', 'VHH']],
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    start_date: str = "None",
    end_date: str = "None",
    *args,
    **kwargs
):
    """
    Combines get_pubmed_papers and dump_papers.
    For default setting, just import this function and use
    search_pubmed_papers()
    Returns:
        A list of dictionaries, each containing the paper's
         ["title", "authors", "date", "abstract", "journal", "doi"]

    Args:
        keywords (List[Union[str, List[str]]]): List of keywords to request
            pubmed API. The outer list level will be considered as AND
            separated keys, the inner level as OR separated.
        fields (List, optional): List of strings with fields to keep in output.
            Defaults to ['title', 'authors', 'date', 'abstract',
            'journal', 'doi'].
            NOTE: If 'emails' is passed, an attempt is made to extract author
            mail addresses.
        start_date (str): Start date for the search. Needs to be in format:
            YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date (str): End date for the search. Same notation as start_date.
    """
    # Translate keywords into query.
    query = get_query_from_keywords_and_date(
        keywords, start_date=start_date, end_date=end_date
    )
    papers = get_pubmed_papers(query, fields, *args, **kwargs)
    return papers


def dump_papers(papers, filepath: str) -> None:
    """
    Receives a list of dicts, one dict per paper and dumps it into a .jsonl
    file with one paper per line.
    Args:
        papers (list[dict]): List of papers
        filepath (str): Path to dump the papers.
    """

    with open(filepath, "w") as f:
        for paper in papers:
            f.write(str(paper) + "\n")


def pubmed_papers_and_pt(
    keywords: List[Union[str, List[str]]]
    = [['SARS-CoV-2', 'COVID-19', 'coronavirus', 'SARS-CoV', 'MERS-CoV',
        'SARS'],
        ['antibody', 'antibodies', 'nanobody', 'immunoglobulin',
         'nanobodies'],
        ['neutralizing', 'neutralize', 'neutralization', 'bind', 'binding',
         'inhibit', 'targeting', 'neutralising', 'neutralise', 'neutralisation'],
        ['heavy chain', 'complementarity determining region', 'gene',
         'epitope', 'receptor', 'rbd', 'spike protein', 'VHH', 'domain']],
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    start_date: str = "None",
    end_date: str = "None",
    txt: bool = False,
    jsonl: bool = False,
    *args,
    **kwargs
):
    """
    Search for papers and preprints on PubMed
    Returns:
        A list of dictionaries, each containing the paper's
         ["title", "authors", "date", "abstract", "journal", "doi"]
        AND/OR files containing relevant information

    Args:
        keywords (List[Union[str, List[str]]],optional): List of keywords to
            request pubmed API. The outer list level will be considered as AND
            separated keys, the inner level as OR separated.
        fields (List, optional): List of strings with fields to keep in output.
            Defaults to ['title', 'authors', 'date', 'abstract',
            'journal', 'doi'].
            NOTE: If 'emails' is passed, an attempt is made to extract author
            mail addresses.
        start_date (str): Start date for the search. Needs to be in format:
            YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date (str): End date for the search. Same notation as start_date.
    """
    papers = search_pubmed_papers(keywords, fields, start_date, end_date,
                                  *args, **kwargs)
    papers_pt = search_pubmed_papers(keywords+['AND preprint[pt]'],
                                     fields, start_date, end_date, *args,
                                     **kwargs)
    output = papers+papers_pt
    list_of_titles = []
    list_of_doi = []
    for _ in output:
        list_of_titles.append(_["title"])
        if _["doi"] is not None:
            doi = _["doi"].split("\n")[0]
            if doi not in list_of_doi:
                list_of_doi.append(doi)
    if txt is True:
        with open('pubmed_results.txt', "w") as f:
            for t in list_of_titles:
                t = t.replace('[', '')
                t = t.replace(']', '')
                f.write(str(t) + "\n")
        with open('dois.txt', "w") as f:
            for doi in list_of_doi:
                f.write('https://www.ncbi.nlm.nih.gov/pmc/articles/doi/'+str(doi) + "\n")
    if jsonl is True:
        with open('pubmed_results.jsonl', "w") as f:
            for paper in output:
                f.write(str(paper) + "\n")
    return output


if __name__ == '__main__':
    # These are the keywords that are able to retrieve the most the papers in
    # covabdab
    covid = ['SARS-CoV-2', 'COVID-19', 'coronavirus', 'SARS-CoV', 'MERS-CoV',
             'SARS']
    antibody = ['antibody', 'antibodies', 'nanobody', 'MAb', 'immunoglobulin',
                'nanobodies']
    interaction = ['neutralizing', 'neutralize', 'neutralization', 'bind',
                   'binding', 'inhibit', 'targeting']
    extra = ['heavy chain',  'complementarity determining region',
             'gene', 'epitope', 'receptor-binding domain', 'rbd',
             'spike protein', 'VHH']
    papers_and_preprints = pubmed_papers_and_pt(txt=True, jsonl=True)


INFO:paperscraper.load_dumps:Loaded biorxiv dump with 190584 entries
INFO:paperscraper.load_dumps:Loaded chemrxiv dump with 10442 entries
INFO:paperscraper.load_dumps:Loaded medrxiv dump with 33177 entries


ChunkedEncodingError: ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))

In [31]:
keywords = [['SARS-CoV-2', 'COVID-19', 'coronavirus', 'SARS-CoV', 'MERS-CoV',
        'SARS'],
        ['antibody', 'antibodies', 'nanobody', 'immunoglobulin',
         'nanobodies'],
        ['neutralizing', 'neutralize', 'neutralization', 'bind', 'binding',
         'inhibit', 'targeting', 'neutralising', 'neutralise', 'neutralisation'],
        ['heavy chain', 'complementarity determining region', 'gene',
         'epitope', 'receptor', 'rbd', 'spike protein', 'VHH', 'domain']]
#papers = get_and_dump_pubmed_papers([covid,antibody,neut,structure])

papers = get_and_dump_pubmed_papers([covid,
                                     ['neutralizing-antibody','monoclonal+antibody'],
                                     ['receptor+binding+domain','rbd'],
                                    ['spike+protein','complementarity+determining+region','epitope']])
dump_papers(papers, 'results.jsonl')
list_of_titles = []
list_of_doi = []
for _ in papers:
    list_of_titles.append(_["title"])
    if _["doi"] != None:
        doi = _["doi"].split("\n")[0]
        if doi not in list_of_doi:
            list_of_doi.append(doi)

with open('dois.txt', "w") as f:
        for doi in list_of_doi:
            f.write('https://doi.org/'+str(doi) + "\n")

with open('search_results.txt', "w") as f:
        for t in list_of_titles:
            t = t.replace('[','')
            t = t.replace(']','')
            f.write(str(t) + "\n")






In [9]:
list_of_abs = []
for _ in papers:
    list_of_abs.append(_["abstract"])
dump_papers(list_of_abs,'abs.txt')

In [34]:
with open('titles.txt') as f:
    titles = f.read().splitlines()
    
title = []
for t in titles:
    t = t.split(" - S")[0]
    t = t.split(" | ")[0]
    title.append(t)
with open("titles_edited.txt", "w") as f:
        for name in title:
            f.write(str(name) + "\n")

In [22]:
with open('titles_edited.txt') as f:
    titles_covab = f.read().splitlines()
    
with open('search_results.txt') as f:
    titles_search = f.read().splitlines()
def get_freq(data = str):
    words = data.split(" ")
    words2 = []
    output_dict = {}
    for _ in words:
        if _ not in words2:
            words2.append(_)
    for word in words2:
        output_dict[word] = words.count(word)
    return output_dict
    

{'b.1.617': 10, 'point-of-care': 10, 'efficiency': 10, 'dried': 10, 'higher': 10, 'inhibitors': 10, 'reduced': 10, 'insights': 10, 'emergence': 10, 'particle': 10, 'people': 10, 'living': 10, '6': 10, 'covid-19.sars-cov-2': 10, 'population': 10, 'sarbecovirus': 10, 'glycosylated': 10, 'golden': 10, 'p.1': 10, 'glycosylation': 10, 'confers': 10, 'antibody-mediated': 10, 'vaccine-elicited': 10, 'african': 10, 'year': 10, 'ncov-19': 10, 'over': 10, 'available': 10, 'future': 10, 'validation': 10, 'mechanism': 10, 'variable': 10, 'structures': 10, 'characteristics': 10, 'antigenicity': 10, 'transmission': 10, 'inflammatory': 10, 'vector-based': 10, 'immunosorbent': 10, 'diagnosis': 10, 'homologous': 10, 'drug': 10, 'full-length': 10, 'covid': 10, 'treat': 10, 'helper': 10, 'variation': 10, 'control': 10, 'common': 10, 'detecting': 10, 'bat': 10, 'coli': 10, 'coronavirus:': 10, 'enteric': 10, 'samples': 11, 'combination': 11, 'vaccine:': 11, 'four': 11, 'kidney': 11, 'automated': 11, 'effic

{'potently': 5, 'efficacy': 5, 'sars-cov': 5, 'variants': 5, 'highly': 5, 'recognition': 5, 'patients': 5, 'specific': 5, 'development': 5, 'mice': 6, 'severe': 6, 'therapeutic': 6, 'responses': 6, 'multiple': 6, 'nucleocapsid': 6, 'acute': 7, 'domain': 8, 'block': 8, 'mers-cov': 8, 'epitope': 8, 'infection': 8, 'nanobody': 8, 'covid-19': 9, 'nanobodies': 9, 'neutralize': 10, 'basis': 10, 'syndrome': 10, 'receptor': 10, 'respiratory': 11, 'convalescent': 11, 'protein': 13, 'targeting': 13, 'coronavirus': 16, 'binding': 20, 'neutralization': 21, 'potent': 27, 'monoclonal': 28, 'spike': 34, 'human': 44, 'antibody': 46, 'neutralizing': 58, 'antibodies': 58, 'sars-cov-2': 95}


In [23]:
abs_cov = ''
for _ in titles_covab:
    for elem in papers:
        if _.lower() == elem['title'].lower():
            if elem['abstract'] is not None:
                abs_cov = abs_cov + ' ' + elem['abstract'].lower()
                
abs = ''

for elem in papers:
    if elem['abstract'] is not None:
        abs = abs + ' ' +  elem['abstract'].lower()

113705


In [38]:
import nltk
abs_cov = abs_cov.replace('.','')
abs_cov = abs_cov.replace('(','')
abs_cov = abs_cov.replace(')','')
abs_cov = abs_cov.replace(',','')
print(len(nltk.sent_tokenize(abs_cov)))


In [24]:
import nltk
import string
import re
from nltk.util import ngrams
from collections import Counter



def extract_phrases(text, phrase_counter, length):
    for sent in nltk.sent_tokenize(text):
        words = nltk.word_tokenize(sent)
        for phrase in ngrams(words, length):
            phrase_counter[phrase] += 1

phrase_counter = Counter()

extract_phrases(abs, phrase_counter, 5)

most_common_phrases = phrase_counter.most_common(100)
print(most_common_phrases)

1


In [50]:
from typing import List, Union
from paperscraper.pubmed import get_query_from_keywords_and_date
from paperscraper.pubmed import get_pubmed_papers


def search_pubmed_papers(
    keywords: List[Union[str, List[str]]]
    = [['SARS-CoV-2', 'COVID-19', 'coronavirus', 'SARS-CoV', 'MERS-CoV',
        'SARS'],
        ['antibody', 'antibodies', 'nanobody', 'immunoglobulin', 'MAb',
         'nanobodies'],
        ['neutralizing', 'neutralize', 'neutralization', 'bind', 'binding',
         'inhibit', 'targeting'],
        ['heavy chain', 'complementarity determining region', 'gene',
         'epitope', 'receptor-binding domain', 'rbd', 'spike protein', 'VHH']],
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    start_date: str = "None",
    end_date: str = "None",
    *args,
    **kwargs
):
    """
    Combines get_pubmed_papers and dump_papers.
    For default setting, just import this function and use
    search_pubmed_papers()
    Returns:
        A list of dictionaries, each containing the paper's
         ["title", "authors", "date", "abstract", "journal", "doi"]

    Args:
        keywords (List[Union[str, List[str]]]): List of keywords to request
            pubmed API. The outer list level will be considered as AND
            separated keys, the inner level as OR separated.
        fields (List, optional): List of strings with fields to keep in output.
            Defaults to ['title', 'authors', 'date', 'abstract',
            'journal', 'doi'].
            NOTE: If 'emails' is passed, an attempt is made to extract author
            mail addresses.
        start_date (str): Start date for the search. Needs to be in format:
            YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date (str): End date for the search. Same notation as start_date.
    """
    # Translate keywords into query.
    print(keywords)
    query = get_query_from_keywords_and_date(
        keywords, start_date=start_date, end_date=end_date
    )
    papers = get_pubmed_papers(query, fields, *args, **kwargs)
    return papers


def dump_papers(papers, filepath: str) -> None:
    """
    Receives a list of dicts, one dict per paper and dumps it into a .jsonl
    file with one paper per line.
    Args:
        papers (list[dict]): List of papers
        filepath (str): Path to dump the papers.
    """

    with open(filepath, "w") as f:
        for paper in papers:
            f.write(str(paper) + "\n")


def pubmed_papers_and_pt(
    keywords: List[Union[str, List[str]]]
    = [['SARS-CoV-2', 'COVID-19', 'coronavirus', 'SARS-CoV', 'MERS-CoV',
        'SARS'],
        ['antibody', 'antibodies', 'nanobody', 'immunoglobulin', 'MAb',
         'nanobodies'],
        ['neutralizing', 'neutralize', 'neutralization', 'bind', 'binding',
         'inhibit', 'targeting'],
        ['heavy chain', 'complementarity determining region', 'gene',
         'epitope', 'receptor-binding domain', 'rbd', 'spike protein', 'VHH']],
    fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
    start_date: str = "None",
    end_date: str = "None",
    txt: bool = False,
    jsonl: bool = False,
    *args,
    **kwargs
):
    """
    Search for papers and preprints on PubMed
    Returns:
        A list of dictionaries, each containing the paper's
         ["title", "authors", "date", "abstract", "journal", "doi"]
        AND/OR files containing relevant information

    Args:
        keywords (List[Union[str, List[str]]],optional): List of keywords to
            request pubmed API. The outer list level will be considered as AND
            separated keys, the inner level as OR separated.
        fields (List, optional): List of strings with fields to keep in output.
            Defaults to ['title', 'authors', 'date', 'abstract',
            'journal', 'doi'].
            NOTE: If 'emails' is passed, an attempt is made to extract author
            mail addresses.
        start_date (str): Start date for the search. Needs to be in format:
            YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific
            dates are used.
        end_date (str): End date for the search. Same notation as start_date.
    """
    papers = search_pubmed_papers(keywords, fields, start_date, end_date,
                                  *args, **kwargs)
    papers_pt = search_pubmed_papers(keywords+['AND preprint[pt]'],
                                     fields, start_date, end_date, *args,
                                     **kwargs)
    output = papers+papers_pt
    list_of_titles = []
    list_of_doi = []
    for _ in output:
        list_of_titles.append(_["title"])
        if _["doi"] is not None:
            doi = _["doi"].split("\n")[0]
            if doi not in list_of_doi:
                list_of_doi.append(doi)
    if txt is True:
        with open('pubmed_titles.txt', "w") as f:
            for t in list_of_titles:
                t = t.replace('[', '')
                t = t.replace(']', '')
                f.write(str(t) + "\n")
        with open('pubmed_dois.txt', "w") as f:
            for doi in list_of_doi:
                f.write('https://doi.org/'+str(doi) + "\n")
    if jsonl is True:
        with open('pubmed_results.jsonl', "w") as f:
            for paper in output:
                f.write(str(paper)+ "\n")
    return output

pps = pubmed_papers_and_pt(txt=True, jsonl=True)

[['SARS-CoV-2', 'COVID-19', 'coronavirus', 'SARS-CoV', 'MERS-CoV', 'SARS'], ['antibody', 'antibodies', 'nanobody', 'immunoglobulin', 'MAb', 'nanobodies'], ['neutralizing', 'neutralize', 'neutralization', 'bind', 'binding', 'inhibit', 'targeting'], ['heavy chain', 'complementarity determining region', 'gene', 'epitope', 'receptor-binding domain', 'rbd', 'spike protein', 'VHH']]
[['SARS-CoV-2', 'COVID-19', 'coronavirus', 'SARS-CoV', 'MERS-CoV', 'SARS'], ['antibody', 'antibodies', 'nanobody', 'immunoglobulin', 'MAb', 'nanobodies'], ['neutralizing', 'neutralize', 'neutralization', 'bind', 'binding', 'inhibit', 'targeting'], ['heavy chain', 'complementarity determining region', 'gene', 'epitope', 'receptor-binding domain', 'rbd', 'spike protein', 'VHH'], 'AND preprint[pt]']


In [51]:
print(len(pps))

4364


In [None]:
papers = get_and_dump_pubmed_papers([covid,
                                     ['neutralizing-antibody','monoclonal+antibody'],
                                     ['receptor+binding+domain','rbd'],
                                    ['spike+protein','complementarity+determining+region','epitope']])