In [1]:
import get_doi_from_pubmed as gd
import pandas as pd



In [2]:
covid = ['SARS-CoV-2', 'COVID-19', 'coronavirus', 'SARS-CoV', 'MERS-CoV',
        'SARS']
antibody = ['antibody', 'antibodies', 'nanobody', 'MAb', 'immunoglobulin',
            'nanobodies']
interaction = ['neutralizing', 'neutralize', 'neutralization', 'bind',
               'binding', 'inhibit', 'targeting']
extra = ['heavy chain',  'complementarity determining region',
        'gene', 'epitope', 'receptor-binding domain', 'rbd',
        'spike protein', 'VHH']

In [11]:
papers_and_preprints = gd.pubmed_papers_and_pt(txt=False, jsonl=False, csv=True)

In [4]:
class PDBChecker:
    def __init__(self):
        from Bio.PDB.PDBList import PDBList
        """
        First we store all the existing pdb IDs as a dictionary for O(1) lookup. 
        There are 184,929 IDs as of 2021-12-8 and the retrieval using biopython takes about 7 seconds.
        For some reason, calling PDBList() creates an empty folder in the directory called "obsolete", 
        but this goes away by setting the `obsolte_pdb` parameter to some random string, which I made "None".

        """

        self.pdbl = PDBList(verbose=False, obsolete_pdb="None")
        self.existing_pdbs = {pdb_id: True for pdb_id in self.pdbl.get_all_entries()}  # takes 7 secs

    def get_actual(self, possible_pdbs: list, verbose=True) -> list:
        """
        Takes a list of possible PDB IDs as input. 
        Returns a list of the actual PDB IDs, i.e. the ones from the input list that exist on the PDB database.
        
        
        Warning: Please remember that html gobble can include actual PDB IDs by chance. So just because a possible
        PDB ID from the paper url html turns out to be an actual PDB ID (is actually on the database), does not 
        mean it was meant to be written in the text of the paper. 
        """
        actual_pdbs = [pdb_id for pdb_id in possible_pdbs if self.existing_pdbs.get(pdb_id, False)]
        if verbose: 
            print("Out of the", len(actual_pdbs), 'possible PDB IDs scraped', len(possible_pdbs), 'are actual PDB IDs.')
        return actual_pdbs
        
    def get_top_authors(self, pdb_id: str, top_num=3) -> list:
        """
        Takes a 
        """
        
        import tempfile
        import re

        temp_dir = tempfile.TemporaryDirectory()
        pdb_file = self.pdbl.retrieve_pdb_file(pdb_id, file_format="pdb", pdir=temp_dir.name)
        author_txt = ' '.join(filter(lambda line: line.split()[0] == "AUTHOR", open(pdb_file).read().splitlines()))
        temp_dir.cleanup()
        authors = list(filter(lambda word: len(word) > 1 and word != "AUTHOR", re.findall(r"[\w']+", author_txt)))
        return authors[:top_num]
        