In [28]:
import numpy as np
from Bio.PDB.PDBList import PDBList
import get_poss_pdbs as gp
import tempfile

In [2]:
%%time 

# Store all the pdb IDs as a list 
# This has a length of 184,929 IDs as of 2021-12-8
# Note that this takes about 3 seconds 
# For some reason this created an empty folder in the path called "obsolete", but this went away
# when I set the obsolte_pdb argument to some random string, which I made "None".
pdbl = PDBList(verbose=False, obsolete_pdb="None")
all_pdb = pdbl.get_all_entries()

CPU times: user 161 ms, sys: 129 ms, total: 290 ms
Wall time: 6.48 s


In [3]:
%%time 

all_pdb = {pdb_id: True for pdb_id in all_pdb}

CPU times: user 34.5 ms, sys: 5.42 ms, total: 39.9 ms
Wall time: 39.5 ms


In [29]:
test_url = "https://www.biorxiv.org/content/10.1101/2020.08.09.242867v1.full"

In [5]:
%%time 

# This can take about 2 seconds
poss = gp.get_poss_pdbs(test_url)

CPU times: user 302 ms, sys: 12.3 ms, total: 315 ms
Wall time: 1.89 s


In [6]:
# Gets the actual PDB IDs among the possible ones
actual = [pdb_id for pdb_id in poss if all_pdb.get(pdb_id, False)]

In [7]:
actual[:5]

['1COL', '2BAN', '2BAT', '2BBY', '2BIN']

In [8]:
print(len(actual), 'out of', len(poss), 'were actual PDB IDs')

14 out of 91 were actual PDB IDs


In [10]:
pdb_id = actual[0]

In [12]:
actual

['1COL',
 '2BAN',
 '2BAT',
 '2BBY',
 '2BIN',
 '2BIS',
 '2BLI',
 '2BMA',
 '2BOF',
 '2COL',
 '2F08',
 '2F10',
 '3BN9',
 '6VXX']

I went through each of these 'actual' PDB IDs and found that they were each html gobble except the last two of them. We may need to refine our beautiful soup to avoid things like links and div ids, which is where this gobble came up. The last two were easily found because they were the only ones that had PDB immediately before them (e.g. "PDB 6VXX"). 

I did this by going to: 
view-source:https://www.biorxiv.org/content/10.1101/2020.08.09.242867v1.full
on Google Chrome.

In [8]:
pdbl

<Bio.PDB.PDBList.PDBList at 0x10ada85e0>

In [5]:
temp_dir = tempfile.TemporaryDirectory()

In [6]:
temp_dir.cleanup()

In [7]:
output = pdbl.retrieve_pdb_file("6VXX", file_format="pdb", pdir=temp_dir.name)
# output = pdbl.retrieve_pdb_file("6VXX", file_format="mmCif", pdir="./pdbs")
# output2 = pdbl.retrieve_pdb_file("6VXX", file_format="pdb", pdir="./pdbs")

In [8]:
author_lines = list()
for line in open(output).read().splitlines():
    if line.split()[0] == "AUTHOR":
        author_lines.append(line)

In [9]:
author_lines

['AUTHOR    A.C.WALLS,Y.J.PARK,M.A.TORTORICI,A.WALL,SEATTLE STRUCTURAL GENOMICS   ',
 'AUTHOR   2 CENTER FOR INFECTIOUS DISEASE (SSGCID),A.T.MCGUIRE,D.VEESLER         ']

In [11]:
author_txt = ' '.join(author_lines)
author_txt 

'AUTHOR    A.C.WALLS,Y.J.PARK,M.A.TORTORICI,A.WALL,SEATTLE STRUCTURAL GENOMICS    AUTHOR   2 CENTER FOR INFECTIOUS DISEASE (SSGCID),A.T.MCGUIRE,D.VEESLER         '

In [13]:
import re
re.split()

In [26]:
top_authors = list(filter(lambda word: len(word) > 1 and word != "AUTHOR", re.findall(r"[\w']+", author_txt)))[:3]

In [27]:
top_authors

['WALLS', 'PARK', 'TORTORICI']

In [32]:
html = gp.get_txt(test_url)

In [38]:
all([author.lower() in html.lower() for author in top_authors])


True

In [32]:
string = temp_dir.name + "/6vxx" + ".cif"
output == string

True

In [35]:
# open(output).read()

In [19]:
tempfile.gettempdir()

'/var/folders/rd/64lt4yfn7y3ch2mm0wl6g3vw0000gn/T'

In [20]:
tempfile.gettempprefix()

'tmp'