In [9]:
# how can I extract info from PDB?
# https://data.rcsb.org/redoc/index.html
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

In [10]:
def parse_after(string: str, after="&nbsp"):
	return string[string.index(after) + len(after):]

def get_pdb(code: str):
	entry_id = code
	req = requests.get(f"https://www.rcsb.org/structure/{entry_id}")
	parsed_html = BeautifulSoup(req.text)
	title = parsed_html.body.find('span', attrs={'id':'structureTitle'}).text
	classification = parsed_html.body.find('li', attrs={'id':'header_classification'}).text
	classification = parse_after(classification)
	organism = parsed_html.body.find('li', attrs={'id':'header_organism'}).text
	organism = parse_after(organism)
	return dict(
		title=title, classification=classification, organism=organism
	)
get_pdb("1RH7")

{'title': 'Crystal Structure of Resistin-like beta',
 'classification': 'HORMONE/GROWTH FACTOR',
 'organism': 'Mus musculus'}

In [11]:
df = pd.read_parquet("../website/backend/data/embed-sub-venome-2D.parquet")

In [12]:
pdb = df[df["name"].apply(lambda x: ("Gh" in x or "Lb" in x or "Lh" in x) and "comp" in x).apply(lambda x: not x)].copy()

In [13]:
codes = pdb["name"].apply(lambda x: x[3:7]).unique().tolist()

In [14]:
codes

['1a5k',
 '1avz',
 '1ayb',
 '1c7m',
 '1dbd',
 '1eod',
 '1eoe',
 '1eof',
 '1f66',
 '1fao',
 '1fzv',
 '1hdf',
 '1j0f',
 '1j6q',
 '1jbi',
 '1jnj',
 '1jos',
 '1jxg',
 '1ka8',
 '1klv',
 '1ksr',
 '1kx8',
 '1mwq',
 '1n8u',
 '1no5',
 '1nrv',
 '1odd',
 '1odv',
 '1psr',
 '1q8d',
 '1r21',
 '1ri0',
 '1rja',
 '1rn4',
 '1ru0',
 '1sru',
 '1t1d',
 '1u2n',
 '1u36',
 '1ub9',
 '1uh6',
 '1v4r',
 '1ver',
 '1w41',
 '1w42',
 '1wgr',
 '1whd',
 '1whv',
 '1wjk',
 '1wju',
 '1wq8',
 '1x0n',
 '1x3a',
 '1ylx',
 '2a7o',
 '2brf',
 '2ce1',
 '2ce6',
 '2cgy',
 '2cow',
 '2cql',
 '2cz4',
 '2d9o',
 '2dbf',
 '2dn8',
 '2do4',
 '2dzm',
 '2e2z',
 '2e5n',
 '2ebt',
 '2edj',
 '2edp',
 '2eeh',
 '2efi',
 '2ep8',
 '2es9',
 '2g35',
 '2gaq',
 '2hlq',
 '2hpl',
 '2j4m',
 '2jli',
 '2jmp',
 '2ka4',
 '2kdg',
 '2kjk',
 '2kkc',
 '2kpq',
 '2kre',
 '2kxy',
 '2kz6',
 '2l7p',
 '2lc1',
 '2lc7',
 '2lfb',
 '2lgv',
 '2llz',
 '2lv7',
 '2lz0',
 '2mqi',
 '2mzt',
 '2n8i',
 '2nml',
 '2p1b',
 '2pbc',
 '2r2o',
 '2rkg',
 '2rrm',
 '2ubp',
 '2vb5',
 '2vwa',
 

In [15]:
def flatten(arr):
  out = []
  def _flatten(a):
    for item in a:
      if isinstance(item, list):
        _flatten(item)
      else:
        out.append(item)
  _flatten(arr)
  return out

def graphql_pdb(code: str):
  query = """
  query structure ($id: String!) {
    entry(entry_id:$id){
        struct {
            title
        }
        struct_keywords {
            pdbx_keywords
        }
        polymer_entities {
            rcsb_entity_source_organism {
              scientific_name
            }
          }
    }
  }
  """
  output = requests.post("https://data.rcsb.org/graphql", json={"query": query, "variables": {"id": code}}).json()
  root = output["data"]["entry"]
  if root is None:
    return None, None, None

  title = None
  if "struct" in root and root["struct"] is not None:
    title = root["struct"]
    
  pdbx_keywords = None
  if "struct_keywords" in root and root["struct_keywords"] is not None:
    if "pdbx_keywords" in root["struct_keywords"] and root["struct_keywords"]["pdbx_keywords"] is not None:
      pdbx_keywords = root["struct_keywords"]["pdbx_keywords"]

  organisms = []
  if "polymer_entities" in root and root["polymer_entities"] is not None:
    for polymer_entities in root["polymer_entities"]:
      rcsb_entity_source_organism = polymer_entities["rcsb_entity_source_organism"]
      if rcsb_entity_source_organism is None:
        continue
      else:
        for _org in rcsb_entity_source_organism:
          organisms.append(_org["scientific_name"])

  return title, pdbx_keywords, list(set(organisms))

In [17]:
infos = []
for c in tqdm(codes):
	infos.append(graphql_pdb(c))
	
print(infos)

 44%|████▎     | 46174/105757 [2:06:11<2:42:50,  6.10it/s] 


KeyboardInterrupt: 

In [None]:
titles = [i[0] for i in infos]
keywords = [i[1] for i in infos]
organisms = [i[2] for i in infos]
pd.DataFrame({"code": codes, "title": titles, "keywords": keywords, "organisms": organisms }).to_parquet("pdb-info.parquet", index=False)