In [1]:
from typing import List, Tuple, Optional, Dict, NamedTuple, Union, Callable
import itertools
import os
import string
from pathlib import Path

import numpy as np
import torch
from scipy.spatial.distance import squareform, pdist, cdist
from sklearn.cluster import KMeans
from k_means_constrained import KMeansConstrained
import matplotlib.pyplot as plt
import matplotlib as mpl
from Bio import SeqIO
from Bio.PDB import *
import biotite.structure as bs
from biotite.structure.io.pdbx import PDBxFile, get_structure
from biotite.database import rcsb
from tqdm import tqdm
import pandas as pd
import seaborn as sns
import glob
import subprocess

import esm
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

In [2]:
parser = PDBParser(PERMISSIVE = True, QUIET = True) 
pdbl = PDBList() 

def get_pdb_residues(PDB_ID):
    pdb = PDB_ID.upper()[:4]
    chain_id = PDB_ID.upper()[-1]
    
    ## Get the data from the pdb file
    pdbl.retrieve_pdb_file(PDB_ID.upper()[:4], pdir = '.', file_format = 'pdb')
    data = parser.get_structure(pdb,"pdb" + pdb.lower() + ".ent")
    os.remove("pdb" + pdb.lower() + ".ent")

    all_residues = {}
    
    missing = [res for res in data.header["missing_residues"] if res["chain"] == chain_id]
    for res in missing:
        key = res["ssseq"]
        all_residues[key] = False
    
    model = list(data.get_models())[0]
    chains = list(model.get_chains()) 
    residues = list([chain for chain in chains if chain.id == chain_id][0].get_residues())
    residues = [res for res in residues if str(res)[17] == ' ']
    for res in residues:
        if not str(res)[28].isnumeric():
            key = int(str(res)[26:28])
        elif not str(res)[29].isnumeric():
            key = int(str(res)[26:29])
        else:
            key = int(str(res)[26:30])
        all_residues[key] = True
    
    pdb_indices = []
    index = 0
    for i in range(10000):
        if i in all_residues:
            if all_residues[i]:
                pdb_indices.append(index)
            index += 1
    return pdb_indices

def download_pdb(directory, pdb_id):
    pdb = pdb_id.lower()[:4]
    chain_id = pdb_id.upper()[-1]
    
    # Get the data from the pdb file
    pdbl.retrieve_pdb_file(pdb, pdir = directory, file_format = 'pdb')
    structure = parser.get_structure(pdb, directory + "pdb" + pdb + ".ent")
    os.remove(directory + "pdb" + pdb + ".ent")

    for model in structure:
        for chain in model:
            class ChainSelect(Select):
                        def accept_chain(self, chain):
                            if chain.get_id().upper() == chain_id:
                                return True
                            else:
                                return False
            
    io = PDBIO()
    io.set_structure(structure)
    io.save(directory + pdb_id + ".pdb" , ChainSelect())

In [3]:
#PDB_ID = "2qke_e"
#DIRECTORY = "../data/benchmark_data/af_cluster_monomeric/KaiB/reference_1/"

PDB_ID = "4qhh_a"
DIRECTORY = "../data/benchmark_data/af_cluster_oligomeric/Selecase/reference_2/"

download_pdb(DIRECTORY, PDB_ID)

Downloading PDB structure '4qhh'...


In [21]:
structure = parser.get_structure(PDB_ID, DIRECTORY + PDB_ID.lower() + ".pdb")

for model in structure:
    print(model)
    for chain in model:
        
        print(chain, chain.get_id(), type(chain.get_id()))
        
        class ChainSelect(Select):
                    def accept_chain(self, chain):
                        if chain.get_id().upper() == chain_id:
                            return True
                        else:
                            return False
        
io = PDBIO()
io.set_structure(structure)
io.save(DIRECTORY + PDB_ID + ".pdb" , ChainSelect())

<Model id=0>
<Chain id=A> A <class 'str'>
<Chain id=B> B <class 'str'>
<Chain id=C> C <class 'str'>
<Chain id=D> D <class 'str'>
<Chain id=E> E <class 'str'>
<Chain id=F> F <class 'str'>
