In [1]:
import prody as pd
from pathlib import Path
import numpy as np
from tqdm.notebook import tqdm

In [17]:
def get_interface_residues(pdb_file_1, pdb_file_2, threshold=6.0):
    """
    Get the interface residues between two (sets of) chains of a protein
    An interface residue is defined as a residue that has at least one heavy atom within a distance threshold from the other chain set
    
    :param pdb_file_1: first pdb file
    :param pdb_file_2: second pdb file
    :param threshold: distance threshold for interface residues
    :return: interface residues of the first chain set, interface residues of the second chain set
    """
    structure_1 = pd.parsePDB(str(pdb_file_1)).select("protein and not hydrogen")
    structure_2 = pd.parsePDB(str(pdb_file_2)).select("protein and not hydrogen")
    structure_1_coords = structure_1.getCoords()
    structure_2_coords = structure_2.getCoords()
    structure_1_2_coords = np.concatenate([structure_1_coords, structure_2_coords])
    structure_1_interface_residues = set()
    structure_2_interface_residues = set()
    structure_1_length = len(structure_1_coords)
    kd_tree = pd.KDTree(structure_1_2_coords)
    for i in range(len(structure_1_coords)):
        kd_tree.search(center = structure_1_coords[i], radius = threshold)
        if any([j >= structure_1_length for j in kd_tree.getIndices()]):
            structure_1_interface_residues.add(f"{structure_1[i].getChid()}_{structure_1[i].getResnum()}")
    for i in range(len(structure_2_coords)):
        kd_tree.search(center = structure_2_coords[i], radius = threshold)
        if any([j < structure_1_length for j in kd_tree.getIndices()]):
            structure_2_interface_residues.add(f"{structure_2[i].getChid()}_{structure_2[i].getResnum()}")
    return structure_1_interface_residues, structure_2_interface_residues

In [19]:
missing_pdb_files = set()
with open("data/interface_labels.txt", "w") as f:
    with open("data/full_list.txt") as list_f:
        for line in tqdm(list_f):
            pdb_id, chains_1, chains_2 = line.strip().split("_")
            pdb_file_1 = Path("data/raw") / f"{pdb_id}_{chains_1}.pdb"
            pdb_file_2 = Path("data/raw") / f"{pdb_id}_{chains_2}.pdb" 
            if not pdb_file_1.exists() or not pdb_file_2.exists():
                continue
            interface_residues_1, interface_residues_2 = get_interface_residues(pdb_file_1, pdb_file_2)
            f.write(f"\t".join([pdb_file_1.stem, pdb_file_2.stem, ",".join(interface_residues_1), ",".join(interface_residues_2)]) + "\n")

0it [00:00, ?it/s]