In [2]:
import Bio.PDB as PDB
from Bio import pairwise2
from Bio.PDB import Chain, Residue, Atom, Model
import numpy as np
import sys

In [20]:
def remove_non_protein_chains(model: PDB.Model):
    non_protein_chains = []
    for chain in model.child_dict.keys():
        if chain_to_seq(model.child_dict[chain])[0] == '':
            non_protein_chains.append(chain)
    for non_protein_chain in non_protein_chains:
        model.detach_child(non_protein_chain)
    return model

In [21]:
parser = PDB.PDBParser()
pdb = parser.get_structure("1m9k", "resources/homology_model_6pp4.pdb").get_models().__next__()
print(len(pdb))
print(len(remove_non_protein_chains(pdb)))
#print(chain_to_seq(pdb.child_list[2]))

3
2


In [7]:
def chain_to_seq(chain: PDB.Chain) -> (str, [PDB.Residue]):
    three_to_one = {
        'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
        'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',
        'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',
        'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'}
    seq = ''
    residues = []
    for residue in chain.get_residues():
        three_letter = residue.resname
        if three_letter in three_to_one.keys():
            seq = seq + three_to_one[three_letter]
            residues.append(residue)
    return seq, residues

In [82]:
parser = PDB.PDBParser()
pdb1 = parser.get_structure("1m9k", "resources/1m9k.pdb").get_models().__next__()
pdb2 = parser.get_structure("1m9k", "resources/1m9m.pdb").get_models().__next__()
print(pdb1.get_atoms().__next__().get_coord())

[ 26.927 -10.252  36.891]




In [92]:
# align and extract atoms
def match_and_extract_atoms(model1: PDB.Model, model2: PDB.Model) -> (list[PDB.Atom], list[PDB.Atom]):
    # checking, whether both models have the same number of chains:
    if len(model1) != len(model2):
        print("The models have a different number of chains.", file=sys.stderr)
        exit(1)

    chains1 = [chain for chain in model1.get_chains()]
    chains2 = [chain for chain in model2.get_chains()]

    atoms1 = []
    atoms2 = []

    # iterating over the chains:
    for i in range(len(model1)):
        chain1 = chains1[i]
        chain2 = chains2[i]

        # extracting amino acid sequence and corresponding residues
        seq1, residues1 = chain_to_seq(chain1)
        seq2, residues2 = chain_to_seq(chain2)

        # aligning sequences:
        alignment = pairwise2.align.globalxx(seq1, seq2)

        # collecting atoms that match between both sequences
        # for the same amino acid all atoms are collected
        # for a match between different amino acids only CA is collected
        n = 0 # residue index in chain1
        m = 0 # residue index in chain2
        seq1 = alignment[0][0]
        seq2 = alignment[0][1]
        for j in range(len(seq1)):
            if seq1[j] == seq2[j]:          # match case

                # making dictionary from atoms of first residue
                atoms_dict = {}
                for atom in residues1[n]:
                    atoms_dict[atom.id] = atom
                # adding atom only if present in both residues
                for atom in residues2[m]:
                    if atom.id in atoms_dict.keys():
                        atoms1.append(atoms_dict[atom.id])
                        atoms2.append(atom)
                # increasing counter
                n += 1
                m += 1
            elif seq1[j] == "-":            # gap in seq1
                m += 1
            elif seq2[j] == "-":            # gap in seq2
                n += 1
            else:                           # missmatch
                # increasing counter
                n += 1
                m += 1

    return atoms1, atoms2

def dist(point1: np.array, point2: np.array) -> float:
    """
    Calculates the distance between two points in three-dimensional space.
    :param point1: numpy array with coordinates of first point
    :param point2: numpy array with coordinates of first point
    :return: float with distance between points
    """
    squared_sum = float(0)
    for i in range(len(point1)):
        squared_sum += np.power((point1[i] - point2[i]), 2)
    return np.sqrt(squared_sum)

def atom_distance(atom1: PDB.Atom, atom2: PDB.Atom) -> float:
    pos1 = np.array(atom1.get_coord())
    pos2 = np.array(atom2.get_coord())
    return dist(pos1, pos2)

def rmsd(atoms1: list[PDB.Atom], atoms2: list[PDB.Atom]) -> float:

    # calculating squared pairwise distances
    distances = []
    for i in range(len(atoms1)):
        distances.append(atom_distance(atoms1[i], atoms2[i])**2)

    # calculating rmsd
    return np.sqrt(np.sum(distances)/len(atoms1))

def main():

    # extracting matching atoms
    atoms1, atoms2 = match_and_extract_atoms(pdb1, pdb2)

    super_imposer = PDB.Superimposer()
    super_imposer.set_atoms(fixed=atoms1, moving=atoms2)
    super_imposer.apply(atoms2)

    print("number of compared atoms: " + str(len(atoms1)))

    print("RMSD: " + str(rmsd(atoms1, atoms2)))

main()

number of compared atoms: 6335
RMSD: 0.6681580615845315


In [15]:
for chain in pdb1.get_chains():
    for residue in chain.get_residues():
        print(residue.resname)

LYS
PHE
PRO
ARG
VAL
LYS
ASN
TRP
GLU
VAL
GLY
SER
ILE
THR
TYR
ASP
THR
LEU
SER
ALA
GLN
ALA
GLN
GLN
ASP
GLY
PRO
CYS
THR
PRO
ARG
ARG
CYS
LEU
GLY
SER
LEU
VAL
PHE
PRO
GLU
GLN
LEU
LEU
SER
GLN
ALA
ARG
ASP
PHE
ILE
ASN
GLN
TYR
TYR
SER
SER
ILE
LYS
ARG
SER
GLY
SER
GLN
ALA
HIS
GLU
GLN
ARG
LEU
GLN
GLU
VAL
GLU
ALA
GLU
VAL
ALA
ALA
THR
GLY
THR
TYR
GLN
LEU
ARG
GLU
SER
GLU
LEU
VAL
PHE
GLY
ALA
LYS
GLN
ALA
TRP
ARG
ASN
ALA
PRO
ARG
CYS
VAL
GLY
ARG
ILE
GLN
TRP
GLY
LYS
LEU
GLN
VAL
PHE
ASP
ALA
ARG
ASP
CYS
ARG
SER
ALA
GLN
GLU
MET
PHE
THR
TYR
ILE
CYS
ASN
HIS
ILE
LYS
TYR
ALA
THR
ASN
ARG
GLY
ASN
LEU
ARG
SER
ALA
ILE
THR
VAL
PHE
PRO
GLN
ARG
CYS
PRO
GLY
ARG
GLY
ASP
PHE
ARG
ILE
TRP
ASN
SER
GLN
LEU
VAL
ARG
TYR
ALA
GLY
TYR
ARG
GLN
GLN
ASP
GLY
SER
VAL
ARG
GLY
ASP
PRO
ALA
ASN
VAL
GLU
ILE
THR
GLU
LEU
CYS
ILE
GLN
HIS
GLY
TRP
THR
PRO
GLY
ASN
GLY
ARG
PHE
ASP
VAL
LEU
PRO
LEU
LEU
LEU
GLN
ALA
PRO
ASP
GLU
PRO
PRO
GLU
LEU
PHE
LEU
LEU
PRO
PRO
GLU
LEU
VAL
LEU
GLU
VAL
PRO
LEU
GLU
HIS
PRO
THR
LEU
GLU
TRP
PHE
ALA
ALA
LEU
GLY
LEU
ARG
TRP
