In [8]:
import os
import json
from glob import glob
from io import StringIO

import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio.SVDSuperimposer import SVDSuperimposer
from Bio.PDB.PDBParser import PDBParser
from Bio.PDB import Dice

PDB_DIR = "Structures"
MODELLING_DIR = os.path.join(PDB_DIR, "Modelling")

TEMPLATE_ID = "4lxv"

In [2]:
PDB_PARSER = PDBParser()

In [3]:
with open(os.path.join(PDB_DIR, "tipNames.json")) as f:
    tipNames = json.load(f)

In [4]:
structures = []
n = 0

for seqDir in os.listdir(MODELLING_DIR):
    seqName = os.path.basename(seqDir)
    if seqName in tipNames:
        structure = None
        align = None
        for pdbFile in glob(os.path.join(MODELLING_DIR, seqDir, "*.pdb")):
            structure = PDB_PARSER.get_structure(seqName, pdbFile)
        for alignFile in glob(os.path.join(MODELLING_DIR, seqDir, seqName + "-" + TEMPLATE_ID +".ali")):
            align = SeqIO.index(alignFile, "pir")[TEMPLATE_ID]
            for (start, res) in enumerate(align, start=1):
                if res != '-':
                    break
            for (end, res) in enumerate(align[::-1]):
                if res != '-':
                    break
        f = StringIO()
        Dice.extract(structure, chain_id=' ', start=start, end=len(align) - end, filename=f)
        f = StringIO(f.getvalue())
        s = PDB_PARSER.get_structure(seqName, f)
        structures.append(s)
        
    n += 1
    if n == 5:
        break

In [5]:
for i in range(len(structures) - 1):
    for subject in structures[i:]:
        if structures[i] != subject:
            x = np.array([res["CA"].coord for res in structures[i].get_residues()])
            y = np.array([res["CA"].coord for res in subject.get_residues()])
            sup = SVDSuperimposer()
            sup.set(x, y)
            sup.run()
            print(sup.get_rms())

0.8762926050593773
0.8156529694277238
1.0134323246851187
0.8996337659852263
0.24829029760135102
1.1266882997342702
0.22518736516195628
1.0722275789023228
0.2875284114863194
1.1273683486194392


In [12]:
pd.DataFrame([[1,2,3], [2,3,4]], columns=["A", "B", "C"], index = ["A", "B"])

Unnamed: 0,A,B,C
A,1,2,3
B,2,3,4


In [10]:
[s.id for s in structures]

['AB530462', 'AB530463', 'AB530466', 'AB530469', 'AB530488']