In [17]:
from cmath import sqrt
from pathlib import Path

import numpy as np
import parmed
from tqdm.notebook import tqdm
import shutil
import subprocess
import pandas as pd

In [6]:
# define the working directory
wdir = Path('pdb')
all_files = wdir.glob('*.pdb')
# create a dict to identify the Ai model
files_dataset = {}

for f in all_files:
    if len(f.stem) == 9:
        if not files_dataset.get(f):
            files_dataset[f] = {}
    else:
        basename = f.stem[:6]
        model = f.stem[7:]
        native = f.parent.joinpath(f'{basename}.pdb')
        if not files_dataset.get(native):
            files_dataset[native] = {} if len(f.stem) == 6 else {model: f}
        elif len(f.stem) != 6:
            files_dataset[native][model] = f

files_dataset

{PosixPath('pdb/7d8b_B.pdb'): {'IF_model': PosixPath('pdb/7d8b_B_IF_model.pdb'),
  'ESMem_model': PosixPath('pdb/7d8b_B_ESMem_model.pdb'),
  'AF2m_model': PosixPath('pdb/7d8b_B_AF2m_model.pdb'),
  'ESM_model': PosixPath('pdb/7d8b_B_ESM_model.pdb'),
  'AF2r10_model': PosixPath('pdb/7d8b_B_AF2r10_model.pdb'),
  'AF2mem_model': PosixPath('pdb/7d8b_B_AF2mem_model.pdb'),
  'NNem_model': PosixPath('pdb/7d8b_B_NNem_model.pdb'),
  'IFem_model': PosixPath('pdb/7d8b_B_IFem_model.pdb'),
  'YSem_model': PosixPath('pdb/7d8b_B_YSem_model.pdb'),
  'OF_model': PosixPath('pdb/7d8b_B_OF_model.pdb'),
  'AF2_model': PosixPath('pdb/7d8b_B_AF2_model.pdb'),
  'OFem_model': PosixPath('pdb/7d8b_B_OFem_model.pdb'),
  'NN_model': PosixPath('pdb/7d8b_B_NN_model.pdb'),
  'OFr20em_model': PosixPath('pdb/7d8b_B_OFr20em_model.pdb'),
  'OFr20_model': PosixPath('pdb/7d8b_B_OFr20_model.pdb'),
  'AF2em_model': PosixPath('pdb/7d8b_B_AF2em_model.pdb'),
  'YS_model': PosixPath('pdb/7d8b_B_YS_model.pdb'),
  'AF2r10em_model':

In [19]:
def get_tmscore_output(out):
    results = {}
    for l in out:
        if not l or l.startswith(' '):
            continue
        if l.startswith('Number of residues in common'):
            results['Number of residues in common'] = int(l.split()[-1])
        elif l.startswith('RMSD of  the common residues'):
            results['global rmsd'] = float(l.split()[-1])
        elif l.startswith('TM-score'):
            results['TM-score'] = float(l.split()[2])
        elif l.startswith('MaxSub-score'):
            results['MaxSub-score'] = float(l.split()[1])
        elif l.startswith('GDT-TS-score'):
            results['GDT-TS-score'] = float(l.split()[1])
            results['GDT-TS-score %(d<1)'] = float(l.split()[2].split('=')[-1])
            results['GDT-TS-score %(d<2)'] = float(l.split()[3].split('=')[-1])
            results['GDT-TS-score %(d<4)'] = float(l.split()[4].split('=')[-1])
            results['GDT-TS-score %(d<8)'] = float(l.split()[5].split('=')[-1])
        elif l.startswith('GDT-HA-score'):
            results['GDT-HA-score'] = float(l.split()[1])
            results['GDT-HA-score %(d<0.5)'] = float(l.split()[2].split('=')[-1])
            results['GDT-HA-score %(d<1)'] = float(l.split()[3].split('=')[-1])
            results['GDT-HA-score %(d<2)'] = float(l.split()[4].split('=')[-1])
            results['GDT-HA-score %(d<4)'] = float(l.split()[5].split('=')[-1])
        elif l.startswith('Superposition in the TM-score'):
            results['TM-score Superposition Length(d<5.0)'] = int(l.split()[-1])
    return results

def get_dist(coor1, coor2):
    return sqrt((coor2[0] - coor1[0]) ** 2 + (coor2[1] - coor1[1]) ** 2 + (coor2[2] - coor1[2]) ** 2)

def get_rmsd(dist_list):
    diff = np.array(dist_list)
    N = len(diff)
    return round(np.sqrt((diff * diff).sum() / N), 1)

residues_code = {'HIS': 'H', 'HIE': 'H', 'HID': 'H', 'CYX': 'C', 'CYM': 'C', 'CYS': 'C', 'LYN': 'K','ASH': 'D',
                 'GLH': 'E', 'LYS': 'K', 'ARG': 'R', 'HIP': 'H', 'GLU': 'E', 'ASP': 'D', 'PHE': 'F', 'TRP': 'W',
                 'VAL': 'V', 'ILE': 'I', 'LEU': 'L', 'MET': 'M', 'PRO': 'P','ALA': 'A', 'GLY': 'G', 'TYR': 'Y',
                 'SER': 'S', 'THR': 'T', 'GLN': 'Q', 'ASN': 'N'}
bb_atoms = ['CA', 'C', 'N', 'O']

In [9]:
tmscore_exe = '/home/mario/WORK/Nb_modelling/programs/TMscore'

In [13]:
model_data = {'model': [], 'Number of residues in common': [], 'global rmsd': [], 'TM-score': [], 'MaxSub-score': [],
              'GDT-TS-score': [], 'GDT-TS-score %(d<1)': [], 'GDT-TS-score %(d<2)': [], 'GDT-TS-score %(d<4)': [],
              'GDT-TS-score %(d<8)': [], 'GDT-HA-score': [], 'GDT-HA-score %(d<0.5)': [], 'GDT-HA-score %(d<1)': [],
              'GDT-HA-score %(d<2)': [], 'GDT-HA-score %(d<4)': [], 'TM-score Superposition Length(d<5.0)': []}
aligned_results_folder = Path('aligned_pdb')
index = []
seqs = {}
for p, ms in tqdm(files_dataset.items()):
    # Sequence
    seqs[p.stem[:6]] = open(f'fasta/{p.stem[:6]}.fasta').readlines()[1].strip('\n')
    o = aligned_results_folder.joinpath(p.stem)
    print(o)
    o.mkdir(exist_ok=True)
    shutil.copyfile(p, o.joinpath(p.name))

    # TM-score
    for m, f in ms.items():
        mf = o.joinpath(f.stem.split('_')[2])
        mf.mkdir(exist_ok=True)
        with subprocess.Popen([tmscore_exe, f, p,
                               '-o', mf.joinpath(f'{f.stem[:-6]}_sup').as_posix()],
                              stdout=subprocess.PIPE) as process:
            output = process.communicate()[0].decode("utf-8").split('\n')
            with mf.joinpath(f"{f.stem.split('_')[2]}.log").open('w') as of:
                for ol in output:
                    of.write(ol + '\n')
            index.append(f.stem[:6])
            model_data['model'].append(f.stem.split('_')[2])
            for x, v in get_tmscore_output(output).items():
                model_data[x].append(v)
            print(p.stem, f.stem)
            print('\n'.join(x for x in output[-6:] if x))

models_df = pd.DataFrame(data=model_data, index=index)

  0%|          | 0/75 [00:00<?, ?it/s]

aligned_pdb/7d8b_B
7d8b_B 7d8b_B_IF_model
EVQLVESGGGLVQPGGSLRLSSAISGFSISSTSIDWVRQAPGKGLEWVARISPSSGSTSYADSVKGRFTISADTSKNTVYLQMNSLRAEDTAVYYTGRVAKALNSRSPSFVVNTYSSIGFDYRGQGTLVTVSS
:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::                   :::::::::::::: 
EVQLVESGGGLVQPGGSLRLSSAISGFSISSTSIDWVRQAPGKGLEWVARISPSSGSTSYADSVKGRFTISADTSKNTVYLQMNSLRAEDTAVYYTGRVAKALNSRSPSFVVNTYSSIGFDYRGQGTLVTVS-
1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123
7d8b_B 7d8b_B_ESMem_model
EVQLVESGGGLVQPGGSLRLSSAISGFSISSTSIDWVRQAPGKGLEWVARISPSSGSTSYADSVKGRFTISADTSKNTVYLQMNSLRAEDTAVYYTGRVAKALNSRSPSFVVNTYSSIGFDYRGQGTLVTVSS
:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::        ::     :::::::::::::: 
EVQLVESGGGLVQPGGSLRLSSAISGFSISSTSIDWVRQAPGKGLEWVARISPSSGSTSYADSVKGRFTISADTSKNTVYLQMNSLRAEDTAVYYTGRVAKALNSRSPSFVVNTYSSIGFDYRGQGTL

In [14]:
aho_align = {
    '7n9c_D': {'Fw1': 'QVQLVESGGGLVQAGGSLRLSC', 'CDR1': 'AA-SGRTFSS-YS', 'Fw2': 'MGWFRQAQGKEREFVA',
               'CDR2': 'TINGN---GRDT', 'Fw3': 'YYTNSVKGRFTISRDDATNTVYLQMNSLKPEDTAIYYCAA',
               'CDR3': 'DKDVYYGY-------TSFPNEYEY', 'Fw4': 'WGQGTQVTVSS'},
    '7d8b_B': {'Fw1': 'EVQLVESGGGLVQPGGSLRLSS', 'CDR1': 'AI-SGFSISS-TS', 'Fw2': 'IDWVRQAPGKGLEWVA',
               'CDR2': 'RISPS---SGST', 'Fw3': 'SYADSVKGRFTISADTSKNTVYLQMNSLRAEDTAVYYTGR',
               'CDR3': 'VAKALNSRSPSFVVNTYSSIGFDY', 'Fw4': 'RGQGTLVTVSS'},
    '7l6v_D': {'Fw1': 'QVQLVESGGGSVQPGGSLRLSC', 'CDR1': 'AA-IGSVFTM-YT', 'Fw2': 'TAWYRQTPGNLRELVA',
               'CDR2': 'SITD----EHRT', 'Fw3': 'NYAASAEGRFTISRDNAKHTVDLQMTNLKPEDTAVYYCKL',
               'CDR3': 'EHDL---------------GYYDY', 'Fw4': 'WGQGTQVTVSS'},
    '7p5v_G': {'Fw1': 'QVQLVESGGGLVQAGGSLRLSC', 'CDR1': 'AA-SGFPVGR-HF', 'Fw2': 'MYWYRQAPGKEREWVA',
               'CDR2': 'AIYSY---GEYT', 'Fw3': 'EYADSVKGRFTISRDNAKNTVYLQMNSLKPEDTAVYYCYV',
               'CDR3': 'YVG------------------NEY', 'Fw4': 'WGQGTQVTVSA'},
    '6zwk_A': {'Fw1': 'EVQLVESGGGLVQAGDSLRLSC', 'CDR1': 'AA-SGLTFSR-YA', 'Fw2': 'MGWFRQAPGNEREFVA',
               'CDR2': 'VITAS---GRTT', 'Fw3': 'LYADSVKGRFTISRDNAKNTVALQMQSLKPEDTAVYYCAA',
               'CDR3': 'DYGTSRYT--------RRQSEYEY', 'Fw4': 'WGQGTQVTVSS'},
    '7ocy_C': {'Fw1': 'QLQLVESGGGLVQAGDTLRLSC', 'CDR1': 'EA-S-RSFN---R', 'Fw2': 'MGWYRQAPGKQRDMVA',
               'CDR2': 'HIFS----DGRT', 'Fw3': 'RYADSVQGRFTISRDNAKNTVYLQMNNLKPEDTAVYYCNG',
               'CDR3': 'FFI------------------QDF', 'Fw4': 'WGQGTPVTVSA'},
    '7cz0_E': {'Fw1': 'QVQLVESGGGLVQAGGSLRLSC', 'CDR1': 'AA-SGFPVDT-QW', 'Fw2': 'MHWYRQAPGKEREWVA',
               'CDR2': 'AISST---GRST', 'Fw3': 'FYADSVKGRFTISRDNAKNTVYLQMNSLKPEDTAVYYCTV',
               'CDR3': 'YVG------------------NRY', 'Fw4': 'RGQGTQVTVSA'},
    '7oan_F': {'Fw1': 'QVQLVESGGGSVQAGGSLTLSC', 'CDR1': 'VA-SGVTLGR-HA', 'Fw2': 'IGWFRQAPGKERERVS',
               'CDR2': 'CIRTF---DGIT', 'Fw3': 'SYVESTKGRFTISSNNAMNTVYLQMNSLKPEDTAVYFCAL',
               'CDR3': 'GVTAAC------------SDNPYF', 'Fw4': 'WGQGTQVTVSS'},
    '7m1h_F': {'Fw1': 'QLQLVESGGGLVQPGGSLRLSC', 'CDR1': 'AA-SGNIFSI-YY', 'Fw2': 'MGWYRQAPGKQREMVA',
               'CDR2': 'IINS----NGIT', 'Fw3': 'NYGDFVKGRFTISRDNAENSAYLQMNNLTPEDTAVYYCNA',
               'CDR3': 'GKLRRTT----------GWGLDDY', 'Fw4': 'WGQGTQVTVSS'},
    '7kd0_C': {'Fw1': 'QVQLVESGGGLVQPGGSLRLSC', 'CDR1': 'VA-SGFTFSS-TP', 'Fw2': 'MNWFRQAPGKEREFVA',
               'CDR2': 'GVGSR---NDIA', 'Fw3': 'YYADSVKGRFTVSRDDAKNTVYLQMNSLKPEDTGVYYCKR',
               'CDR3': 'PAGR---------------IEDEL', 'Fw4': 'WGQGTQVTVSS'},
    '7tpr_D': {'Fw1': 'AVQLVDSGGGSVQAGGSLRLSC', 'CDR1': 'AA-SGYTYSI-CT', 'Fw2': 'MGWYRQAPGEGLEWVS',
               'CDR2': 'GINAD---GSNT', 'Fw3': 'HYTDSVKGRFTISRDNAKKTLYLQMNSLKPEDTAIYYCAA',
               'CDR3': 'HGTYDKYAP-----CGGFAGTYTY', 'Fw4': 'WGQGTQVTVSS'},
    '7z1b_E': {'Fw1': 'QVQLVESGGGLMQAGGSLRLSC', 'CDR1': 'AV-SGRTFST-AA', 'Fw2': 'MGWFRQAPGKEREFVA',
               'CDR2': 'AIRWS---GGSA', 'Fw3': 'YYADSVKGRFTISRDKAKNTVYLQMNSLKYEDTAVYYCAG',
               'CDR3': 'FSATRSLLS------DYATWPYDY', 'Fw4': 'WGQGTQVTVSS'},
    '7nft_C': {'Fw1': 'QVQLQESGGGLVQPGGSLRLSC', 'CDR1': 'AA-SGRTFSM-YA', 'Fw2': 'MAWFRQAPGKQREFVG',
               'CDR2': 'TISRS---GDYA', 'Fw3': 'LHADAVKGRFTISRDNAKNTVYLQMNSLKLEDTAVYYCAA',
               'CDR3': 'GAYHSK-----------DKTLYDY', 'Fw4': 'WGQGTQVTVSS'},
    '7rby_B': {'Fw1': 'DVQLQESGGGLVQPGGSLRLSC', 'CDR1': 'AA-SGLTLDY-YA', 'Fw2': 'IGWFRQAPGKEREGVS',
               'CDR2': 'CISSS---DGST', 'Fw3': 'YYADSVKGRFTTSRDNAKNTVYLQMNSLKPEDTAVYYCAA',
               'CDR3': 'VPSTYYSGTY---YYTCHPGGMDY', 'Fw4': 'WGKGTQVTVSS'},
    '7d4b_B': {'Fw1': 'QVQLVESGGGVVQPGRSLRLSC', 'CDR1': 'AA-SGSTFSI-VA', 'Fw2': 'MGWYRQAPGKQRELVA',
               'CDR2': 'SIITG---DGDT', 'Fw3': 'NYADSVKGRFTISRDNSKNTMYLQMNSLKPEDTAVYYCYA',
               'CDR3': 'RTGYGSSW--------LMGHEYDY', 'Fw4': 'WGQGTQVTVSS'},
    '7e6u_B': {'Fw1': 'QVQLQESGGGSVQAGGSLRLSC', 'CDR1': 'AA-SGFPIST-YD', 'Fw2': 'MGWFRQAPGKEREGVV',
               'CDR2': 'GITD----SFSI', 'Fw3': 'KYEDSVKGRFTISRDNAKNALYLQMNSLKPEDTGMYYCAA',
               'CDR3': 'GDARWSLL--------LRAEQYNY', 'Fw4': 'WGQGTQVTVSS'},
    '7p5w_G': {'Fw1': 'QVQLVESGGGLVQAGGSLRLSC', 'CDR1': 'AA-SGFPVYQ-AW', 'Fw2': 'MWWYRQAPGKEREWVA',
               'CDR2': 'AIESE---GQYT', 'Fw3': 'WYADSVKGRFTISRDNAKNTVYLQMNSLKPEDTAVYYCNV',
               'CDR3': 'KDTGHT------------TNQYDY', 'Fw4': 'WGQGTQVTVSA'},
    '7vfa_D': {'Fw1': 'QVQLQESGGGSVAAGGSLRLSC', 'CDR1': 'AV-SGVTASS-VY', 'Fw2': 'MAWFRQAPGKEREGLA',
               'CDR2': 'GINT----VGYT', 'Fw3': 'TYADSVKGRFTISKDNSENTLYLQMNSLKPEDIALYYCAA',
               'CDR3': 'TYLLRFAS--------LSATNFPY', 'Fw4': 'WGQGTQVTVSS'},
    '7s7r_B': {'Fw1': 'QVQLQESGGGLVQAGGSLRLSC', 'CDR1': 'AA-SGRTFSS-YG', 'Fw2': 'MGWFRQAPGTEREFVA',
               'CDR2': 'AISWS---GDST', 'Fw3': 'YYADSVKGRFTISIDKAKNTVYLQMNSLKPEDTAVYYCAA',
               'CDR3': 'DHALVV------------GGTYNY', 'Fw4': 'WGQGTQVTVSS'},
    '7fau_B': {'Fw1': 'QVQLQESGGGSVQAGGSLRLSC', 'CDR1': 'AA-SGYTVSV-GC', 'Fw2': 'MAWFRQAPGKEREGVA',
               'CDR2': 'GIDA----SGIT', 'Fw3': 'KYSDSVKGRFTISKDNAKNALDLQMNGLKPEDTAMYHCAA',
               'CDR3': 'GLVRGSCTD-----VLDHPSYLGV', 'Fw4': 'WGQGTQVTVSS'},
    '7nfr_B': {'Fw1': 'QVQLQESGGGLVQPGGSLRLSC', 'CDR1': 'AA-SGFTFSN-YY', 'Fw2': 'MSWVRQAPGEGREWVS',
               'CDR2': 'SINRD---GSNT', 'Fw3': 'YYADSVKGRFTIARDNVKNTLYLLMDSLKSDDTAVYYCTK',
               'CDR3': 'GSNW----------------PYDN', 'Fw4': 'WGQGTQVTVSS'},
    '7lzp_C': {'Fw1': 'QLQLVETGGGLVQPGGSLRLAC', 'CDR1': 'VA-SESVFEM-YT', 'Fw2': 'VAWYRQAPGKQRELVA',
               'CDR2': 'GITD----EGRT', 'Fw3': 'NYADFVKGRFTISRDNSKKTVHLQMDNLNPEDTAVYYCKL',
               'CDR3': 'EHDL---------------GYYDY', 'Fw4': 'WGQGTQVTVSS'},
    '7r63_A': {'Fw1': 'QVQLQESGGGLVQTGGSLRLSC', 'CDR1': 'KA-SGRAFAR-YD', 'Fw2': 'LAWSRQAPGKQREFVA',
               'CDR2': 'SIGVT---RNPP', 'Fw3': 'YYSGSVKGRFTVSRDNAKETVYLQMNDLKPEDSAVYYCAA',
               'CDR3': 'KDASVTV---------ATIEDYPY', 'Fw4': 'WGRGTQVTVSS'},
    '7l6v_B': {'Fw1': 'QVQLAESGGGLVQPGGSLRLSC', 'CDR1': 'AA-SGFTFNR-YV', 'Fw2': 'IRWYRQAPGKERELVA',
               'CDR2': 'GISRS---GDSG', 'Fw3': 'RYVDSVKGRFTISRDNDKNMAYLQMSSLKPDDTAVYYCSA',
               'CDR3': 'LNLE----------------DMEY', 'Fw4': 'WGQGTQVTVSS'},
    '7php_N': {'Fw1': 'QRQLVESGGGLVQPGGSLRLSC', 'CDR1': 'AA-SGIIFKI-ND', 'Fw2': 'MGWFRQAPGKEREGVA',
               'CDR2': 'GITS----GGRT', 'Fw3': 'NYADSVKGRFIISRDNVKNTVYLQMNSLEPEDTAVYYCKS',
               'CDR3': 'DGLISYA----------ASQLSTY', 'Fw4': 'WGKGTPVTVSS'},
    '7anq_B': {'Fw1': 'QVKLEESGGGLVQAGGSLRLSC', 'CDR1': 'SP-SDRTFSA-YA', 'Fw2': 'MGWFRQVPGREREFVA',
               'CDR2': 'TIRDS---DASI', 'Fw3': 'YYTDSVKGRFTISRDNAKNTVYLQMNSLIPDDTAVYYCAA',
               'CDR3': 'RQYYSGRVY------STFREEYDY', 'Fw4': 'WGQGTQVTVSS'},
    '7m1h_E': {'Fw1': 'QVQLAETGGGLVQPGGSLRLSC', 'CDR1': 'TA-STTISDF-YS', 'Fw2': 'MGWFRQTPGNQRELVA',
               'CDR2': 'IVRR----GGDT', 'Fw3': 'KSGDSVKGRFTISRDNTRSTVYLQMDNLKPEDTAVYYCYA',
               'CDR3': 'NLQKSS-----------DELGPYY', 'Fw4': 'WGQGTQVTVSS'},
    '7z1c_E': {'Fw1': 'QVQLVESGGGLMQAGGSLRLSC', 'CDR1': 'AV-SGRTFST-AA', 'Fw2': 'MGWFRQAPGKEREFVA',
               'CDR2': 'AIRWS---GGSA', 'Fw3': 'YYADSVKGRFTISRDKAKNTVYLQMNSLKYEDTAVYYCAS',
               'CDR3': 'YQATRSLLS------DYATWPYDY', 'Fw4': 'WGQGTQVTVSS'},
    '7olz_B': {'Fw1': 'QVQLVESGGGLVQAGGSLRLSC', 'CDR1': 'AA-SGRTFSN-DA', 'Fw2': 'LGWFRQAPRKEREFVA',
               'CDR2': 'AINW----NSGT', 'Fw3': 'YYADSVKGRFTISRDNAKNTVYLQMNSLKPEDTAVYSCAA',
               'CDR3': 'ASDYGLP---------REDFLYDY', 'Fw4': 'WGQGTQVTVSS'},
    '7d6y_B': {'Fw1': 'EVQLVESGGGLVQPGGSLRLSS', 'CDR1': 'AI-SGFSISS-TS', 'Fw2': 'IDWVRQAPGKGLEWVA',
               'CDR2': 'RISPS---SGST', 'Fw3': 'SYADSVKGRFTISADTSKNTVYLQMNSLRAEDTAVYYTGR',
               'CDR3': 'VAKDLNSSSPSFVVNTYSSFGFDY', 'Fw4': 'RGQGTLVTVSS'},
    '7r4i_D': {'Fw1': 'QVQLVESGGGSVQAGGSLKLSC', 'CDR1': 'AA-SGYASWARKC', 'Fw2': 'IGWFRQAPGQEREGVA',
               'CDR2': 'AIFDF---DGST', 'Fw3': 'YYSDSVKGRFTISGDNAKNTVSLQMNSLLPKDTAVYYCTV',
               'CDR3': 'AFGTCDN---------WYRGRGDY', 'Fw4': 'WGQGTQVTVSS'},
    '7fat_B': {'Fw1': 'QVQLQESGGGSVQAGGSLRLSC', 'CDR1': 'AA-SGYTFSS-YC', 'Fw2': 'LGWFRQAPGKEREGVA',
               'CDR2': 'AIDS----DGST', 'Fw3': 'SYADSVKGRFTISRDNAKNTLYLQMNSLKPEDTAMYYCAA',
               'CDR3': 'EGGPSLSYCTGGYGFLLSGLMYNS', 'Fw4': 'WGQGTQVTVSS'},
    '7sl9_B': {'Fw1': 'QVQLQESGGGLVQAGGSLRLSC', 'CDR1': 'AA-SGNISTR-AG', 'Fw2': 'MGWYRQAPGKEREFVA',
               'CDR2': 'SINW----GAIT', 'Fw3': 'NYADSVKGRFTISRDNAKNTVYLQMNSLKPEDTAVYYCAV',
               'CDR3': 'EYKYGPQ----------RSDTYYY', 'Fw4': 'WGQGTQVTVSS'},
    '7nqa_C': {'Fw1': 'QVQLVESGGGLAKPGGSLRLSC', 'CDR1': 'VA-TGTFRSM-ED', 'Fw2': 'VGWYRQAPGKDRELVA',
               'CDR2': 'EITT----LGKV', 'Fw3': 'TYADSVKGRFTISRDDAKNAVYLQMSDLKSEDTAVYYCNI',
               'CDR3': 'EADQTKG---------IGYVVYPY', 'Fw4': 'WGQGTRVTVSS'},
    '7te8_A': {'Fw1': 'EVQLQASGGGFVQPGGSLRLSC', 'CDR1': 'AA-SGTTYGQ-TN', 'Fw2': 'MGWFRQAPGKEREFVS',
               'CDR2': 'AISGL--QGRDL', 'Fw3': 'YYADSVKGRFTISRDNSKNTVYLQMNSLRAEDTATYYCAF',
               'CDR3': 'HDFL---------------RMWEY', 'Fw4': 'WGQGTQVTVSS'},
    '7o06_A': {'Fw1': 'QVQLQESGGGLVQPGGSLRLSC', 'CDR1': 'SA-SGSSFSI-NT', 'Fw2': 'MGWYRQALGKQRELVA',
               'CDR2': 'NINS----GGST', 'Fw3': 'NYIDSVKGRFTISRDNAKNMVYLQMNSLKPEDTAVYFCNA',
               'CDR3': 'ARPLRPE----------GGRWLNY', 'Fw4': 'WGQGTQVTVSS'},
    '7p6k_G': {'Fw1': 'QVQLVESGGGLVQAGGSLRLSC', 'CDR1': 'AA-SGFPVAQ-EI', 'Fw2': 'MTWYRQAPGKEREWVA',
               'CDR2': 'AISSI---GDTT', 'Fw3': 'AYADSVKGRFTISRDNAKNTVYLQMNSLKPEDTAVYYCAV',
               'CDR3': 'NVG------------------FTY', 'Fw4': 'KGQGTQVTVSA'},
    '7omn_A': {'Fw1': 'EVQLVESGGGLVQPGRSLRLSC', 'CDR1': 'AA-SGFDIAY-YS', 'Fw2': 'IGWVRRAPGKGEELVA',
               'CDR2': 'RIYPS---SSST', 'Fw3': 'SYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYCAR',
               'CDR3': 'WHYDYADW--------PGGYGMDY', 'Fw4': 'WGQGTLVTVSS'},
    '7te8_C': {'Fw1': 'EVQLQASGGGFVQPGGSLRLSC', 'CDR1': 'AA-SGSTSRQ-YD', 'Fw2': 'MGWFRQAPGKEREFVS',
               'CDR2': 'AISSN--QDQPP', 'Fw3': 'YYADSVKGRFTISRDNSKNTVYLQMNSLRAEDTATYYCAF',
               'CDR3': 'KQHH---------------ANGAY', 'Fw4': 'WGQGTQVTVSS'},
    '7p60_G': {'Fw1': 'QVQLVESGGGSVQAGGSLRLSC', 'CDR1': 'AA-SGYIYQI-EY', 'Fw2': 'LGWFRQAPGKEREGVA',
               'CDR2': 'ALATW---NGQT', 'Fw3': 'YYADSVKGRFTVSLDNAKNTVYLQMNSLKPEDTALYYCAA',
               'CDR3': 'AYEGDTSP--------LYYEEYGY', 'Fw4': 'WGQGTQVTVSA'},
    '7ar0_B': {'Fw1': 'QVQLQESGGGLVQAGGSLRLSC', 'CDR1': 'AA-SERTFSS-LG', 'Fw2': 'MGWFRQGPGKEREFAA',
               'CDR2': 'AISWS---GVST', 'Fw3': 'YYADSVKGRFTISRDNDKNTVYLQMNSLKPDDTAVYYCAA',
               'CDR3': 'TSSWNDMA-------LKSAGWYEY', 'Fw4': 'WGQGTQVTVSS'},
    '7l6v_C': {'Fw1': 'QVQLVETGGGLVQAGGSLRLSC', 'CDR1': 'TA-SGADFSF-YA', 'Fw2': 'MGWYRQTPGNSRELVA',
               'CDR2': 'VMNL----NGVI', 'Fw3': 'SYGDSARGRFDISRDGTKNIVFLQMNSLKPEDTGVYYCNG',
               'CDR3': 'MRLYTRG----------SVRHPES', 'Fw4': 'WGQGIQVTVSS'},
    '7qbg_E': {'Fw1': 'QRQLVESGGGLVQPGGSLRLSC', 'CDR1': 'AA-SGFTPGI-YD', 'Fw2': 'IGWFRQAPGKEREGVS',
               'CDR2': 'CISSR---GSST', 'Fw3': 'NYADSVKGRFIISRDNVKNTVYLQMNSLEPEDTAVYYCAA',
               'CDR3': 'IYQPSNGC--------VLRPEYSY', 'Fw4': 'WGKGTPVTVSS'},
    '7qbe_E': {'Fw1': 'QLQLVESGGGLVQAGGSLRLSC', 'CDR1': 'TA-SGRTG----T', 'Fw2': 'MGWFRQGPGKEREFVA',
               'CDR2': 'SHKWV---AGST', 'Fw3': 'YYADSVKGRFTISRDNAKNTLYLQMNSLKSEDTAVYYCAA',
               'CDR3': 'SSQIFYGA-------TTSIKDFNS', 'Fw4': 'WGKGTRVTVSS'},
    '7t5f_B': {'Fw1': 'QVQLVESGGGLVQPGGSLRLSC', 'CDR1': 'AA-SGFPFHA-YY', 'Fw2': 'MSWVRQAPGKGLEWVS',
               'CDR2': 'HIGNG---GIIT', 'Fw3': 'RYADSVKGRFTISRDNAKNTLYLQMTNLKPEDTALYYCTL',
               'CDR3': 'GTRD---------------DLGPE', 'Fw4': 'RGQGTQVTVSS'},
    '7lzp_B': {'Fw1': 'QVQLVETGGGLVQAGDSLTLSC', 'CDR1': 'AA-TGRTLDY-YA', 'Fw2': 'LGWFRQVPGNKREFVA',
               'CDR2': 'AINWL---GGST', 'Fw3': 'YYADSVRGRFTLSRDNSKSTLYLNMNNLIPDDTAVYYCAA',
               'CDR3': 'DFSIAYSGTY---PPAYAEYDYDY', 'Fw4': 'WGQGTQVTVSS'},
    '7na9_D': {'Fw1': 'QVQLVESGGGLVQTGGSLRLSC', 'CDR1': 'AA-SGRTFRR-NT', 'Fw2': 'MGWFRQAPGKVREFVA',
               'CDR2': 'AISWS---GDRT', 'Fw3': 'YCADSVKGRFTISRDNAKNTVDLLMNSLKPEDTAIYYCAA',
               'CDR3': 'DGTASVFNSY----ASADRNKYNY', 'Fw4': 'WGQGTQVTVSS'},
    '7s2r_B': {'Fw1': 'QVQLQESGGGSVQAGGSLRLSC', 'CDR1': 'AA-SGYTYRD-YY', 'Fw2': 'MGWFRQAPGREREGVA',
               'CDR2': 'SIYTRGSREGST', 'Fw3': 'RYSSSVEGRFTITLDTAKNTLYLQMNSLKPEDTAMYYCAA',
               'CDR3': 'DDRTWLPRVQ---LGGPRENEYNY', 'Fw4': 'WGQGTQVTVSS'},
    '7b2p_D': {'Fw1': 'EVQLVESGGGLVQPGGSLRLSC', 'CDR1': 'AA-SGFTFSS-YH', 'Fw2': 'MSWVRQAPGKGLEWIS',
               'CDR2': 'VINDS---GDLT', 'Fw3': 'RYADSVKGRFTISRDNAKNTLYLQMNSLQPEDTAVYSCLK',
               'CDR3': 'SSDFYS------------YSNADS', 'Fw4': 'RGQGTQVTVSS'},
    '7r74_B': {'Fw1': 'AVQLVDSGGGLVQAGGSLRLSC', 'CDR1': 'VV-SGSIFSI-NA', 'Fw2': 'MGWYRQAPGKQRDLVA',
               'CDR2': 'RISG----DSST', 'Fw3': 'YYIDSVKGRFTISRDNAANTVYLQMNSLKPEDTAVYYCAA',
               'CDR3': 'RRLPI--------------GDYTD', 'Fw4': 'WGQGTQVTVSS'},
    '7p5y_G': {'Fw1': 'QVQLVESGGGLVQAGGSLRLSC', 'CDR1': 'AA-SGFPVMN-AG', 'Fw2': 'MYWYRQAPGKEREWVA',
               'CDR2': 'AIESE---GTST', 'Fw3': 'YYADSVKGRFTISRDNAKNTVYLQMNSLKPEDTAVYYCNV',
               'CDR3': 'KDVGDN------------HFPYDY', 'Fw4': 'WGQGTQVTVSA'},
    '7qiv_C': {'Fw1': 'QVQLVETGGGLVQAGGSLRLSC', 'CDR1': 'AA-SGSIFSI-NA', 'Fw2': 'MGWFRQAPGKEREFVA',
               'CDR2': 'TINRS---GGRT', 'Fw3': 'YYADSVKGRFTISRDNGKNMVYLQMHSLKPEDTAIYYCAA',
               'CDR3': 'GTGWSPQ----------TDNEYNY', 'Fw4': 'WGQGTQVTVSS'},
    '7r98_D': {'Fw1': 'EVQLQASGGGLVQAGDSLRLSC', 'CDR1': 'VAVSGRTIST-FA', 'Fw2': 'MGWFRQAPGKEREFVA',
               'CDR2': 'TINWS---GSSA', 'Fw3': 'RYADPVEGRFTISRDDAKNTVYLEMSSLKPGDSAVYYCAS',
               'CDR3': 'GRYLGGIT-------SYSQGDFAP', 'Fw4': 'WGQGTQVTVSS'},
    '7r20_B': {'Fw1': 'EVQLQESGGGLVQAGGSLRLSC', 'CDR1': 'AA-SGSIFSG-NA', 'Fw2': 'MGWYRQAPGKQREVVA',
               'CDR2': 'VISA----GNSS', 'Fw3': 'NYVDSVKGRFTISRDNAKNTVYLQMNSLKPEDTAVYYCNV',
               'CDR3': 'VKRGP--------------QWGME', 'Fw4': 'WGKGTLVTVSS'},
    '7czd_A': {'Fw1': 'EVQLQESGGGLVQPGGSLRLSC', 'CDR1': 'AA-SGFTFSS-YW', 'Fw2': 'MYWLRQAPGKGLEWVS',
               'CDR2': 'SINSD---SSST', 'Fw3': 'YYRDSVKGRFTISRDNAKNTLYLQMNSLKSEDTAVYYCAK',
               'CDR3': 'DPG------------------GYA', 'Fw4': 'KGQGTQVTVSS'},
    '7tpr_F': {'Fw1': 'QVQLVESGGGSVQPGGSLRLSC', 'CDR1': 'VV-SGYTSSS-RY', 'Fw2': 'MGWFRQVPGKGLEWVS',
               'CDR2': 'GIKRD---GTNT', 'Fw3': 'YYADSVKGRFTISQDNAKNTVYLQMNSLKPEDTAMYYCAA',
               'CDR3': 'GSWYNQ-----------WGYSMDY', 'Fw4': 'WGKGTQVTVSS'},
    '7e53_B': {'Fw1': 'QVQLQESGGGSVQAGGSLRLSC', 'CDR1': 'AA-SGPTYSS-YF', 'Fw2': 'MAWFRQAPGMEREGVA',
               'CDR2': 'ASSYD---GSTT', 'Fw3': 'LYADSVKGRFTISQGNAKNTKFLLLNNLEPEDTAIYYCAL',
               'CDR3': 'RRRGWSNTS-----GWKQPGWYDY', 'Fw4': 'WGQGTQVTVSS'},
    '7f5g_B': {'Fw1': 'QVQLQESGGGLVQPGGSLRLSC', 'CDR1': 'AA-SGSFFEF-GT', 'Fw2': 'VGWFRQAPGKQRELVS',
               'CDR2': 'RITG----NDHR', 'Fw3': 'YYADSVKGRFTISRDNDETTVYLQMDSLKPEDTAIYHCNI',
               'CDR3': 'LEGQ---------------RWSNY', 'Fw4': 'WGQGTQVTVSA'},
    '7zfb_M': {'Fw1': 'QVQLVESGGGLVQPGGSLRLSC', 'CDR1': 'AA-SGFTNDF-YS', 'Fw2': 'IAWFRQAPGKEREGVS',
               'CDR2': 'WLSVS---DNTP', 'Fw3': 'TYVDSVKDRFTISRHNANNTVYLQMNMLKPEDTAIYYCAA',
               'CDR3': 'GRFAGRD---------TWPSSYDY', 'Fw4': 'WGQGTQVTVSS'},
    '7ndf_C': {'Fw1': 'QMQLVESGGGLVQAGGSLRLSC', 'CDR1': 'AV-SGSIFSI-IT', 'Fw2': 'LAWYRQAPGKPRENVA',
               'CDR2': 'TITR----GSRT', 'Fw3': 'SYADSVKGRFTISKDNAKSTVYLQMNKLKPEDTADYYCNA',
               'CDR3': 'EGP------------------AGY', 'Fw4': 'WGQGTPVTVS'},
    '7t5f_C': {'Fw1': 'QVQLVESGGGLVQSGGSLRLSC', 'CDR1': 'AA-SGSIDSL-YH', 'Fw2': 'MGWYRQAPGKERELVA',
               'CDR2': 'RVQD----GGST', 'Fw3': 'AYKDSVKGRFTISRDFSRSTMYLQMNSLKPEDTAIYYCAA',
               'CDR3': 'KSTI---------------STPLS', 'Fw4': 'WGQGTQVTVSS'},
    '7ngh_D': {'Fw1': 'QVQLVESGGGLVQAGGSLRLSC', 'CDR1': 'AA-SGFPVDS-QF', 'Fw2': 'MHWYRQAPGKEREWVA',
               'CDR2': 'AIESY---GDET', 'Fw3': 'YYADSVKGRFTISRDNAKNTVYLQMNSLKPEDTAVYYCRV',
               'CDR3': 'LVG------------------WGY', 'Fw4': 'YGQGTQVTVSA'},
    '7rnn_C': {'Fw1': 'QVQLVESGGGLVQPRGSLRLSC', 'CDR1': 'AA-SGFTFSR-AA', 'Fw2': 'MSWYRQAPGKEREMVS',
               'CDR2': 'TIGSF---GVST', 'Fw3': 'NYSDSVKGRFTISRDNAKNTVYLHMNSLKPEDTAVYYCNA',
               'CDR3': 'RYR-----------------SSYP', 'Fw4': 'WGQGTQVTVSS'},
    '7z1a_E': {'Fw1': 'QVQLVESGGGLMQAGGSLRLSC', 'CDR1': 'AV-SGRTFST-AA', 'Fw2': 'MGWFRQAPGKEREFVA',
               'CDR2': 'AIRWS---GGSA', 'Fw3': 'YYADSVKGRFTISRDKAKNTVYLQMNSLKYEDTAVYYCAQ',
               'CDR3': 'TRVTRSLLS------DYATWPYDY', 'Fw4': 'WGQGTQVTVSS'},
    '7apj_B': {'Fw1': 'QVQLVESGGGLVQAGGSLRLSC', 'CDR1': 'AA-SGIDVRI-KT', 'Fw2': 'MAWYRQAPGKQRELLA',
               'CDR2': 'SVLV----SGST', 'Fw3': 'NYADPVKGRFTISRDNAKNTVYLQMNKLIPDDTAVYYCNT',
               'CDR3': 'YGRL----------------RRDV', 'Fw4': 'WGPGTQVTVSS'},
    '7lzp_G': {'Fw1': 'QVQLVETGGALVQPGQSLTLSC', 'CDR1': 'TT-SENVFGI-YG', 'Fw2': 'MAWLRQAPGRQRELVA',
               'CDR2': 'SITS----RGTA', 'Fw3': 'HYHDSVKGRFTISRESGKTTAYLQTTSVNPEDTAIYYCNS',
               'CDR3': 'G---------------------PY', 'Fw4': 'WGQGTQVTVSS'},
    '7nfq_C': {'Fw1': 'QVQLQESGGGLVQPGGSLRLSC', 'CDR1': 'AA-SGFTFKM-YA', 'Fw2': 'MSWVRQAPGKGLEWVS',
               'CDR2': 'SINSA---GGST', 'Fw3': 'SYVDSVKGRFTISRDNAKNTLYLQMNSLKPDDTAVYYCVQ',
               'CDR3': 'GRNW----------------PYDY', 'Fw4': 'RGQGTQVTVSS'},
    '7ooi_A': {'Fw1': 'EVQLVESGGGLVQPGRSLRLSC', 'CDR1': 'AA-SGFDIDY-YS', 'Fw2': 'IGWVRRAPGKGEELVA',
               'CDR2': 'RIYPS---SSST', 'Fw3': 'SYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYCAR',
               'CDR3': 'WHWAYSAW--------PGWYGMDY', 'Fw4': 'WGQGTLVTVSS'},
    '7nqk_B': {'Fw1': 'QVQLVESGGGLVQPGGSLRLLC', 'CDR1': 'VA-SGRPFND-YD', 'Fw2': 'MGWFRQAPGKEREFVA',
               'CDR2': 'SISWS---GRVT', 'Fw3': 'DYSDSMKGRCTVSRDNAKGTMFLQMSNLVPRDTAVYYCAA',
               'CDR3': 'ARRRWTFKA------TNTEEFYET', 'Fw4': 'WGQGTQVTVSS'},
    '7djx_A': {'Fw1': 'QVQLVESGGGLVQAGGSLRLSC', 'CDR1': 'AA-SGRTFSS-YA', 'Fw2': 'MGWFRQAPGKERECVA',
               'CDR2': 'AMDWS---TSAT', 'Fw3': 'YYADSVKGRFTISRDNAKNTVYLQMNSLKPEDTAVYYCAA',
               'CDR3': 'DLDYSDYG--------PFPGDMDY', 'Fw4': 'WGKGTQVTVSS'},
    '7pqg_B': {'Fw1': 'QVQLVESGGGLVQAGGSLRLSC', 'CDR1': 'AV-SGRTTAN-YN', 'Fw2': 'MGWFRQAPGKEREFVA',
               'CDR2': 'GIKWS---SGST', 'Fw3': 'YVADSAKGRFTISRDNAKNSVYLQMDSLKPEDTALYYCAA',
               'CDR3': 'NYYGVSWF-------LISPSSYDY', 'Fw4': 'WGQGTQVTVSS'},
    '7djy_A': {'Fw1': 'QVQLVESGGGLVQAGGSLRLSC', 'CDR1': 'AA-SGATFIT-YG', 'Fw2': 'MTWFRQAPGKEREFVA',
               'CDR2': 'AVTGN---GAGT', 'Fw3': 'TYLPSVKGRFTISRDNAKNTVYLQMSSLKPEDTAVYYCGG',
               'CDR3': 'RRWVPAT----------AVDQVAY', 'Fw4': 'WGQGTQVTVSS'},
    '7vfb_B': {'Fw1': 'QVQLQESGGGSVQAGGSLRLSC', 'CDR1': 'AV-SGYTYSS-KC', 'Fw2': 'LGWFRQAPGKEREGIA',
               'CDR2': 'TIYTG---GGST', 'Fw3': 'YYVDSVKGRFTISQDNAKNTVALQMNSLKPEDTAMYYCAA',
               'CDR3': 'SGAIAGIRL-----CLPGHTFYTY', 'Fw4': 'WGQGTQVTVSS'},
    '7r4r_D': {'Fw1': 'QVQLVESGGGSVQAGGSLRLSC', 'CDR1': 'AA-SGYTYST-CR', 'Fw2': 'KGWYRQAPGKERELVA',
               'CDR2': 'SITA----DGAT', 'Fw3': 'YYLDSVKGRLTISQDNAKNTVYLQMNSLKPEDTAVYYCAA',
               'CDR3': 'SVKDF-------------TCTFNS', 'Fw4': 'WGQGTQVTVSS'},
    '7r4q_D': {'Fw1': 'QVQLVESGGGSVQAGGSLRLSC', 'CDR1': 'AA-SGYTINT-DA', 'Fw2': 'VAWFRQAPGKGDERVA',
               'CDR2': 'VIYTG---SGNT', 'Fw3': 'NYADSVKGRFTISQDNAKNTVYLQMNSLKPEDTALYYCAS',
               'CDR3': 'GYYGAS------------GYDFNN', 'Fw4': 'WGQGTQVTVSS'}
}

In [20]:
frag_pos = {'CDR1': [23, 36], 'CDR2': [52, 63], 'CDR3': [102, 127]}
models_df['CDR3 length'] = [len(aho_align[x]['CDR3'].replace('-', '')) for x in models_df.index]
models_df.to_csv('models_metrics_origin.csv')
models_df

Unnamed: 0,model,Number of residues in common,global rmsd,TM-score,MaxSub-score,GDT-TS-score,GDT-TS-score %(d<1),GDT-TS-score %(d<2),GDT-TS-score %(d<4),GDT-TS-score %(d<8),GDT-HA-score,GDT-HA-score %(d<0.5),GDT-HA-score %(d<1),GDT-HA-score %(d<2),GDT-HA-score %(d<4),TM-score Superposition Length(d<5.0),CDR3 length
7d8b_B,IF,132,6.074,0.8480,0.8231,0.8409,0.7500,0.8409,0.8712,0.9015,0.7557,0.5606,0.7500,0.8409,0.8712,113,24
7d8b_B,ESMem,132,3.830,0.8837,0.8538,0.8731,0.7576,0.8864,0.9091,0.9394,0.7689,0.5227,0.7576,0.8864,0.9091,119,24
7d8b_B,AF2m,132,3.054,0.8869,0.8523,0.8920,0.8030,0.8788,0.8939,0.9924,0.8049,0.6439,0.8030,0.8788,0.8939,118,24
7d8b_B,ESM,132,3.664,0.8940,0.8695,0.8883,0.8030,0.8939,0.9091,0.9470,0.8087,0.6288,0.8030,0.8939,0.9091,119,24
7d8b_B,AF2r10,132,3.734,0.8878,0.8608,0.8788,0.7879,0.8712,0.9091,0.9470,0.7936,0.6061,0.7879,0.8712,0.9091,120,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7z1b_E,DAem,127,2.629,0.8805,0.8339,0.8740,0.7402,0.8346,0.9213,1.0000,0.7323,0.4331,0.7402,0.8346,0.9213,117,18
7z1b_E,AF2r10,127,1.560,0.9251,0.8880,0.9154,0.7953,0.8898,0.9764,1.0000,0.8209,0.6220,0.7953,0.8898,0.9764,123,18
7z1b_E,OFr4,127,1.766,0.9220,0.8813,0.9252,0.8504,0.8898,0.9606,1.0000,0.8425,0.6693,0.8504,0.8898,0.9606,119,18
7z1b_E,AF2,127,1.455,0.9304,0.8930,0.9232,0.8031,0.8976,0.9921,1.0000,0.8327,0.6378,0.8031,0.8976,0.9921,125,18


In [18]:
import itertools
seq_length = 138

conf_pos = {}
atoms_dist = {}
for pdb in tqdm(aho_align):
    len_seq_res = len(''.join(aho_align[pdb].values()).replace('-', '').strip())
    pdb_ref = parmed.load_file(aligned_results_folder.joinpath(f'{pdb}/{pdb}.pdb').as_posix())
    atoms_dist[pdb] = {}
    conf_pos[pdb] = {}
    for m in models_df['model'].unique():
        if not aligned_results_folder.joinpath(f'{pdb}/{m}/{pdb}_{m}_sup.pdb').exists():
            continue
        pdbm = parmed.load_file(aligned_results_folder.joinpath(f'{pdb}/{m}/{pdb}_{m}_sup.pdb').as_posix())
        atoms_dist[pdb][m] = {}
        if m in ['AF2', 'AF2r10', 'AF2m', 'OF', 'OFr4', 'OFr20', 'IF', 'YS', 'ESM']:
            conf_pos[pdb][m] = {}
        for r in pdbm.residues[:-1]:
            for r1 in pdb_ref.residues:
                if r.number == r1.number and r.name == r1.name:
                    atoms_dist[pdb][m][r] = {}
                    if m in ['AF2', 'AF2r10', 'AF2m', 'OF', 'OFr4', 'OFr20', 'IF', 'YS', 'ESM']:
                      conf_pos[pdb][m][r] = r.atoms[0].bfactor * 100 if m == 'ESM' else r.atoms[0].bfactor
                    for at, at1 in itertools.product(r1.atoms, r.atoms):
                        if 'H' in at.name:
                            continue
                        if at.name == at1.name:
                            d = [round(at.xx - at1.xx, 3), round(at.xy - at1.xy, 3), round(at.xz - at1.xz, 3)]
                            atoms_dist[pdb][m][r][at] = d
                    break


75


  0%|          | 0/75 [00:00<?, ?it/s]

7n9c_D
7d8b_B
7l6v_D
7p5v_G
6zwk_A
7ocy_C
7cz0_E
7oan_F
7m1h_F
7kd0_C
7tpr_D
7z1b_E
7nft_C
7rby_B
7d4b_B
7e6u_B
7p5w_G
7vfa_D
7s7r_B
7fau_B
7nfr_B
7lzp_C
7r63_A
7l6v_B
7php_N
7anq_B
7m1h_E
7z1c_E
7olz_B
7d6y_B
7r4i_D
7fat_B
7sl9_B
7nqa_C
7te8_A
7o06_A
7p6k_G
7omn_A
7te8_C
7p60_G
7ar0_B
7l6v_C
7qbg_E
7qbe_E
7t5f_B
7lzp_B
7na9_D
7s2r_B
7b2p_D
7r74_B
7p5y_G
7qiv_C
7r98_D
7r20_B
7czd_A
7tpr_F
7e53_B
7f5g_B
7zfb_M
7ndf_C
7t5f_C
7ngh_D
7rnn_C
7z1a_E
7apj_B
7lzp_G
7nfq_C
7ooi_A
7nqk_B
7djx_A
7pqg_B
7djy_A
7vfb_B
7r4r_D
7r4q_D


In [21]:
def get_confidence():
    confidence = {}
    for pdb, v in tqdm(conf_pos.items()):
        curr_seq = ''.join(aho_align[pdb].values())
        for m, v1 in v.items():
            confidence[(pdb, m)] = [None] * seq_length
            c = 0
            for i, (r, v2) in enumerate(v1.items(), start=list(v1.keys())[0].number -1):
                if i+c >= len(curr_seq):
                    continue
                while curr_seq[i + c] == '-':
                    c += 1
                # if pdb == '7d8b_B' and m == 'of':
                #     print(curr_seq)
                #     print(pdb, c, i,curr_seq[i], r.number, curr_seq[i + c], residues_code[r.name],
                #           curr_seq[i + c] == residues_code[r.name], v2)
                if curr_seq[i + c] == residues_code[r.name]:
                    confidence[(pdb, m)][i + c] = v2
                else:
                    print(pdb, m, r.number, curr_seq[i + c], residues_code[r.name], v2)
    return pd.DataFrame(confidence, index=range(1, 139)).T.rename_axis(['PDB', 'model']).reset_index()
conf_pos_df = get_confidence()
conf_pos_df.to_csv('confidence.csv')
conf_pos_df

  0%|          | 0/75 [00:00<?, ?it/s]

Unnamed: 0,PDB,model,1,2,3,4,5,6,7,8,...,129,130,131,132,133,134,135,136,137,138
0,7n9c_D,IF,2.02,0.54,0.25,0.14,0.14,0.13,0.16,0.22,...,0.25,0.26,0.22,0.21,0.21,0.18,0.19,0.24,0.46,
1,7n9c_D,ESM,76.00,82.00,88.00,90.00,93.00,95.00,95.00,95.00,...,90.00,92.00,93.00,95.00,95.00,96.00,95.00,95.00,92.00,
2,7n9c_D,AF2r10,80.80,88.17,94.86,97.31,97.96,98.44,98.22,97.45,...,96.65,96.70,97.93,98.50,98.24,98.63,98.45,98.27,96.13,
3,7n9c_D,OF,76.94,83.96,91.30,93.86,95.41,95.58,96.15,95.06,...,94.21,95.60,96.02,97.07,97.62,97.57,97.53,97.42,95.90,
4,7n9c_D,AF2,78.50,86.64,93.97,96.61,97.69,98.37,98.10,96.99,...,95.99,96.14,97.76,98.45,98.18,98.57,98.37,98.12,95.63,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
636,7r4q_D,OF,83.75,88.99,94.28,96.47,97.32,97.67,97.69,97.25,...,95.64,96.05,96.82,98.15,98.24,98.40,98.32,98.29,97.34,
637,7r4q_D,AF2,80.19,85.64,92.85,94.88,96.86,98.05,97.89,96.09,...,94.98,95.21,97.15,98.28,97.87,98.34,98.15,97.92,96.21,
638,7r4q_D,OFr20,83.28,89.16,94.85,96.71,97.41,97.66,97.64,97.18,...,96.17,96.49,96.92,98.13,98.19,98.38,98.29,98.26,97.24,
639,7r4q_D,YS,78.86,83.45,89.30,90.40,91.30,90.30,91.55,90.90,...,86.23,88.90,89.26,90.33,92.10,92.24,93.80,92.72,92.10,


In [22]:
def get_rmsd_pos():
    dist_pos = {}
    for pdb, v in tqdm(atoms_dist.items()):
        curr_seq = ''.join(aho_align[pdb].values())
        for m, v1 in v.items():
            dist_pos[(pdb, m, 'ca')] = [None] * seq_length
            dist_pos[(pdb, m, 'bb')] = [None] * seq_length
            dist_pos[(pdb, m, 'all')] = [None] * seq_length
            c = 0
            for i, (res, v2) in enumerate(v1.items(), start=list(v1.keys())[0].number -1):
                if i+c >= len(curr_seq):
                    continue
                while curr_seq[i + c] == '-':
                    c += 1
                # if pdb == '7sl9_B' and m == 'of':
                #     print(res.chain, res.name, res.number, curr_seq[i + c], i+c,
                #           get_rmsd([v2[x] for x in v2 if x.name == 'CA']),
                #           )
                #     if  i+c == 49:
                #         print(get_rmsd(list(v2.values())), list(v2.keys()))
                dist_pos[(pdb, m, 'ca')][i+c] = get_rmsd([v2[x] for x in v2 if x.name == 'CA'])
                dist_pos[(pdb, m, 'bb')][i+c] = get_rmsd([v2[x] for x in v2 if x.name in bb_atoms])
                dist_pos[(pdb, m, 'all')][i+c] = get_rmsd(list(v2.values()))
    return pd.DataFrame(dist_pos, index=range(1, 139)).T.rename_axis(['PDB', 'model', 'rmsd atoms']).reset_index()
f = get_rmsd_pos()
f.to_csv('rmsd_by_pos.csv')
f

  0%|          | 0/75 [00:00<?, ?it/s]

Unnamed: 0,PDB,model,rmsd atoms,1,2,3,4,5,6,7,...,129,130,131,132,133,134,135,136,137,138
0,7n9c_D,IF,ca,3.1,2.6,2.3,0.6,0.3,0.3,0.4,...,0.5,1.0,1.2,1.9,1.5,1.3,1.3,0.8,1.9,
1,7n9c_D,IF,bb,2.9,2.6,2.2,0.8,0.3,0.3,0.6,...,0.6,1.0,1.4,1.8,1.3,1.4,1.0,1.4,2.1,
2,7n9c_D,IF,all,4.2,2.6,2.2,1.6,0.4,0.6,1.1,...,0.6,2.8,1.4,2.0,4.1,1.7,1.7,1.3,2.1,
3,7n9c_D,ESMem,ca,5.9,3.9,2.8,0.9,0.6,0.8,0.5,...,0.6,0.9,1.0,1.6,1.3,1.2,1.5,1.1,2.5,
4,7n9c_D,ESMem,bb,5.4,3.9,2.7,1.0,0.6,0.7,0.7,...,0.7,0.9,1.1,1.5,1.1,1.4,1.2,1.6,2.8,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4741,7r4q_D,OFr4,bb,3.3,1.7,0.7,0.5,0.6,0.5,0.4,...,1.7,1.4,0.6,0.3,0.3,0.2,0.4,0.4,1.1,
4742,7r4q_D,OFr4,all,7.4,2.0,2.1,1.1,0.5,1.1,1.1,...,1.7,1.5,0.6,1.3,0.4,1.3,0.5,0.4,1.2,
4743,7r4q_D,DA,ca,2.1,1.3,0.7,0.6,0.7,0.9,0.6,...,2.1,1.5,0.5,0.4,0.2,0.2,0.4,0.5,0.9,
4744,7r4q_D,DA,bb,2.4,1.3,0.7,0.6,0.8,0.9,0.6,...,1.7,1.4,0.6,0.4,0.3,0.2,0.3,0.6,1.0,


In [24]:
def get_rmsd_segm():
    seq_rmsd = {}
    seq_d = {}
    for pdb, v in tqdm(atoms_dist.items()):
        curr_seq = ''.join(aho_align[pdb].values())
        curr_seq_index = {
            'CDR1': [''.join(aho_align[pdb].values()).index(aho_align[pdb]['CDR1']), ''.join(aho_align[pdb].values()).index(aho_align[pdb]['CDR1']) + len(aho_align[pdb]['CDR1'])],
            'CDR2': [''.join(aho_align[pdb].values()).index(aho_align[pdb]['CDR2']), ''.join(aho_align[pdb].values()).index(aho_align[pdb]['CDR2']) + len(aho_align[pdb]['CDR2'])],
            'CDR3': [''.join(aho_align[pdb].values()).index(aho_align[pdb]['CDR3']), ''.join(aho_align[pdb].values()).index(aho_align[pdb]['CDR3']) + len(aho_align[pdb]['CDR3'])]
            }

        for m, v1 in v.items():
            for s in ['CDR1', 'CDR2', 'CDR3', 'Fw', 'Global']:
                for a in ['ca', 'bb', 'all']:
                    seq_d[(pdb, m, s, a)] = []

            c = 0
            for i, (res, v2) in enumerate(v1.items(), start=list(v1.keys())[0].number -1):
                if i+c >= len(curr_seq):
                    continue
                while curr_seq[i + c] == '-':
                    c += 1
                if i + c in range(*curr_seq_index['CDR1']):
                    seq_d[(pdb, m, 'CDR1', 'ca')].extend([v2[x] for x in v2 if x.name == 'CA'])
                    seq_d[(pdb, m, 'CDR1', 'bb')].extend([v2[x] for x in v2 if x.name in bb_atoms])
                    seq_d[(pdb, m, 'CDR1', 'all')].extend(list(v2.values()))
                elif i + c in range(*curr_seq_index['CDR2']):
                    seq_d[(pdb, m, 'CDR2', 'ca')].extend([v2[x] for x in v2 if x.name == 'CA'])
                    seq_d[(pdb, m, 'CDR2', 'bb')].extend([v2[x] for x in v2 if x.name in bb_atoms])
                    seq_d[(pdb, m, 'CDR2', 'all')].extend(list(v2.values()))
                elif i + c in range(*curr_seq_index['CDR3']):
                    seq_d[(pdb, m, 'CDR3', 'ca')].extend([v2[x] for x in v2 if x.name == 'CA'])
                    seq_d[(pdb, m, 'CDR3', 'bb')].extend([v2[x] for x in v2 if x.name in bb_atoms])
                    seq_d[(pdb, m, 'CDR3', 'all')].extend(list(v2.values()))
                else:
                    seq_d[(pdb, m, 'Fw', 'ca')].extend([v2[x] for x in v2 if x.name == 'CA'])
                    seq_d[(pdb, m, 'Fw', 'bb')].extend([v2[x] for x in v2 if x.name in bb_atoms])
                    seq_d[(pdb, m, 'Fw', 'all')].extend(list(v2.values()))
                seq_d[(pdb, m, 'Global', 'ca')].extend([v2[x] for x in v2 if x.name == 'CA'])
                seq_d[(pdb, m, 'Global', 'bb')].extend([v2[x] for x in v2 if x.name in bb_atoms])
                seq_d[(pdb, m, 'Global', 'all')].extend(list(v2.values()))

            for s in ['Fw', 'CDR1', 'CDR2', 'CDR3', 'Global']:
                for a in ['ca', 'bb', 'all']:
                    seq_rmsd[(pdb, m, s, a)] = get_rmsd(seq_d[(pdb, m, s, a)])
        # print(curr_seq_index)
    # return pd.DataFrame(seq_rmsd, index=range(1, 139)).T.rename_axis(['PDB', 'model', 'rmsd atoms']).reset_index()
    return pd.DataFrame(seq_rmsd, index={'RMSD': [0]}).T.rename_axis(['PDB', 'model', 'Fragment', 'rmsd atoms'])\
        .reset_index()
f = get_rmsd_segm()
f.to_csv('rmsd_by_region.csv')
f

  0%|          | 0/75 [00:00<?, ?it/s]

Unnamed: 0,PDB,model,Fragment,rmsd atoms,RMSD
0,7n9c_D,IF,Fw,ca,1.3
1,7n9c_D,IF,Fw,bb,1.3
2,7n9c_D,IF,Fw,all,2.0
3,7n9c_D,IF,CDR1,ca,2.0
4,7n9c_D,IF,CDR1,bb,2.1
...,...,...,...,...,...
23725,7r4q_D,DA,CDR3,bb,6.8
23726,7r4q_D,DA,CDR3,all,7.1
23727,7r4q_D,DA,Global,ca,2.5
23728,7r4q_D,DA,Global,bb,2.5
