In [1]:
import mdtraj as md
import functions as funcs
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib as mpl

In [2]:
prot_pdbs = {'Chignolin': '5awl.pdb', 'BBA': '1fme.pdb', 'Trp-cage': '2jof.pdb',  'Villin': '2f4k.pdb', 'WW-domain': '2f21.pdb' , 'BBL': '2wxc.pdb' , 'Homeodomain': '2p6j.pdb', 'Protein-B': '1prb.pdb'}
prot_dict = dict(zip(funcs.PROTEIN_LABELS, funcs.PROTEIN_DIRS))

# Get wwdomain structure

In [3]:
# 2f21         MADEEKLPPGWEKRMSADGRVYYFNHITNASQWERPSGQGEPARVRCSHLLVKHSQSRRPSSW
# system       G---SKLPPGWEKRMSRDGRVYYFN?ITGTTQFERPSG-------------------------

In [4]:
ref = list('MADEEKLPPGWEKRMSADGRVYYFNHITNASQWERPSGQGEPARVRCSHLLVKHSQSRRPSSW')
tar = list("G---SKLPPGWEKRMSRDGRVYYFN?ITGTTQFERPSG-------------------------")
letters = funcs.LETTERS + ['?']

ref_ix = []
tar_ix = []
ref_count = 1
tar_count = 1
for i in range(len(ref)):

        
    if (ref[i]==tar[i]) or (ref[i]=='?') or (tar[i]=='?'):
        ref_ix.append(ref_count)
        tar_ix.append(tar_count)

    if ref[i].lower() in letters: ref_count += 1
    if tar[i].lower() in letters: tar_count += 1

pdb = md.load('compare_structures/2f21.pdb')
pdb = pdb.top.to_dataframe()[0]
pdb = pdb.loc[pdb.name=='CA', :]
pdb.resSeq = np.arange(pdb.shape[0])+1
wwd_ref_atom_ix = pdb.loc[pdb.resSeq.isin(ref_ix), 'serial'].values - 1

wwd = md.load('compare_structures/ww_domain.pdb')
wwd = wwd.top.to_dataframe()[0]
wwd = wwd.loc[wwd.name=='CA', :]
wwd_tar_atom_ix = wwd.loc[wwd.resSeq.isin(tar_ix), 'serial'].values - 1

# pd.concat([wwd.loc[wwd.serial.isin(tar_atom_ix+1), :].reset_index(inplace=False, drop=True), pdb.loc[pdb.serial.isin(ref_atom_ix+1), :].reset_index(inplace=False, drop=True)], axis=1)

# Comparison to crystal structure

In [5]:
directory = 'compare_structures'
traj_dir = Path('/Volumes/REA/MD/12FF/strided/')

nm_to_ang = 10

def sample_states(protein, select_method, feature):

    traj_paths = list(Path(directory).joinpath(protein).rglob(f"{select_method}_model_{feature}_state_*.xtc"))
    num_states = len(traj_paths)
    if num_states > 0:
        state_samples = {}

        for state in range(num_states): 
            traj_path = Path(directory).joinpath(protein, f"{select_method}_model_{feature}_state_{state}.xtc")
            top_path = str(list(traj_dir.rglob(f"*{prot_dict[protein].upper()}*/*.pdb"))[0])
            top = md.load(top_path)
            state_samples[state] = md.load(str(traj_path), top=top)
    else: 
        state_samples = None
        
    return state_samples


def get_reference(protein, state_samples, ref):
    
    use_xtal = ref is None

    if use_xtal:
        ref = md.load(f"{directory}/{prot_pdbs[protein]}")

    if (protein == 'WW-domain') and (use_xtal): 
        ref_ix = wwd_ref_atom_ix
        tar_ix = wwd_tar_atom_ix
    else: 
        ref_ix = ref.top.select('name == CA')
        tar_ix = state_samples[0].top.select('name == CA')

    return ref, ref_ix, tar_ix


def align_structures(protein, state_samples, ref=None):
    
    ref, ref_ix, tar_ix = get_reference(protein, state_samples, ref)
        
    for state_ix, sample in state_samples.items(): 
        sample.superpose(ref, frame=0, atom_indices=tar_ix, ref_atom_indices=ref_ix)
    return state_samples
        
def rmsd_samples(protein, state_samples, ref=None):
    ref, ref_ix, tar_ix = get_reference(protein, state_samples, ref)
    rmsd_samples = {}
    for state_ix, sample in state_samples.items(): 
        rmsd_samples[state_ix] = md.rmsd(sample, ref, frame=0, atom_indices=tar_ix, ref_atom_indices=ref_ix)*nm_to_ang
    return rmsd_samples
        

In [15]:
method_dict = {'m1': 'Fixed k', 'm2': 'TS Gap', 'm3': 'Fixed k (worst)'}

for protein in ['BBA', 'BBL', 'Chignolin', 'Trp-cage', 'Villin', 'WW-domain', 'Homeodomain']:
# protein = 'BBA'
    all_dfs = []
    print(protein)
    for feature in ['dihed.', 'dist.', 'logit(dist.)']:
        for select_method in ['m1', 'm2', 'm3']:

            samples = sample_states(protein, select_method, feature)
            if not samples is None:
                samples = align_structures(protein, samples)
                rmsd = rmsd_samples(protein, samples)
                df = pd.DataFrame(rmsd)
                df = df.melt(var_name='State', value_name='RMSD')
                df['Protein'] = protein
                df['Feature'] = feature
                df['Method'] = method_dict[select_method]
                all_dfs.append(df)

    df = pd.concat(all_dfs)
    with sns.plotting_context('paper', font_scale=1.25):
        g = sns.displot(data=df, col='Feature', col_order=['dihed.' ,'dist.', 'logit(dist.)'],  
                        row='State', hue='Method', x='RMSD', kind='kde', lw=2)

        xlim = g.axes[0,0].get_xlim()
        for i, ax in enumerate(g.axes.flatten()):
            ax.set_xlim(0, xlim[1])
            ax.xaxis.set_major_locator(mpl.ticker.MultipleLocator(1))
            ax.grid()
            ax.annotate(text=f"({funcs.LETTERS[i]})", xy=(0.01, 0.99), xycoords='axes fraction', ha='left', va='top')
            
            
        plt.savefig(f"{funcs.FIG_DIR}/model_comparisons/folded_state/{protein}.pdf", bbox_inches='tight')
        plt.close()

BBA




BBL




Chignolin
Trp-cage




Villin
WW-domain
Homeodomain


