In [1]:
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio.PDB import PDBParser
from collections import Counter
from itertools import product
import os
import pandas as pd
import numpy as np
from Bio.PDB import PDBParser
import matplotlib.pyplot as plt

In [3]:
def max_freq_diff(i,j, stabilize, reference):

    pairs_stab = [(seq[i], seq[j]) for seq in stabilize]
    freqs_stab = {key: value / len(pairs_stab) for key, value in Counter(pairs_stab).items()}

    pairs_ref = [(seq[i], seq[j]) for seq in reference]
    freqs_ref = {key: value / len(pairs_ref) for key, value in Counter(pairs_ref).items()}

    
    AMINO_ACIDS = 'ACDEFGHIKLMNPQRSTVWY'
    max_diff = -100
    max_diff_pair = None
    for aa1, aa2 in product(AMINO_ACIDS, repeat=2):
        freq_stab_pair = freqs_stab.get((aa1, aa2), 0)
        freq_ref_pair = freqs_ref.get((aa1, aa2), 0)
        freq_diff =  -(freq_stab_pair - freq_ref_pair)
        if freq_diff > max_diff:
            max_diff = freq_diff
            max_diff_pair = (aa1, aa2)
    
    return max_diff_pair, max_diff

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

def calculate_frequencies(i, msa):

    amino_acids = "ACDEFGHIKLMNPQRSTVWY"  # Standard 20 amino acids
    sequence = [seq[i] for seq in msa]
    counts = Counter(sequence)
    total = sum(counts.values())
    
    freq_dict = {aa: counts[aa] / total if total > 0 else 0 for aa in amino_acids}
    return freq_dict

def plot_amino_acid_frequencies(freq_dict):
  
    hydrophobic = {'A', 'V', 'I', 'L', 'M', 'F', 'Y', 'W'}  # Common hydrophobic amino acids
    amino_acids = "ACDEFGHIKLMNPQRSTVWY"  # Standard 20 amino acids
    
    # Ensure all amino acids are represented
    freq_dict = {aa: freq_dict.get(aa, 0) for aa in amino_acids}
    
    # Separate hydrophobic and other amino acids
    sorted_aa = sorted(amino_acids, key=lambda aa: (aa not in hydrophobic, aa))
    sorted_freqs = [freq_dict[aa] for aa in sorted_aa]
    
    # Plot
    sns.set_style("whitegrid")
    plt.figure(figsize=(10, 5))
    sns.barplot(x=sorted_aa, y=sorted_freqs, palette="muted")
    plt.xlabel("Amino Acid")
    plt.ylabel("Frequency")
    plt.title("Amino Acid Frequencies at Position i")
    plt.show()

# Example usage
example_sequence = ['A', 'V', 'I', 'L', 'M', 'F', 'Y', 'W', 'G', 'P', 'S', 'T', 'C', 'N', 'Q', 'D', 'E', 'H', 'K', 'R', 'A', 'A', 'L', 'V']
freqs = calculate_frequencies(example_sequence)
plot_amino_acid_frequencies(freqs)

In [4]:
def calculate_contacts_dengue(chain1, chain2=None, threshold=6.0):
    contacts = []
    residues_to_compare = chain2 if chain2 else chain1

    unique_residues = sorted(set(res.id[1] for res in chain1 if res.has_id("CA")))
    if chain2:
        unique_residues.extend(res.id[1] for res in chain2 if res.has_id("CA"))
    unique_residues = sorted(set(unique_residues))

    residue_map = {res_id: i for i, res_id in enumerate(unique_residues)}

    for res1 in chain1:
        if not res1.has_id("CA"):
            continue
        atom1 = res1["CA"]

        for res2 in residues_to_compare:
            if chain2 is None or res1.id[1] >= res2.id[1] or not res2.has_id("CA"):
                continue
            atom2 = res2["CA"]
            distance = atom1 - atom2
            if distance < threshold:
                pair = (residue_map[res1.id[1]], residue_map[res2.id[1]])
                if pair not in contacts:
                    contacts.append(pair)

    return contacts

def dimer_pdb(pdb_file):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("dimer", pdb_file)
    chains = list(structure[0].get_chains())

    if len(chains) < 2:
        raise ValueError("The PDB file must contain at least two chains.")

    chain1 = chains[0]
    chain2 = chains[1]
    inter_contacts = calculate_contacts_dengue(chain1, chain2)

    return inter_contacts

In [5]:
def main_dengue(prot):
    

    msa_full = AlignIO.read('../output_files/'+prot+'/sequence_files/full_MSA.fasta', "fasta")
    L = len(msa_full[0].seq)
    query = msa_full[0].seq

    files = os.listdir('output_files/'+prot)
    full_dca_path = 'output_files/'+ prot + '/' +[f for f in files if "full" in f][0]
  #  dca_full = read_dca_scores(full_dca_path, L)
    
    clusters_dca = [f for f in files if "full" not in f][0]
    name = clusters_dca.replace('PLMDCA_apc_fn_scores_','').replace('.txt','')
    msa_cluster = AlignIO.read('../output_files/'+prot+'/clusters/'+name+'.fasta', 'fasta')

    pdb_file = "../output_files/Dengue2_Envelope/dimeric.pdb"
    pdb_inter = dimer_pdb(pdb_file)
    
    df = []
    for i,j in pdb_inter:
        max_diff_pair, max_diff = max_freq_diff(i-1, j-1, msa_cluster, msa_full)
        #if max_diff_pair[0] != query[i-1] or max_diff_pair[1] != query[j-1]:
        df.append([(i,j),(query[i-1],query[j-1]),max_diff_pair,max_diff])
    df_sorted = pd.DataFrame(df, columns=['pair_id','query','max_diff_pair', 'max_diff']).sort_values(by='max_diff', ascending=False)
    #df_sorted.to_csv('output_files/'+prot+'_muts.csv', index=False)

    return df_sorted, pdb_inter, L #, dca_full

In [5]:
prot = "Dengue2_Envelope"

df_sorted, pdb_inter, L = main_dengue(prot)
#dca = [(i, j) for i, j, k in dca_full]

In [23]:
df_sorted

Unnamed: 0,pair_id,query,max_diff_pair,max_diff
1,"(107, 314)","(L, E)","(L, E)",0.273039
2,"(108, 315)","(F, T)","(F, S)",0.188235
5,"(256, 257)","(Q, E)","(Q, T)",0.168627
4,"(255, 257)","(S, E)","(D, T)",0.164706
3,"(254, 257)","(G, E)","(G, T)",0.152941
6,"(257, 258)","(E, G)","(T, G)",0.145098
7,"(258, 261)","(G, H)","(G, L)",0.141176
0,"(106, 312)","(G, I)","(G, V)",0.109804
