In [2]:
import evoVAE.utils.seq_tools as st
import evoVAE.utils.statistics as stats
import pandas as pd


pd.set_option("display.max_rows", None)

In [3]:
test_aln = st.read_aln_file("../data/pair_test.aln")
test_aln


Reading the alignment: ../data/pair_test.aln
Checking for bad characters: ['B', 'J', 'X', 'Z']
Performing one hot encoding
Number of seqs: 8


Unnamed: 0,id,sequence,encoding
0,seq_1,LTRAALYEDC,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,seq_2,LTRATLYEDC,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,seq_3,LTRCTLPEDC,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,seq_4,LRRATLPDDC,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,seq_5,LRRATLPDDA,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
5,seq_6,LVRATKPWDA,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
6,seq_7,LVRATLPWDA,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
7,seq_8,LVRATLPWDA,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [4]:
pfm = stats.calc_position_freq_matrix(test_aln)
pfm[:, 0]

Sequence weight numpy array created with shape (num_seqs, columns):  (8, 10)


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 8.,
       0., 0., 0., 0.])

In [5]:
ppm = stats.calc_position_prob_matrix(test_aln)
ppm[:, 0]

Sequence weight numpy array created with shape (num_seqs, columns):  (8, 10)


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0.])

In [6]:
H = stats.calc_shannon_entropy(test_aln)


Sequence weight numpy array created with shape (num_seqs, columns):  (8, 10)


In [193]:


from evoVAE.utils.seq_tools import AA_TO_IDX, GAPPY_ALPHABET_LEN
import numpy as np

def pair_wise_covariances(msa):

    SEQ_COUNT = 0
    COLS = 1

    pairs = []
    for i in range(st.GAPPY_ALPHABET_LEN):
        pairs.extend([(i, j) for j in range(i + 1, st.GAPPY_ALPHABET_LEN)])

    # the number of unique ways we can compare columns in the MSA 
    column_combinations = (GAPPY_ALPHABET_LEN * (GAPPY_ALPHABET_LEN - 1) // 2)
    # number of different residue combinations we can have 
    aa_combinations = GAPPY_ALPHABET_LEN ** 2

    num_seqs = msa.shape[SEQ_COUNT]
    num_columns = msa.shape[COLS]

    # each column has aa_combinations many ways to combine residues
    # this is an upper triangular matrix but we will store it in a linear format. 
    covariances = np.zeros(column_combinations * aa_combinations)

    # keep track of which column combination we're up to 
    col_combination_count = 0
    for i in range(num_columns - 1):
        for j in range(i + 1, num_columns):
            col_i = msa[:, i]
            col_j = msa[:, j]

            for a, b in pairs:
      
                # find how many sequences have residues a and b
                col_i_res = np.where(col_i == a)[0]
                col_j_res = np.where(col_j == b)[0]

                # find how many sequences have this combination 
                intersect = np.intersect1d(col_i_res, col_j_res, assume_unique=True).shape[SEQ_COUNT]
                # make a frequency based on number of sequences 
                freq_Ai_Bj = intersect / num_seqs
                
                # just count how many sequences have these residues 
                freq_Ai = col_i_res.shape[0] / num_seqs
                freq_Bj = col_j_res.shape[0] / num_seqs

                # get correct position: (which column combination we're at) + (which residue combination we're at)
                covar_index = col_combination_count * aa_combinations + a * st.GAPPY_ALPHABET_LEN + b

                # useful in case you want to find a specific cov score based on column and residue indices in the upper tri matrix
                #col_combination_count = (num_cols*(num_cols-1)/2) - (num_cols-col_1_idx)*((num_cols-col_1_idx)-1)/2 + col_2_idx - col_1_idx - 1
                #covar_index = int(col_combination_count * aa_combinations + a_idx * st.GAPPY_ALPHABET_LEN + b_idx)

                # (joint occurances of residues a & b at thi) - (occurence of A at col i * occurence of B at col j)
                covariances[covar_index] = freq_Ai_Bj - (freq_Ai * freq_Bj)

            # keep track of how many column combinations we've seen 
            col_combination_count += 1

    
    return covariances

msa, _, _ = st.convert_msa_numpy_array(test_aln)
pairs = pair_wise_covariances(msa)


#pairs.shape

Sequence weight numpy array created with shape (num_seqs, columns):  (8, 10)


In [195]:
col_1_idx = 0
col_2_idx = 1
aa_combinations = GAPPY_ALPHABET_LEN ** 2
num_cols = msa.shape[1]

a_idx = st.AA_TO_IDX['L']
b_idx = st.AA_TO_IDX['T']


col_combination_count = (num_cols*(num_cols-1)/2) - (num_cols-col_1_idx)*((num_cols-col_1_idx)-1)/2 + col_2_idx - col_1_idx - 1
covar_index = int(col_combination_count * aa_combinations + a_idx * st.GAPPY_ALPHABET_LEN + b_idx)
print(covar_index)
print(pairs[covar_index])

10
343
0.0


In [139]:
GAPPY_ALPHABET_LEN ** 2

441