## Imports

In [None]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt

In [29]:
# create dataframe of ranked pairs from ESM contact matrix
def df_from_esm_contact_matrix(contact_matrix):
    
    gather = []
    
    # traverse over upper right triangle, exlucding main diagonal
    for i in range(contact_matrix.shape[0]):
        for j in range(i+1, contact_matrix.shape[1]):
            # SANITY check the matrix should be symmetric
            assert contact_matrix[i][j] == contact_matrix[j][i]
            row_dict = dict(
                i = i,
                j = j,
                prob = contact_matrix[i,j]
            )
            
            gather.append(row_dict)

    return pd.DataFrame(gather).sort_values(by='prob', ascending = False)

## Load data

### Load ESM data

In [None]:
esm_results_path = os.path.join('..', 'results/cadherin/esmfold/results.zip')
esm_results = np.load(esm_results_path)
job_name = 'cadherin'

#number of sequences in Cadherin MSA
num_seqs = 55
#esm_results.keys()
esm_dfs = []
for i in range(num_seqs):
    # index into NPzFile object with file name
    contact_matrix = esm_results[f'{i}_{job_name}']
    df = df_from_esm_contact_matrix(contact_matrix)
    esm_dfs.append(df)
    
for i in range(len(esm_dfs)):
    esm_dfs[i] = df.sort_values(by="prob", ascending=False).reset_index(drop=True)

### Load True Contacts

In [None]:
real_contacts_path = os.path.join('..', 'data', 'cadherin', 'PF00028_real_contacts.csv')
real_contacts = np.genfromtxt(real_contacts_path,delimiter=',') # import
real_contacts_df = pd.DataFrame(real_contacts).astype(int)

### Load DCA data

In [32]:
# Load Motherfuckin DCA results 
mfdca_results_path = os.path.join('..', 'data', 'cadherin', 'PF00028_mfdca.csv')
mfdca_results = np.genfromtxt(mfdca_results_path,delimiter=',') # import
mfdca_df = pd.DataFrame(mfdca_results).iloc[1:,1:] # drop empty columns and header row
for i in [1,2]:
    mfdca_df["i" if i==1 else "j"] = mfdca_df[i].astype(int) # convert to ints
mfdca_df["prob"] = mfdca_df[3] # rename columns
mfdca_df = mfdca_df[["i", "j", "prob"]] # take only reformatted, renamed columns
mfdca_df = mfdca_df.sort_values(by="prob", ascending=False).reset_index(drop=True) # sort by descending score and reset index
# Load PLM DCA results TODO: CURRENTLY A STAND-IN
plmdca_results_path = os.path.join('..', 'data', 'cadherin', 'PF00028_mfdca.csv')
plmdca_results = np.genfromtxt(mfdca_results_path,delimiter=',')
plmdca_df = pd.DataFrame(plmdca_results).iloc[1:,1:]
for i in [1,2]:
    plmdca_df["i" if i==1 else "j"] = plmdca_df[i].astype(int) # convert to ints
plmdca_df["prob"] = plmdca_df[3] # rename columns
plmdca_df = plmdca_df[["i", "j", "prob"]] # take only reformatted, renamed columns
plmdca_df = plmdca_df.sort_values(by="prob", ascending=False).reset_index(drop=True) # sort by descending score and reset index

## Process Data

In [None]:
# replace with whatever L (length of true structure file protein) should actually be
# for the precision@L calculations

# NOTE: these indices are relative to the sequence, not MSA
# NOTE: only include unique pairs if i,j exists then j,i does not and i != j
L = 90
top_L_esm = esm_dfs[0][:L]
top_L_mf = mfdca_df[:L]
top_L_plm = plmdca_df[:L]
real_contacts_df

In [47]:
top_L_mf

Unnamed: 0,i,j,prob
0,45,48,0.299342
1,45,47,0.277358
2,112,113,0.243650
3,47,48,0.235820
4,46,47,0.216699
...,...,...,...
85,70,122,0.085898
86,4,54,0.085510
87,21,59,0.084957
88,14,117,0.084368
