In [43]:
# read in csv with dca mappings
# read in MSA
# for each seq in the MSA, 
# index, seqid, MSA_i, MSA_j, seq_i, seq_j, score

#a--b-c--de
#abcde
# seq_i = MSA_i - num_gaps_before
# MSA_i = seq_i + num_gaps_before

In [79]:
from Bio import AlignIO

In [82]:
def dca_to_msa(msa, dca_output_file, mapped_output_file, type = 'PLM'):

    # read coupling data from csv
    coupling_data = []

    for line in dca_output_file:
        if line[0] != '#':
            if type == 'PLM':
                datapt = line.split()
            else:
                datapt = line.split(',')

            coupling_data.append(datapt)

    # only keep up to top L contacts, where L = length of sequence
    L = len(msa[0])
    coupling_data = coupling_data[:L]

    for (i, record) in enumerate(msa):
        # seq_i = MSA_i - num_gaps[MSA_i]

        # computes number of gaps occurring before and including index i
        n = 0
        num_gaps = []
        record_str = str(record.seq)

        for c in record_str:
            if c == '-':
                n += 1
                num_gaps.append(-1)
            else:
                num_gaps.append(n)

        for coupling in coupling_data:

            # coupling indices are 1-indexed
            if type == 'PLM':
                MSA_i = int(coupling[0]) - 1
                MSA_j = int(coupling[1]) - 1
            elif type == 'MF':
                MSA_i = int(coupling[0])
                MSA_j = int(coupling[1])

            DCA_score = coupling[2]

            if num_gaps[MSA_i] == -1 or num_gaps[MSA_j] == -1:
                continue

            seq_i = MSA_i - num_gaps[MSA_i]
            seq_j = MSA_j - num_gaps[MSA_j]

            # for PLM, set it back to 1-indexed?
            # MF is always 0-indexed
            #if type == 'PLM':
            #    csv_datapt_str = "{},{},{},{},{},{},{}".format(i, record.id, MSA_i + 1, MSA_j + 1, seq_i, seq_j, DCA_score)
            #elif type == 'MF':
            #    csv_datapt_str = "{},{},{},{},{},{},{}".format(i, record.id, MSA_i, MSA_j, seq_i, seq_j, DCA_score)
            csv_datapt_str = "{},{},{},{},{},{},{}\n".format(i, record.id, MSA_i, MSA_j, seq_i, seq_j, DCA_score)
                
            # elif type == 'MF':
            #     MSA_i = int(coupling[0])
            #     MSA_j = int(coupling[1])

            # MSA indices are 1-indexed, but seq indices are 0-indexed!!!
            # index, seqid, MSA_i, MSA_j, seq_i, seq_j, score
            
            mapped_output_file.write(csv_datapt_str)

        

## PLM-DCA

In [83]:
cadherin_msa_path = "../results/cadherin/PF00028_10000_msa_trimmed.faa"
cadherin_csv_path = "../results/cadherin/plmdca_cadherin_output.csv"
cadherin_mapped_csv_path = "../results/cadherin/PF00028_plmdca_mapped.csv"

cadherin_msa = AlignIO.read(cadherin_msa_path, 'fasta')
cadherin_csv_file = open(cadherin_csv_path, 'r')
cadherin_mapped_csv_file = open(cadherin_mapped_csv_path, 'w')

dca_to_msa(cadherin_msa, cadherin_csv_file, cadherin_mapped_csv_file)

## MF-DCA

In [None]:
cadherin_msa_path = "../data/cadherin/PF00028.alignment.seed"
cadherin_csv_path = '../data/cadherin/PF00028_mfdca.csv'
cadherin_mapped_csv_path = '../data/cadherin/PF0028_mfdca_mapped.csv'

cadherin_msa = AlignIO.read(cadherin_msa_path, 'stockholm')
cadherin_csv_file = open(cadherin_csv_path, 'r')
cadherin_mapped_csv_file = open(cadherin_mapped_csv_path, 'w')

dca_to_msa(cadherin_msa, cadherin_csv_file, cadherin_mapped_csv_file, type = 'MF')

In [None]:
import pandas as pd

df = pd.read_csv(cadherin_csv_path)
df['first_site'] = df['first_site'].astype(int)
df['second_site'] = df['second_site'].astype(int)

In [None]:
df

In [25]:
df.to_csv(cadherin_csv_path, index=False)