In [1]:
# read in csv with dca mappings
# read in MSA
# for each seq in the MSA, 
# index, seqid, MSA_i, MSA_j, seq_i, seq_j, score

#a--b-c--de
#abcde
# seq_i = MSA_i - num_gaps_before
# MSA_i = seq_i + num_gaps_before

In [2]:
from Bio import AlignIO

In [3]:
# record: MSA record
# idx: idx in MSA
# couplings: list([i, j, score])
# type: PLM | MF
def dca_to_msa_single_seq(record, idx, couplings, mapping_file):
    n = 0
    num_gaps = []
    record_str = str(record.seq)

    for c in record_str:
        if c == '-':
            n += 1
            num_gaps.append(-1)
        else:
            num_gaps.append(n)
    
    for coupling in couplings:
        MSA_i = int(coupling[0])
        MSA_j = int(coupling[1])

        DCA_score = coupling[2]

        if num_gaps[MSA_i] == -1 or num_gaps[MSA_j] == -1:
            continue

        seq_i = MSA_i - num_gaps[MSA_i]
        seq_j = MSA_j - num_gaps[MSA_j]

        if seq_i < 0 or seq_j < 0:
            raise Exception("something is very wrong: MSA_i: {}, MSA_j: {}, numgaps_i: {}, numgaps_j: {}".format(MSA_i, MSA_j, num_gaps[MSA_i], num_gaps[MSA_j]))

        csv_datapt_str = "{},{},{},{},{},{},{}\n".format(idx, record.id, MSA_i, MSA_j, seq_i, seq_j, DCA_score)
        mapping_file.write(csv_datapt_str)

In [4]:
# msa: MSA
# dca_output_file: file with dca scores
# mapped_output_file: file with mapped indices
# length: number of sequences to compute mapping on
# top-L: boolean of whether to truncate couplings to top-L
def dca_to_msa(msa, dca_output_file, mapped_output_file, length, top_L=False):

    # read coupling data from csv
    coupling_data = []

    for line in dca_output_file:
        # really scuffed check to ignore first line but ez
        if line[0].isdigit():
            datapt = line.replace(',', ' ').split()
            coupling_data.append(datapt)

    if top_L:
        coupling_data = coupling_data[:len(msa[0])]
    
    print(len(coupling_data))

    for i in range(length):
        dca_to_msa_single_seq(msa[i], i, coupling_data, mapped_output_file)

## PLM-DCA

In [None]:
cadherin_msa_path = "../results/cadherin/PF00028_10000_msa_trimmed.faa"
cadherin_csv_path = "../results/cadherin/plmdca_cadherin_output.csv"
cadherin_all_mapped_csv_path = "../results/cadherin/PF00028_all_plmdca_mapped.csv"

cadherin_msa = AlignIO.read(cadherin_msa_path, 'fasta')
cadherin_csv_file = open(cadherin_csv_path, 'r')
cadherin_all_mapped_csv_file = open(cadherin_all_mapped_csv_path, 'w')

dca_to_msa(cadherin_msa, cadherin_csv_file, cadherin_all_mapped_csv_file, len(cadherin_msa), top_L=True)

377


In [9]:
cadherin_msa_path = "/nfshomes/vla/cmsc702-protein-lm/results/rrm/PF00076_10000_msa_trimmed.faa"
cadherin_csv_path = "/nfshomes/vla/cmsc702-protein-lm/results/rrm/plmdca_rrm_output.csv"
cadherin_single_mapped_csv_path = "../results/rrm/rrm_single_plmdca_mapped.csv"

cadherin_msa = AlignIO.read(cadherin_msa_path, 'fasta')
cadherin_csv_file = open(cadherin_csv_path, 'r')
cadherin_single_mapped_csv_file = open(cadherin_single_mapped_csv_path, 'w')
dca_to_msa(cadherin_msa, cadherin_csv_file, cadherin_single_mapped_csv_file, 1)

197136
