In [203]:
from Bio import AlignIO
from Bio.Seq import Seq
from Bio.PDB import PDBParser
from utils.pdb import seq_from_structure
import requests
import re

In [None]:
msa_path = "../data/cadherin/PF00028_alignment_seed.fa"
pdb_filename = "../data/cadherin/CAD13_CHICK.3k6i.pdb.ent"
pdb_id = "3k6i"

msa = AlignIO.read(msa_path, 'fasta')
pdb_struct = PDBParser().get_structure(pdb_id, pdb_filename)

msa, pdb_struct

In [None]:
pdb_uniprot_info = requests.get('https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/' + pdb_id).json()[pdb_id]['UniProt']
key_id = list(pdb_uniprot_info.keys())[0]

pdb_name = pdb_uniprot_info[key_id]['name']
pdb_name

In [206]:
# find sequence corresponding to PDB in the MSA
# MSA record name contains uniprot residue range of sequence that is aligned,
# so we only return the record if it contains our query start and end range
def find_seq_in_MSA(msa, target_name, start=-1, end=-1):
    for record in msa:
        rec_split = record.name.split('/')
        rec_name = rec_split[0]
        rec_range = rec_split[1].split('-')
        rec_start = int(rec_range[0])
        rec_end = int(rec_range[1])

        if start == -1 or end == -1:
            if rec_name == target_name:
                return record
        else:
            if rec_name == target_name and start >= rec_start and end <= rec_end:
                return record
    return ''

In [None]:
# reference uniprot range that we will crop MSA based on
uniprot_start = 143
uniprot_end = 236

gapped_str_record = find_seq_in_MSA(msa, pdb_name, start=uniprot_start, end=uniprot_end)
gapped_str = str(gapped_str_record.seq)

uniprot_ref_start = int(re.split('/|-', gapped_str_record.name)[1])
uniprot_ref_end = int(re.split('/|-', gapped_str_record.name)[2])

uniprot_ref_start, uniprot_ref_end

In [208]:
# num gaps up to and including index i in str
def num_gaps_before(str, i):
    return str[:(i+1)].count('.')

def find_matching_ungapped_idx(str, input_idx):
    for i in range(len(str)):
        if input_idx == i - num_gaps_before(str, i):
            return i
    return -1

# str = input sequence with gaps
# u_start = Uniprot start index for range
# u_end = Uniprot end index for range
# u_ref_start = Uniprot offset for first character in sequence
# returns: inclusive range of MSA columns
def get_msa_range(str, u_start, u_end, u_ref_start):
    gapped_start = u_start - u_ref_start
    gapped_end = u_end - u_ref_start
    str_start = find_matching_ungapped_idx(str, gapped_start)
    str_end = find_matching_ungapped_idx(str, gapped_end)

    return str_start, str_end + 1

In [None]:
i, j = get_msa_range(gapped_str, uniprot_start, uniprot_end, uniprot_ref_start)

print(gapped_str)
print("start: {}, end: {}".format(uniprot_start, uniprot_end))
print("reference start: {}".format(uniprot_ref_start))
print("reference end: {}".format(uniprot_ref_end))
print("Trimmed seq: {}".format(gapped_str[i:j]))

In [None]:
# updates msa with trimmed seq
# once this is done, msa can be written to file
for record in msa:
    record.seq = Seq(str(record.seq)[i:j])

print(msa)