In [1]:
from Bio import AlignIO
from Bio.Seq import Seq
from Bio.PDB import PDBParser

import sys
sys.path.append('..')
from utils.pdb import seq_from_structure
import requests
import re

In [3]:
# path to FULL msa stockholm file
msa_path = '/nfshomes/vla/cmsc702-protein-lm/PF00076_full'
msa = AlignIO.read(msa_path, 'stockholm')

msa

<<class 'Bio.Align.MultipleSeqAlignment'> instance (560356 records of length 776) at 7f141c1a9f30>

In [6]:
# path to pdb file
pdb_path = '../data/rrm/pdb1d8z.ent'
pdb_id = "1d8z"

parser = PDBParser()
structure = parser.get_structure(pdb_id, pdb_path)
structure

<Structure id=1d8z>

In [7]:
pdb_sequence = seq_from_structure(structure)
pdb_sequence

'MDSKTNLIVNYLPQNMTQDEFKSLFGSIGDIESCKLVRDKITGQSLGYGFVNYSDPNDADKAINTLNGLKLQTKTIKVSYARPSSASIR'

In [8]:
len(pdb_sequence)

89

In [9]:
pdb_uniprot_info = requests.get('https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/' + pdb_id).json()[pdb_id]['UniProt']
key_id = list(pdb_uniprot_info.keys())[0]

pdb_name = pdb_uniprot_info[key_id]['name']
pdb_name  # Uniprot entry name

'ELAV3_MOUSE'

In [10]:
# find sequence corresponding to PDB in the MSA
# MSA record name contains uniprot residue range of sequence that is aligned,
# so we only return the record if it contains our query start and end range
def find_seq_in_MSA(msa, target_name, start=-1, end=-1):
    for record in msa:

        rec_split = record.id.split('/')

        rec_name = rec_split[0]
        rec_range = rec_split[1].split('-')
        rec_start = int(rec_range[0])
        rec_end = int(rec_range[1])

        if start == -1 or end == -1:
            if rec_name == target_name:
                return record
        else:
            if rec_name == target_name and start >= rec_start and end <= rec_end:
                return record
            
    return ''

In [11]:
# reference uniprot range that we will crop MSA based on
uniprot_start = -1
uniprot_end = -1

gapped_str_record = find_seq_in_MSA(msa, pdb_name, start=uniprot_start, end=uniprot_end)
gapped_str = str(gapped_str_record.seq)
gapped_str

'--------------------------------------------------LIVN-Y--L--------P--------Q-------N--------M------T---------Q---------D----------E----------F-----------K----------S---------L---------F-------------G-----------S-----I--------------G------------D-----------I------E------------S-------------C--------------K-----------------L-----------------V--------------R-----------------D-----------------kI--------------T--------------G---------------Q--------------S--------------L------------G------------Y---------------G----------------F------------V-----------N----------Y---------S-----------D--------P----------N---------D---------A--------D-----------K---------A-----------I--------N------T--------L----N---G--L--KL-QTKTIK---------------------------------------------------------'

In [12]:
gapped_str_record

SeqRecord(seq=Seq('--------------------------------------------------LIVN...---'), id='ELAV3_MOUSE/41-111', name='ELAV3_MOUSE', description='ELAV3_MOUSE/41-111', dbxrefs=['PDB; 1FNX H; 127-195;', 'PDB; 1D8Z A; 6-76;'])

In [13]:
ungapped_seq = gapped_str.replace('-', '')
# raw sequence has same amount of residues as the uniprot range specified
# assert len(ungapped_seq) == uniprot_ref_end - uniprot_ref_start + 1
ungapped_seq

'LIVNYLPQNMTQDEFKSLFGSIGDIESCKLVRDkITGQSLGYGFVNYSDPNDADKAINTLNGLKLQTKTIK'

In [14]:
pdb_sequence[6:76 + 1]

'LIVNYLPQNMTQDEFKSLFGSIGDIESCKLVRDKITGQSLGYGFVNYSDPNDADKAINTLNGLKLQTKTIK'

In [15]:
assert ungapped_seq.upper() == pdb_sequence[6:76 + 1]

In [16]:
len(pdb_sequence)

89

In [17]:
start = len(gapped_str) - len(gapped_str.lstrip('-'))
start # inclusive msa col to start at

50

In [18]:
end = len(gapped_str) - (len(gapped_str) - len(gapped_str.rstrip('-')))
end # exclusive msa col to end at

719

In [None]:
# updates msa with trimmed seq
# once this is done, msa can be written to file
for record in msa:  
    record.seq = Seq(str(record.seq)[start:end])

print(msa)