In [1]:
from Bio import AlignIO
from Bio.Seq import Seq
from Bio.PDB import PDBParser
from Bio import SeqIO
import random

import sys
sys.path.append('..')
from utils.pdb import seq_from_structure
import requests
import re

In [2]:
# path to FULL msa stockholm file
msa_path = '/nfshomes/vla/cmsc702-protein-lm/PF00076_full'
msa = AlignIO.read(msa_path, 'stockholm')

msa

<<class 'Bio.Align.MultipleSeqAlignment'> instance (560356 records of length 776) at 7fc007e08d60>

In [3]:
pdb_id = "1d8z"

In [4]:
# path to pdb file
pdb_path = '/nfshomes/vla/cmsc702-protein-lm/data/rrm/pdb1d8z.ent'


parser = PDBParser()
structure = parser.get_structure(pdb_id, pdb_path)
structure

<Structure id=1d8z>

In [5]:
pdb_sequence = seq_from_structure(structure)
pdb_sequence

'MDSKTNLIVNYLPQNMTQDEFKSLFGSIGDIESCKLVRDKITGQSLGYGFVNYSDPNDADKAINTLNGLKLQTKTIKVSYARPSSASIR'

In [6]:
len(pdb_sequence)

89

In [7]:
pdb_uniprot_info = requests.get('https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/' + pdb_id).json()[pdb_id]['UniProt']
key_id = list(pdb_uniprot_info.keys())[0]

pdb_name = pdb_uniprot_info[key_id]['name']
pdb_name  # Uniprot entry name

'ELAV3_MOUSE'

In [8]:
# find sequence corresponding to PDB in the MSA
# MSA record name contains uniprot residue range of sequence that is aligned,
# so we only return the record if it contains our query start and end range
def find_seq_in_MSA(msa, target_name, start=-1, end=-1):

    records = []
    for record in msa:

        rec_split = record.id.split('/')

        rec_name = rec_split[0]
        rec_range = rec_split[1].split('-')
        rec_start = int(rec_range[0])
        rec_end = int(rec_range[1])

        if rec_name == target_name:
            records.append(record)

            
    return records

In [9]:
records = find_seq_in_MSA(msa, pdb_name)
records

[SeqRecord(seq=Seq('--------------------------------------------------LIVN...---'), id='ELAV3_MOUSE/41-111', name='ELAV3_MOUSE', description='ELAV3_MOUSE/41-111', dbxrefs=['PDB; 1FNX H; 127-195;', 'PDB; 1D8Z A; 6-76;']),
 SeqRecord(seq=Seq('--------------------------------------------------LYVS...---'), id='ELAV3_MOUSE/127-195', name='ELAV3_MOUSE', description='ELAV3_MOUSE/127-195', dbxrefs=['PDB; 1D9A A; 4-72;', 'PDB; 1FNX H; 127-195;']),
 SeqRecord(seq=Seq('--------------------------------------------------IFVY...---'), id='ELAV3_MOUSE/286-356', name='ELAV3_MOUSE', description='ELAV3_MOUSE/286-356', dbxrefs=[])]

In [10]:
record = records[0]
record

SeqRecord(seq=Seq('--------------------------------------------------LIVN...---'), id='ELAV3_MOUSE/41-111', name='ELAV3_MOUSE', description='ELAV3_MOUSE/41-111', dbxrefs=['PDB; 1FNX H; 127-195;', 'PDB; 1D8Z A; 6-76;'])

In [11]:
seq_range = record.dbxrefs[1].split(';')[2].split('-')
seq_range


[' 6', '76']

In [12]:
seq_start = int(seq_range[0])
seq_end = int(seq_range[1]) + 1
seq_start,seq_end

(6, 77)

In [13]:
gapped_str = str(record.seq)
ungapped_seq = gapped_str.replace('-', '')
# raw sequence has same amount of residues as the uniprot range specified
# assert len(ungapped_seq) == uniprot_ref_end - uniprot_ref_start + 1
ungapped_seq

'LIVNYLPQNMTQDEFKSLFGSIGDIESCKLVRDkITGQSLGYGFVNYSDPNDADKAINTLNGLKLQTKTIK'

In [14]:
pdb_sequence[seq_start:seq_end]

'LIVNYLPQNMTQDEFKSLFGSIGDIESCKLVRDKITGQSLGYGFVNYSDPNDADKAINTLNGLKLQTKTIK'

In [15]:
assert ungapped_seq.upper() == pdb_sequence[seq_start:seq_end]

In [16]:
len(pdb_sequence)

89

In [17]:
msa_start = len(gapped_str) - len(gapped_str.lstrip('-'))
msa_start # inclusive msa col to start at

50

In [18]:
msa_end = len(gapped_str) - (len(gapped_str) - len(gapped_str.rstrip('-')))
msa_end # exclusive msa col to end at

719

In [20]:
meta = {}
meta['pfam_id'] = 'PF00076'
meta['pdb_id'] = '1d8z'
meta['pdb_name'] = pdb_name
meta['seq_range'] = (seq_start, seq_end)   # range of pdb sequence that MSA entry refers to [start,end) python indexing
meta['msa_range'] = (msa_start,msa_end)    # range of msa cols to crop msa to to include the MSA entry of interest [start,end) python indexing
meta['uniprot_range'] = (35,123)           # Uniprot residue range covered by the entire pdb structure [start,end] (for record purposes)

meta


{'pfam_id': 'PF00076',
 'pdb_id': '1d8z',
 'pdb_name': 'ELAV3_MOUSE',
 'seq_range': (6, 77),
 'msa_range': (50, 719),
 'uniprot_range': (35, 123)}

In [21]:
import json
import os

output_dir = '../data/rrm'

with open(os.path.join(output_dir, 'meta.json'), 'w') as f:
    json.dump(meta, f)

In [31]:
# Parameters
msa_path = '/nfshomes/vla/cmsc702-protein-lm/PF00076_full' # path to FULL msa stockholm file
N = 10000 # number of sequences to select
L = msa_start # left msa index
R = msa_end # right msa index 
output_path = f'/nfshomes/vla/cmsc702-protein-lm/data/rrm/PF00076_{N}_msa.faa' # output path for new fasta file
threshold = 0 # minimal gap ratio to accept [0-1]

In [32]:
if threshold == None:
    threshold = 0
try:
    seen = set(record[L:R].seq)
    records = []

    def gap_ratio(seq):
        return len(seq.replace('-',''))/len(seq)

    flag = True
    if L==None or R==None:
        flag = False
    for rec in SeqIO.parse(msa_path, 'stockholm'): 

        if flag and rec[L:R].seq not in seen:
            if gap_ratio(rec[L:R].seq) >= threshold:
                seen.add(rec[L:R].seq)
                records.append(rec[L:R])
        elif not flag and rec.seq not in seen:
            if gap_ratio(rec.seq) >= threshold:
                seen.add(rec.seq)
                records.append(rec)       
            
except:
    print('could not parse file -> not in fasta format')



random.shuffle(records) 
records = [record[L:R]] + records


In [33]:
records[0]

SeqRecord(seq=Seq('LIVN-Y--L--------P--------Q-------N--------M------T---...TIK'), id='ELAV3_MOUSE/41-111', name='ELAV3_MOUSE', description='ELAV3_MOUSE/41-111', dbxrefs=[])

In [34]:
SeqIO.write(records[0:N], output_path, "fasta")

10000

In [110]:
import pandas as pd
df = pd.read_csv('/nfshomes/vla/cmsc702-protein-lm/results/cadherin/mfdca/PF00028_8000_sequences_2o72_uniprot_mfdca.csv')
df.sort_values(by = 'mf_dca', ascending=False)

Unnamed: 0,first_site,second_site,mf_dca
116769,182,199,4.928587e-01
240793,498,509,4.535699e-01
261724,618,620,3.881888e-01
261953,620,624,3.840173e-01
262174,622,624,3.584000e-01
...,...,...,...
259239,598,605,3.393099e-07
259241,598,607,3.393099e-07
259243,598,609,3.393099e-07
259277,598,643,3.393099e-07
