In [2]:
from Bio import AlignIO
from Bio.Seq import Seq
from Bio.PDB import PDBParser
from Bio import SeqIO
import random

import sys
sys.path.append('..')
from utils.pdb import seq_from_structure
import requests
import re

In [3]:
# path to FULL msa stockholm file
msa_path = '/nfshomes/vla/cmsc702-protein-lm/PF00043_full'
msa = AlignIO.read(msa_path, 'stockholm')

msa

<<class 'Bio.Align.MultipleSeqAlignment'> instance (41514 records of length 582) at 7effa2a49150>

In [4]:
refs = []
for rec in msa:
    if rec.dbxrefs:
        refs.append((rec.name, rec.dbxrefs))

len(refs)

83

In [5]:
refs

[('A0A1S4FIB3_AEDAE',
  ['PDB; 7EBT C; 121-197;',
   'PDB; 7EBU D; 121-197;',
   'PDB; 7EBT A; 121-197;',
   'PDB; 7EBW C; 121-197;',
   'PDB; 7EBT D; 121-197;',
   'PDB; 7EBU A; 121-197;',
   'PDB; 7EBW A; 121-197;',
   'PDB; 7EBW B; 121-197;',
   'PDB; 7EBW D; 121-197;',
   'PDB; 7EBV B; 121-197;',
   'PDB; 7EBV D; 121-197;',
   'PDB; 7EBU C; 121-197;',
   'PDB; 7EBV A; 121-197;',
   'PDB; 7EBT B; 121-197;',
   'PDB; 7EBV C; 121-197;',
   'PDB; 7EBU B; 121-197;']),
 ('B9RWR5_RICCO', ['PDB; 4J2F A; 91-196;']),
 ('GSTM7_MOUSE', ['PDB; 2DC5 B; 111-199;', 'PDB; 2DC5 A; 111-199;']),
 ('A5E437_LODEL',
  ['PDB; 4IVF D; 142-212;',
   'PDB; 4IVF B; 142-212;',
   'PDB; 4IVF G; 142-212;',
   'PDB; 4IVF H; 142-212;',
   'PDB; 4IVF E; 142-212;',
   'PDB; 4IVF A; 142-212;',
   'PDB; 4IVF C; 142-212;',
   'PDB; 4IVF F; 142-212;']),
 ('A1ZB72_DROME', ['PDB; 4PNG B; 107-196;', 'PDB; 4PNG A; 107-196;']),
 ('YFCG_ECOLI',
  ['PDB; 5HFK A; 127-197;', 'PDB; 5HFK B; 127-197;', 'PDB; 3GX0 A; 127-197;']),
 (

In [6]:
pdb_id = "1b4p"

In [7]:
# path to pdb file
pdb_path = '/nfshomes/vla/cmsc702-protein-lm/data/pf00043/pdb1b4p.ent'


parser = PDBParser()
structure = parser.get_structure(pdb_id, pdb_path)
structure

<Structure id=1b4p>

In [8]:
pdb_sequence = seq_from_structure(structure)
pdb_sequence

'PMILGYWNVRGLTHPIRLLLEYTDSSYEEKRYAMGDAPDYDRSQWLNEKFKLGLDFPNLPYLIDGSRKITQSNAIMRYLARKHHLCGETEEERIRVDVLENQAMDTRLQLAMVCYSPDFERKKPEYLEGLPEKMKLYSEFLGKQPWFAGNKITYVDFLVYDVLDQHRIFEPKCLDAFPNLKDFVARFEGLKKISDYMKSGRFLSKPIFAKMAFWNPK'

In [9]:
len(pdb_sequence)

217

In [10]:
pdb_uniprot_info = requests.get('https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/' + pdb_id).json()[pdb_id]['UniProt']
key_id = list(pdb_uniprot_info.keys())[0]

pdb_name = pdb_uniprot_info[key_id]['name']
pdb_name  # Uniprot entry name

'GSTM2_RAT'

In [11]:
# find sequence corresponding to PDB in the MSA
# MSA record name contains uniprot residue range of sequence that is aligned,
# so we only return the record if it contains our query start and end range
def find_seq_in_MSA(msa, target_name, start=-1, end=-1):

    records = []
    for record in msa:

        rec_split = record.id.split('/')

        rec_name = rec_split[0]
        rec_range = rec_split[1].split('-')
        rec_start = int(rec_range[0])
        rec_end = int(rec_range[1])

        if rec_name == target_name:
            records.append(record)

            
    return records

In [12]:
records = find_seq_in_MSA(msa, pdb_name)
records

[SeqRecord(seq=Seq('------------------------------------------------------...---'), id='GSTM2_RAT/104-192', name='GSTM2_RAT', description='GSTM2_RAT/104-192', dbxrefs=['PDB; 1B4P A; 103-191;'])]

In [13]:
selected_record = records[0]
selected_record

SeqRecord(seq=Seq('------------------------------------------------------...---'), id='GSTM2_RAT/104-192', name='GSTM2_RAT', description='GSTM2_RAT/104-192', dbxrefs=['PDB; 1B4P A; 103-191;'])

In [15]:
seq_range = selected_record.dbxrefs[0].split(';')[2].split('-')
seq_range


[' 103', '191']

In [16]:
# SOMETIMEs this mapping is shifted down one WHY
seq_start = int(seq_range[0]) - 1
seq_end = int(seq_range[1])
seq_start,seq_end

(102, 191)

In [22]:
gapped_str = str(selected_record.seq)
ungapped_seq = gapped_str.replace('-', '')
# raw sequence has same amount of residues as the uniprot range specified
# assert len(ungapped_seq) == uniprot_ref_end - uniprot_ref_start + 1
# ungapped_seq

In [26]:
ungapped_seq.upper()  

'AMDTRLQLAMVCYSPDFERKKPEYLEGLPEKMKLYSEFLGKQPWFAGNKITYVDFLVYDVLDQHRIFEPKCLDAFPNLKDFVARFEGLK'

In [27]:
pdb_sequence[seq_start:seq_end]

'AMDTRLQLAMVCYSPDFERKKPEYLEGLPEKMKLYSEFLGKQPWFAGNKITYVDFLVYDVLDQHRIFEPKCLDAFPNLKDFVARFEGLK'

In [28]:
# make sure this doesnt fail
assert ungapped_seq.upper() == pdb_sequence[seq_start:seq_end]

In [29]:
msa_start = len(gapped_str) - len(gapped_str.lstrip('-'))
msa_start # inclusive msa col to start at

115

In [30]:
msa_end = len(gapped_str) - (len(gapped_str) - len(gapped_str.rstrip('-')))
msa_end # exclusive msa col to end at

532

In [31]:
meta = {}
meta['pfam_id'] = 'PF00043'
meta['pdb_id'] = pdb_id
meta['pdb_name'] = pdb_name
meta['seq_range'] = (seq_start, seq_end)   # range of pdb sequence that MSA entry refers to [start,end) python indexing
meta['msa_range'] = (msa_start,msa_end)    # range of msa cols to crop msa to to include the MSA entry of interest [start,end) python indexing
meta['uniprot_range'] = (2,218)           # Uniprot residue range covered by the entire pdb structure [start,end] (for record purposes), find on https://www.ebi.ac.uk/interpro/structure/PDB/

meta


{'pfam_id': 'PF00043',
 'pdb_id': '1b4p',
 'pdb_name': 'GSTM2_RAT',
 'seq_range': (102, 191),
 'msa_range': (115, 532),
 'uniprot_range': (2, 218)}

In [32]:
import json
import os

output_dir = '../data/pf00043'

with open(os.path.join(output_dir, f'meta_{pdb_name}.json'), 'w') as f:
    json.dump(meta, f)

In [33]:
# Parameters
# msa_path = '/nfshomes/vla/cmsc702-protein-lm/PF00011_full' # path to FULL msa stockholm file
N = 10000 # number of sequences to select
L = msa_start # left msa index
R = msa_end # right msa index 
output_path = f'/nfshomes/vla/cmsc702-protein-lm/data/pf00043/PF00043_{N}_msa.faa' # output path for new fasta file
threshold = 0 # minimal gap ratio to accept [0-1]

In [34]:
random.seed(42)

if threshold == None:
    threshold = 0
try:
    seen = set(selected_record[L:R].seq)
    records = []

    def gap_ratio(seq):
        return len(seq.replace('-',''))/len(seq)

    for rec in SeqIO.parse(msa_path, 'stockholm'): 
        if rec[L:R].seq not in seen:
            if gap_ratio(rec[L:R].seq) >= threshold:
                seen.add(rec[L:R].seq)
                records.append(rec[L:R])     
except:
    print('could not parse file -> not in fasta format')



random.shuffle(records) 
records = [selected_record[L:R]] + records

SeqIO.write(records[0:N], output_path, "fasta")



10000

In [35]:
records[0]

SeqRecord(seq=Seq('a-MDTRLQLAMVC-Y-S--PDF--------------------------------...-lk'), id='GSTM2_RAT/104-192', name='GSTM2_RAT', description='GSTM2_RAT/104-192', dbxrefs=[])

In [36]:
records[0].seq[-1]

'k'

In [37]:
SeqIO.write(records[0:N], output_path, "fasta")

10000