In [2]:
from Bio import AlignIO
from Bio.Seq import Seq
from Bio.PDB import PDBParser
from Bio import SeqIO
import random

import sys
sys.path.append('..')
from utils.pdb import seq_from_structure
import requests
import re

In [3]:
# path to FULL msa stockholm file
msa_path = '/nfshomes/vla/cmsc702-protein-lm/PF00028_full'
msa = AlignIO.read(msa_path, 'stockholm')

msa

<<class 'Bio.Align.MultipleSeqAlignment'> instance (513304 records of length 733) at 7fd504825d80>

In [4]:
refs = []
for rec in msa:
    if rec.dbxrefs:
        refs.append(rec.dbxrefs)

len(refs)

211

In [6]:
refs

[['PDB; 6VFU C; 333-423;', 'PDB; 6VFU B; 334-423;', 'PDB; 6VFU A; 333-423;'],
 ['PDB; 6VFV A; 591-682;'],
 ['PDB; 6PGW A; 549-639;'],
 ['PDB; 5ERP B; 341-433;', 'PDB; 5ERP A; 341-433;'],
 ['PDB; 3K5S A; 112-216;', 'PDB; 3K5S B; 112-216;'],
 ['PDB; 3K6D A; 5-98;'],
 ['PDB; 5EQX A; 223-326;'],
 ['PDB; 5VT8 C; 2596-2689;',
  'PDB; 7SUU A; 2710-2791;',
  'PDB; 5VT8 D; 2596-2688;',
  'PDB; 7SUU B; 2710-2792;',
  'PDB; 5VT8 B; 2596-2689;',
  'PDB; 5VT8 A; 2596-2689;'],
 ['PDB; 6CG7 A; 111-206;', 'PDB; 6CG7 B; 111-206;'],
 ['PDB; 5VVM A; 2384-2473;', 'PDB; 5VVM B; 2384-2473;'],
 ['PDB; 5EQX A; 223-326;'],
 ['PDB; 5EQX A; 223-326;'],
 ['PDB; 6BXU A; 702-789;',
  'PDB; 6BXU B; 702-789;',
  'PDB; 6BWN A; 702-789;',
  'PDB; 5TPK A; 803-896;',
  'PDB; 5W1D A; 702-789;'],
 ['PDB; 8EGW B; 363-466;', 'PDB; 8EGX B; 363-466;'],
 ['PDB; 5DZY A; 323-413;',
  'PDB; 5DZY E; 323-413;',
  'PDB; 5DZY C; 323-413;',
  'PDB; 5DZY B; 323-413;',
  'PDB; 5DZY D; 323-413;',
  'PDB; 5DZY F; 323-413;'],
 ['PDB; 2A4E A

In [7]:
pdb_id = "2o72"

In [8]:
# path to pdb file
pdb_path = '/nfshomes/vla/cmsc702-protein-lm/data/cadherin/pdb2o72.ent'


parser = PDBParser()
structure = parser.get_structure(pdb_id, pdb_path)
structure

<Structure id=2o72>

In [9]:
pdb_sequence = seq_from_structure(structure)
pdb_sequence

'DWVIPPISSPENEKGPFPKNLVQIKSNKDKEGKVFYSITGQGADTPPVGVFIIERETGWLKVTEPLDRERIATYTLFSHAVSSNGNAVEDPMEILITVTDQNDNKPEFTQEVFKGSVMEGALPGTSVMEVTATDADDDVNTYNAAIAYTILSQDPELPDKNMFTINRNTGVISVVTTGLDRESFPTYTLVVQAADLQGEGLSTTATAVITVTD'

In [10]:
len(pdb_sequence)

213

In [11]:
pdb_uniprot_info = requests.get('https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/' + pdb_id).json()[pdb_id]['UniProt']
key_id = list(pdb_uniprot_info.keys())[0]

pdb_name = pdb_uniprot_info[key_id]['name']
pdb_name  # Uniprot entry name

'CADH1_HUMAN'

In [12]:
# find sequence corresponding to PDB in the MSA
# MSA record name contains uniprot residue range of sequence that is aligned,
# so we only return the record if it contains our query start and end range
def find_seq_in_MSA(msa, target_name, start=-1, end=-1):

    records = []
    for record in msa:

        rec_split = record.id.split('/')

        rec_name = rec_split[0]
        rec_range = rec_split[1].split('-')
        rec_start = int(rec_range[0])
        rec_end = int(rec_range[1])

        if rec_name == target_name:
            records.append(record)

            
    return records

In [13]:
records = find_seq_in_MSA(msa, pdb_name)
records

[SeqRecord(seq=Seq('------------------------------------------------------...---'), id='CADH1_HUMAN/491-585', name='CADH1_HUMAN', description='CADH1_HUMAN/491-585', dbxrefs=['PDB; 7STZ C; 444-532;', 'PDB; 7STZ D; 337-430;']),
 SeqRecord(seq=Seq('------------------------------------------------------...---'), id='CADH1_HUMAN/380-478', name='CADH1_HUMAN', description='CADH1_HUMAN/380-478', dbxrefs=['PDB; 7STZ D; 337-430;', 'PDB; 7STZ C; 444-532;']),
 SeqRecord(seq=Seq('------------------------------------------------------...---'), id='CADH1_HUMAN/267-366', name='CADH1_HUMAN', description='CADH1_HUMAN/267-366', dbxrefs=['PDB; 8H62 B; 113-212;', 'PDB; 2O72 A; 113-212;', 'PDB; 7STZ D; 337-430;', 'PDB; 4ZTE A; 113-212;', 'PDB; 4ZTE B; 113-212;', 'PDB; 7STZ C; 444-532;', 'PDB; 6CXY C; 113-212;', 'PDB; 4ZT1 A; 113-212;', 'PDB; 4ZT1 B; 113-212;', 'PDB; 6VEL C; 214-313;']),
 SeqRecord(seq=Seq('------------------------------------------------------...---'), id='CADH1_HUMAN/598-686', name='CADH1_

In [14]:
selected_record = records[2]
selected_record

SeqRecord(seq=Seq('------------------------------------------------------...---'), id='CADH1_HUMAN/267-366', name='CADH1_HUMAN', description='CADH1_HUMAN/267-366', dbxrefs=['PDB; 8H62 B; 113-212;', 'PDB; 2O72 A; 113-212;', 'PDB; 7STZ D; 337-430;', 'PDB; 4ZTE A; 113-212;', 'PDB; 4ZTE B; 113-212;', 'PDB; 7STZ C; 444-532;', 'PDB; 6CXY C; 113-212;', 'PDB; 4ZT1 A; 113-212;', 'PDB; 4ZT1 B; 113-212;', 'PDB; 6VEL C; 214-313;'])

In [18]:
seq_range = selected_record.dbxrefs[1].split(';')[2].split('-')
seq_range


[' 113', '212']

In [23]:
# SOMETIMEs this mapping is shifted down one WHY
seq_start = int(seq_range[0]) - 1
seq_end = int(seq_range[1])
seq_start,seq_end

(112, 212)

In [24]:
gapped_str = str(selected_record.seq)
ungapped_seq = gapped_str.replace('-', '')
# raw sequence has same amount of residues as the uniprot range specified
# assert len(ungapped_seq) == uniprot_ref_end - uniprot_ref_start + 1
ungapped_seq

'fKGSVMEGALPGTSVMEVTATDADddvNTYNAAIAYTILSQDPelpdKNMFTINRNTGVISVVTTgLDRESFPTYTLVVQAADLQGEGLSTTATAVITVT'

In [25]:
pdb_sequence[seq_start:seq_end]

'FKGSVMEGALPGTSVMEVTATDADDDVNTYNAAIAYTILSQDPELPDKNMFTINRNTGVISVVTTGLDRESFPTYTLVVQAADLQGEGLSTTATAVITVT'

In [26]:
ungapped_seq.upper()  

'FKGSVMEGALPGTSVMEVTATDADDDVNTYNAAIAYTILSQDPELPDKNMFTINRNTGVISVVTTGLDRESFPTYTLVVQAADLQGEGLSTTATAVITVT'

In [28]:
# make sure this doesnt fail
assert ungapped_seq.upper() == pdb_sequence[seq_start:seq_end]

In [30]:
msa_start = len(gapped_str) - len(gapped_str.lstrip('-'))
msa_start # inclusive msa col to start at

59

In [31]:
msa_end = len(gapped_str) - (len(gapped_str) - len(gapped_str.rstrip('-')))
msa_end # exclusive msa col to end at

665

In [32]:
meta = {}
meta['pfam_id'] = 'PF00028'
meta['pdb_id'] = pdb_id
meta['pdb_name'] = pdb_name
meta['seq_range'] = (seq_start, seq_end)   # range of pdb sequence that MSA entry refers to [start,end) python indexing
meta['msa_range'] = (msa_start,msa_end)    # range of msa cols to crop msa to to include the MSA entry of interest [start,end) python indexing
meta['uniprot_range'] = (155,367)           # Uniprot residue range covered by the entire pdb structure [start,end] (for record purposes), find on https://www.ebi.ac.uk/interpro/structure/PDB/

meta


{'pfam_id': 'PF00028',
 'pdb_id': '2o72',
 'pdb_name': 'CADH1_HUMAN',
 'seq_range': (112, 212),
 'msa_range': (59, 665),
 'uniprot_range': (155, 367)}

In [34]:
import json
import os

output_dir = '../data/cadherin'

with open(os.path.join(output_dir, f'meta_{pdb_name}.json'), 'w') as f:
    json.dump(meta, f)

In [37]:
# Parameters
msa_path = '/nfshomes/vla/cmsc702-protein-lm/PF00028_full' # path to FULL msa stockholm file
N = 10000 # number of sequences to select
L = msa_start # left msa index
R = msa_end # right msa index 
output_path = f'/nfshomes/vla/cmsc702-protein-lm/data/cadherin/PF00028_{N}_msa.faa' # output path for new fasta file
threshold = 0 # minimal gap ratio to accept [0-1]

In [38]:
random.seed(42)

if threshold == None:
    threshold = 0
try:
    seen = set(selected_record[L:R].seq)
    records = []

    def gap_ratio(seq):
        return len(seq.replace('-',''))/len(seq)

    for rec in SeqIO.parse(msa_path, 'stockholm'): 
        if rec[L:R].seq not in seen:
            if gap_ratio(rec[L:R].seq) >= threshold:
                seen.add(rec[L:R].seq)
                records.append(rec[L:R])     
except:
    print('could not parse file -> not in fasta format')



random.shuffle(records) 
records = [selected_record[L:R]] + records

SeqIO.write(records[0:N], output_path, "fasta")



10000

In [39]:
records[0]

SeqRecord(seq=Seq('f-KGSVM-E---G----A---L-------P----G-----T-------S-----...TVT'), id='CADH1_HUMAN/267-366', name='CADH1_HUMAN', description='CADH1_HUMAN/267-366', dbxrefs=[])

In [40]:
SeqIO.write(records[0:N], output_path, "fasta")

10000