# Process FASTA files
- create theoretically considered peptides considere by search engines
- dump results

> Based on notebook received by Annelaura Bach and created by

In [1]:
import pandas as pd
from tqdm.notebook import tqdm

## Core Functionality

In [2]:
def cleave_to_tryptic(seq="", num_missed_cleavages=1, reversed=False):
#    Takes a sequence and returns an array of peptides cleaved C-term to R  and K
#    number of missed cleavages can be 0 or 1
#    works with single letter sequences
#    :param seq:
#    num_missed_cleavges,
#    reversed: boolean flag. If true, append the reversed peptide spequences as well
#    :return: list of strings
    
    if seq == "":
        return
    add_rxk = True  # flag for adding bonus peptides at patterns like KXR, which trypsin doesn't cleave efficiently

    seq.replace(' ', '')  # sequence must not contain whitespaces todo: all whitespaces
    seq = seq.upper()
    seq = seq.replace('K', 'K ')  # introduce a white space so we can split there
    seq = seq.replace('R', 'R ')
    pep_seqs = seq.split()

    last_pep = ""
    temp_peps = []
    if num_missed_cleavages == 0 and add_rxk:
        for pep in pep_seqs:
            if 0 < len(last_pep) <= 2:
                temp_peps.append(last_pep+pep)
            last_pep = pep
        pep_seqs += temp_peps
        
    sec_last_pep = ""
    if num_missed_cleavages == 1:  # add joined peptides
        for pep in pep_seqs:
            if last_pep != "":
                temp_peps.append(last_pep + pep)
            if add_rxk and sec_last_pep != "" and len(sec_last_pep) <= 2:
                temp_peps.append(sec_last_pep + last_pep + pep)
            sec_last_pep = last_pep  # sec_last_pep, last_pep = last_pep, pep ?
            last_pep = pep
        pep_seqs += temp_peps
    # # if num_missed_cleavages == 2: todo: implement two missed cleavages

    if reversed:
        reversed_seqs = []
        for pep in pep_seqs:
            if str(pep).endswith('K'):  # change this to change the MW of the peptide
                pep = pep[0:-1] + 'r'
            elif str(pep).endswith('R'):
                pep = pep[0:-1] + 'k'
            rev_pep = pep[::-1]
            reversed_seqs.append(rev_pep)

        pep_seqs += reversed_seqs

    return pep_seqs

In [3]:
def read_fasta(fp):
    """Read a fasta file."""
    name, seq = None, []
    for line in fp:
        line = line.rstrip()
        if line.startswith(">"):
            if name: yield (name, ''.join(seq))
            name, seq = line, []
        else:
            seq.append(line)
    if name: yield (name, ''.join(seq))

In [4]:
def iterFlatten(root):
    """Flatten a nested structure."""
    if isinstance(root, (list, tuple)):
        for element in root:
            for e in iterFlatten(element):
                yield e
    else:
        yield root

## Process Fasta Files

First define input Folder and the file location of the created peptides:

In [5]:
DATA_DIR  = 'data/fasta'
OUTPUT_FILE = 'data/tryptic_peptides.json'

In [6]:
from src.file_utils import search_files
fasta_files = search_files(path='.', query='.fasta')
print("\n".join(fasta_files))

data\fasta\UP000005640_9606.fasta
data\fasta\UP000005640_9606_additional.fasta


Set input FASTA, Output .txt name, lower legth cutoff, missed cleavages and if to report reverse

Tryptic digest of Fastas to Peptides >6 in list for matching with measured peptides  

In [7]:
CUTOFF_LEN_PEP = 6
MAX_MISSED_CLEAVAGES = 1
DECOY_REVERSE = False

print(
    "Set hyperparameters for peptide creation:\n"
    f"Minimal peptide length: {MAX_MISSED_CLEAVAGES+1}\n"
    f"Considered maximum of missed cleavage sites: {MAX_MISSED_CLEAVAGES}\n"
    f"Decoy reverse?: {DECOY_REVERSE}")

Set hyperparameters for peptide creation:
Minimal peptide length: 2
Considered maximum of missed cleavage sites: 1
Decoy reverse?: False


From the [Fasta Meta information](https://ebi14.uniprot.org/help/fasta-headers) the Identifier is extracted.

In [8]:
peptides = {}

for _fasta in tqdm(fasta_files):
    
    with open(_fasta) as fp:
        for i, (metainfo, seq) in tqdm(enumerate(read_fasta(fp))):
            name = metainfo.split("|")[1]
            _all_peptides = cleave_to_tryptic(seq, num_missed_cleavages=MAX_MISSED_CLEAVAGES, reversed=DECOY_REVERSE)
            # ToDo: Check for key: what happens if key exists?
            peptides[name] = [_pep for _pep in _all_peptides if len(_pep)> CUTOFF_LEN_PEP]            

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…





In [9]:
%%time
from random import sample
sample(list(peptides), 10)

Wall time: 7.04 ms


['P04196',
 'D6RB42',
 'D6R9U1',
 'F5H6T9',
 'C9JH19',
 'Q86WG5-3',
 'H3BVD2',
 'D6RJ89',
 'P78364',
 'O43423']

`>tr` or `>sp`

In [10]:
%%time
import json

with open(OUTPUT_FILE, 'w') as f:
    json.dump(peptides, f)

Wall time: 11.1 s
