In [1]:
import os
from util import *
from search import search, _find_gapped_pairs
from agnostic_search import search as agnostic_search, _find_gapped_pairs as agnostic_find_gapped_pairs
from prototype import *
from random import shuffle, randint

tolerance = 2 * AVERAGE_MASS_DIFFERENCE
alphabet = AMINO_MASS_MONO
amin = min(alphabet)
amax = max(alphabet)

In [None]:
seqs = load_fasta_records_as_str("uniprot_sprot.fasta")
shuffle(seqs)
seqs = seqs[:100]
peps = collapse_second_order_list(map(digest_trypsin,seqs))
peps = list(filter(lambda pep: 'X' not in pep, peps))
specs = list(map(generate_spectrum_and_list_mz,peps))
specs = [np.unique(spec) for spec in specs]

## Timing and Sanity Checks

In [None]:
from time import time
def duration(fn, specs, *args):
    init_t = time()
    for spec in specs:
        fn(spec,*args)
    return time() - init_t

In [None]:
duration(old_search_overlap,specs,tolerance,alphabet)

In [None]:
duration(old_search_overlap_alt,specs,tolerance,alphabet)

In [None]:
duration(search,specs,"overlap",alphabet,tolerance)

In [None]:
duration(search,specs,"overlap_alt",alphabet,tolerance)

## Manual Validation

#### Compare Inferred Pivots to True Pivots
for validation on large datasets, `python3 prototype.py test path/to/fasta`

In [None]:
i = randint(0,len(specs)-1)
spec = specs[i]
p = locate_pivot_point(spec,tolerance)
print("pivot",p)
pep = peps[i]
b = get_b_ion_series(pep)
y = get_y_ion_series(pep)
true_pivot = np.mean([*b[0:2],*y[-3:-1]])
print("true pivot",true_pivot)
print("error",abs(true_pivot - p))
print("symmetry, expected symmetry",measure_mirror_symmetry(spec,p), (len(spec) - 1)/len(spec))

#### Compare gap-driven and gap-agnostic pair discovery

In [None]:
i = randint(0,len(specs)-1)
spec = specs[i]
pairs_g = collapse_second_order_list([_find_gapped_pairs(specs[i],tg,tolerance) for tg in alphabet])
pairs_a = agnostic_find_gapped_pairs(specs[i],amin,amax,tolerance)
print("gap-driven pairs",len(pairs_g))
print("gap-agnostic pairs",len(pairs_a))
inc = 0
for p in pairs_g:
    if p in pairs_a:
        inc += 1
print("% gap-driven pairs present in gap-agnostic pairs",100 * inc / len(pairs_g))

In [None]:
get_gap = lambda x: x[1]-x[0]
candidate_pairs = sorted(pairs_a, key = get_gap)
list(map(get_gap,candidate_pairs))

In [None]:
agnostic_search(spec,"overlap",alphabet,tolerance)

## Miscellaneous

#### Create Swiss-Prot Subset

In [11]:
sprot = load_fasta_records("uniprot_sprot.fasta")

def create_sprot_subset(sprot,count,filename_pattern):
    shuffle(sprot)
    sprot_subset = sprot[:count]
    counter = 0
    while os.path.isfile(filename_pattern.format(counter)):
        counter += 1
    filename = filename_pattern.format(counter)
    print(filename)
    with open(filename,"w") as handle:
        print(SeqIO.write(sprot_subset, handle, "fasta"))

In [14]:
create_sprot_subset(sprot,1000,"sprot_1k_{}.fasta")

sprot_1k_1.fasta
1000


In [15]:
create_sprot_subset(sprot,10000,"sprot_10k_{}.fasta")

sprot_10k_1.fasta
10000


In [17]:
create_sprot_subset(sprot,100000,"sprot_100k_{}.fasta")

sprot_100k_1.fasta
100000
