In [1]:
# load the sequence and probabilities into memory
sequence_filename = "chr22.maf.ancestors.42000000.complete.boreo.fa.txt"
probabilities_filename = "chr22.maf.ancestors.42000000.complete.boreo.conf.txt"

with open(sequence_filename) as f:
    sequence = f.readline()

with open(probabilities_filename) as f:
    probabilities = f.readline().split()

# make all probabilities float
n = len(probabilities)
for i in range(n):
    probabilities[i] = float(probabilities[i])

In [2]:
# set random seed
import random

random.seed(1)

In [3]:
# generate test sequence

In [4]:
def get_test_sequence(length,prob_insertion,prob_deletion):
    nucleotides = ['A', 'G', 'C', 'T']
    
    # get random starting position
    
    len_sequence = len(sequence)
    # len of probabilities should be the same
    
    start_index = random.randrange(len_sequence-length-1)
    
    query = list()
    
    for i in range(start_index, start_index+length):
    
        insertion_deletion_prob = random.random()
        
        if insertion_deletion_prob < prob_insertion:
            # if we are within probability of getting an insertion
            nucleotide = nucleotides[random.randrange(4)]

        elif insertion_deletion_prob < prob_insertion + prob_deletion:
            # if we are within probability of getting a deletion
            nucleotide = ""
        
        else:
            # if there is no indel
            
            if (probabilities[i] > random.random()):
                nucleotide = sequence[i]
            else:
                temp_nucleotides = ['A', 'G', 'C', 'T']
                temp_nucleotides.remove(sequence[i])
                nucleotide = temp_nucleotides[random.randrange(3)]

        # add to our list representing the sequence
        
        query.append(nucleotide)
    
    return ["".join(query),start_index]

In [5]:
# test_name = "test_sequence"
# new_filename = test_name + ".fa"

# length = 300
# prob_insertion = 0.01
# prob_deletion = 0.01

# test_sequence_list = get_test_sequence(length,prob_insertion,prob_deletion)

# with open(new_filename, 'w') as f:
#     f.write(">" + test_name + " " + str(test_sequence_list[1]) + " " + str(length) + "\n")
#     f.write(test_sequence_list[0])

In [6]:
import blast
import os
import time

In [14]:
def test_probabilistic_blast(test_name,iterations):
    
    pblast = blast.PBlast()
    
    exact_index_matches = 0
    contains_match = 0
    
    cur_dir = os.getcwd()
    os.mkdir(test_name)
    
    for i in range(iterations):
        
        new_filename = test_name +  str(i) + ".fa"
        new_path_to_file = os.path.join(cur_dir, test_name, new_filename)

        length = 400
        prob_insertion = 0.005
        prob_deletion = 0.005

        test_sequence_list = get_test_sequence(length,prob_insertion,prob_deletion)

        with open(new_path_to_file, 'w') as f:
            f.write(">" + test_name + " " + str(test_sequence_list[1]) + " " + str(length) + "\n")
            f.write(test_sequence_list[0])
        
        # call to probabilistic blast
        pblast.load_query_file(new_path_to_file)
        
        # compare with start and end index locations
        result = pblast.probabilistic_blast()
        
        if test_sequence_list[1] == result[-1][0] and result[-1][1] - result[-1][0] == length:
            exact_index_matches += 1
        elif test_sequence_list[1] <= result[-1][0] and result[-1][1] - result[-1][0] >= length:
            contains_match += 1
        
        # check top ten results as well
        topten = 0
        for j in range(-10,-1,-1):
            if test_sequence_list[1] <= result[j][0] and result[j][1] - result[j][0] >= length:
                topten += 1
                break
    
    pblast.exit()
    
    return [exact_index_matches,contains_match,topten,iterations]
        

In [15]:
results = test_probabilistic_blast("fifth_test",100)
print(results)

total_seeds:  308
stopped before ungapped extension:  0
stopped before gapped extension:  125
HSPs:  183
BEST SCORING ALIGNMENT: 
140891
141308

query: 	
HSP: 	GCTACCTATATTTTTACCTTTTAGGGAAGAGAGCGTAGAAAATG
gap: 	-AGATTGGGTCTTTGACCAACAGCAAACAGAGGAAATTGATTTTACACATTTTTCTAGTGTATCAATGAATGAGTCTTCTGGCAAGTAGGAGACTTGGATTTTGTTTACCTGTTA-CTGTTTATCACCCCTATTTTGAAAGAAAATTTCATTTTTTTTGTCTACAAATCTGAAAGGTAACTAATATACCTTTACTTAAGATAACCTTCCAGAAAGATAACCTTCCAACAATGCTCTGGTTGTTTGTCATTTGTTTTCCTTAAAGTTTGTTTTGGTGCTAGGA-TATTATAAGAAGTAGGAATAATAGATAATATTAATTGAGTGTCTCTGTGTGTCAGGCTCGGGGCTAAG------------A---TCT--T
db: 	
HSP: 	GCTAGCAATATTTTTACCTTTTAGGGAAGAGAAATGAGAAAATG
gap: 	GAGATTGGGTCTTTGACCAACAGCAAATAGAGGAAATTGATTTTCCATATATTTCTGGTGTTTCAATGAATGTGTCTCCTGGCAAGTAGGAGATTTTGATTTTTTTTACCTGTTATCTGTTTATCACCCCTATTTTGAAAGAAAATTTCATTATTTTTGTCTACAAATCTGAAAGGTAACTAATATACCTTTACTTAAGATAACCGTCCAAAAAGATAACCATCCAACAATGCTCTGGTTGTTTGTCATTTGTTTTCCTTAAAGTTTGTTTTGGTGCTAGGATTATTATAAGAAGTAGGAATAATAGCTAATATTTATTGAGTGTCACTATGTGCCAGGCTCGGGGCTAAGC

total_seeds:  326
stopped before ungapped extension:  3
stopped before gapped extension:  149
HSPs:  174
BEST SCORING ALIGNMENT: 
153933
154371

query: 	-----A-----T---ACG--CA--C-C-CACACGCAAGCCTGCGGGGGTCCTGGACGAGCGTCACAGCTGCCCCTTCGCTCCAATGCTTTCTAGTGTG-TCAG-
HSP: 	GACAGTAAATAACAGTCCCTGACCGCTGGCCGAGTGCCGGGCACTGATCTAAACTCTTCACACTCAGAGGTAAAAGAGATGAAAAATTAGTCAAAGCAGAATATTTTTGCCTAAAATACTCTAAAAGATATGTAATTAAAATGACATGAGAAAAAGTCCCAGGATCGCAGGGGCCGCCTGGCTAGGCACTCACGGGGCTGAGGGTGGAAGTGCAGCTACAAGGGGGCCTCTGAGACCCCTCCTCCGCCCTGCAGATTGAAAGCGAGCAGCAGGGAGCAGAACCAGATTCCAGTAGGGT
gap: 	-AGGCAGTCCCAGCAGG--A-----------------

db: 	GGGGGAGGGCGTCTCACACACACGCACACACACGCACGCCTGCGCGGGTCCTGGACGAGCCTCACAGCTGCCCCTTCGCTCCAGTGCTTTCTAGTGTGAT-AGT
HSP: 	GACAGTAAATAACAGTCCCTGACCGCTGGCCGTGTGCCGGGCACTGTTCTAAACTCTTTACACTCAGAGGTAAAAGAGATGAAAAATTAGTCAAAGCAGAATAGTTTTGCCTAAAATACTCTAAAAGATATGTAATTAAAATGACATGAGAAAAAGTCCCAGGATCGAGGGGGCTGCCTGGCTTGGCACTCACGGGGCTGAGGGTGGAAGTGCAGCAGCCGGGGGGCCTCTGAGACCCCTCCTCAGCCCTGCAGGTTGAATGCCGGCAGCAGGGCC

total_seeds:  351
stopped before ungapped extension:  4
stopped before gapped extension:  238
HSPs:  109
BEST SCORING ALIGNMENT: 
497636
498055

query: 	
HSP: 	GGCCACCTGGGCTCCCGTCATTCCCGAAAAGCTCCTCCTTGGATGTCTCTAGGTCCCACCCCCCACTCTCTGACCACAGCCCCGCCCTGCCCAAGCCTCCTCCCAGGGAAGGATAGTGAGCGCTGGACTCGGCAAGGGGGGCACCGAGCCCCTGGCCTCCACGGCTCCCTGCACGGGATCATGGTCAGGGCTCCGCTCCCACCAGCCTCCGCCACCAGACGGGAGCACCTAGAGGGCAGGGTCCGTGCCTAGTTTGTCTTTGGGTTCCTGAACCTACTGCTGTGCTTGGCACAACGGGATCGGCACCAAATGAGCTATGCCGAGGGGGCTCCTCCCTCACTAGGGTGTGTCC
gap: 	-CGTTCCTCACTCACTCTTGGCCCAGCTACGAGTGGCCT-----GA--CCT---GA---T-----G-
db: 	
HSP: 	GGCCACCTGGGCTCCCGTCATTCCCCAAATGCTCCTCCTTGGAGGTCTCAAGTTCCCACCACCCACTCTCTGACCACAGCCCCGCACAGCCCAAGCCTCCTCCCAGGGAAGGCGAGTGAGCCCCGGGCTCGGCTGGGAGGGATCCGAGCCCCTGGCCTCCACGGCTCCCTGCACCGGATCATGGTCGGAGCCCCGCTCCCACCAGCCTCCCCCACCGGATGGGAGCCCCTTGAGGGCAGGGACCGTGCCTGGTTTGTCTCTGGGTCCCTGAACCTACAGCTGTGCTTGGCACAAAGAGAGCGGCACCAAATGAGCTATGGTGAGGGGGCTCCTCGCTCACTTGGGTGTGTCC
gap: 	TGGGCCCACACTCACTCTTGGCCCAGCTACGAGCGGCCTGAC

total_seeds:  330
stopped before ungapped extension:  0
stopped before gapped extension:  123
HSPs:  207
BEST SCORING ALIGNMENT: 
10402
10819

query: 	----------------CA-A-ACCT-ACAATTCCTATGATACACAAGACATTTTGGTATTTGGGTTGTAATGAGGTGTGATCTACACTGAACTAGGGCTGATTTTCTCTCCTGGCAGTTAAGTAGTTTAACGGAGCACAAACGAAATACTTCCCTTACTAATCCTTACTGACACAACAGCATCCAATCCTCCCCCCGAATTGCTTTCGTACCATCTGGGCAAAAGCAATTAACTTAGAAACTGTTGGGAGAAGACCGCAATGTCAGGCCGAACTTGGTGAAGTCTGGAGTTCGGGCTCCGGGAGACCTGACTTC-
HSP: 	CCTCTACCTGGGGGCGACGGCTGGGCCCCTTACGCAGCTCTAGCTCTCGCTTCCAATCTACCCCAGGGACTGCAGAGAGAGTTTCCTTTTGGCACGGAGGGAA

db: 	CAATATCACCTGAAATCACAAACCTGACAATTCCTATGTTACACAAGACATTTTGGTATTTGGGTTGTAATGAGGTTTGATCTACACTGAACTAGGGCTGATTTTCTCTCCCGGCAGTTAAGTAGTTTAATGGAGCACAACCGAAATACTTCCCTTACTAATCCTTACTGACACAACAGCATCCAATCCTCCCCCAGAATTGCTTTCTTACCATCTGGGCAAAAGCAATTAACTTAGAAACTGTTGAGAGAAGACCGCATTGTCAGGCCGAACTTGGTGAATTCTGGAGTTCGGGCTCCGGTTGACCTGACTTCA
HSP: 	CCTCTACCTGGGGGCGATGGCTGGGCCCCTGGTGCAGCTCTAGCTCTCTCTTCCCACCTACCCCAGGGACTGCAGAGAGAGTTTCCTTTTGGCAC

total_seeds:  408
stopped before ungapped extension:  2
stopped before gapped extension:  182
HSPs:  224
BEST SCORING ALIGNMENT: 
274856
275274

query: 	-------------T--C-A-C--TTTATCTCTTTTTTTTAGAACGGA-CCTCAGTGCTCCTGGGAGATT-TGTC-
HSP: 	AGTGAATCTCCTGTCTAAACCTCCCACGTTTCTCGGAATCCAGCCCTGCCCCAGCTGCTTCGGTAATTTAGTCCTTTCCTCTCTCCTCTTCTAATTATTTTTCTTGCTCTGAATTTGTCCAAGGACTTTAAAATCAAGTTGCTTATGATGGGTCTGGGGTCCACAGGAGGGAATGCCCCTGAGAGGGCCAGGCCTTCTGTGGTCCTTCCAGAGTGGGCTCTGGAGCTCACAGCCACCACTCTGACCCTCTGCAGATGTCCTTCGGCCTTCTCCGAGTGTTCTCCATTGTGATCCCCTTTCTCTATGACGGGACGCTCATTAGCAAGAACTTTGCTGCTCTACTTGA

db: 	ACTTTGATGGCCATAACAATCACTTTATCTCTTTTTTTTAG-ACAGAGCCTCACTCCTCCTGGGAGGTTCT-TCA
HSP: 	AGCAATTCTCCTGCCTCAGCCTCCCAAGTAGCTGGGATTACAGGCCTGCGCCAGCTGGGCCAATTTTTTTGTATTTTCCCCTCTCCTCTTCTAATTATTTTTCTTGCTCTGAATTTTTCCAAGGACTTTAAAATCAAGTTGCTTATGATGGGTCTGGGGTCCACAGGAGGGAATGCCCCTGAGAGGGCCAGGCCTTCTGTGGTCCTGCCAGAGTGGGCTCTGGAGCTCACAGCCACCACTCTGACCCTCTGCAGATGTCCTTCGGCCTTCTCCGTGTGTTCTCCATTGTGATCCCCTTTCTCTATGTCGGGACGCTCATTAGCAAGAACT

total_seeds:  349
stopped before ungapped extension:  4
stopped before gapped extension:  262
HSPs:  83
BEST SCORING ALIGNMENT: 
192650
193066

query: 	-C-CA--CC--A-C-C-G--CGA-C-CGG--G--GGATCCCGCCCCGCTCCTAATTGTTCTGAG-CAGTTTCCTCATCTGCAAAATCAACGTGGTGGATGAGACCAGTGATTGTCAAA-TTTTTTTAGCCGCGGACCCTTTGTTCAACTGAAATCTTAAGTGGAGATGAAACACGTCAGATGAATAAAGACACTCGGCCCTGCTGGTGTGAGCGTGGGAGCGGGAGGGGGTTGGAGTCCCACCAGCTGGACTCCCTGCCCCCTCCGGCCCAGCACTCGCTGCTGGAGCGAGGAGGAGGACCGGCCCTGCCGGGAACCGGCCAAACCACTGAGGGCTGACCCAGGGCCTGTCTAGCCCTGACCGGTTTTCACC-
HSP: 	ACAGTGAATGTTTAATGAGCCGGGCACCCCGCCTTGGGGAGGGG

db: 	TCACAGGCGTGAGCTCTGCACCACCACGGAAGTCGGACCCCGCCCCGCTCTTAATTGTTCTGAGTCAGTTTCCTCATCTGCAAAATCAAAGTGGTGGATGAGACCAGTGATTTTCAAACTTTTTTTAGCCACGGACCCTTTGTTCAAATGAAATCTTAAGTGGAGATGTAACACGTCAGATGGATAAAGACACGGGGCTCTGCTGGAGTGAGCGTGGGAGGGGGAGGGGGCTGGAGTCCCACCAGCTGGGCTCCCTCCCCGCTCCTGCCCAGCACTCCCTGCTGGAGCGCGGAGGAGTTTGGGCCCTGCAGGGAACCTGCGAAACCACTGACGGCTGACCCAGGGCCTGTCAAGCCCTGACCGGTCTTCACCA
HSP: 	ACAGTGAATGTTTACTGAGCCGGGCACCAGGCCGTGG

total_seeds:  530
stopped before ungapped extension:  11
stopped before gapped extension:  284
HSPs:  235
BEST SCORING ALIGNMENT: 
460726
461125

query: 	
HSP: 	GGAGTTCGGAAGCTCAGAGCGCCATTTAGAAAGACAGCTTTGGCAAAGGTGTCCCAGACCGATGGTCGAGGCGGGGGAGGAAAGGCGGGAATGTCCAGGTCAGAGGGGAAAAGAAGGCTAGAGGGAGAGACGCTTGAAGGGCTGACTGGAGGGACGGGCAGGTAGGGAGGCCACAGGCCTGGGGTCTCCAGTTTGGGTGCTGAGTGGATGTGATGTCACAGGCCTGGGGTAGGATAAGGAATTGGGCCTGATGCTAAACTTGTGAGCCTGGCAAGGCCCCAGCAGGCCCCAAAACCCAGGCCTGGGCTTCGGAGACGCCGGGACCCAAATGTTGGCCCCAGGTTCAAAATTCTGAGCAGCGCTGGACTCCTCCTCCTGGAGCGGGTTCATTTCCTGGGGC

db: 	
HSP: 	GGAGTTCGGAAGCTCAGAGCTGCATTTAGAAAGACAGCTTTGGCAATGGTGTCCCAGACCGATGGTCGAGGCCGGGGAGGAGAGGCGGGAATGTCCAGGTCAGAGGGGAAAAGACGGAGCGAGGGAGAGACGCTTGAGGGGCAGACTGGATGGACGGGCAGGTAGGGAGGCCACAGGCCTGGGGTCTCCAGTTTGGGTGCTGGGTGGAAGTGATGTCACAGGCCTGGGGTAGGATAAGGAATTGGGCCTGATGCTAAACTTGTGAGCCTGGGAAGGTCCCAGCAGGCCCCAAAACCCAGGCCTGGGCTTGGGAGAAGCCAGGACCCAAATGTTGGCCCCAGGTTCAAAATTCTGAGCAGCGCTGGACTCCTCCTCCTGGAGGCAGTTCATTTCCTTGGGC
370.37000000000063
total

total_seeds:  364
stopped before ungapped extension:  1
stopped before gapped extension:  102
HSPs:  261
BEST SCORING ALIGNMENT: 
346476
346894

query: 	
HSP: 	GCGTTGCTTTCTGCACGAATCACACAAACCTTCCTGCACAAAGTTCAGGAGCCATGACGCCGTGCTTCTCTTGCCACACACCAGCTATGGGCCCTGGCCT
gap: 	-AGGCTCACTCACCCTGTGAAGAGATCCAAGTTCCCTCCCAGCACTCTAACTTCAGATTTCTACTGGTTTTCTAGCAGGGACTCAGATATTTGGTGAATTGAAATGAATACCTACTTTTAGATGCTATTCGATGTTAGCTCTTTT-ACAGATTAGCAATAACTAGTTTAACATCTCAGGGATGAATTATTTGGCCTAATATACAGGGACTTCTCAATATTCTTGTTTATGGTCCTTTAGCATTTTTAGATAATTTTTGAATTCAAAGAATAGTCTACTCCTTATAAGTTTTACTTT--AA--------AA--------
db: 	
HSP: 	GCGTTGCTTTCTGCACGAATCACACAAACCTTCCTGCACAAAGTTCAGGAGCCTTGACACAGTGCTTCTCTTGCCACACACCAGCTATGGGCCCTGGGCT
gap: 	TAGGCTCACTCACCCTGTGAAGTGGTCGAAGTTCCCTCCCAGCACTCTAACTTCAGATTTCTACTGGTTTTCTAGCAGGGACTCAGATATTTGTTGAATTGAAATGAATACCTACTTTTACATGCTATTTGATGTTAGCTCTTTTAACAGATTAGCAATAACTAGTTTAACATCTCAGGGATGAATTATTTGGCCTAATATACAGCGACTTGTCAATATTCTTGTTTATGGTCCTTTAGCATTTTGAGATAATTTTGGTATTCAAAGAATAGTCTACTCCTTATAAGTTTTACTT

total_seeds:  334
stopped before ungapped extension:  1
stopped before gapped extension:  112
HSPs:  221
BEST SCORING ALIGNMENT: 
330495
330912

query: 	-----AC-----T-T----CT---AACCTATAGAACTGTAAGACAA-TAGTAACAAAGTTTGTGGTAATGTGTTAAAGCACAGATAGAAAACTAACACGTTGGGATTTCCGAGAGTCTCATGAGAGAAGTGACTGCTCATTTTCCTTTTATTACTTTCTTGAAGGAAACAAATTTCTCCCTCATGGAAACACCTACAATTTCACTCACTTTTCATTAGGCAAATTGTGGCATCACGAGATCACTGAAGAAGGGACTCTGGGTCTGTCTTCAATG-
HSP: 	CCCTTTTTTCTGTTGAAGAAACAGAGGCTAAAGCAAAGTGACCTGCTGAAGATCACATAGCTAGGAAAAGAACTTAGAACTTCAACCTTTTTTCTTCTGAACCCCAAGTTCCACATTCTTTATACCATACCATGCTGCCTTCT

db: 	GTGAGACCCGTGTGTTGGACTTCTAACCTACAGAACTGTAAGACAACAAGTAACCAAGTTTGTGGTAATGTGTTACAGCAGTGATAGAAAACTAATACGTGGGGATTTCCGAGAATCTCATGAGAGAAGTGACTGCTCATTTTCCTTTTATTACGTTCTTCAATGTAACAAATTTCTCCCTCATGGAAACACCTACAATTTCACTCACTTTTCATTAGGCAAATTTTGGCATCACGAGATCACTGAAGAAGGGACTCTGGGTCTGTCTTCAACGC
HSP: 	CCTTTTTTCTAGTTGAAGAAACAGAGGCTAAAGCAAAGTGACCTGCTGAAGATCACATAGCTAGGAAGAGAACTTAGAACTTCAGCCTTTTTTCTTCTGAACCCCAAGTTCAACATTCTTTATACCATACCAT

total_seeds:  350
stopped before ungapped extension:  0
stopped before gapped extension:  123
HSPs:  227
BEST SCORING ALIGNMENT: 
61528
61944

query: 	
HSP: 	GGGGATGAATCCCAGAGGGGGTGAGATGAGTAAACCAACAGGGTAACAAAGGCATAGAGTAAGAGCAGTGTCAATCAGAGGTACCCATGCAGAGCTGGAAGTCTCCCAGGAGGTCC
gap: 	-TGGTAGGGGGGACCTGTAGCACCTTTCAGGCAGGAGGTGGTATTTGAGCTGTGCTATGAGGGGAGAGATTT-GTATTTGTGGAAGGACCAGGGTTCGCCGAAACATGTGGGTGGGAAAGTATAGGGATTACTGAGGGTACAGTAGCCAGTTAT-GCCAGCAGGTTTCGTGAAGGGGGGTGGTGCCTGAAGAAGCCGGAAAGCTGGTTGGGTCAGAATT-TGGAGAGAGATGGGTAGGGGGCAAGTTATTAATCCTT-GATGATTTGACTGACCTAGAGT-C-----C-C--G--T------
db: 	
HSP: 	GGGCATGAATCCCAGAGTGGGTGAGATGAGTAAACCAACAGGGTAACAAAGGCATAGAGTAAGAGCAGTGTCAATCAGAGGTACCCATGCAGAGCTGGAAGTCTCCCAGGAGGTCG
gap: 	GTGCTATGGGGGACCGGTAGCACCTTTCAGGCAGGAGGTGGTATTTGAGCTGTGCTATGAGGGGAGAGATTTAG-ATTTGTGGAAGGACCAGGGTTGGCAGAAACATGTGGGTGGGAAAGTATAGGGATTACTGAGGGTACAGTAGCCAGTT-TGGCCAGCAGGTTGCGTGAAGGGGAGTGGTGCCTGAAGAAGCTGGAAAGCTGGTTGGGTCAGAATTGTGGAGAGAGATGGGTAGGGGGCAGGTTATTAATCCTTGGATGATTTTACTGAGCTGGAGTG

total_seeds:  243
stopped before ungapped extension:  1
stopped before gapped extension:  84
HSPs:  158
BEST SCORING ALIGNMENT: 
520796
521195

query: 	
HSP: 	TTTGCCCGAGCGTGAATCCCAGGACGCCATAGAAAAACGGGCGAGTCCTTCCGGACACAGGCATCAAACGCGACGGCGCATGGCCAAGCGGCTGGACAGGGAGTAGCGCGGTGCAGGCGTCGGGAGGCAACCTTTGAACAGAGAGTGAGTGTTACGGAGCTTTTAGCGCCGATGCCCTCGGAACAGAGCCATCCGACTGGCTGTCATATCTTTTAACACCTCCGAATTACAAAGATGCACACAGAGGTGGCTTGCCTCGGGGGCCCTTTCCACAGCCTCCCCGCTTCACGTTCTCAAGCCTCTAGACCCTGGACCCGGGGCGCACGGCGGCCACGCGGGGACACAAACCAAGTGCAGAATGTCAAATGTTTGGTCCAAGAAACGGATAAGCTTCGCAAGT

db: 	
HSP: 	TTTGCCCGAGCGTGAATCCCAGGACGCCACAGAAAAACGGGCGAGTCCTTCAGGACACAGGCATCAAACGCGACGGCGCGTGGCCAAGTGGCTGGTGAGGGAGAAGCGCGGGGCAGGCGTCGGAAGGCAACCTTTGAACAGAGAGTGTTTGTTACGGAGCTTTTAGCGCCGATGCCCTCGGAACGGCGCCATCAGACTGGCTGTCACATCTTTCAACACCTCCGAATTACAAAGATGCACACAGATGTGGCTTGCCTCGGGGGCCGTTTCCACAGCCTGCCCGCGTCCCGTTCTCAGGCCTCCAGACCCCGGACCCAGGGCGCACGGCGGCCACGCGGGGACACAAACCAAGTGCAGAACGTCAAATGTTTGGTCCCAGAAACGTATAAGCTTCACAAGT
354.07999999999976
total_s

total_seeds:  399
stopped before ungapped extension:  1
stopped before gapped extension:  163
HSPs:  235
BEST SCORING ALIGNMENT: 
38003
38421

query: 	
HSP: 	AAAGTATCTAACAGGAAGAAAGAAATCAAGCAGACCGCAGGTT
gap: 	-CATCACCCCATCATACAAGGGGGGAACTTAATCTGTTCACTCCAGAGGACAGGGATGTTTAAAACAAATTCTTCCTCTTAATTCCTTTTGGGCAGTTCCCCAACCCGGTCCTTACCTGCACCGAGGGCATGCTAGCCAGCCCTGGCGCCGAGTTCAGAGGCTGG-GCTGAGGCTGGGGCATCAGGCTCCTTTGTGAGGTGGCCTCAGTGACAGGGCTGCTATGAGGTAGGGGCTACCCCACCCCAAACTGGCTGGGTCAGCCCTACCAGTCCTGCCCTCCAACTCCCTAACACCAACTGCCTGTCCTAAGCTTTGCCATGTTGCTGCCACTGCTGGGCGCCTGTGC--T-GT---G--A-T----GG----G-G
db: 	
HSP: 	AAAGTATCTAATAGGAAGAAAGAAATCAAGCAGACCGCAGGTT
gap: 	ACATCACCCCATCATACATGGGGGGAACTTAATCTCTTCACTCCAGAGGACAGGGATGTTTAAAACAAATTCTTCCTCTTAATTCCTTTTGGGCAGTTCCCCAACGCTGTCCTTACCTGCACCGAGGGCATGCTAGCCAGCCCTGGCGCCGGGTTCAGCGGCTGGAGCTGAGGCTGGGGCATCAGGCTCCTTTGTGAGGTGGCCTCAGTGACAGGGGTGCTATGAGGCAGGGGCTACCCCGCCCCAAACTGGCTGGGTCAGCCATACCAGTCCTGCCCTCCAACTCCCTAACACCACCTGCCTGACCTAAGCTTTGCCATGTTGCTGCCACTGCTGGGCGCCTGTGCCGTGGTG

total_seeds:  281
stopped before ungapped extension:  3
stopped before gapped extension:  180
HSPs:  98
BEST SCORING ALIGNMENT: 
178143
178558

query: 	------TT-----CAG--C-GC-C-GATAGAGAGAGCAGACGGAGGCCCCCGGCCCAGAGCCCGGAAGCGCTCCTGCTCCCCGACAGAGCCTGCGCCGAAACCTGACCCATGCACACGCGACCCC-ACCTGTTCTCGCAGGCTGTTCTCCCTTCCTGGAAGCCTTTCTAGCCCCTCACTAC-GAGAGCTCCTATACA-CGAGCCTTTATCCTTCATTTT-GAGCCAGACTGATCTGGTTGGAATTCCTGCTCTAATAGGGCCATTCAACCTTTCTAGGCCTCAG-
HSP: 	TTGCTCATCTGCAAGATGTGCTGCTTGCACCTCATGGGCAGGGCTGCAGGGAGAAGCCAAAGGGCCCCAGGCCGGAAAACGCCCGGGTACATGCAGCCTCGTGGGATCCCCCACGGCAGCCGCGCTGAGAAG

db: 	CCTGCTTTCTGGCCAGTTCAGCACAAACAGAGAGAGCAGCCGGAGGACCCCGGCCCAGAGCCCGGAAGCGCTCCTACCCCCCGACAGTGCCTGCGCCGAGACCTGACCCAGGCACACCCGACCCCTGCCTGTTCCCGCAGGCTGTTCTCTCTTCCTGGAAGCCTTTCCAGCCCCTCAC-GCTGAGAGCTCCTATGCTCCCAGCTTTTATCCTTCATTTTGGAGCCAGACTGACCTGGTTGGAATTCCTGCTCTACTAGGGCCATTCAACCTTTCTAGGCCTCAGT
HSP: 	TTTCTCATCTGCAAGATGGGCTGCTGGCACCTCTTTGGCAGGGCTGCAGGGAGAAGCCAATGGGCCCCTGGCCGGAAAACCCCCAGGTACATGCAGCCTCGTGGGAGCCCCCACCGCAGCCGGGC

In [16]:
print(results)

[0, 45, 0, 100]


In [9]:
# test 1: 0 10 100
# test 2: 0 4 100
# test 3: 0 45 100

# test 3 with epsilon 0 : 0 0 100
# test 3 with epsilon 10: 0 46 100

# higher epsilon is better

# test 3 with top ten: 0 45 0 100

In [10]:
def test_probabilistic_blast(test_name,pblast):
    
    exact_index_matches = 0
    contains_match = 0
    total_time = 0
    
    cur_dir = os.getcwd()
    test_dir = os.path.join(cur_dir, test_name) 
    
    files = []
    # r=root, d=directories, f = files
    for r, d, f in os.walk(test_dir):
        for file in f:
            if '.fa' in file:
                files.append(os.path.join(r, file))
    
    
    for file in files:
        
        #read first line of fasta file
        with open(file, 'r') as f:
            info = f.readline().split()
            start_index = int(info[1])
            length = int(info[2])
        
        # call to probabilistic blast
        pblast.load_query_file(file)
        
        # compare with start and end index locations
        start_time = time.process_time()
        result = pblast.probabilistic_blast()
        end_time = time.process_time()
        
        total_time += end_time - start_time
        
        if start_index == result[-1][0] and result[-1][1] - result[-1][0] == length:
            exact_index_matches += 1
        elif start_index <= result[-1][0] and result[-1][1] - result[-1][0] >= length:
            contains_match += 1
        
        # check top ten results as well
        topten = 0
        for j in range(-10,-1,-1):
            if test_sequence_list[1] <= result[j][0] and result[j][1] - result[j][0] >= length:
                topten += 1
                break
    
    iterations = len(files)
    return [exact_index_matches,contains_match,topten,iterations,total_time/iterations]

In [17]:
pblast = blast.PBlast()

pblast.set_blast_params(ungapped_stop = 25)

results = test_probabilistic_blast("third_test",pblast)

pblast.exit()

print(results)


FileExistsError: [Errno 17] File exists: 'third_test'