In [1]:
# load the sequence and probabilities into memory
sequence_filename = "chr22.maf.ancestors.42000000.complete.boreo.fa.txt"
probabilities_filename = "chr22.maf.ancestors.42000000.complete.boreo.conf.txt"

with open(sequence_filename) as f:
    sequence = f.readline()

with open(probabilities_filename) as f:
    probabilities = f.readline().split()

# make all probabilities float
n = len(probabilities)
for i in range(n):
    probabilities[i] = float(probabilities[i])

In [2]:
# set random seed
import random
import math
random.seed(1)

In [3]:
# generate test sequence

In [4]:
def get_test_sequence(length,prob_insertion,prob_deletion, prob_point):
    nucleotides = ['A', 'G', 'C', 'T']
    
    # get random starting position
    
    len_sequence = len(sequence)
    # len of probabilities should be the same
    
    start_index = random.randrange(len_sequence-length-1)
    
    query = list()
    
    for i in range(start_index, start_index+length):
        insertion = list()
        extra_nucleotide = ""
        
        if (probabilities[i] > random.random()):
                nucleotide = sequence[i]
        else:
            temp_nucleotides = ['A', 'G', 'C', 'T']
            temp_nucleotides.remove(sequence[i])
            nucleotide = temp_nucleotides[random.randrange(3)]
        
        insertion_deletion_prob = random.random()
        
        if insertion_deletion_prob < prob_insertion:
            # if we are within probability of getting an insertion 
            # length follows a geometric distribution
            insertion_length = int(math.log(random.random()/prob_insertion)/math.log(1-prob_insertion) + 1)
            for i in range(insertion_length):
                extra_nucleotide = nucleotides[random.randrange(4)] 
                insertion.append(extra_nucleotide)

        elif insertion_deletion_prob < prob_insertion + prob_deletion:
            # if we are within probability of getting a deletion
            nucleotide = ""
           
        elif insertion_deletion_prob < prob_insertion + prob_deletion + prob_point:
            nucleotide = nucleotides[random.randrange(4)]
  
        query.append(nucleotide + "".join(insertion))
    
    return ["".join(query),start_index]

In [5]:
get_test_sequence(100, 0.0005, 0.0005, 0.005)

['GCTAGCTAGAGTTTTACCTTTTAGGGAAGAGAACTTAGAAAATGGAGATTGGGTCTTTGACCAACAGCAAATAGAGGAAATTGATTTTCCATATATTTCT',
 140891]

In [6]:
# test_name = "test_sequence"
# new_filename = test_name + ".fa"

# length = 300
# prob_insertion = 0.01
# prob_deletion = 0.01

# test_sequence_list = get_test_sequence(length,prob_insertion,prob_deletion)

# with open(new_filename, 'w') as f:
#     f.write(">" + test_name + " " + str(test_sequence_list[1]) + " " + str(length) + "\n")
#     f.write(test_sequence_list[0])

In [7]:
import blast
import os
import time

In [8]:
def gen_test_probabilistic_blast(test_name: str,iterations: int):
    
    cur_dir = os.getcwd()
    os.mkdir(test_name)
    
    for i in range(iterations):
        
        new_filename = test_name +  str(i) + ".fa"
        new_path_to_file = os.path.join(cur_dir, test_name, new_filename)

        length = 400
        prob_insertion = 0.0005
        prob_deletion = 0.0005
        prob_point = 0.005

        test_sequence_list = get_test_sequence(length,prob_insertion,prob_deletion,prob_point)

        with open(new_path_to_file, 'w') as f:
            f.write(">" + test_name + " " + str(test_sequence_list[1]) + " " + str(length) + "\n")
            f.write(test_sequence_list[0])
        

In [28]:
gen_test_probabilistic_blast("testing_suite_2",100)
#print(results)

In [14]:
# test 1: 0 10 100
# test 2: 0 45 100
# test 3: 0 45 100

# test 3 with epsilon 0 : 0 0 100
# test 3 with epsilon 10: 0 46 100
# test 3 with epsilon 15: [0, 44, 0, 100]

# higher epsilon is better

# test 3 with top ten: 0 45 0 100
# test 4: 0 45 0 100
# test 5: 0 45 0 100

# test 3

# alpha 0.8 [0, 45, 0, 100, 44.18176678]
# alpha 0.9 [0, 46, 0, 100, 45.725060700000014]
# alpha 0.95 [0, 45, 0, 100, 43.92756490000005]

# beta 25 [0, 45, 0, 100, 108.04912073999995]

# delta 1 [0, 46, 0, 100, 51.59618474000002]
# delta 10 [0, 45, 0, 100, 43.11371054000002]

# final tests word size 5

# base params  [1, 11, 83, 1, 0.0, 0.0, 0.0, 100, 30.060828680000004]
# delta 20 [1, 11, 83, 1, 0.0, 0.0, 0.0, 100, 29.526636300000035]


In [15]:
def test_probabilistic_blast(test_name: str,pblast: blast.PBlast):
    
    exact_index_matches = 0
    contains_match = 0
    contains_start = 0
    contains_end = 0
    total_time = 0
    topten = 0
    topten_contains_start = 0
    topten_contains_end = 0
    
    
    cur_dir = os.getcwd()
    test_dir = os.path.join(cur_dir, test_name) 
    
    files = []
    # r=root, d=directories, f = files
    for r, d, f in os.walk(test_dir):
        for file in f:
            if '.fa' in file:
                files.append(os.path.join(r, file))
    
    
    for file in files:
        
        #read first line of fasta file
        with open(file, 'r') as f:
            info = f.readline().split()
            start_index = int(info[1])
            length = int(info[2])
        
        # call to probabilistic blast
        pblast.load_query_file(file)
        
        # compare with start and end index locations
        start_time = time.process_time()
        result = pblast.probabilistic_blast()
        end_time = time.process_time()
        
        total_time += end_time - start_time
        
        if start_index == result[-1][0] and result[-1][1] - result[-1][0] == length:
            exact_index_matches += 1
        elif start_index <= result[-1][0] and result[-1][1] - result[-1][0] >= length:
            contains_match += 1
        elif start_index < result[-1][1] and result[-1][1] < start_index + length:
            # contains part of the start of the query
            contains_start += 1
        elif result[-1][0] < start_index + length and start_index + length < result[-1][1]:
            # contains part of the end of the query
            contains_end += 1
        
        # check top ten results as well
        
        for j in range(-10,-1,-1):
            if start_index[1] <= result[j][0] and result[j][1] - result[j][0] >= length:
                topten += 1
            elif start_index < result[-1][1] and result[-1][1] < start_index + length:
                # contains part of the start of the query
                contains_start += 1
            elif result[-1][0] < start_index + length and start_index + length < result[-1][1]:
                # contains part of the end of the query
                contains_end += 1
    
    iterations = len(files)
    return [exact_index_matches,contains_match,contains_start,contains_end,topten/iterations,topten_contains_start/iterations,topten_contains_end/iterations,iterations,total_time/iterations]



In [23]:
tpblast = blast.PBlast()

# tpblast.set_blast_params(max_decrease_stop = 20, ungapped_stop = 50)

results = test_probabilistic_blast("test_testing_suite",tpblast)

tpblast.exit()

print(results)



total_seeds:  349
stopped before ungapped extension:  0
stopped before gapped extension:  138
HSPs:  211
BEST SCORING ALIGNMENT: 
132314
132713

query: 	
HSP: 	TAATAGTAAAAGCAAATCTGCCAAAAAAAACACTATGAATAACTTGGACTAGCTAATACTAAGTGCCAGGCACCGTGCTATAGATTCCATATGAATTATCTCTAACCACCCCCCTGAGTTAGCTATAATTGTTAGCCACATTTTAGAGTGGAGAAACTGCGTCTCAGATAGGTAAAAACAGGGTGACTCTTAATCCCTATGCCATTCTTTTTTCCTGAACAAATTAAAAGACAAATAACAAAGTAGGGGAAATATTTGCAAAAATATAAGAAAGAGCAAATATCCATAATATATGAAACTTTTTCAAATCAGTAAATAGATACAAATAGCCCAATAGAAAAATGGACAAAGGTCAAGAGCAGGCAGTTTACAAAGGAAGAAATACAAATGGACAATAAAA

db: 	
HSP: 	TAAGAGTCAAAGCAAAACTGTCAAAAAAAACACTATGAATAACATGTACTAGCTAATACTAAGTGCCAGGCACCGTGCTATAGATTCTACGTGAATTATCTCTCCACAGCCCCGTGAGTTAGCTATAATTGTTAGCCACATTTTATAGATGAGAAACTGCGGCTCAGAGAGGTAAAAACATGGTGACTCTTAATCACTATGCCATTCTTTTTTCCTGAACAAATTAAAAGAGAAATAACAAAGTAGGGGAAATATTTGCAAACATATAAGAAAGAGCAAATATCCATAATATATGAAAGTTTTTCAAATCAGTAAAAAGAGACAAATAGCCCAATAGAAAAATGGACAAAGGTCAAGAGCAGGCAGTTTACAAAGGAAGAAATACAAATGGCCAATAAAA
364.7199999999998
[0, 0, 

In [None]:
pblast = blast.PBlast()

pblast.set_blast_params()

results = test_probabilistic_blast("testing_suite_2",pblast)

pblast.exit()

print(results)


total_seeds:  433
stopped before ungapped extension:  4
stopped before gapped extension:  240
HSPs:  189
BEST SCORING ALIGNMENT: 
465876
466295

query: 	
HSP: 	GTCAGCCACGGCTGGATTCTCACCCTCGGCTTCTCTCCACCCCCCAGATAACAGCCCTCAACCAGTCTTCCCGGTGTGCCAGGCCTTTGCGCATGGCTGTGCTCTGCCTGGAATGGCGTTACTGCCCTTTTGAAATCGTAACCCTCCTTGAAAGACAGTTCAGAACTCCCCCACCTGTAGATCCCTCCCCAGCTTCCGGAGGCGGCACCTCGGGCCCCCAGCTGCTCCTGGGTTCCCCTCCCTGAATTGTCAGAATCCCCTTTCCCTCTTTGTACCTTGTGCCCCTGAGGGCAGGGGCTGTGTCTAGCTGCCTTTGCAGTGGCCCAGAGCCCACGGCACGCATATTGTGTGCCTGATAGTGAATGGCGCTGACGGGGCAACAACCCC
gap: 	-AGCCCA-A-------A---GG-G-T------
db: 	
HSP: 	GTCAGCCCCAGCTGGCTTCTCACCCTCTGCTTCTCTCCACCCCCCAGATAACAGCCCTCAACCAGTTTCCCCAGTGTGCCAGGCCTTTGCGCATGCCTGTTCTCTGCCTGGAATGCCTTTACTGCCTTTTTGAAATCCTAACCCTCCTTGAAGGACAGTTCAGAACTCCCCCACCTGTAGATCCCTCCCCAGCTTCCGGAGGCGGCACCTCGGGCCCCCAGCTGCGCCTGGGTCCCCCTCCCTGAATTGTCACAAGCCCCTTTCCCTCTTTGTACCTTGTGCCCCTGAGGGCAGGGGCTGTGTCTTGCTGCCTTTGCAGTGGCCCAGAGCCCACGGCACGCAGAAAGGGTGCCTGATAGTGAACGACGCTGGCGGGGCAAAAGACCC
gap: 	CAGCCCA

total_seeds:  375
stopped before ungapped extension:  3
stopped before gapped extension:  129
HSPs:  243
BEST SCORING ALIGNMENT: 
5902
6301

query: 	
HSP: 	AAAAAGCATAATAATAGTGTCTGCCGGCTAACATTTCCAAGATTAAAAGAGATGACAACACGAAATATGCCTAGCACAGTGCCTGGCACATAATACTTGCCCAATAAAGACTAACTCATTCTTCTTTTAGGCTTTAAAATTTGAAAGTCTTATGGCTGCTCTCATAGTATCTTGGTATTAAAACTACCCATTTAGGTGTCTTTCTCCCCCGTTAGACTACAATTGCCTCAAAGACAAGGACAGTTTCTTATGCATCTTTGTAACTCCCGCACCTAGCAAAGTACCTGGACCAGAGATGCTACATGCACATCTGTTGAGATGAGCTATTCTGTTTTGTTAGCTCTGTTGGTTGACACACAGTGCTATTGAGTCCAAAGTTCTTAACCCCAATCTTGGCTGC

db: 	
HSP: 	AAAAAGCATAGTAATAGTGTCTGCCTCATAACATTTCCAAGATTAAAAGAGATGACAACATGAAATATGCCTAGCACAGTGCCTGGCACATAATACATGCCCAATAAAGACTAACTCATTCTTCTTTTAGGCTTTATAATTTGAAAGTTTTATAGCTGCTCTCATAGTATCTTGGTATTGAAACTACCCATTTATGTGTCTTTCTCCCCCATTAGACTACAATTGCCTCAAAGACAAGGACAGTTTCTTATGCATCTTTGTAACTCCAGCACCTAGCAAAGTACCTGGACCAGAGATGCTACATGAACATCTGTTGAGATGAGCTATTCTGTTTAGTTAGCTCTGTTGGTTGACACACAGTGCTATTGAGGCCAAAGTTCTTAACCCCAATCTTGGCTGC
378.12
total_seeds:  397
stop

total_seeds:  328
stopped before ungapped extension:  3
stopped before gapped extension:  125
HSPs:  200
BEST SCORING ALIGNMENT: 
121107
121526

query: 	---T-----------------
HSP: 	AATTGTTACATCATTTTCACATACGGGTTGCTATATGTTGAAGTAGTTGAGGTGTAATAGCCATGTAATCCAACGTAATCCATCTGAATAGGCAAACTCTTGCTTATCGTTGTCCCTGTTGAACATTTGCGTCATTTGCAATTTTTCATTAACATAAGAACTGTTGCCCTGAACACTTATGAACAAGTCACTTTGGAGTTAGTTCGTTGGTGTGCGTTTTCCAAAGAGGATTCTTCAAGTAGAGCTGGGTTACTGCTCTTACACACTGCCAGCTGTAGCCCTTTCATTTTTAAAACTTTTCCAATGCTACAAATTACTGTGGTGATTGGGAATTGCTACTTGGAAAATATTTTTAATTTTTTAAAAAGGTAGTACATTCATATGGAACACAATAAAATA

db: 	AACTTTATTGATGTGCACACA
HSP: 	AATTGTTACAACATTTTCACATACGGGTTGCTATATGTTGAAATAGTTGAGGTGTAATAGCCATGTAATCCAACGTAATCCATCTGAATAGGCAAACGCTTGCATATCCATGTCCCTGTTGAACATTTGCATCATTTGCAGTTTTTCATTAACATAAGTACTGTTGCTCTGAACACTTATGAACAAGTCACTTTGGAGTTAGTTCATTGGTGTGTGTTTCCCAAAGAGGAGTCTTCAAGTAGAGCTGGGTTACAGCTCTTACACACTGCCAGCTGTAGCCCTTTCATTTTTAAAACTTTTCCAGTGCTACAAATTACTGTGGTGATTGGGAATTACAACTTGGAAAATATTTTTATTTTTTCAAAAAGGTAGTACATTCATATGA

total_seeds:  169
stopped before ungapped extension:  2
stopped before gapped extension:  110
HSPs:  57
BEST SCORING ALIGNMENT: 
238343
238762

query: 	G----------------G----
HSP: 	CCTCCCCCCACTCCCAGGAACAAGGTCCCGAGCTTTTGGGGAGTCGGAAGAGGGATGCGGATGTGGAGTGTGGATCGTTCTGGAAAGATCCTGAACTGTCGGGGGCCTCCTCTCGCCCTCGGAACTTCCCGAAGCCCCCTTTCCTCTTCGCCCCAACCAGGAAGATGACTGCTCTTCAGAAGGACAGACCGAAAAGATTGAGGTCCAGACGCACCTTTCTTCTAAGACATTTTGGGAAGCAGACAGGGGCCTAGAGCTACAGACTGGCCTGGACAGTGTTGTGATATCCCACCGCGTCGGAAACTGCAAAATGTGCTGGACATGGCGGGACGTGAACAGATAGCCTAGGAAGAATGATGCTCTTTAGGCCTCCCGAGGCCATGGGGTTTGCACCCGTC

db: 	GAGGGGGGGCCCTGGGCGCGTA
HSP: 	CCTCCCCCGACTCCCGGGAACAAGGTCCCGGGCTTTCGGGGAGGCGGAAGAGGCATGAGGAGGTGGAGTGTGGATCATTCTGGAAAGATCCCGAACTGTCGGGGGCCTCCCCTCGCCCTCGGAACTTCCCAAAGCCCCCTTTCTTCTTCGCCCCAACCAGGGAGATGACTGCTCTTTAAGGAGACAGATTGAAAAGATTGAGGACCAGGTGAAGCTTTATTCACCAACACTTTGGGAAGCTGATGTGGTCCAGGAGTTTGAGACATGTCTGGGCAGCATGGCGATATCCCACCTCTACAAAAAGTAGAAAATTAGCGGGGCATGGTGGGAAGTGAACAGATGGCCTAGGAAGAATGAAGCGCTCTCGGCCTCCCATGGCCTTGGG

total_seeds:  398
stopped before ungapped extension:  4
stopped before gapped extension:  204
HSPs:  190
BEST SCORING ALIGNMENT: 
458517
458916

query: 	
HSP: 	TCCCTGCACACTCAGCAAAGCGCCCCCTGCTCCCACCCTGGTCGACAGCCCGGTCTGGGGGACTCAGAAGGGGGCCAGGAACTCGGTGACTAGCTGGGTGCTCCCCATATCAGGGTCTTGCGCAAACAAGCTCTGCAAGGAAAGGCTGCACCAGTTCCACCTCACTGCTCCCCTTGGAGCCTTCCAGATATCCGCAGGCCTGGGGAGACTCCTTTCTAGGGCCCCAGCAGCTCCTGCAGCCCTGTCCCCATCTGTTCCGGGGTATGTTTCCCCAGGGTGGCCGTCAGAGGGCGAGGCCACCGCTGAACCACCCCTGTGGGTACAAGACCCCTGTGCAGGAGGCTTGTACTGGATGAATGAAGGAGTGAGTAAGTGAAGGAATGATGGAACAAACCAATGC

db: 	
HSP: 	TCCCTGCACACTCAGCAAAGCGCCCCCTGCTCCCACCCTGGTCGACAGCCCGGTCTGGGGGACTCAGGAGTGGGCCAGGAACTCGGTGACTAGCTGGGTGCCCCCCATATCAGGGTCTGGTGCAAACAAGGTCTGTAAGGAAAGGCTGCACCAGCTCCGCCTCACTGGTCCCCTTGGAGCCTTCCAGATATCCCCAGGCCTGGGGAGACTCCTTTCTAGGGTCCCAGCAGCTCATGCAGCCCTGTCCCCATCTGTTCCTGGGTCTGTTTCCCCAGGGAGGCTGTCAGAGGGCGAGGCCACCTCTGAACCACCCCTGTGGGTACAAGACCCCAGTGCAGGAGGCATGTTCTGGATGAATGAATGAGTGAGTAAGTGAATGAATGATGGAACAAACCAATGC
364.2199999999999
total_s