In [1]:
# load the sequence and probabilities into memory
sequence_filename = "chr22.maf.ancestors.42000000.complete.boreo.fa.txt"
probabilities_filename = "chr22.maf.ancestors.42000000.complete.boreo.conf.txt"

with open(sequence_filename) as f:
    sequence = f.readline()

with open(probabilities_filename) as f:
    probabilities = f.readline().split()

# make all probabilities float
n = len(probabilities)
for i in range(n):
    probabilities[i] = float(probabilities[i])

In [2]:
# set random seed
import random

random.seed(1)

In [3]:
# generate test sequence

In [4]:
def get_test_sequence(length,prob_insertion,prob_deletion):
    nucleotides = ['A', 'G', 'C', 'T']
    
    # get random starting position
    
    len_sequence = len(sequence)
    # len of probabilities should be the same
    
    start_index = random.randrange(len_sequence-length-1)
    
    query = list()
    
    for i in range(start_index, start_index+length):
    
        insertion_deletion_prob = random.random()
        
        if insertion_deletion_prob < prob_insertion:
            # if we are within probability of getting an insertion
            nucleotide = nucleotides[random.randrange(4)]

        elif insertion_deletion_prob < prob_insertion + prob_deletion:
            # if we are within probability of getting a deletion
            nucleotide = ""
        
        else:
            # if there is no indel
            
            if (probabilities[i] > random.random()):
                nucleotide = sequence[i]
            else:
                temp_nucleotides = ['A', 'G', 'C', 'T']
                temp_nucleotides.remove(sequence[i])
                nucleotide = temp_nucleotides[random.randrange(3)]

        # add to our list representing the sequence
        
        query.append(nucleotide)
    
    return ["".join(query),start_index]

In [5]:
# test_name = "test_sequence"
# new_filename = test_name + ".fa"

# length = 300
# prob_insertion = 0.01
# prob_deletion = 0.01

# test_sequence_list = get_test_sequence(length,prob_insertion,prob_deletion)

# with open(new_filename, 'w') as f:
#     f.write(">" + test_name + " " + str(test_sequence_list[1]) + " " + str(length) + "\n")
#     f.write(test_sequence_list[0])

In [6]:
import blast
import os

In [7]:
def test_probabilistic_blast(test_name,iterations):
    
    pblast = blast.PBlast()
    
    exact_index_matches = 0
    contains_match = 0
    
    cur_dir = os.getcwd()
    os.mkdir(test_name)
    
    for i in range(iterations):
        
        new_filename = test_name +  str(i) + ".fa"
        new_path_to_file = os.path.join(cur_dir, test_name, new_filename)

        length = 300
        prob_insertion = 0.01
        prob_deletion = 0.01

        test_sequence_list = get_test_sequence(length,prob_insertion,prob_deletion)

        with open(new_path_to_file, 'w') as f:
            f.write(">" + test_name + " " + str(test_sequence_list[1]) + " " + str(length) + "\n")
            f.write(test_sequence_list[0])
        
        # call to probabilistic blast
        pblast.load_query_file(new_path_to_file)
        
        # compare with start and end index locations
        result = pblast.probabilistic_blast()
        
        if test_sequence_list[1] == result[0] and result[1] - result[0] == length:
            exact_index_matches += 1
        elif test_sequence_list[1] < result[0] and result[1] - result[0] >= length:
            contains_match += 1
    
    return [exact_index_matches,contains_match,iterations]
        

In [None]:
results = test_probabilistic_blast("third_test",100)

total_seeds:  162
stopped before ungapped extension:  0
stopped before gapped extension:  75
HSPs:  87
BEST SCORING ALIGNMENT: 
140891
141206
query: 	HSP: 	GCTACCTATATTTTTACCTTTTAGGGAAGAGAGCGTAGAAAATGGAGATTGGGTCTTTGgap: 	-CCAACAGCAAATAGAGGAAATTGATTTTTCATATATTTCTGGTGTTTCAATGAATGAGTCTTCTGGCAAGTAGGAGACTTGGATTTTGTTTACCTGTTACCTGTGTATCACCCCTA-TTTG-AAGAAAATTTCATTGTAATTGCCTGCAAATCTGAAAGGTAAGTGATATACCTTTAC-TAAGATAACCCTCCACCAAGATAACCGTCCAACAATGC-CTGGTTGTTTGTCA-------T--TT---G--T-TTT
db: 	HSP: 	GCTAGCAATATTTTTACCTTTTAGGGAAGAGAAATGAGAAAATGGAGATTGGGTCTTTGgap: 	ACCAACAGCAAATAGAGGAAATTGATTTTCCATATATTTCTGGTGTTTCAATGAATGTGTCTCCTGGCAAGTAGGAGATTTTGATTTTTTTTACCTGTTATCTGTTTATCACCCCTATTTTGAAAGAAAATTTCATTATTTTTGTCTACAAATCTGAAAGGTAACTAATATACCTTTACTTAAGATAACCGTCCAAAAAGATAACCATCCAACAATGCTCTGGTTGTTTGTCATTTGTTTTCCTTAAAGTTTGTTT
245.99999999999986
[140891, 141206, 'query: \tHSP: \tGCTACCTATATTTTTACCTTTTAGGGAAGAGAGCGTAGAAAATGGAGATTGGGTCTTTGgap: \t-CCAACAGCAAATAGAGGAAATTGATTTTTCATATATTTCTGGTGTTTCAATGAATGAGTCTTCTGGCAA