In [1]:
from collections import defaultdict
import sqlite3
import numpy as np

In [43]:
# INPUTS
query_file = "random_query.fa"  # random query selected as a slice from seq: seq[100:400] len=300
probability_file = "chr22.maf.ancestors.42000000.complete.boreo.conf.txt"
fasta_file = "chr22.maf.ancestors.42000000.complete.boreo.fa.txt"

# PARAMS
w = 11
MATCH_SCORE = 1
MISMATCH_SCORE = -1
M = np.array((4, 4))
delta = 10
alpha = 5
beta = 10


In [37]:
# Open the probability file and store it is as a list of floats
with open(probability_file, 'r') as f:
    prob = f.readline()
prob = prob.split(' ')
prob = prob[:-1]
prob = [float(i) for i in prob]

# Open the fasta file and store it as a string
with open(fasta_file, 'r') as f:
    db = f.readline()

# Open the query file and store it as a string
with open(query_file) as f:
    query = f.readline()
    
# Test import
print(db[:10])
print(prob[:10])
print(query[:10])

CAACTAACCA
[0.99, 1.0, 1.0, 0.99, 0.98, 1.0, 1.0, 0.99, 0.99, 1.0]
CATCAACCAC


In [29]:
conn = sqlite3.connect('blast.db')
c = conn.cursor()

In [30]:
def get_table_name(word_size):
    return "preprocess_wordsize_" + str(word_size)

In [31]:
nucleotide_num = {'A':0,'C':1,'G':2,'T':3}

def get_word_encoding(word):
    inverted_word = word[::-1]
    n = len(word)
    
    index = 0
    
    for i in range(n):
        index += pow(4,i) * nucleotide_num[inverted_word[i]] 
    
    return index + 1

In [32]:
def get_indexes_for_word(word):
    word_size = (len(word))
    new_table_name = get_table_name(word_size)
    
    encoding = get_word_encoding(word)
    
    s = "SELECT sequence_index FROM {} where word_encoding = ?".format(new_table_name)
    return c.execute(s,(encoding,))

In [44]:
def singleBaseCompare(base1, base2):
    if base1 == base2:
        return MATCH_SCORE
    else:
        return MISMATCH_SCORE

In [34]:
def scoreSeed(index):
    """ Given the starting index in the database of a seed
    returns the score of that seed based on the product of MATCH_SCORE and probabilities"""
    seed_score = 0
    for i in range(index, index + w + 1):
        seed_score = prob[i] * MATCH_SCORE
    return seed_score

In [42]:
# Get all possible words from query and store them in the list of strings words
words = []
i = 0
while(i+w <= len(query)):
    words.append(query[i:i+w])
    i += 1

words[:3]

['CATCAACCACA', 'ATCAACCACAG', 'TCAACCACAGA']

In [46]:
def ungappedExtensionRight(query_index, db_index, seed_score):
    """Takes the index of the query and db at the end the seed and the seed_score
    outputs the indices of the ungapped extension and its score"""
    max_score = seed_score
    maxscoring_qi = 0
    maxscoring_dbi = 0
    score = seed_score
    
    # While loop that exits when the difference between max_score acheived and score is greater than delta
    while max_score - score < delta:
        query_index += 1
        db_index += 1
        score += singleBaseCompare(query[query_index], db[db_index])
        if score > max_score:
            max_score = score
            maxscoring_qi = query_index
            maxscoring_dbi = db_index
    
    return (maxscoring_qi, maxscoring_dbi, max_score)

def ungappedExtensionLeft(query_index, db_index, seed_score):
    """Takes the index of the query and db at the start the seed and the seed_score
    outputs the indices of the ungapped extension and its score"""
    max_score = seed_score
    maxscoring_qi = 0
    maxscoring_dbi = 0
    score = seed_score
    
    # While loop that exits when the difference between max_score acheived and score is greater than delta
    while max_score - score < delta:
        query_index -= 1
        db_index -= 1
        score += singleBaseCompare(query[query_index], db[db_index])
        if score > max_score:
            max_score = score
            maxscoring_qi = query_index
            maxscoring_dbi = db_index
    
    return (maxscoring_qi, maxscoring_dbi, max_score)

In [53]:
# Putting it all together



In [None]:
conn.close()