In [1]:
import sqlite3

In [2]:
conn = sqlite3.connect('blast.db')
c = conn.cursor()

In [3]:
def get_table_name(word_size):
    return "preprocess_wordsize_" + str(word_size)

In [4]:
nucleotide_num = {'A':0,'C':1,'G':2,'T':3}

def get_word_encoding(word):
    inverted_word = word[::-1]
    n = len(word)
    
    index = 0
    
    for i in range(n):
        index += pow(4,i) * nucleotide_num[inverted_word[i]] 
    
    return index + 1

In [5]:
get_word_encoding('AGCTT')

160

In [6]:
def all_combinations(word_size):
    nucleotides = ('A','C','G','T')
    sol = list()
    
    def all_combinations_rec(size,cur_sol):
    
        # exit condition
        if size == 0:
            sol.append(''.join(cur_sol))
            return
        
        for nuc in nucleotides:
            
            cur_sol.append(nuc)
            all_combinations_rec(size-1,cur_sol)
            
            # backtrack
                       
            cur_sol.pop()
    
    all_combinations_rec(word_size,[])
    
    return sol

In [7]:
def create_word_table(word_size):
    
    # new table name
    new_table_name = get_table_name(word_size)
    
    # Create new table for this word size
    
    create_table_s = "CREATE TABLE {} (id INTEGER PRIMARY KEY,sequence_index int NOT NULL UNIQUE,word_encoding int NOT NULL)".format(new_table_name,str(word_size))
    c.execute(create_table_s)
    
    # Save (commit) the changes
    conn.commit()

In [9]:
create_word_table(11)

In [138]:
# delete a table if it already exists
word_size = 11
#new_table_name = get_table_name(word_size)
#c.execute("DROP TABLE {}".format(new_table_name))

<sqlite3.Cursor at 0x10b4e70a0>

In [113]:
# makes a cursor which is like an iterator
c.execute("SELECT * FROM preprocess_wordsize_5")

<sqlite3.Cursor at 0x10b4e70a0>

In [114]:
# you can do this to print them or make them into a list
for row in c.execute("SELECT * FROM preprocess_wordsize_5"):
    print(row)

In [115]:
l = list(c.execute("SELECT * FROM preprocess_wordsize_5"))

In [116]:
c.execute("SELECT * FROM indexed_words_wordsize_5")

<sqlite3.Cursor at 0x10b4e70a0>

In [10]:
def get_indexes_for_word(word):
    word_size = (len(word))
    new_table_name = get_table_name(word_size)
    
    encoding = get_word_encoding(word)
    
    s = "SELECT sequence_index FROM {} where word_encoding = ?".format(new_table_name)
    return c.execute(s,(encoding,))


In [11]:
# get returns the cursor, so an iterator of all the entries for a certain word
get_indexes_for_word('AAAAA')

<sqlite3.Cursor at 0x10fc62f80>

In [12]:
def insert_index_for_word(word,index):
    word_size = (len(word))
    new_table_name = get_table_name(word_size)
    
    encoding = get_word_encoding(word)
    
    s = "INSERT INTO {} (sequence_index,word_encoding) VALUES (?,?)".format(new_table_name)
    c.execute(s,(index,encoding,))
    
    conn.commit()

In [13]:
# iterate to see all entries fitting the word
cursor = get_indexes_for_word('AAAAA')
for row in cursor:
    print(row)

(588,)
(1349,)
(1350,)
(1351,)
(1806,)
(1807,)
(1919,)
(2609,)
(2873,)
(3318,)
(3704,)
(3836,)
(5447,)
(5899,)
(5900,)
(5901,)
(5902,)
(6487,)
(6654,)
(6655,)
(6908,)
(6952,)
(6953,)
(6954,)
(6955,)
(7479,)
(7570,)
(7571,)
(7572,)
(7573,)
(7574,)
(7575,)
(7725,)
(8346,)
(8418,)
(8419,)
(8420,)
(8489,)
(8728,)
(9165,)
(9436,)
(9464,)
(9465,)
(9750,)
(9890,)
(9891,)
(10335,)
(12541,)
(12938,)
(13663,)
(13664,)
(13665,)
(13666,)
(13799,)
(13816,)
(14032,)
(14033,)
(14034,)
(14035,)
(14036,)
(14037,)
(14038,)
(14051,)
(14052,)
(14053,)
(14054,)
(14055,)
(14056,)
(14057,)
(14058,)
(14059,)
(14060,)
(14061,)
(14062,)
(14063,)
(14379,)
(14380,)
(15675,)
(15693,)
(16053,)
(16054,)
(16055,)
(16767,)
(16768,)
(16769,)
(17281,)
(17282,)
(17448,)
(18002,)
(18803,)
(19102,)
(20514,)
(20515,)
(21371,)
(21372,)
(21373,)
(21374,)
(21382,)
(21383,)
(21384,)
(21385,)
(22994,)
(22995,)
(23462,)
(23806,)
(23807,)
(24581,)
(24582,)
(24583,)
(25162,)
(25173,)
(26807,)
(27443,)
(27444,)
(27866,)
(28888,)
(30

(370017,)
(370095,)
(370802,)
(370900,)
(370901,)
(370909,)
(371048,)
(371049,)
(371050,)
(371051,)
(371052,)
(371053,)
(371054,)
(371055,)
(371056,)
(371057,)
(371058,)
(371059,)
(371060,)
(371061,)
(371403,)
(371416,)
(371417,)
(371463,)
(371547,)
(371548,)
(371662,)
(371896,)
(371908,)
(371959,)
(371960,)
(371961,)
(371962,)
(371968,)
(371974,)
(371975,)
(371976,)
(371982,)
(371983,)
(372017,)
(372018,)
(372051,)
(372052,)
(372135,)
(372458,)
(372459,)
(372489,)
(372747,)
(373000,)
(373001,)
(373042,)
(373419,)
(373628,)
(373656,)
(373678,)
(373695,)
(373762,)
(373763,)
(373764,)
(373765,)
(373766,)
(373767,)
(373768,)
(373769,)
(373770,)
(373771,)
(373772,)
(373773,)
(373774,)
(373775,)
(373776,)
(373777,)
(373778,)
(373779,)
(373785,)
(373983,)
(374203,)
(374225,)
(374445,)
(374745,)
(374897,)
(374898,)
(374912,)
(374913,)
(374914,)
(374915,)
(374916,)
(374917,)
(374923,)
(374924,)
(374925,)
(374926,)
(375680,)
(375840,)
(375949,)
(376637,)
(377305,)
(377591,)
(377631,)
(378119,)


In [14]:
# load the sequence and probabilities into memory
sequence_filename = "chr22.maf.ancestors.42000000.complete.boreo.fa.txt"
probabilities_filename = "chr22.maf.ancestors.42000000.complete.boreo.conf.txt"

with open(sequence_filename) as f:
    sequence = f.readline()

with open(probabilities_filename) as f:
    probabilities = f.readline().split()

# make all probabilities float
n = len(probabilities)
for i in range(n):
    probabilities[i] = float(probabilities[i])

In [None]:
# only preprocess for the most likely probabilistic sequence
word_size = 11
for i in range(n-word_size):
    
    insert_index_for_word(sequence[i:i+word_size],i)

In [150]:
# iterate to see all entries fitting the word
cursor = get_indexes_for_word('AAAAA')
for row in cursor:
    print(row)

(588,)
(1349,)
(1350,)
(1351,)
(1806,)
(1807,)
(1919,)
(2609,)
(2873,)
(3318,)
(3704,)
(3836,)
(5447,)
(5899,)
(5900,)
(5901,)
(5902,)
(6487,)
(6654,)
(6655,)
(6908,)
(6952,)
(6953,)
(6954,)
(6955,)
(7479,)
(7570,)
(7571,)
(7572,)
(7573,)
(7574,)
(7575,)
(7725,)
(8346,)
(8418,)
(8419,)
(8420,)
(8489,)
(8728,)
(9165,)
(9436,)
(9464,)
(9465,)
(9750,)
(9890,)
(9891,)
(10335,)
(12541,)
(12938,)
(13663,)
(13664,)
(13665,)
(13666,)
(13799,)
(13816,)
(14032,)
(14033,)
(14034,)
(14035,)
(14036,)
(14037,)
(14038,)
(14051,)
(14052,)
(14053,)
(14054,)
(14055,)
(14056,)
(14057,)
(14058,)
(14059,)
(14060,)
(14061,)
(14062,)
(14063,)
(14379,)
(14380,)
(15675,)
(15693,)
(16053,)
(16054,)
(16055,)
(16767,)
(16768,)
(16769,)
(17281,)
(17282,)
(17448,)
(18002,)
(18803,)
(19102,)
(20514,)
(20515,)
(21371,)
(21372,)
(21373,)
(21374,)
(21382,)
(21383,)
(21384,)
(21385,)
(22994,)
(22995,)
(23462,)
(23806,)
(23807,)
(24581,)
(24582,)
(24583,)
(25162,)
(25173,)
(26807,)
(27443,)
(27444,)
(27866,)
(28888,)
(30

(373785,)
(373983,)
(374203,)
(374225,)
(374445,)
(374745,)
(374897,)
(374898,)
(374912,)
(374913,)
(374914,)
(374915,)
(374916,)
(374917,)
(374923,)
(374924,)
(374925,)
(374926,)
(375680,)
(375840,)
(375949,)
(376637,)
(377305,)
(377591,)
(377631,)
(378119,)
(378120,)
(378121,)
(378122,)
(378123,)
(378124,)
(378176,)
(378280,)
(378316,)
(378442,)
(378499,)
(378500,)
(378619,)
(378620,)
(378621,)
(378622,)
(378623,)
(378624,)
(378625,)
(378702,)
(379991,)
(380102,)
(380103,)
(380155,)
(380156,)
(380157,)
(380164,)
(380165,)
(380166,)
(381191,)
(382087,)
(382088,)
(382089,)
(382090,)
(383622,)
(383623,)
(383645,)
(383811,)
(384692,)
(384693,)
(384864,)
(384865,)
(385557,)
(385558,)
(387313,)
(387314,)
(388170,)
(388171,)
(388454,)
(388864,)
(388921,)
(388922,)
(388923,)
(388924,)
(388925,)
(388926,)
(389251,)
(389328,)
(389329,)
(389631,)
(389632,)
(389633,)
(389634,)
(389769,)
(389775,)
(390322,)
(390323,)
(390324,)
(390325,)
(390403,)
(390461,)
(390559,)
(390907,)
(390908,)
(390909,)


In [152]:
# close connection to db if done with it
conn.close()