In [1]:
import bisect
import sys

In [2]:
class Index(object):
    def __init__(self, t, k):
        ''' Create an index from all substrings of t with length k. '''
        self.k = k  # This is the length of the k-mers we'll use for our index.
        self.index = []
        # We slide a window of size 'k' across the text 't'.
        for i in range(len(t) - k + 1):
            # For each k-mer, we store the k-mer itself and its starting position (offset).
            self.index.append((t[i:i+k], i))
        # We sort the list. This is crucial for fast searching later.
        self.index.sort()
    
    def query(self, p):
        ''' Find all occurrences of the first k-mer of a pattern p in the index. '''
        # Take the first k characters of the pattern to look up in our index.
        kmer = p[:self.k]
        # Use binary search (bisect_left) to find where that k-mer would be in our sorted list.
        i = bisect.bisect_left(self.index, (kmer, -1))
        hits = []
        # After finding the first hit, check for any other identical k-mers right after it.
        while i < len(self.index):
            # If the k-mer at this position doesn't match, we're done.
            if self.index[i][0] != kmer:
                break
            # Otherwise, it's a hit. Record the offset.
            hits.append(self.index[i][1])
            i += 1
        return hits


In [6]:
# This function uses the index to find exact matches for a pattern.
def queryIndex(p, t, index):
    k = index.k
    offsets = []
    # Get all the possible locations from the index (the "hits").
    for i in index.query(p):
        # The index only matches the first k bases. We have to verify the rest of the pattern.
        if p[k:] == t[i+k:i+len(p)]:
            # If the rest of the pattern matches too, it's a confirmed hit!
            offsets.append(i)
    return offsets

In [7]:
t = 'ACTTGGAGATCTTTGAGGCTAGGTATTCGGGATCGAAGCTCATTTCGGGGATCGATTACGATATGGTGGGTATTCGGGA'
p = 'GGTATTCGGGA'

In [8]:
index = Index(t, 4)
print(queryIndex(p, t, index))

[21, 68]
