<a href="https://colab.research.google.com/github/Wan-Shi-Tong-bi/5Ws/blob/main/colab/2.02_SubstringIndex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Substring Index
We want to build an Index with Bisect library

In [3]:
import bisect
import sys

In [4]:
'''Index Class'''
class Index(object):
    def __init__(self, t, k):
        ''' Constructor: Create index from all substrings of size 'k'
        param t: Text on which the index will be created
        param k: length of the substring index'''
        # class properties
        self.k = k 
        self.index = []
        # build k-mer index and append substring to index list
        for i in range(len(t) - k + 1):  
            self.index.append((t[i:i+k], i))  
        # alphabetize by k-mer for using bisect
        self.index.sort()  
    
    def query(self, p):
        ''' Return index hits for first k-mer of P 
        param p: search pattern
        returns: 1st index hit for p in index'''
        # query with first k-mer of pattern p
        # select kmer of pattern p
        kmer = p[:self.k]
        # do binary search in index
        i = bisect.bisect_left(self.index, (kmer, -1))
        hits = []
        # collect matching index entries
        while i < len(self.index):  
            if self.index[i][0] != kmer:
                break
            hits.append(self.index[i][1])
            i += 1

        return hits

In [5]:
def queryIndex(p, t, index):
    """Queries the index for pattern p in text t, using the index-object"""
    k = index.k
    offsets = []
    for i in index.query(p):
      if p[k:] == t[i+k:i+len(p)]: 
            offsets.append(i)
    return offsets

In [6]:
t = 'ACTTGGAGATCTTTGAGGCTAGGTATTCGGGATCGAAGCTCATTTCGGGGATCGATTACGATATGGTGGGTATTCGGGA'
p = 'GGTATTCGGGA'

In [7]:
index = Index(t, 4)
print(queryIndex(p, t, index))

[21, 68]


## Unit Tests

In [8]:
import unittest

In [9]:
class MyTest(unittest.TestCase):
    def setUp(self):
        t = 'ACTTGGAGATCTTTGAGGCTAGGTATTCGGGATCGAAGCTCATTTCGGGGATCGATTACGATATGGTGGGTATTCGGGA'
        p = 'GGTATTCGGGA'
        
    def testQueryIndex(self):
        self.assertEqual([21, 68], queryIndex(p, t, index))

In [10]:
suite = unittest.TestLoader().loadTestsFromTestCase( MyTest )
unittest.TextTestRunner(verbosity=1,stream=sys.stderr).run( suite )

.
----------------------------------------------------------------------
Ran 1 test in 0.003s

OK


<unittest.runner.TextTestResult run=1 errors=0 failures=0>