In [4]:
!pip install nbimporter

Collecting nbimporter
  Downloading nbimporter-0.3.4-py3-none-any.whl (4.9 kB)
Installing collected packages: nbimporter
Successfully installed nbimporter-0.3.4
You should consider upgrading via the '/opt/conda/bin/python3 -m pip install --upgrade pip' command.[0m


In [5]:
import nbimporter
from boyer_moore import *

In [6]:
def approximate_match(p, t, n):
    '''This function finds approximate matches of a pattern in a text, allowing for 'n' mismatches.'''
    # Divide the pattern into n+1 segments. At least one must match perfectly.
    segment_length = int(round(len(p) / (n+1)))
    all_matches = set() # Use a set to avoid storing duplicate match positions.
    # For each segment of the pattern...
    for i in range(n+1):
        # Define the start and end of the current segment.
        start = i*segment_length
        end = min((i+1)*segment_length, len(p))
        # Pre-process this segment for a fast Boyer-Moore search.
        p_bm = BoyerMoore(p[start:end], alphabet='ACGT')
        # Find all exact matches of this segment in the text.
        matches = boyer_moore(p[start:end], p_bm, t)
        
        # For each place the segment matched, check if the whole pattern matches with <= n mismatches.
        for m in matches:
            # Check if the full pattern would fit within the text boundaries.
            if m < start or m-start+len(p) > len(t):
                continue
            
            mismatches = 0
            # Compare the part of the pattern *before* the matched segment.
            for j in range(0, start):
                if not p[j] == t[m-start+j]:
                    mismatches += 1
            # Compare the part of the pattern *after* the matched segment.
            for j in range(end, len(p)):
                if not p[j] == t[m-start+j]:
                    mismatches += 1
            
            # If the total number of mismatches is within our limit...
            if mismatches <= n:
                # it's a valid approximate match. Record the starting position.
                all_matches.add(m - start)
    return list(all_matches)

In [7]:
p = 'AACTTG'
t = 'CACTTAATTTG'
print(approximate_match(p, t, 2))

[0, 5]
