In [1]:
def naive(text: str, pattern: str): 
    n = len(text)
    m = len(pattern)
    
    for s in range(n - m + 1):
        if(text[s:s + m] == pattern):
            yield s

In [2]:
def transition_function(pattern, alphabet):
    m = len(pattern)
    result = []
    for q in range(m + 1):
        result.append({})
        for a in alphabet:
            k = min(m + 1, q + 2)
            while True:
                k = k - 1
                pqa = pattern[:q] + a
                pk = pattern[:k]
                if(pk == pqa[len(pqa) - k:]):
                    break
                    
            result[q][a] = k
    
    return result 

def finite_automaton_matcher(text, pattern, alphabet, delta):
    m = len(pattern)
    n = len(text)
    q, count = 0, 0
    for i in range(n):
        if(text[i] not in alphabet):
            q = 0
        else:
            q = delta[q][text[i]]
            if q == m:
                yield i
    
    return count

In [3]:
def prefix_function(pattern):
    pi = [0]
    k = 0
    for q in range(1, len(pattern)):
        while(k > 0 and pattern[k] != pattern[q]):
            k = pi[k-1]
        if(pattern[k] == pattern[q]):
            k = k + 1
        pi.append(k)
    return pi

def kmp_string_matching(text, pattern, pi):
    q = 0
    for i in range(0, len(text)):
        while(q > 0 and pattern[q] != text[i]):
            q = pi[q-1]
        if(pattern[q] == text[i]):
            q = q + 1
        if(q == len(pattern)):
            yield (i + 1 - q)
            q = pi[q - 1]

In [4]:
import time

def test(filename: str, P: str, alg = finite_automaton_matcher):
    with open(filename, "r", encoding = "utf-8") as file:
        start_time = time.time()
        count = 0
        for line in file.readlines():
            count = count + len(list(alg(line, P)))
        
        end_time = time.time()
        running_time = end_time - start_time
        
        print(f"Found {count} repetitions. Running time: {running_time} s.")

In [5]:
import functools

def test_suite(filename, pattern, print_pattern = True):
    pattern_str = pattern
    if not print_pattern:
        pattern_str = '...'
        
    print(f"Searching through '{filename}' looking for '{pattern_str}'", end = "\n" + "-" * 50 + "\n")
    alph = {letter for letter in pattern}
    trans_func = transition_function(pattern, alph)
    automaton = functools.partial(finite_automaton_matcher, alphabet = alph, delta = trans_func)

    pref = prefix_function(pattern)
    kmp = functools.partial(kmp_string_matching, pi = pref)

    print("Naive", end = " - ")
    test(filename, pattern, alg = naive)
    print("Finite Automaton", end = " - ")
    test(filename, pattern, alg = automaton)
    print("Knuth-Morris-Pratt", end = " - ")
    test(filename, pattern, alg = kmp)
    print("-" * 50)

In [6]:
filename = "simple.txt"
pattern  = "ala"

test_suite(filename, pattern, print_pattern = False)

Searching through 'simple.txt' looking for '...'
--------------------------------------------------
Naive - Found 1 repetitions. Running time: 0.00010323524475097656 s.
Finite Automaton - Found 1 repetitions. Running time: 9.274482727050781e-05 s.
Knuth-Morris-Pratt - Found 1 repetitions. Running time: 0.00019359588623046875 s.
--------------------------------------------------


In [7]:
filename = "1997_714.txt"
pattern = "art"

test_suite(filename, pattern)

Searching through '1997_714.txt' looking for 'art'
--------------------------------------------------
Naive - Found 273 repetitions. Running time: 0.07299017906188965 s.
Finite Automaton - Found 273 repetitions. Running time: 0.0279695987701416 s.
Knuth-Morris-Pratt - Found 273 repetitions. Running time: 0.055002689361572266 s.
--------------------------------------------------


In [8]:
## Takes huge amount of time..
filename = "wikipedia-tail-kruszwil.txt"
pattern = 'kruszwil'

test_suite(filename, pattern)

Searching through 'wikipedia-tail-kruszwil.txt' looking for 'kruszwil'
--------------------------------------------------
Naive - Found 13 repetitions. Running time: 54.75539994239807 s.
Finite Automaton - Found 13 repetitions. Running time: 41.58528780937195 s.
Knuth-Morris-Pratt - Found 13 repetitions. Running time: 72.26392960548401 s.
--------------------------------------------------


In [9]:
def edge_case_test(n, frac, filename = "edge.txt"):
    with open(filename, "w") as file:
        for i in range(n):
            file.write("a")
        file.write("b")
    
    pattern = (int(frac * n)) * "a" + "b"               

    return filename, pattern

## Ex 6.

Chosen text consists of n (the bigger n the bigger difference in runtime) 'a's and 'b' at the end. Pattern is the same except it has only fraction of 'a's at the beggining.

The idea behind this is:
Naive algorithm at each character will have to run through the whole pattern only to see that it fails due to b at the end. Automaton on the other hand (once it consumes all 'a's from pattern) with each character will perform only one comparsion - is next char b. Kmp works similarly except it does not know as much about pattern as automaton algorithm so it has to perform more operations on each character but still less than naive algorithm.

In [16]:
filename, pattern = edge_case_test(100000, .12)

test_suite(filename, pattern, print_pattern = False)

Searching through 'edge.txt' looking for '...'
--------------------------------------------------
Naive - Found 1 repetitions. Running time: 0.08985710144042969 s.
Finite Automaton - Found 1 repetitions. Running time: 0.015736818313598633 s.
Knuth-Morris-Pratt - Found 1 repetitions. Running time: 0.045162200927734375 s.
--------------------------------------------------


## Ex 7.

We simply take some arbitrary long pattern and O(Sigma * m^3) vs O(m) takes care of the rest ;)

In [11]:
def run(alg, arg):
    start_time = time.time()
    alg(arg)
    end_time = time.time()
    running_time = end_time - start_time
    
    return running_time

pattern = 2000 * "a" + 1000 * "b"
alph = {letter for letter in pattern}
preftime = run(prefix_function, pattern)
transtime  = run(functools.partial(transition_function, alphabet = alph), pattern)

print(f"KMP preprocessing: {preftime}s")
print(f"Automaton preprocessing: {transtime}s")

KMP preprocessing: 0.0009152889251708984s
Automaton preprocessing: 3.493818521499634s
