In [1]:
def naive(text: str, pattern: str): 
    n = len(text)
    m = len(pattern)
    
    for s in range(n - m + 1):
        if(text[s:s + m] == pattern):
            yield s

In [2]:
def transition_function(pattern, alphabet):
    m = len(pattern)
    result = []
    for q in range(m + 1):
        result.append({})
        for a in alphabet:
            k = min(m + 1, q + 2)
            while True:
                k = k - 1
                pqa = pattern[:q] + a
                pk = pattern[:k]
                if(pk == pqa[len(pqa) - k:]):
                    break
                    
            result[q][a] = k
    
    return result 

def finite_automaton_matcher(text, pattern, alphabet, delta):
    m = len(pattern)
    n = len(text)
    q, count = 0, 0
    for i in range(n):
        if(text[i] not in alphabet):
            q = 0
        else:
            q = delta[q][text[i]]
            if q == m:
                yield i
    
    return count

In [3]:
def prefix_function(pattern):
    pi = [0]
    k = 0
    for q in range(1, len(pattern)):
        while(k > 0 and pattern[k] != pattern[q]):
            k = pi[k-1]
        if(pattern[k] == pattern[q]):
            k = k + 1
        pi.append(k)
    return pi

def kmp_string_matching(text, pattern, pi):
    q = 0
    for i in range(0, len(text)):
        while(q > 0 and pattern[q] != text[i]):
            q = pi[q-1]
        if(pattern[q] == text[i]):
            q = q + 1
        if(q == len(pattern)):
            yield (i + 1 - q)
            q = pi[q - 1]

In [4]:
transition_function("abab", {"a", "b", "c"})

prefix_function("ababaca")

[0, 0, 1, 2, 3, 0, 1]

In [5]:
import time

def test(filename: str, P: str, alg = finite_automaton_matcher):
    with open(filename, "r", encoding = "utf-8") as file:
        start_time = time.time()
        count = 0
        for line in file.readlines():
            count = count + len(list(alg(line, P)))
        
        end_time = time.time()
        running_time = end_time - start_time
        
        print(f"Found {count} repetitions. Running time: {running_time} s.")

In [6]:
filename = "1997_714.txt"
pattern = "art"

In [7]:
import functools

alph = {letter for letter in pattern}
trans_func = transition_function(pattern, alph)
automaton = functools.partial(finite_automaton_matcher, alphabet = alph, delta = trans_func)

pref = prefix_function(pattern)
kmp = functools.partial(kmp_string_matching, pi = pref)

print("Naive", end = " - ")
test(filename, pattern, alg = naive)
print("Finite Automaton", end = " - ")
test(filename, pattern, alg = automaton)
print("Knuth-Morris-Pratt", end = " - ")
test(filename, pattern, alg = kmp)

Naive - Found 273 repetitions. Running time: 0.046922922134399414 s.
Finite Automaton - Found 273 repetitions. Running time: 0.025949716567993164 s.
Knuth-Morris-Pratt - Found 273 repetitions. Running time: 0.05413389205932617 s.


In [8]:
### Takes huge amount of time..

# filename = "tokens-with-entities.tsv"
# pattern = 'Kruszwil'

# test(filename, pattern, alg = naive)

In [30]:
def edge_case_test(n, frac, filename = "edge.txt"):
    with open(filename, "a") as file:
        for i in range(n):
            file.write("a")
        file.write("b")
    
    pattern = (int(frac * n)) * "a" + "b"               
    return filename, pattern

In [31]:
filename, pattern = edge_case_test(10000, .5)

In [33]:
import functools

print("Chosen text consists of n (the bigger n the bigger difference in runtime) 'a's and 'b' at the end. Pattern is the same except it has only fraction of 'a's at the beggining.")

alph = {letter for letter in pattern}
trans_func = transition_function(pattern, alph)
automaton = functools.partial(finite_automaton_matcher, alphabet = alph, delta = trans_func)

pref = prefix_function(pattern)
kmp = functools.partial(kmp_string_matching, pi = pref)

print("Naive", end = " - ")
test(filename, pattern, alg = naive)
print("Finite Automaton", end = " - ")
test(filename, pattern, alg = automaton)
print("Knuth-Morris-Pratt", end = " - ")
test(filename, pattern, alg = automaton)

Chosen text consists of n (the bigger n the bigger difference in runtime) 'a's and 'b' at the end. Pattern is the same except it has only fraction of 'a's at the beggining.
Naive - Found 1 repetitions. Running time: 0.002994060516357422 s.
Finite Automaton - Found 1 repetitions. Running time: 0.0009968280792236328 s.
Knuth-Morris-Pratt - Found 1 repetitions. Running time: 0.003007650375366211 s.


In [12]:
def run(alg, arg):
    start_time = time.time()
    alg(arg)
    end_time = time.time()
    running_time = end_time - start_time
    
    return running_time

pattern = 2000 * "a" + 1000 * "b"
alph = {letter for letter in pattern}
preftime = run(prefix_function, pattern)
transtime  = run(functools.partial(transition_function, alphabet = alph), pattern)

print("We simply take long pattern and O(Sigma * m^3) (vs O(m) in kmp) takes care of the rest ;)")

print(f"Prefix function: {preftime}")
print(f"Transition function {transtime}")

We simply take long pattern and O(Sigma * m^3) (vs O(m) in kmp) takes care of the rest ;)
Prefix function: 0.003987789154052734
Transition function 5.133466958999634
