# Wyszukiwanie wzorca w tekście

### Algorytm naiwny

In [66]:
def naive_search(text, pattern, preprocess = None):
    len_pattern = len(pattern)
    len_text = len(text)
    correct_s = []
    for i in range(len_text - len_pattern + 1):
        if text[i:i + len_pattern] == pattern:
            correct_s.append(i)
    return correct_s

print(naive_search("abababaabaaababaababa", "aba"))

[0, 2, 4, 7, 11, 13, 16, 18]


### Automat skończony

In [65]:
from collections import defaultdict
import re

def fa_string_matching(text, pattern, delta = None):
    patt_len = len(pattern)
    q = 0
    if delta is None:
        delta = transition_table(pattern)
    correct_s = []
    for s in range(0, len(text)):
        q = delta[q][text[s]]
        if(q == len(delta) - 1):
            correct_s.append(s - patt_len + 1)
    return correct_s

def transition_table(pattern):
    #
    # O(len(pattern) * len(alphabet)) implementation
    #
    track = 0
    chars = set()
    for ch in pattern:
        chars.add(ch)
    
    chars_len = len(chars)
    patt_len = len(pattern)
    
    table = [defaultdict(lambda: 0) for _ in range(0, patt_len + 1)]    
    for col in chars:
        table[0][col] = 0
        
    table[0][pattern[0]] = 1;

    for row in range(1, patt_len):
        for col in range(0, chars_len):
            table[row][col] = table[track][col]
        table[row][pattern[row]] = row + 1

        if(row < patt_len):
            track = table[track][pattern[row]]
    return table

print(fa_string_matching("abcdbabaabcddabcdaababaababa", "abcd"))

[0, 8, 13]


### Algorytm Knutha-Morrisa-Pratta

In [67]:
def kmp_string_matching(text, pattern, pi = None):
    if pi is None:
        pi = prefix_function(pattern)
    patt_len = len(pattern)
    q = 0
    correct_s = []
    for i in range(0, len(text)):
        while(q > 0 and pattern[q] != text[i]):
            q = pi[q-1]
        if(pattern[q] == text[i]):
            q = q + 1
        if(q == len(pattern)):
            correct_s.append(i - patt_len + 1)
            q = pi[q-1]
    return correct_s
            
def prefix_function(pattern):
    pi = [0]
    k = 0
    for q in range(1, len(pattern)):
        while(k > 0 and pattern[k] != pattern[q]):
            k = pi[k-1]
        if(pattern[k] == pattern[q]):
            k = k + 1
        pi.append(k)
    return pi

print(fa_string_matching("abcdbabaabcddabcdaababaababa", "abcd"))

[0, 8, 13]


# Funkcja mierząca czas działania algorytmu dla wybranego tekstu i wzorca

In [75]:
import time

def matching_time(algorithm, text, pattern):
    preprocess = None
    start_time = time.time()
    if algorithm is kmp_string_matching: 
        preprocess = prefix_function(pattern)
    if algorithm is fa_string_matching: 
        preprocess = transition_table(pattern)
    result = algorithm(text, pattern, preprocess)
    return (result, time.time() - start_time)

def matching_comparassion(text, pattern, showresult = False):
    res = matching_time(naive_search, text, pattern)
    print(f"Naive algorithm: {res[1]}s")
    if (showresult):
        print(f"Naive algorithm: !-{len(res[0])} found-! {res[0]}s")
        print
    res = matching_time(fa_string_matching, text, pattern)
    print(f"FA: {res[1]}s")
    if (showresult):
        print(f"FA: !-{len(res[0])} found-! {res[0]}s")
    res = matching_time(kmp_string_matching, text, pattern)
    print(f"KMP: {res[1]}s")
    if (showresult):
        print(f"KMP: !-{len(res[0])} found-! {res[0]}s")
        print

In [82]:
ex_text = "a" * 1000000
ex_pattern = "a" * 500000

matching_comparassion(ex_text, ex_pattern)

Naive algorithm: 17.531678199768066s
FA: 1.0810279846191406s
KMP: 0.34475040435791016s


In [76]:
ustawa_text = open("ustawa.txt", 'r', encoding="utf8").read()
ustawa_pattern = "art"

matching_comparassion(ustawa_text, ustawa_pattern, showresult = True)

Naive algorithm: 0.029604434967041016s
Naive algorithm: !-273 found-! [1156, 1505, 4692, 4734, 4879, 5082, 5148, 5949, 6039, 7266, 7511, 7781, 8044, 8299, 9104, 9959, 10022, 10224, 11122, 11207, 11618, 13194, 15284, 15358, 16092, 16261, 16406, 16547, 16616, 16840, 16856, 23637, 24061, 24152, 24586, 24683, 24780, 24931, 25530, 25689, 27001, 27288, 27479, 27542, 27592, 27857, 28373, 28558, 28766, 30964, 31021, 31096, 31362, 31811, 32609, 32968, 33053, 33268, 33595, 34651, 34737, 35511, 36155, 37143, 37543, 38451, 38595, 39056, 39210, 39436, 39568, 39980, 41152, 41829, 42028, 42198, 42371, 42504, 42718, 42896, 42941, 43447, 43555, 43787, 44590, 44653, 44953, 45010, 45293, 45401, 47319, 47422, 48785, 48820, 48906, 49052, 49259, 49316, 49488, 49559, 49915, 49979, 50102, 50160, 50702, 51050, 51179, 51966, 52071, 52272, 52552, 53008, 53032, 53211, 53788, 53931, 54078, 54137, 54770, 55075, 55279, 55465, 55807, 55991, 56827, 56911, 57164, 57549, 57800, 57932, 57989, 58280, 58378, 58874, 58966, 

In [78]:
matching_comparassion(ustawa_text, ustawa_pattern)

Naive algorithm: 0.034662485122680664s
FA: 0.030025243759155273s
KMP: 0.03327298164367676s


In [None]:
wiki_text = open("wikipedia-tail-kruszwil.txt", 'r', encoding="utf8").read()

In [81]:
print(len(fa_string_matching(wiki_text, "kruszwil")))

13


In [79]:
matching_comparassion(wiki_text, "kruszwil")

Naive algorithm: 36.314714193344116s
FA: 38.10478115081787s
KMP: 43.138630390167236s


In [85]:
def preprocess_time(pattern):
    start_time = time.time()
    transition_table(pattern)
    print(f"FA preprocessing time: {time.time() - start_time}")
    
    start_time = time.time()
    prefix_function(pattern)
    print(f"KMP preprocessing time: {time.time() - start_time}")

In [94]:
preprocess_time("abcdefghijk" * 100000)

FA preprocessing time: 2.7725579738616943
KMP preprocessing time: 0.2004375457763672
