# Trie and Suffix Tree lab

## 1. Przyjmij następujący zbiór danych wejściowych:

### a) bbb$

In [1]:
a = "bbb$"

### b) aabbabd

In [2]:
b = "aabbabd"

### c) ababcd

In [3]:
c = "ababcd"

### d) abcbccd

In [4]:
d = "abcbccd"

### e) załączony plik (1997_714_head.txt)

In [5]:
with open("1997_714_head.txt") as file:
    e = file.read()
e += "$"

---
## 2. Upewnij się, że każdy łańcuch na końcu posiada unikalny znak (marker), a jeśli go nie ma, to dodaj ten znak.

Można zauważyć, że każdy łańcuch oprócz pliku `1997_714_head.txt` zawiera unikalny znak (marker) na końcu. Dlatego dopisałem taki marker na końcu tego pliku.

---
## 3. Zaimplementuj algorytm konstruujący strukturę trie, która przechowuje wszystkie sufiksy łańcucha danego na wejściu.

In [6]:
class TrieNode:
    def __init__(self, parent):
        self.parent = parent
        self.children = dict()


class Trie:
    def __init__(self, text):
        self.root = TrieNode(None)
        for i in range(len(text)):
            suffix = text[i:]
            head, index = self.find(suffix)
            self.graft(head, suffix[index:])

    def find(self, text):
        current_node = self.root
        idx = 0
        while idx < len(text) and text[idx] in current_node.children:
            current_node = current_node.children[text[idx]]
            idx += 1
        return current_node, idx

    def graft(self, node, text):
        for c in text:
            new_node = TrieNode(node)
            node.children[c] = new_node
            node = new_node

    def pattern_search(self, pattern):
        if len(pattern) == 0:
            return True
        node, index = self.find(pattern)
        return node.children != {} and index == len(pattern)

---
## 4. Zaimplementuj algorytm konstruujący drzewo sufiksów.

In [7]:
class SuffixTreeNode:
    def __init__(self, start, end):
        self.start = start
        self.end = end
        self.children = dict()


class SuffixTree:
    def __init__(self, text):
        self.root = SuffixTreeNode(0, len(text) - 1)
        self.text = text
        for i in range(len(text) - 1):
            suffix = text[i:]
            head, depth = self.find(suffix)
            self.graft(head, depth, i)

    def find(self, text, depth=0, node=None):
        if node is None:
            node = self.root
        next_node = node.children.get(text[0])
        if next_node is None:
            return node, depth

        next_node_text_len = next_node.end - next_node.start + 1
        for i in range(1, next_node_text_len):
            if self.text[next_node.start + i] != text[i]:
                stop_node = SuffixTreeNode(next_node.start, next_node.start + i - 1)
                next_node.start += i
                node.children[self.text[stop_node.start]] = stop_node
                stop_node.children[self.text[next_node.start]] = next_node
                return stop_node, depth + i
        return self.find(text[next_node_text_len:], next_node_text_len + depth, next_node)

    def graft(self, head, depth, i):
        new_node = SuffixTreeNode(depth + i, len(self.text) - 1)
        head.children[self.text[new_node.start]] = new_node

    def pattern_search(self, pattern, node=None):
        if len(pattern) == 0:
            return True
        if node is None:
            node = self.root
        next_node = node.children.get(pattern[0])
        if next_node is None:
            return False

        next_node_text_len = next_node.end - next_node.start + 1
        for i in range(1, next_node_text_len):
            if i == len(pattern):
                return True
            elif self.text[next_node.start + i] != pattern[i]:
                return False
        return self.pattern_search(pattern[next_node_text_len:], next_node)

---
## 5. Upewnij się, że powstałe struktury danych są poprawne. Możesz np. sprawdzić, czy struktura zawiera jakiś ciąg znaków i porównać wyniki z algorytmem wyszukiwania wzorców.

### Test on a, b, c, d texts

In [8]:
texts = [a, b, c, d]
invalid_patterns = ["tghn", "rty", "xcqpl", "evc", "iop", "qwerty", "mnb", "tyu", "fql", "sdb", "s", "z"]

# Trie
errors = 0
for text in texts:
    trie = Trie(text)
    for inv_pat in invalid_patterns:
        if trie.pattern_search(inv_pat):
            errors += 1

print(f"Number of errors in Trie search is {errors}.")

# Suffix Tree
errors = 0
for text in texts:
    suffix_tree = SuffixTree(text)
    for inv_pat in invalid_patterns:
        if suffix_tree.pattern_search(inv_pat):
            errors += 1
            
print(f"Number of errors in Suffix Tree search is {errors}.")

Number of errors in Trie search is 0.
Number of errors in Suffix Tree search is 0.


### Test on e text

In [9]:
invalid_patterns = ["fsadfasbdasfhbgsddg", "hjkghljkh", "3124,a", "[]659", "178vhja", "i9bjweoi"]

errors = 0
trie = Trie(e)
for inv_pat in invalid_patterns:
        if trie.pattern_search(inv_pat):
            errors += 1
            
print(f"Number of errors in Trie search is {errors}.")

errors = 0
suffix_tree = SuffixTree(e)
for inv_pat in invalid_patterns:
    if suffix_tree.pattern_search(inv_pat):
        errors += 1
print(f"Number of errors in Suffix Tree search is {errors}.")

Number of errors in Trie search is 0.
Number of errors in Suffix Tree search is 0.


### Single tests on each text

__Trie__

In [10]:
print(f"Test on Trie on a text. Is it correct? {Trie(a).pattern_search('bb')}")
print(f"Test on Trie on b text. Is it correct? {Trie(b).pattern_search('abb')}")
print(f"Test on Trie on c text. Is it correct? {Trie(c).pattern_search('aba')}")
print(f"Test on Trie on d text. Is it correct? {Trie(d).pattern_search('abc')}")
print(f"Test on Trie on e text. Is it correct? {Trie(e).pattern_search('dochodowego')}")

Test on Trie on a text. Is it correct? True
Test on Trie on b text. Is it correct? True
Test on Trie on c text. Is it correct? True
Test on Trie on d text. Is it correct? True
Test on Trie on e text. Is it correct? True


__Suffix Tree__

In [11]:
print(f"Test on Suffix Tree on a text. Is it correct? {SuffixTree(a).pattern_search('bb')}")
print(f"Test on Suffix Tree on b text. Is it correct? {SuffixTree(b).pattern_search('abb')}")
print(f"Test on Suffix Tree on c text. Is it correct? {SuffixTree(c).pattern_search('aba')}")
print(f"Test on Suffix Tree on d text. Is it correct? {SuffixTree(d).pattern_search('abc')}")
print(f"Test on Suffix Tree on e text. Is it correct? {SuffixTree(e).pattern_search('dochodowego')}")

Test on Suffix Tree on a text. Is it correct? True
Test on Suffix Tree on b text. Is it correct? True
Test on Suffix Tree on c text. Is it correct? True
Test on Suffix Tree on d text. Is it correct? True
Test on Suffix Tree on e text. Is it correct? True


---
## 6. Porównaj szybkość działania algorytmów konstruujących struktury danych dla danych z p. 1 w następujących wariantach:
 - __Trie - czas budowy O(n^2), rozmiar drzewa O(n^2), n - długość tekstu (można wykorzystać fragment załączonego tekstu),__
 - __Drzewo sufiksów bez wykorzystania procedury fast_find oraz elementów "link" - czas budowy O(n^2), rozmiar drzewa O(n) (w trakcie tworzenia drzewa rozmiar ten nie może być większy).__

In [12]:
import pandas as pd

### Trie test

In [13]:
trie_a_time = %timeit -o Trie(a).pattern_search('bb')
trie_b_time = %timeit -o Trie(b).pattern_search('abb')
trie_c_time = %timeit -o Trie(c).pattern_search('aba')
trie_d_time = %timeit -o Trie(d).pattern_search('abc')
trie_e_time = %timeit -o Trie(e).pattern_search('dochodowego')

7.45 µs ± 260 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
17 µs ± 595 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
13.5 µs ± 369 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
17.1 µs ± 765 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
1.35 s ± 49.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
trie_time = [trie_a_time.average, trie_b_time.average, trie_c_time.average,
             trie_d_time.average, trie_e_time.average]

### Suffix Tree test

In [15]:
suffix_a_time = %timeit -o SuffixTree(a).pattern_search('bb')
suffix_b_time = %timeit -o SuffixTree(b).pattern_search('abb')
suffix_c_time = %timeit -o SuffixTree(c).pattern_search('aba')
suffix_d_time = %timeit -o SuffixTree(d).pattern_search('abc')
suffix_e_time = %timeit -o SuffixTree(e).pattern_search('dochodowego')

6.79 µs ± 167 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
12.1 µs ± 230 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
8.97 µs ± 343 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
9.89 µs ± 518 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
17.7 ms ± 635 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [16]:
suffix_time = [suffix_a_time.average, suffix_b_time.average, suffix_c_time.average,
               suffix_d_time.average, suffix_e_time.average]

### Time comparison

In [17]:
def which_faster(row):
    if row["Trie test"] <= row["Suffix Tree test"]:
        return "Trie test"
    else:
        return "Suffix Tree test"

In [18]:
data = {"Trie test": trie_time, "Suffix Tree test": suffix_time}
df = pd.DataFrame(data=data)
df["Difference"] = df["Trie test"] - df["Suffix Tree test"]
df["Faster structure"] = df.apply(lambda row: which_faster(row), axis=1)

In [19]:
df

Unnamed: 0,Trie test,Suffix Tree test,Difference,Faster structure
0,7e-06,7e-06,6.533942e-07,Suffix Tree test
1,1.7e-05,1.2e-05,4.884482e-06,Suffix Tree test
2,1.3e-05,9e-06,4.524646e-06,Suffix Tree test
3,1.7e-05,1e-05,7.235248e-06,Suffix Tree test
4,1.349214,0.017708,1.331506,Suffix Tree test
