# Implementation of Trie 

In [1]:
class Node:
    def __init__(self, depth, parent, char):
        self.depth    = depth
        self.parent   = parent
        self.v        = char
        self.children = []

    ## Looks for children with v = char.
    def find_child(self, char):
        for child in self.children:
            if child.v == char:
                return child
        
        return None

    def search(self, pattern):
        if len(pattern)  <= 0:
            return True

        child = self.find_child(pattern[0])
        if child == None:
            return False
        else:
            return child.search(pattern[1:])

    def graft(self, suffix_end):
        parent        = self
        running_depth = self.depth + 1

        for char in suffix_end:
            child = Node(running_depth, parent, char)
            parent.children.append(child)
            running_depth += 1
            parent = child
        
        return parent

In [2]:
class Trie:
    def __init__(self, marker = None):
        self.root = Node(0, None, None)    
        self.marker = marker

    def search_pattern(self, pattern):
        return self.root.search(pattern)

    def insert(self, text):
        if self.marker:
            text += self.marker

        for i in range(len(text)):
            suffix = text[i:]
            head   = self.find(suffix)
            suffix_end = suffix[head.depth:]
            head.graft(suffix_end)

    def find(self, suffix):
        head = self.root

        for char in suffix:
            child = head.find_child(char)
            if child == None:
                break
            else:
                head = child
        
        return head

In [3]:
def ensure_marker(text):
    last = text[len(text) - 1]
    if last not in text[:len(text) - 1]:
        return None
    
    i = 0
    while True:
        marker = chr(ord('$') + i)
        if marker not in text:
            return marker
        i += 1

def build_tree_schema(text, insert = True):
    marker = ensure_marker(text)
    trie = Trie(marker)
    if insert:
        trie.insert(text)

    return trie

In [4]:
text = "aabbbd"
trie = build_tree_schema(text)
trie.search_pattern("bd")

True

## Testing trie

I test implemented trie by checking if i can
find all subwords of inserted words. 

Building trie for text from file took too long so instead I broke it down to single words and inserted them.
For some subset of this words i generated all subwords and checked if they're in the structure.

In [5]:
simple_tests = ['bbb$', 'aabbabd', 'ababcd', 'abcbccd']

def substring_generator(word):
    n = len(word)
    for i in range(n - 1):
        for j in range(i + 1, n):
            yield word[i:j]

## Tests by generating all subwords and checking if it can be found in generated trie.
def test_simple_trie(word):
    generator = substring_generator(word)
    trie      = build_tree_schema(word)
    for subword in generator:
        if not trie.search_pattern(subword):
            print( u'\u2718', "'{}'".format(word), 'failed.')
            print(subword, 'not found in trie.')
            return
    print(u'\u2713', "'{}'".format(word), f'passed with', end = ' ')


## Building trie for whole text takes too long so 
## to measure it somehow i insert invidual words which
## enables me to search for particular words and subwords.
def test_file_trie(filename = '1997_714.txt'):
    trie = None
    with open(filename, 'r', encoding = 'UTF-8') as file:
        content = file.read()
        trie     = build_tree_schema(content, insert = False)

    selected = []
    with open(filename, 'r', encoding = 'UTF-8') as file:  
        i = 0;
        for line in file.readlines():
            for word in line.split():
                trie.insert(word)
                i += 1
                if i % 100 == 0:
                    selected.append(word)

    for word in selected:
        generator = substring_generator(word)
        for subword in generator:
            if not trie.search_pattern(subword):
                print( u'\u2718', "'{}'".format(word), 'failed.')
                print(subword, 'not found in trie.')
                return
    print(u'\u2713', "'{}'".format(filename), f'passed.', end = ' ')
    print(f'\n\tTested all subwords of selected {i // 100} words with ', end = ' ')


def test_suite_trie():
    for word in simple_tests:
        %time test_simple_trie(word)
        print()
    
    %time test_file_trie()
    print()

In [6]:
test_suite_trie()

✓ 'bbb$' passed with Wall time: 0 ns

✓ 'aabbabd' passed with Wall time: 0 ns

✓ 'ababcd' passed with Wall time: 0 ns

✓ 'abcbccd' passed with Wall time: 991 µs

✓ '1997_714.txt' passed. 
	Tested all subwords of selected 259 words with  Wall time: 1.32 s



# Implementation of suffix tree without using links and fast find

In [7]:
class SuffixTreeNode:
    def __init__(self, depth, parent, bounds):
        self.depth    = depth
        self.parent   = parent
        self.bounds   = bounds
        self.children = []

    def __str__(self):
        return f'Depth {self.depth}, bounds: ({self.bounds[0]}, {self.bounds[1]})' + f', Children: {str(self.children)}'

    def __repr__(self):
        return str(self)

    def label_length(self):
        return self.bounds[1] - self.bounds[0]

    ## Returns tuple (child, matching characters of this child)
    def find_child(self, text, pattern):
        for child in self.children:
            if pattern[0]:
                pass
            if text[child.bounds[0]]:
                pass
            if text[child.bounds[0]] == pattern[0]:
                matching = 1
                for i in range(1, min(len(pattern), child.label_length())):
                    if text[child.bounds[0] + i] == pattern[i]:
                        matching += 1
                    else:
                        break
                return (child, matching)
        
        return (None, None)

    ## Where cut tells us how many characters are matching
    def break_me(self, cut):
        child_bounds_start = self.bounds[0] + cut
        child_bounds_end   = self.bounds[1]
        child_depth        = self.depth 

        child = SuffixTreeNode(child_depth, self, (child_bounds_start, child_bounds_end))
        child.children = self.children
        
        self.depth     = self.depth - self.label_length() + cut
        self.bounds    = (self.bounds[0], child_bounds_start)
        self.children  = [child]

        return self

    def graft(self, bounds):
        parent = self
        depth  = self.depth + (bounds[1] - bounds[0]) 
        node = SuffixTreeNode(depth, self, bounds)
        
        self.children.append(node)
        
        return node

    def search(self, text, pattern):
        if len(pattern) <= 0:
            return True
        child, matching = self.find_child(text, pattern)
        
        if child == None:
            return False
        else:
            return child.search(text, pattern[matching:])

In [11]:
class SuffixTree:
    def __init__(self, text):
        self.root = SuffixTreeNode(0, None, (0, 0))
        self.text = ensure_marker(text) 

    def search_pattern(self, pattern):
        return self.root.search(self.text, pattern)


    ## This should be fixed as we should recusrively search children down the tree!
    def find(self, suffix):
        head = self.root
        child = head

        while True:
            head  = child
            (child, matching) = child.find_child(self.text, suffix)

            if not child:
                break
            elif child.label_length() > matching:
                head = child.break_me(matching)
                break
            elif child.label_length() == matching:
                suffix = suffix[matching:]
            else:
                raise Exception("Sth went horribly wrong!")
            
        
        return head

def ensure_marker(text):
    last = text[len(text) - 1]
    if last not in text[:len(text) - 1]:
        return text
    
    i = 0
    while True:
        marker = chr(ord('$') + i)
        if marker not in text:
            return text + marker
        i += 1

def build_suffix_tree_schema(text, insert = True):
    suffix_tree = SuffixTree(text)
    suffix_tree.root.graft((0, len(text)))

    for i in range(1, len(text)):
        suffix = text[i:]
        head   = suffix_tree.find(suffix)
        head.graft((i + head.depth, len(text)))

    return suffix_tree

In [12]:
tree = build_suffix_tree_schema(simple_tests[1])
tree.search_pattern('bba')

True

## Testing Suffix Tree
 - conducted in the same manner

In [13]:
def test_simple_suffix_tree(word):
    generator = substring_generator(word)
    tree      = build_suffix_tree_schema(word)
    for subword in generator:
        if not tree.search_pattern(subword):
            print( u'\u2718', "'{}'".format(word), 'failed.')
            print(subword, 'not found in trie.')
            return
    print(u'\u2713', "'{}'".format(word), f'passed with', end = ' ')

def test_file_suffix_tree(filename = '1997_714.txt'):
    tree = None
    with open(filename, 'r', encoding = 'UTF-8') as file:
        content = file.read()
        tree    = build_suffix_tree_schema(content)

    # selected = []
    # with open(filename, 'r', encoding = 'UTF-8') as file:  
    #     i = 0;
    #     for line in file.readlines():
    #         for word in line.split():
    #             tree.insert(word)
    #             i += 1
    #             if i % 100 == 0:
    #                 selected.append(word)

    # for word in selected:
    #     generator = substring_generator(word)
    #     for subword in generator:
    #         if not tree.search_pattern(subword):
    #             print( u'\u2718', "'{}'".format(word), 'failed.')
    #             print(subword, 'not found in trie.')
    #             return
    print(u'\u2713', "'{}'".format(filename), f'passed.', end = ' ')
    # print(f'\n\tTested all subwords of selected {i // 100} words with ', end = ' ')


def test_suite_tree():
    for word in simple_tests:
        %time test_simple_suffix_tree(word)
        print()
    
    # %time test_file_suffix_tree()
    # print()

test_suite_tree()

✓ 'bbb$' passed with Wall time: 0 ns

✓ 'aabbabd' passed with Wall time: 999 µs

✓ 'ababcd' passed with Wall time: 0 ns

✓ 'abcbccd' passed with Wall time: 1 ms

