## Algorytmy Tekstowe - laboratorium 6

In [55]:
import numpy as np

def counting_sort(values, key=lambda v: v):
    counter = np.zeros(128, dtype=np.int32)
    unique = 0
    for val in values:
        if counter[ord(key(val))] == 0:
            unique += 1
        counter[ord(key(val))] += 1
    summed = np.zeros(128, dtype=np.int32)
    for i in range(1,128):
        summed[i] = summed[i-1] + counter[i-1]
    
    final_list = [None] * len(values)
    for val in values:
        final_list[summed[ord(key(val))]] = val
        summed[ord(key(val))] += 1
    
    return final_list

In [50]:
def sort_rename(sequence, sorting=sorted):
    last_entry = None
    index = 0
    position_to_index = [None] * len(sequence)
    first_entry = {}
    for entry in sorting([(e,i) for i, e in enumerate(sequence)], key=lambda v: v[0]):
        if last_entry and last_entry[0] != entry[0]:
            index += 1
            first_entry[index] = entry[1]
        
        position_to_index[entry[1]] = index
        if last_entry is None:
            first_entry[0] = entry[1]
        last_entry = entry
    
    return position_to_index, first_entry

In [51]:
sort_rename([(1,2),(3,1),(2,2),(1,1),(2,3),(1,2)])

([1, 4, 2, 0, 3, 1], {0: 3, 1: 0, 2: 2, 3: 4, 4: 1})

In [52]:
def unique_char(text):
    i = ord('!')
    while chr(i) in text:
        i += 1
    return chr(i)

In [69]:
import math

def kmr(text):
    original_length = len(text)
    factor = math.floor(math.log2(original_length))
    if 2**factor < original_length:
        padding_length = 2**(factor + 1) - original_length
        text += unique_char(text) * padding_length
    
    position_to_index, first_entry = sort_rename(text, counting_sort)
    names = {1:position_to_index}
    entries = {1:first_entry}
    for i in range(1,factor):
        power = 2**(i-1)
        new_sequence = []
        for j in range(len(text)):
            if j + power < len(names[power]):
                new_sequence.append((names[power][j],names[power][j+power]))
        position_to_index, first_entry = sort_rename(new_sequence)
        names[power*2] = position_to_index
        entries[power*2] = first_entry
    return names, entries

In [70]:
kmr("gh$essaghudndhg")[0][2]

[6, 7, 0, 4, 12, 11, 1, 6, 9, 13, 3, 10, 2, 8, 5]

In [75]:
def dbf_search(text, pattern):
    patt_len = len(pattern)
    text_len = len(text)
    unique = unique_char(pattern + text)
    t = 2**math.floor(math.log2(patt_len))
    dbf = kmr(pattern + unique + text)[0][t]
    found = []
    if t == patt_len:
        for i in range(patt_len + 1, 2 + text_len):
            if dbf[0] == dbf[i]:
                found.append(i - patt_len - 1)
    else:
        for i in range(patt_len + 1, 2 + text_len):
            if dbf[0] == dbf[i] and dbf[patt_len - t] == dbf[i + patt_len - t]:
                found.append(i - patt_len - 1)
    return found

In [None]:
class SuffNode:
    def __init__(self, tree, start, stop):
        self.start = start
        self.stop = stop
        self.tree = tree
        self.link = None
        self.depth = 0
        self.children = {}
        self.parent = None
        
    def length(self):
        return self.stop - self.start + 1
    
    def label(self):
        return self.tree.text[self.start:self.stop+1]
    
    def letter(self, i):
        return self.tree.text[self.start + i]
    
    def child(self, ch):
        if ch not in self.children:
            return None
        else:
            return self.children[ch]
        
    def add_link(self):
        d = self.depth
        if self.parent == self.tree.root:
            v = self.parent
        else:
            if self.parent.link is None:
                self.parent.add_link()
            v = self.parent.link
            
        offset = 0
        
        if v == self.parent:
            offset = 1
        
        while v.depth < d-1:
            v = v.child(self.tree.text[self.start + v.depth - self.parent.depth + 1])
        if v.depth > d-1:
            v = v.break_path(d-v.parent.depth-1)
        self.link = v
        
    def break_path(self, depth):
        new_node = SuffNode(self.tree, self.start, self.start + depth - 1)
        self.start += depth
        
        self.parent.children[self.tree.text[new_node.start]] = new_node
        new_node.parent = self.parent
        
        new_node.children[self.letter(0)] = self
        self.parent = new_node
        
        new_node.depth = new_node.parent.depth + depth
        
        return new_node
    
    def graft(self, start):
        new_node = SuffNode(self.tree, start, self.tree.text_length-1)
        
        new_node.parent = self
        self.children[self.tree.text[start]] = new_node
        
        return new_node
                

class SuffTree:
    def __init__(self, text):
        self.text = text
        self.text_length = len(text)
        self.root = SuffNode(self, 0, -1)
        self.root.link = self.root
        
        child = SuffNode(self, 0, self.text_length-1)
        child.parent = self.root
        child.depth = self.text_length
        self.root.children[text[0]] = child
            
    def mc_creight(self):
        node = self.root
        self.root.link = self.root
        self.root.depth = 0
        for i in range(1, self.text_length):
            depth = node.depth
            while node.child(self.text[i + depth]):
                node = node.child(self.text[i + depth])
                depth += 1
                node_depth = 1
                while node.start + node_depth <= node.stop and node.letter(node_depth) == self.text[i + depth]:
                    depth += 1
                    node_depth += 1
                if node.start + node_depth <= node.stop and node.letter(node_depth) != self.text[i + depth]:
                    node = node.break_path(node_depth)
                    break
            node.graft(i + depth).depth = self.text_length - i
            if node.link is None:
                node.add_link()
            node = node.link
    
    def find_subword(self, text):
        u = self.root
        node_d = 0
        for c in text:
            if u.start + node_d <= u.stop:
                if u.letter(node_d) == c:
                    node_d += 1
                else:
                    return False
            else:
                u = u.child(c)
                if u is None:
                    return False
                else:
                    node_d = 1
        return True

In [76]:
wiki = "It predates and is significantly smaller than either the Grafton Centre or the Grand Arcade."
for start in dbf_search(wiki, "the"):
    print(wiki[start:start+5])

ther 
the G
the G
