## Laboratorium 5

In [56]:
import numpy as np
import pandas as pd
from PIL import Image
from time import perf_counter
from random import randint


Zadanie 1 - Zaimplementuj algorytm wyszukiwania wzorca 2-wymiarowego 

In [9]:
def get_diff_columns(pattern):
    columns = []
    indexes = []
    alphabet = set()
    for i in range(len(pattern[0])):
        col = []
        for j in range(len(pattern)):
            col.append(pattern[j][i])
            alphabet.add(pattern[j][i])

        if col in columns:
            idx = columns.index(col)
            indexes.append(idx)
        else:
            columns.append(col)
            indexes.append(len(columns) - 1)
    return columns, indexes, alphabet



In [10]:
def vertical_automaton(columns, letters):
    transition_table = [{}]
    word_sequences = [[]]
    vertical_states = [0] * len(columns)

    for i in range(len(columns[0])):
        for j in range(len(columns)):
            if columns[j][i] in transition_table[vertical_states[j]]:
                vertical_states[j] = transition_table[vertical_states[j]][columns[j][i]]
            else:
                transition_table[vertical_states[j]][columns[j][i]] = len(transition_table)
                word_sequences.append(word_sequences[vertical_states[j]] + [columns[j][i]])
                vertical_states[j] = len(transition_table)
                transition_table.append({})

    for i in range(len(transition_table)):
        for l in letters:
            if l not in transition_table[i]:
                suffix = (word_sequences[i] + [l])[1:]
                state = 0
                for s in suffix:
                    if s in transition_table[state]:
                        state = transition_table[state][s]
                    else:
                        state = 0

                transition_table[i][l] = state
    return transition_table, vertical_states


In [11]:
def horizontal_automaton(pattern, letters):
    result = []
    for state in range(len(pattern) + 1):
        result.append({})
        for l in letters:
            next_state = min(len(pattern), state + 1)
            while True:
                if pattern[:next_state] == (pattern[:state] + [l])[state - next_state + 1:state + 1]:
                    break
                next_state -= 1
            result[state][l] = next_state
    return result


In [12]:
def main_automaton(pattern):
    columns, indexes, letters = get_diff_columns(pattern)
    vertical_transition_table, vertical_states = vertical_automaton(columns, letters)

    new_pattern = [vertical_states[indexes[i]] for i in range(len(indexes))]
    horizontal_transition_table = horizontal_automaton(new_pattern, vertical_states)
    final_horizontal_state = len(horizontal_transition_table) - 1
    return vertical_transition_table, horizontal_transition_table, final_horizontal_state


In [13]:
def pattern_matching_2d(text, pattern, automaton=None):
    if automaton is None:
        vertical_transition_table, horizontal_transition_table, final_horizontal_state = main_automaton(pattern)
    else:
        vertical_transition_table, horizontal_transition_table, final_horizontal_state = automaton

    result = []
    vertical_states = []
    for i in range(len(text)):
        if len(text[i]) < len(vertical_states):
            vertical_states = vertical_states[:len(text[i])]
        elif len(vertical_states) < len(text[i]):
            vertical_states = vertical_states + [0] * (len(text[i]) - len(vertical_states))

        new_horizontal_state = 0
        for j in range(len(text[i])):
            if text[i][j] in vertical_transition_table[vertical_states[j]]:
                vertical_states[j] = vertical_transition_table[vertical_states[j]][text[i][j]]
            else:
                vertical_states[j] = 0
            if vertical_states[j] in horizontal_transition_table[new_horizontal_state]:
                new_horizontal_state = horizontal_transition_table[new_horizontal_state][vertical_states[j]]
                if new_horizontal_state == final_horizontal_state:
                    result.append((i - len(pattern) + 1, j - len(pattern[0]) + 1))
            else:
                new_horizontal_state = 0
    return result

Zadanie 2 - Znajdź w załączonym pliku "haystack.txt" wszyskie sytuacje, gdy taka sama litera występuje na tej samej pozycji w dwóch kolejnych linijkach. Zwróć uwagę, na nierówną długość linii w pliku. 

In [14]:
with open("haystack.txt") as f:
    text = f.readlines()



for i in range(ord("a"), ord("z") + 1):
    pattern = [chr(i), chr(i)]
    result = pattern_matching_2d(text, pattern)
    print("pattern:\n{}".format(pattern))
    print("found indexes:\n{}".format(result))
    print("------------------------------------")



pattern:
['a', 'a']
found indexes:
[(0, 82), (3, 30), (5, 60), (6, 63), (20, 6), (28, 69), (31, 50), (31, 73), (33, 66), (37, 4), (52, 12), (53, 12), (53, 48), (56, 11), (57, 36), (58, 36), (59, 24), (64, 2), (64, 14), (64, 22), (65, 35), (69, 35), (76, 21), (76, 74), (77, 42), (77, 61), (78, 59), (79, 37)]
------------------------------------
pattern:
['b', 'b']
found indexes:
[]
------------------------------------
pattern:
['c', 'c']
found indexes:
[(3, 54), (10, 45), (13, 10), (41, 0), (68, 0), (82, 41)]
------------------------------------
pattern:
['d', 'd']
found indexes:
[(37, 19)]
------------------------------------
pattern:
['e', 'e']
found indexes:
[(0, 63), (1, 8), (4, 77), (7, 65), (10, 1), (10, 64), (14, 2), (15, 43), (17, 6), (18, 27), (20, 10), (21, 61), (22, 53), (24, 3), (24, 65), (28, 67), (28, 73), (29, 38), (29, 43), (37, 48), (40, 11), (40, 26), (41, 57), (42, 36), (42, 48), (46, 52), (47, 50), (51, 31), (57, 54), (58, 50), (58, 54), (59, 73), (63, 66), (65, 69),

Zadanie 3 - Znajdź wszystkie wystąpienia "th" oraz "t h" w dwóch kolejnych liniach na tej samej pozycji. 

In [15]:
pattern = ["th", "th"]
result = pattern_matching_2d(text, pattern)
result

[]

In [16]:
pattern = ["t h", "t h"]
result = pattern_matching_2d(text, pattern)
result

[(37, 0)]

Zadanie 4 - Wybierz przynajmniej 4 litery (małe). Znajdź wszystkie wystąpienia tej litery w załączonym pliku "haystack.png" 

In [52]:
def change_image(file_name):
    image = Image.open(file_name)
    pixels = list(image.getdata())
    width, height = image.size
    text = []
    i = width
    for pixel in pixels:
        if i == width:
            i = 0
            text.append([])
        text[-1].append(pixel[0])
        i += 1
    return text

In [20]:
text = change_image("haystack.png")

In [54]:
a = change_image("a.png")
a

[[255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255],
 [255, 253, 177, 74, 25, 8, 45, 149, 255, 255, 255],
 [255, 203, 0, 0, 0, 0, 0, 0, 129, 255, 255],
 [255, 206, 87, 193, 236, 241, 186, 35, 10, 242, 255],
 [255, 255, 255, 255, 255, 255, 255, 175, 0, 188, 255],
 [255, 255, 159, 60, 16, 1, 0, 0, 0, 162, 255],
 [255, 138, 0, 0, 0, 0, 0, 0, 0, 155, 255],
 [255, 38, 23, 189, 238, 252, 255, 177, 0, 155, 255],
 [255, 38, 27, 201, 249, 238, 178, 31, 0, 155, 255],
 [255, 120, 0, 0, 0, 0, 0, 89, 0, 155, 255],
 [255, 249, 123, 31, 7, 41, 152, 199, 0, 155, 255],
 [255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255]]

In [39]:
a_matched = pattern_matching_2d(text, a)
print("Occure number: {}.".format(len(a_matched)))

Occure number: 356.


Zadanie 5 - Znajdź wszystkie wystąpienia słowa "p a t t e r n" w haystack.png. 

In [55]:
pattern = change_image("pattern.png")
pattern_matched = pattern_matching_2d(text, pattern)
print("number of occurrences: {}.".format(len(pattern_matched)))

number of occurrences: 5.


Zadanie 6 - Porównaj czas budowania automatu i czas wyszukiwania dla różnych rozmiarów wzorca 

In [57]:
with open("haystack.txt") as f:
    text = f.readlines()


def building_times(text_size):
    building_times = []
    for i in text_size:
        pattern = [[chr(randint(ord('a'), ord('z'))) for _ in range(i)] for _ in range(i)]
        start = perf_counter()
        main_automaton(pattern)
        end = perf_counter()
        building_times += [i, end - start]
    df = pd.DataFrame(data={"text size": building_times[::2],
                            "building time [s]": building_times[1::2]})
    return df



In [58]:
text_size = [i for i in range(10, 260, 20)]
df_1 = building_times(text_size)
df_1

Unnamed: 0,text size,building time [s]
0,10,0.003131
1,30,0.039665
2,50,0.146184
3,70,0.36155
4,90,0.872316
5,110,1.423849
6,130,2.38134
7,150,3.856218
8,170,5.824604
9,190,8.05597


In [59]:
def searching_times(text, text_size, path_size):
    pattern = [line[:path_size] for line in text[:path_size]]
    automaton = main_automaton(pattern)
    searching_times = []

    for i in text_size:
        text = [line[:i] for line in text[:i]]
        start = perf_counter()
        pattern_matching_2d(text, pattern, automaton)
        end = perf_counter()
        searching_times += [i, end - start]
    df = pd.DataFrame(data={"text size": searching_times[::2],
                            "searching time [s]": searching_times[1::2]})
    return df

In [60]:
text = change_image("haystack.png")
text_size = [i for i in range(500, 10001, 500)]
path_size = 25
df_2 = searching_times(text, text_size, path_size)
df_2

Unnamed: 0,text size,searching time [s]
0,500,0.054548
1,1000,0.046357
2,1500,0.04634
3,2000,0.045981
4,2500,0.046427
5,3000,0.044423
6,3500,0.044671
7,4000,0.048007
8,4500,0.046629
9,5000,0.047322


Zadanie 7 - Podziel plik na 2, 4 i 8 fragmentów (w poziomie) i porównaj czas przeszukiwania 

In [61]:
def divide_and_measure(text, path_size):
    pattern = [line[:path_size] for line in text[:path_size]]
    result = []

    for div in [2, 4, 8]:
        length = len(text) // div
        intervals = [text[i * length:(i + 1) * length] for i in range(div)]
        start = perf_counter()
        for i in intervals:
            pattern_matching_2d(i, pattern)
        end = perf_counter()
        result += [div, end - start]
    df = pd.DataFrame(data={"part": result[::2],
                            "time [s]": result[1::2]})
    return df

In [62]:
path_size = 25
df_3 = divide_and_measure(text, path_size)
df_3

Unnamed: 0,part,time [s]
0,2,0.39456
1,4,0.332493
2,8,0.334964
