## Algorytmy Tekstowe - lab 7
#### Wyszukiwanie wyrażeń regularnych

In [1]:
import numpy as np
import string
import random

In [124]:
def parse_regex_with_plus(regex):
    parenthesis = []
    last_class_parenthesis = -1
    
    i = 0
    while i < len(regex):
        if regex[i] == '(':
            parenthesis.append(i)
            
        elif regex[i] == '[':
            last_class_parenthesis = i
            
        elif regex[i] == '+':
            if regex[i-1] == ')':
                begin = parenthesis.pop(-1)
                regex = regex[0:i] + regex[begin:i] + '*' + regex[i+1:]
                
            elif regex[i-1] == ']':
                regex = regex[0:i] + regex[last_class_parenthesis:i] + '*' + regex[i+1:]
                
            else:
                regex = regex[0:i] + regex[i-1] + '*' + regex[i+1:]
                
            i -= 1
                
        elif i > 0 and regex[i-1] == ')':
            parenthesis.pop(-1)
        
        i += 1
        
    return regex

class State:
    def __init__(self, number):
        self.transitions = {}
        self.lambda_transitions = set()
        self.number = number
        self.accepting = False
    
    def __hash__(self):
        return self.number
    
    def set_accepting(self):
        self.accepting = True
        
    def is_accepting(self):
        return self.accepting
    
def regex_to_nfa(regex):
    l_regex = len(regex)
    
    state_index = 0
    initial_state = State(state_index)
    opening_states = []
    state_index += 1
    
    prev_state = None
    current_state = initial_state
    
    i = 0
    while i < l_regex:
        new_state = State(state_index)
        state_index += 1
            
        if regex[i] == '(':
            current_state.lambda_transitions.add(new_state)
            opening_states.append(new_state)
            current_state = new_state
            i += 1
        
        elif regex[i] == ')':
            if i + 1 < l_regex:
                opening = opening_states.pop(-1)
                i += 1
                if regex[i] == '*':
                    current_state.lambda_transitions.add(new_state)
                    new_state.lambda_transitions.add(opening)
                    opening.lambda_transitions.add(new_state)
                    i += 1
                elif regex[i] == '?':
                    current_state.lambda_transitions.add(new_state)
                    opening.lambda_transitions.add(new_state)
                    i += 1
                else:
                    current_state.lambda_transitions.add(new_state)

        elif regex[i] == '*':
            current_state.lambda_transitions.add(prev_state)
            prev_state.lambda_transitions.add(new_state)
            i += 1

        elif regex[i] == '?':
            prev_state.lambda_transitions.add(new_state)
            current_state.lambda_transitions.add(new_state)
            i += 1
        
        else:
            char_set = set()
            if regex[i] == '[':
                i += 1
                if regex[i] == '\\':
                    i += 1
                    if regex[i] == 'd':
                        for d in range(10):
                            char_set.add(str(d))

                    elif regex[i] == 'c':
                        for c in string.ascii_letters:
                            char_set.add(c)
                    i += 1
                else:
                    while regex[i] != ']':
                        char_set.add(regex[i])
                        i += 1

            else:
                if regex[i] == '.':
                    char_set = set(list(string.ascii_letters) + [' '] + [str(d) for d in range(10)])
                else:
                    char_set.add(regex[i])
            
            current_state.transitions = {c: new_state for c in char_set}
            i += 1
        
        prev_state = current_state
        current_state = new_state
        
    current_state.set_accepting()
    return initial_state

def random_regex(state):
    out = ""
    while not state.is_accepting():
        if state.lambda_transitions and random.randint(0, len(state.lambda_transitions) + len(state.transitions) - 1) < len(state.lambda_transitions):
            state = random.choice(list(state.lambda_transitions))
        else:
            key = random.choice(list(state.transitions.keys()))
            out += key
            state = state.transitions[key]
    return out

def match_regex(regex, initial_nfa):
    last_states = set([initial_nfa])
    for letter in regex:
        if not last_states:
            return False
        states = set()
        for state in last_states:
            current_state = state
            if letter in current_state.transitions:
                states.add(current_state.transitions[letter])
                
            checked_lambdas = set()
            lambda_states = list(current_state.lambda_transitions)
            while lambda_states:
                current_state = lambda_states.pop(0)
                for st in current_state.lambda_transitions:
                    if st not in checked_lambdas:
                        lambda_states.append(st)
                        checked_lambdas.add(st)
                        
                if letter in current_state.transitions:
                    states.add(current_state.transitions[letter])
                    
        last_states = states
    
    for state in last_states:
        if state.is_accepting():
            return True
        
        current_state = state
        
        checked_lambdas = set()
        lambda_states = list(current_state.lambda_transitions)
        while lambda_states:
            current_state = lambda_states.pop(0)
            if current_state.is_accepting():
                return True
            for st in current_state.lambda_transitions:
                if st not in checked_lambdas:
                    lambda_states.append(st)
                    checked_lambdas.add(st)
        
    return False

In [120]:
parse_regex_with_plus("pf(abf(155)*)+d+sa+")

'pf(abf(155)*)(abf(155)*)*dd*saa*'

In [121]:
init_state = regex_to_nfa(parse_regex_with_plus("pfabf((155)*k)+d+sa+"))

In [122]:
for i in range(20):
    print(random_regex(init_state))

pfabf155kdsa
pfabf155155155k155kkddsa
pfabfkdsaa
pfabfk155kdsa
pfabf155k155kdddddsaaa
pfabf155kddsaa
pfabfkkdsaa
pfabf155k155kkddddddsaaa
pfabf155kk155kdsa
pfabfkddsa
pfabf155k155155kddddsa
pfabfkdsa
pfabf155kdsa
pfabfkkdsa
pfabf155kddsaa
pfabf155kkddsaa
pfabf155k155155k155kdsaa
pfabf155kddsaa
pfabfk155kddddddsaa
pfabf155155k155155155k155kddsaaa


In [125]:
for i in range(20):
    print(match_regex(random_regex(init_state), init_state))

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [100]:
import itertools

def dfs(init_st, st_set = set()):
    st_set.add(init_st)
    print(f"{init_st.number}:")
    for i, st in init_st.transitions.items():
        print(f"\t {i} -> {st.number}")
    print(f"\t eps -> {[lamb.number for lamb in init_st.lambda_transitions]}")
    print(f'\t {init_st.is_accepting()}')
    for st in itertools.chain.from_iterable([init_st.transitions.values(), init_st.lambda_transitions]):
        if st not in st_set:
            dfs(st, st_set)

In [64]:
dfs(init_state)

0:
	 p -> 1
	 eps -> []
	 False
1:
	 f -> 2
	 eps -> []
	 False
2:
	 a -> 3
	 eps -> []
	 False
3:
	 b -> 4
	 eps -> []
	 False
4:
	 f -> 5
	 eps -> []
	 False
5:
	 eps -> [6]
	 False
6:
	 1 -> 7
	 eps -> [10]
	 False
7:
	 5 -> 8
	 eps -> []
	 False
8:
	 5 -> 9
	 eps -> []
	 False
9:
	 eps -> [10]
	 False
10:
	 d -> 11
	 eps -> [6]
	 False
11:
	 d -> 12
	 eps -> []
	 False
12:
	 eps -> [11, 13]
	 False
13:
	 s -> 14
	 eps -> []
	 False
14:
	 a -> 15
	 eps -> []
	 False
15:
	 a -> 16
	 eps -> []
	 False
16:
	 eps -> [17, 15]
	 False
17:
	 eps -> []
	 True


In [49]:
ini = regex_to_nfa("p.*pp.p*")

p
.
*
p
p
.
p
*


In [50]:
match_regex("pppppppgpp", ini)

p
---
0
p
---
1
p
---
2


False