#### Testing Regular Expression

Using the notation from the course notes, write a regular expression for identifiers: an identifier is a sequence of letters `abcdefghijklmnopqrstuvwxyz` and digits `0123456789` starting with a letter. You may use abbreviations [1 point]:

`(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z)(a*b*c*d*e*f*g*h*i*j*k*l*m*n*o*p*q*r*s*t*u*v*w*x*y*z*0*1*2*3*4*5*6*7*8*9*)*`

In [5]:
class set(frozenset):
    def __repr__(self):
        return '{' + ', '.join(str(e) for e in self) + '}'

class FiniteStateAutomaton:
    def __init__(self, T, Q, R, q0, F):
        self.T, self.Q, self.R, self.q0, self.F = T, Q, R, q0, F
    def __repr__(self):
        return str(self.q0) + '\n' + ' '.join(str(f) for f in self.F) + '\n' + \
               '\n'.join(str(q) + ' ' + a + ' → ' + str(r) for (q, a, r) in self.R)

In [6]:
class Choice:
    def __init__(self, e1, e2): self.e1, self.e2 = e1, e2
    def __repr__(self): return '(' + str(self.e1) + '|' + str(self.e2) + ')'

class Conc:
    def __init__(self, e1, e2): self.e1, self.e2 = e1, e2
    def __repr__(self): return '(' + str(self.e1) + str(self.e2) + ')'

class Star:
    def __init__(self, e): self.e = e
    def __repr__(self): return '(' + str(self.e) + ')*'

In [7]:
def REToFSA(re) -> FiniteStateAutomaton:
    global QC

    # if the regex string is empty
    # fsa:
    # T - empty set of symbols
    # Q - one state {q}
    # R - empty set of transitions
    # F - one final state {q}
    if re == '': q = QC; QC += 1; return FiniteStateAutomaton(set(), {q}, set(), q, {q})

    # if it is a nonempty regex string
    # fsa:
    # T - one symbol {re}
    # Q - two states {q, r}
    # R - one transitions {(q, re, r)}
    # F - one final state {r}
    elif type(re) == str:
        q = QC; QC += 1; r = QC; QC += 1
        return FiniteStateAutomaton({re}, {q, r}, {(q, re, r)}, q, {r})

    # if the regex is a Choice 
    elif type(re) == Choice:
        # recursivly call REToFSA on expressions e1 and e2
        A1, A2 = REToFSA(re.e1), REToFSA(re.e2)

        # R2 = {} # a set of transitions from A2
        # # going through every transitions in A2
        # for (q, a, r) in A2.R:
        #     # if state q is the initial state of A2
        #     if q == A2.q0:
        #         R2.add((A1.q0))
        #     else:
        #         R2.add((q, a, r))

        # F2 = {} # a set of final states from A2
        # # going through every final state in A2
        # for q in A2.F:
        #     # if the final state q is the initial state of A2
        #     if q == A2.q0:
        #         F2.add(A1.q0)
        #     else:
        #         F2.add(q)

        R2 = {(A1.q0 if q == A2.q0 else q, a, r) for (q, a, r) in A2.R} # A2.q0 renamed to A1.q0 in A2.R
        F2 = {A1.q0 if q == A2.q0 else q for q in A2.F} # A2.q0 renamed to A1.q0 in A2.F
        return FiniteStateAutomaton(A1.T | A2.T, A1.Q | A2.Q, A1.R | R2, A1.q0, A1.F | F2)
        
    # if the regex is a Conc         
    elif type(re) == Conc:
        # recursivly call REToFSA on expressions e1 and e2
        A1, A2 = REToFSA(re.e1), REToFSA(re.e2)

        # R = {} # a set of all transitions
        # R = R | A1.R
        # r1 = {} 
        # # going through every transaction from A2
        # for (q, a, r) in A2.R:
        #     # if state q is the initial state of A2
        #     if q == A2.q0:
        #         # going through every final state of A1
        #         for f in A1.F:
        #             r1.add((f, a, r))
        # R = R | r1
        
        # r2 = {}
        # # going through every transaction from A2
        # for (q, a, r) in A2.R:
        #     # if state q is not the initial state of A2
        #     if q != A2.q0:
        #         r2.add((q, a, r))
        # R = R | r2


        # F = {} # a set of final states
        # F = (A2.F - {A2.q0}) # all final states from A2, excluding the initial state

        # # if the iniitla state of A2 is in the final states of A2
        # if A2.q0 in A2.F:
        #     # add all final states from A1
        #     F = F | A1.F
        # else:
        #     F = F | set()
        
        R = A1.R | {(f, a, r) for (q, a, r) in A2.R if q == A2.q0 for f in A1.F} | \
            {(q, a, r) for (q, a, r) in A2.R if q != A2.q0}
        
        F = (A2.F - {A2.q0}) | (A1.F if A2.q0 in A2.F else set())
        return FiniteStateAutomaton(A1.T | A2.T, A1.Q | A2.Q, R, A1.q0, F)

    # if the regex is Star
    elif type(re) == Star:
        # recursivly call REToFSA on expressions e
        A = REToFSA(re.e)

        # R = {} # a set of all transitions
        # R = R | A.R # include all transitions from A
        # # going through every transition from A
        # for (q, a, r) in A.R:
        #     # if state q is the initial state of A
        #     if q == A.q0:
        #         # going through every final state of A
        #         for f in A.F:
        #             R.add((f, a, r))
        
        R = A.R | {(f, a, r) for (q, a, r) in A.R if q == A.q0 for f in A.F}
        return FiniteStateAutomaton(A.T, A.Q, R, A.q0, {A.q0} | A.F)
    else: raise Exception('not a regular expression')

def convertRegExToFSA(re) -> FiniteStateAutomaton:
    global QC; QC = 0
    return REToFSA(re)

In [8]:
def deterministicFSA(fsa: FiniteStateAutomaton, trace = False) -> FiniteStateAutomaton:
    # q'0 - a set containing an initial state
    qʹ0 = set({fsa.q0})

    # Q' - a set of states
    # R' - a set of transitions
    # visited - a set of visited states
    Qʹ, Rʹ, visited = {qʹ0}, set(), set()
    
    if trace: print(Qʹ, Rʹ, visited)

    # while not all states are visited
    while visited != Qʹ:

        # get a state that is not yet visited
        qʹ = (Qʹ - visited).pop(); visited |= {qʹ}

        # going through every symbol t in fsa
        for t in fsa.T:
            
            # r' = {} # a set of result states
            # # go through every transition in fsa
            # for (q, u, r) in fsa.R:
            #     # if the transition symbol u matches the symbol t and 
            #     # state q matches state q' that is being visit
            #     if u == t and q in q':
            #         r'.add(r)
            
            rʹ = {r for (q, u, r) in fsa.R if u == t and q in qʹ}

            # if r' != set():
            #     Qʹ |= {set(rʹ)}
            # R' |= {(qʹ, t, set(rʹ))}
            
            if rʹ != set(): Qʹ |= {set(rʹ)}; Rʹ |= {(qʹ, t, set(rʹ))}
        if trace: print(Qʹ, Rʹ, visited)

    # F' = {}
    # for q' in Q':
    #     for f in fsa.F:
    #         if f in q':
    #             F'.add(q')
                
    Fʹ = {qʹ for qʹ in Qʹ for f in fsa.F if f in qʹ}
    return FiniteStateAutomaton(fsa.T, Qʹ, Rʹ, qʹ0, Fʹ)

In [9]:
def accepts(fsa: FiniteStateAutomaton, τ: str) -> bool:
    #R - a set of transitions
    #q - initial state
    #a - symbol
    #r - transitioned state

    #δ is a dict with (q, a) - key, r - value
    δ = {(q, a): r for (q, a, r) in fsa.R}

    #q - initial state
    q = fsa.q0

    #for every character t in τ
    for t in τ:

        #initial state q with symbol t -> new state q = r (δ[q, t])
        if (q, t) in δ: q = δ[q, t]
        else: return False

    # true if q is in the final state of fsa
    return q in fsa.F

Test your answer by expressing it with Python constructors `Choice`, `Conc`, `Star` and calling it `I`. [2 points]

In [62]:
# `(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z)(a*b*c*d*e*f*g*h*i*j*k*l*m*n*o*p*q*r*s*t*u*v*w*x*y*z*0*1*2*3*4*5*6*7*8*9*)*`

letters = ['a', 'b', 'c', 'd', 'e', 'f', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
digits = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
letters_digits = letters + digits

def starting_letter_choice(i):
    if i == 1:
        return Choice(letters[0], letters[1])
    return Choice(starting_letter_choice(i - 1), letters[i])

def letters_digits_conc(i):
    if i == 1:
        return Conc(Star(letters_digits[0]), Star(letters_digits[1]))
    return Conc(letters_digits_conc(i - 1), Star(letters_digits[i]))

starting_letter = starting_letter_choice(len(letters) - 1)
letters_digits_combo = letters_digits_conc(len(letters_digits) - 1)

I = Conc(starting_letter, Star(letters_digits_combo))

In [63]:
A = deterministicFSA(convertRegExToFSA(I))
assert accepts(A, 'cloud7')
assert accepts(A, 'if')
assert accepts(A, 'b12')
assert not accepts(A, '007')
assert not accepts(A, '15b')
assert not accepts(A, 'B12')
assert not accepts(A, 'e-mail')