#### Regular Expressions with Exponentiation

Consider following declarations from the course notes and assignments:

In [13]:
class set(frozenset):
    def __repr__(self):
        return '{' + ', '.join(str(e) for e in self) + '}'

class FiniteStateAutomaton:
    def __init__(self, T, Q, R, q0, F):
        self.T, self.Q, self.R, self.q0, self.F = T, Q, R, q0, F
    def __repr__(self):
        return str(self.q0) + '\n' + ' '.join(str(f) for f in self.F) + '\n' + \
               '\n'.join(str(q) + ' ' + a + ' → ' + str(r) for (q, a, r) in self.R)

def parseFSA(fsa: str) -> FiniteStateAutomaton:
    fsa = [line for line in fsa.split('\n') if line.strip() != '']
    q0 = fsa[0] # first line: initial
    F = set(fsa[1].split()) # second line: final, final, ...
    R = set()
    for line in fsa[2:]: # all subsequent lines: "source symbol → target"
        l, r = line.split('→')
        R |= {(l.split()[0], l.split()[1], r.split()[0])}
    T = {r[1] for r in R}
    Q = {q0} | F | {r[0] for r in R} | {r[2] for r in R}
    return FiniteStateAutomaton(T, Q, R, q0, F)

def minimizeFSA(fsa: FiniteStateAutomaton) -> FiniteStateAutomaton:
    δ = {(q, a): r for (q, a, r) in fsa.R}
    dist = {(q, r) for q in fsa.Q for r in fsa.Q if q != r and (q in fsa.F) != (r in fsa.F)}
    done = False
    while not done:
        done = True #; print(dist)
        for q in fsa.Q:
            for r in fsa.Q:
                if q != r and (q, r) not in dist and any(((q, u) in δ) != ((r, u) in δ) or \
                    ((q, u) in δ) and ((δ[(q, u)], δ[(r, u)]) in dist) for u in fsa.T):
                    dist |= {(q, r)}; done = False #; print('adding', q, r)
    Qʹ = {set({q} | {r for r in fsa.Q if (q, r) not in dist}) for q in fsa.Q}
    Rʹ = {(qʹ, u, rʹ) for qʹ in Qʹ for rʹ in Qʹ for u in fsa.T if any((q, u, r) in fsa.R for q in qʹ for r in rʹ)}
    qʹ0 = {qʹ for qʹ in Qʹ if fsa.q0 in qʹ}.pop()
    Fʹ = {qʹ for qʹ in Qʹ if (qʹ & fsa.F) != set()}
    return FiniteStateAutomaton(fsa.T, Qʹ, Rʹ, qʹ0, Fʹ)

def totalFSA(A: FiniteStateAutomaton, t = -1) -> FiniteStateAutomaton:
    T = set('abcdefghijklmnopqrstuvwxyz') # T is vocabulary, t is trap state
    R = A.R | {(q, a, t) for q in A.Q for a in T if all((q, a, r) not in A.R for r in A.Q)}
    if any(r == t for (q, a, r) in R): # transition to t exists
        Q = A.Q | {t}
        R = R | {(t, a, t) for a in T}
    else: Q = A.Q
    return FiniteStateAutomaton(T, Q, R, A.q0, A.F)

def renameFSA(fsa: FiniteStateAutomaton) -> FiniteStateAutomaton:
    m, c = {}, 0
    for q in fsa.Q:
        m[q] = c; c = c + 1
    Qʹ = {i for i in range(c)}
    Rʹ = {(m[q], u, m[r]) for (q, u, r) in fsa.R}
    qʹ0 = m[fsa.q0]
    Fʹ = {m[q] for q in fsa.F}
    return FiniteStateAutomaton(fsa.T, Qʹ, Rʹ, qʹ0, Fʹ)

def equivalentFSA(a: FiniteStateAutomaton, aʹ: FiniteStateAutomaton, printMap = False) -> bool:
    a = minimizeFSA(totalFSA(a))
    aʹ = minimizeFSA(totalFSA(aʹ))
    δ = {(q, u): r for (q, u, r) in a.R}
    δʹ = {(q, u): r for (q, u, r) in aʹ.R}
    m, v = {a.q0: aʹ.q0}, {a.q0}
    while v != set():
        if printMap: print(m)
        q = v.pop(); qʹ = m[q]
        for u in a.T:
            if ((q, u) in δ) != ((qʹ, u) in δʹ): return False
            elif (q, u) in δ: # (qʹ, u) in δʹ
                r, rʹ = δ[(q, u)], δʹ[(qʹ, u)]
                if r in m:
                    if m[r] != rʹ: return False
                elif rʹ in m.values(): return False
                else: v.add(r); m[r] = rʹ
    if printMap: print(m) 
    return aʹ.F == {m[q] for q in a.F}

In [14]:
class Choice:
    def __init__(self, e1, e2): self.e1, self.e2 = e1, e2
    def __repr__(self): return '(' + str(self.e1) + '|' + str(self.e2) + ')'

class Conc:
    def __init__(self, e1, e2): self.e1, self.e2 = e1, e2
    def __repr__(self): return '(' + str(self.e1) + str(self.e2) + ')'

class Star:
    def __init__(self, e): self.e = e
    def __repr__(self): return '(' + str(self.e) + ')*'

Here is the concrete grammar for regular expressions:

    expression  →  term { '|' term }
    term  →  factor { factor }
    factor  →  atom [ '*' | '+' | '?' ]
    atom  →  plainchar | escapedchar | '(' expression ')'
    plainchar  →  ' ' | '!' | '"' | '#' | '$' | '%' | '&' | '\'' | ',' | '-' | '.' | '/' |
         '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' | ':' | ';' | '<' | '=' | '>' | 
         '@' | 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'G' | 'H' | 'I' | 'J' | 'K' | 'L' | 'M' | 'N' | 'O' |
         'P' | 'Q' | 'R' | 'S' | 'T' | 'U' | 'V' | 'W' | 'X' | 'Y' | 'Z' | '[' | ']' | '^' | '_' |
         '`' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f' | 'g' | 'h' | 'i' | 'j' | 'k' | 'l' | 'm' | 'n' | 'o' |
         'p' | 'q' | 'r' | 's' | 't' | 'u' | 'v' | 'w' | 'x' | 'y' | 'z' | '{' | '}' | '~'
    escapedchar  → '\' ( '(' | ')' | '*' | '+' | '?' | '\' | '|' )

The attribute grammar for constructing the abstract syntax tree of regular expressions is:

    expression(e)  →  term(e) { '|' term(f) « e := Choice(e, f) »  }
    term(e)  →  factor(e) { factor(f) « e := Conc(e, f) » }
    factor(e) → atom(e) [ '*' « e := Star(e) » | '+' « e := Conc(e, Star(e)) » | '?' « e := Choice(e, '') » ]
    atom(e)  →  plainchar(e) | escapedchar(e) | '(' expression(e) ')'
    plainchar(e)  →  ' ' « e := ' ' » | ... | '~' « e := '~' »
    escapedchar(e)  → '\\' ( '(' « e := '(' » | ')' | ... | '|' « e := '|' »)

The corresponding parser in Python is:

In [15]:
PlainChars = ' !"#$%&\',-./0123456789:;<=>@ABCDEFGHIJKLMNO' + \
             'PQRSTUVWXYZ[]^_`abcdefghijklmnopqrstuvwxyz{}~'
EscapedChars = '()*+?\\|'
FirstFactor = PlainChars + '\\('

def nxt():
    global pos, sym
    if pos < len(src): sym, pos = src[pos], pos+1
    else: sym = chr(0) # end of input symbol

def expression():
    e = term()
    while sym == '|': nxt(); e = Choice(e, term())
    return e

def term():
    e = factor()
    while sym in FirstFactor: e = Conc(e, factor())
    return e

def factor():
    e = atom()
    if sym == '*': nxt(); e = Star(e)
    elif sym == '+': nxt(); e = Conc(e, Star(e))
    elif sym == '?': nxt(); e = Choice(e, '')
    return e

def atom():
    if sym in PlainChars: e = sym; nxt()
    elif sym == '\\':
        nxt()
        if sym in EscapedChars: e = sym; nxt()
        else: raise Exception("invalid escaped character at " + str(pos))
    elif sym == '(':
        nxt(); e = expression()
        if sym == ')': nxt()
        else: raise Exception("')' expected at " + str(pos))
    else: raise Exception("invalid character at " + str(pos))
    return e

def parse(s: str):
    global src, pos;
    src, pos = s, 0; nxt(); e = expression()
    if sym != chr(0): raise Exception("unexpected character at " + str(pos))
    return e

Here is more code from the course notes:

In [16]:
def REToFSA(re):
    global QC
    if re == '': q = QC; QC += 1; return FiniteStateAutomaton(set(), {q}, set(), q, {q})
    elif type(re) == str:
        q = QC; QC += 1; r = QC; QC += 1
        return FiniteStateAutomaton({re}, {q, r}, {(q, re, r)}, q, {r})
    elif type(re) == Choice:
        A1, A2 = REToFSA(re.e1), REToFSA(re.e2)
        R2 = {(A1.q0 if q == A2.q0 else q, a, r) for (q, a, r) in A2.R} # A2.q0 renamed to A1.q0 in A2.R
        F2 = {A1.q0 if q == A2.q0 else q for q in A2.F} # A2.q0 renamed to A1.q0 in A2.F
        return FiniteStateAutomaton(A1.T | A2.T, A1.Q | A2.Q, A1.R | R2, A1.q0, A1.F | F2)
    elif type(re) == Conc:
        A1, A2 = REToFSA(re.e1), REToFSA(re.e2)
        R = A1.R | {(f, a, r) for (q, a, r) in A2.R if q == A2.q0 for f in A1.F} | \
            {(q, a, r) for (q, a, r) in A2.R if q != A2.q0}
        F = (A2.F - {A2.q0}) | (A1.F if A2.q0 in A2.F else set())
        return FiniteStateAutomaton(A1.T | A2.T, A1.Q | A2.Q, R, A1.q0, F)
    elif type(re) == Star:
        A = REToFSA(re.e)
        R = A.R | {(f, a, r) for (q, a, r) in A.R if q == A.q0 for f in A.F}
        return FiniteStateAutomaton(A.T, A.Q, R, A.q0, {A.q0} | A.F)
    else: raise Exception('not a regular expression')

def convertRegExToFSA(re):
    global QC; QC = 0
    return REToFSA(re)

In [17]:
def deterministicFSA(fsa: FiniteStateAutomaton) -> FiniteStateAutomaton:
    qʹ0 = set({fsa.q0})
    Qʹ, Rʹ, visited = {qʹ0}, set(), set()
    # print(Qʹ, Rʹ, visited)
    while visited != Qʹ:
        qʹ = (Qʹ - visited).pop(); visited |= {qʹ}
        for t in fsa.T:
            rʹ = {r for (q, u, r) in fsa.R if u == t and q in qʹ}
            if rʹ != set(): Qʹ |= {set(rʹ)}; Rʹ |= {(qʹ, t, set(rʹ))}
        # print(Qʹ, Rʹ, visited)
    Fʹ = {qʹ for qʹ in Qʹ for f in fsa.F if f in qʹ}
    return FiniteStateAutomaton(fsa.T, Qʹ, Rʹ, qʹ0, Fʹ)

def accepts(fsa: FiniteStateAutomaton, τ: str) -> bool:
    δ = {(q, a): r for (q, a, r) in fsa.R}
    q = fsa.q0
    for t in τ:
        if (q, t) in δ: q = δ[q, t]
        else: return False
    return q in fsa.F

In [18]:
def equalRegEx(E1, E2):
    a1 = deterministicFSA(convertRegExToFSA(parse(E1)))
    a2 = deterministicFSA(convertRegExToFSA(parse(E2)))
    return equivalentFSA(a1, a2)

The task is to extend regular expressions with exponentiation with a single digit exponent `⁰`, `¹`, `²`, or `³`. For example:
- `(ab)²` is `ab` repeated twice, i.e. `abab`;
- `a³` is `a` repeated three times, i.e. `aaa`;
- `(a³)²` is `a³` repeated twice, i.e. `aaaaaa`;
- `(abc)¹` is just `abc`;
- `a⁰` is `ε`.

We let any exponent bind the same way as postfix operators `*`, `+`, and `?`, for example:

    ab² = a(b²)
    
In the same way as `a+*` is not allowed (it would have to be written as `(a+)*`), `a²³` is not allowed (it would have to be written as `(a²)³`. Extend the above grammar for regular expressions accordingly! The exponents should only appear as operators, they cannot be escaped, as in `\²`. Use the cell below.

    expression  →  term { '|' term }
    term  →  factor { factor }
    factor  →  atom [ '*' | '+' | '?' ]
    atom  →  plainchar [ exponent ] | escapedchar | '(' expression ')' [ exponent ]
    plainchar  →  ' ' | '!' | '"' | '#' | '$' | '%' | '&' | '\'' | ',' | '-' | '.' | '/' |
         '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' | ':' | ';' | '<' | '=' | '>' | 
         '@' | 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'G' | 'H' | 'I' | 'J' | 'K' | 'L' | 'M' | 'N' | 'O' |
         'P' | 'Q' | 'R' | 'S' | 'T' | 'U' | 'V' | 'W' | 'X' | 'Y' | 'Z' | '[' | ']' | '^' | '_' |
         '`' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f' | 'g' | 'h' | 'i' | 'j' | 'k' | 'l' | 'm' | 'n' | 'o' |
         'p' | 'q' | 'r' | 's' | 't' | 'u' | 'v' | 'w' | 'x' | 'y' | 'z' | '{' | '}' | '~'
    escapedchar  → '\' ( '(' | ')' | '*' | '+' | '?' | '\' | '|' )
    exponent  →  `⁰` | `¹` | `²` | `³`

    expression  →  term { '|' term }
    term  →  factor { factor }
    factor  →  atom [ '*' | '+' | '?' | exponent ]
    atom  →  plainchar | escapedchar | '(' expression ')'
    plainchar  →  ' ' | '!' | '"' | '#' | '$' | '%' | '&' | '\'' | ',' | '-' | '.' | '/' |
         '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' | ':' | ';' | '<' | '=' | '>' | 
         '@' | 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'G' | 'H' | 'I' | 'J' | 'K' | 'L' | 'M' | 'N' | 'O' |
         'P' | 'Q' | 'R' | 'S' | 'T' | 'U' | 'V' | 'W' | 'X' | 'Y' | 'Z' | '[' | ']' | '^' | '_' |
         '`' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f' | 'g' | 'h' | 'i' | 'j' | 'k' | 'l' | 'm' | 'n' | 'o' |
         'p' | 'q' | 'r' | 's' | 't' | 'u' | 'v' | 'w' | 'x' | 'y' | 'z' | '{' | '}' | '~'
    escapedchar  → '\' ( '(' | ')' | '*' | '+' | '?' | '\' | '|' )
    exponent  →  `⁰` | `¹` | `²` | `³`

Extend the above attribute grammar above to construct an abstract syntax tree of regular expressions with exponentiation! The type of the nodes should not be extended. Rather, exponents should be "expanded". For example, the abstract syntax tree of `ab²` is `Conc('a', Conc('b', 'b'))`. Use the cell below.

    expression(e)  →  term(e) { '|' term(f) « e := Choice(e, f) »  }
    term(e)  →  factor(e) { factor(f) « e := Conc(e, f) » }
    factor(e) → atom(e) [ '*' « e := Star(e) » | '+' « e := Conc(e, Star(e)) » | '?' « e := Choice(e, '') » ]
    atom(e)  →  plainchar(e) [ exponent(n) « e := conc_n(e, n) » ] | escapedchar(e) | '(' expression(e) ')' [ exponent(n) « e := conc_n(e, n) » ]
    plainchar(e)  →  ' ' « e := ' ' » | ... | '~' « e := '~' »
    escapedchar(e)  → '\\' ( '(' « e := '(' » | ')' | ... | '|' « e := '|' »)
    exponent(e)  →  `⁰` « e := 0 » | `¹` « e := 1 » | `²` « e := 2 » | `³` « e := 3 »

 - The auxiliary function `conc_n(e, n)` creates concatenation `Conc` of expression `e` `n` number of times.

    expression(e)  →  term(e) { '|' term(f) « e := Choice(e, f) »  }
    term(e)  →  factor(e) { factor(f) « e := Conc(e, f) » }
    factor(e) → atom(e) [ '*' « e := Star(e) » | '+' « e := Conc(e, Star(e)) » | '?' « e := Choice(e, '') » | exponent(n) « e := conc_n(e, n) » ]
    atom(e)  →  plainchar(e) | escapedchar(e) | '(' expression(e) ')'
    plainchar(e)  →  ' ' « e := ' ' » | ... | '~' « e := '~' »
    escapedchar(e)  → '\\' ( '(' « e := '(' » | ')' | ... | '|' « e := '|' »)
    exponent(e)  →  `⁰` « e := 0 » | `¹` « e := 1 » | `²` « e := 2 » | `³` « e := 3 »

 - The auxiliary function `conc_n(e, n)` creates concatenation `Conc` of expression `e` `n` number of times.

Extend above Python parser accordingly! Use the cell below.

In [21]:
PlainChars = ' !"#$%&\',-./0123456789:;<=>@ABCDEFGHIJKLMNO' + \
             'PQRSTUVWXYZ[]^_`abcdefghijklmnopqrstuvwxyz{}~'
EscapedChars = '()*+?\\|'
FirstFactor = PlainChars + '\\('
Exponents = {'⁰': 0, '¹': 1, '²': 2, '³': 3}

def nxt():
    global pos, sym
    if pos < len(src): sym, pos = src[pos], pos+1
    else: sym = chr(0) # end of input symbol

def expression():
    e = term()
    while sym == '|': nxt(); e = Choice(e, term())
    return e

def term():
    e = factor()
    while sym in FirstFactor: e = Conc(e, factor())
    return e

def factor():
    e = atom()
    if sym == '*': nxt(); e = Star(e)
    elif sym == '+': nxt(); e = Conc(e, Star(e))
    elif sym == '?': nxt(); e = Choice(e, '')
    elif sym in Exponents: e = conc_n(e, Exponents[sym]); nxt()
    return e

def atom():
    if sym in PlainChars: e = sym; nxt()
    elif sym == '\\':
        nxt()
        if sym in EscapedChars: e = sym; nxt()
        else: raise Exception("invalid escaped character at " + str(pos))
    elif sym == '(':
        nxt(); e = expression()
        if sym == ')': nxt()
        else: raise Exception("')' expected at " + str(pos))
    else: raise Exception("invalid character at " + str(pos))
    return e

def conc_n(e, n):
    if n == 0:
        return ''
    if n == 1:
        return e
    return Conc(e, conc_n(e, n - 1))

def parse(s: str):
    global src, pos;
    src, pos = s, 0; nxt(); e = expression()
    if sym != chr(0): raise Exception("unexpected character at " + str(pos))
    return e

Here are some test cases. Note that the abstract syntax tree of `a³` is the same as that of `a(aa)`.

In [22]:
assert str(parse('a⁰')) == ''
assert str(parse('ab¹')) == '(ab)'
assert str(parse('ab²')) == '(a(bb))'
assert str(parse('(ab)²')) == '((ab)(ab))'
assert str(parse('a³')) == '(a(aa))'
assert str(parse('(a³)²')) == '((a(aa))(a(aa)))'
assert str(parse('ab²')) == '(a(bb))'

assert equalRegEx('a²', 'aa')
assert equalRegEx('(ab)²', 'abab')
assert equalRegEx('(a²)²', 'aaaa')
assert equalRegEx('ab⁰', 'a')
assert equalRegEx('(a³)²', 'aaaaaa')
assert equalRegEx('(a|b)²', 'aa|ab|ba|bb')
assert equalRegEx('(a*)²', 'a*')