#### Regular Expressions with Counted Repetitions

Consider the constructors of the abstract syntax tree for regular expressions from Chapter 2:

In [1]:
class Choice:
    def __init__(self, e1, e2): self.e1, self.e2 = e1, e2
    def __repr__(self): return '(' + str(self.e1) + '|' + str(self.e2) + ')'

class Conc:
    def __init__(self, e1, e2): self.e1, self.e2 = e1, e2
    def __repr__(self): return '(' + str(self.e1) + str(self.e2) + ')'

class Star:
    def __init__(self, e): self.e = e
    def __repr__(self): return '(' + str(self.e) + ')*'

Let us build a parser that constructs the abstract syntax tree of regular expressions. The attribute grammar for this is as follows, with `plainchar` and `escapedchar` containing all the characters as in Chapter 4:

    expression(e)  →  term(e) { '|' term(f) « e := Choice(e, f) »  }
    term(e)  →  factor(e) { factor(f) « e := Conc(e, f) » }
    factor(e) → atom(e) [ '*' « e := Star(e) » | '+' « e := Conc(e, Star(e)) » | '?' « e := Choice(e, '') » ]
    atom(e)  →  plainchar(e) | escapedchar(e) | '(' expression(e) ')'
    plainchar(e)  →  ' ' « e := ' ' » | ... | '~' « e := '~' »
    escapedchar(e)  → '\\' ( '(' « e := '(' » | ')' | ... | '|' « e := '|' »)

Extend the parser from Chapter 4 with attribute evaluation rules such that `parse` returns the abstract syntax tree. For convenience, here is the parser from Chapter 4 [4 points]:

In [2]:
PlainChars = ' !"#$%&\',-./0123456789:;<=>@ABCDEFGHIJKLMNO' + \
             'PQRSTUVWXYZ[]^_`abcdefghijklmnopqrstuvwxyz{}~'
EscapedChars = '()*+?\\|'
FirstFactor = PlainChars + '\\('

def nxt():
    global pos, sym
    if pos < len(src): sym, pos = src[pos], pos+1
    else: sym = chr(0) # end of input symbol

def expression():
    term()
    while sym == '|': nxt(); term()

def term():
    factor()
    while sym in FirstFactor: factor()

def factor():
    atom()
    if sym in '*+?': nxt()

def atom():
    if sym in PlainChars: nxt()
    elif sym == '\\':
        nxt()
        if sym in EscapedChars: nxt()
        else: raise Exception("invalid escaped character at " + str(pos))
    elif sym == '(':
        nxt(); expression()
        if sym == ')': nxt()
        else: raise Exception("')' expected at " + str(pos))
    else: raise Exception("invalid character at " + str(pos))

def parse(s: str):
    global src, pos;
    src, pos = s, 0; nxt(); expression()
    if sym != chr(0): raise Exception("unexpected character at " + str(pos))

#parse("a\$") # Exception: invalid escaped character at 3
#parse("a(b") # Exception: ')' expected at 3
#parse("a(" + chr(5) + ")") # invalid character at 3
#parse("a" + chr(5)) # unexpected character at 2
parse("(a*)*abcc")

In [3]:
PlainChars = ' !"#$%&\',-./0123456789:;<=>@ABCDEFGHIJKLMNO' + \
                       'PQRSTUVWXYZ[]^_`abcdefghijklmnopqrstuvwxyz{}~'
EscapedChars = '()*+?\\|'
FirstFactor = PlainChars + '\\('

src: str; pos: int; sym: str

def nxt():
    global pos, sym
    if pos < len(src): sym, pos = src[pos], pos+1
    else: sym = chr(0) # end-of-input symbol

def expression(): # expression → term(e) { '|' term(f) « e := Choice(e, f) »  }
    e = term()
    while sym == '|': nxt(); e = Choice(e, term())
    return e

def term(): # term → factor(e) { factor(f) « e := Conc(e, f) » }
    e = factor()
    while sym in FirstFactor: e = Conc(e, factor())
    return e

def factor(): # factor → atom(e) [ '*' « e := Star(e) » | '+' « e := Conc(e, Star(e)) » | '?' « e := Choice(e, '') » ]
    e = atom()
    if sym == '*': nxt(); e = Star(e)
    elif sym == '+': nxt(); e = Conc(e, Star(e))
    elif sym == '?': nxt(); e = Choice(e, '')
    return e

def atom(): # atom → plainchar(e) | escapedchar(e) | '(' expression(e) ')'
    e = sym
    if e in PlainChars: nxt(); return e
    elif e == '\\':
        nxt(); e = sym
        if e in EscapedChars: nxt(); return e
        else: raise Exception("invalid escaped character at " + str(pos))
    elif sym == '(':
        nxt(); e = expression()
        if sym == ')': nxt(); return e
        else: raise Exception("')' expected at " + str(pos))
    else: raise Exception("invalid character at " + str(pos))

def parse(s: str):
    global src, pos;
    src, pos = s, 0; nxt(); e = expression()
    if sym != chr(0): raise Exception("unexpected character at " + str(pos))
    return e

Here are some test cases:

In [4]:
#parse("a\$") # Exception: invalid escaped character at 3
#parse("a(b") # Exception: ')' expected at 3
#parse("a(" + chr(5) + ")") # invalid character at 3
#parse("a" + chr(5)) # unexpected character at 2
assert str(parse("(a*)*abcc")) == '((((((a)*)*a)b)c)c)'
assert str(parse("a|b*c")) == '(a|((b)*c))'

Let's have some fun and use this to check the equivalence of regular expression. Following cells contain code from Chapters 2 and 4:

In [5]:
class set(frozenset):
    def __repr__(self):
        return '{' + ', '.join(str(e) for e in self) + '}'

class FiniteStateAutomaton:
    def __init__(self, T, Q, R, q0, F):
        self.T, self.Q, self.R, self.q0, self.F = T, Q, R, q0, F
    def __repr__(self):
        return str(self.q0) + '\n' + ' '.join(str(f) for f in self.F) + '\n' + \
               '\n'.join(str(q) + ' ' + a + ' → ' + str(r) for (q, a, r) in self.R)

def REToFSA(re) -> FiniteStateAutomaton:
    global QC
    if re == '': q = QC; QC += 1; return FiniteStateAutomaton(set(), {q}, set(), q, {q})
    elif type(re) == str:
        q = QC; QC += 1; r = QC; QC += 1
        return FiniteStateAutomaton({re}, {q, r}, {(q, re, r)}, q, {r})
    elif type(re) == Choice:
        A1, A2 = REToFSA(re.e1), REToFSA(re.e2)
        R2 = {(A1.q0 if q == A2.q0 else q, a, r) for (q, a, r) in A2.R} # A2.q0 renamed to A1.q0 in A2.R
        F2 = {A1.q0 if q == A2.q0 else q for q in A2.F} # A2.q0 renamed to A1.q0 in A2.F
        return FiniteStateAutomaton(A1.T | A2.T, A1.Q | A2.Q, A1.R | R2, A1.q0, A1.F | F2)
    elif type(re) == Conc:
        A1, A2 = REToFSA(re.e1), REToFSA(re.e2)
        R = A1.R | {(f, a, r) for (q, a, r) in A2.R if q == A2.q0 for f in A1.F} | \
            {(q, a, r) for (q, a, r) in A2.R if q != A2.q0}
        F = (A2.F - {A2.q0}) | (A1.F if A2.q0 in A2.F else set())
        return FiniteStateAutomaton(A1.T | A2.T, A1.Q | A2.Q, R, A1.q0, F)
    elif type(re) == Star:
        A = REToFSA(re.e)
        R = A.R | {(f, a, r) for (q, a, r) in A.R if q == A.q0 for f in A.F}
        return FiniteStateAutomaton(A.T, A.Q, R, A.q0, {A.q0} | A.F)
    else: raise Exception('not a regular expression')

def convertRegExToFSA(re) -> FiniteStateAutomaton:
    global QC; QC = 0
    return REToFSA(re)

In [6]:
def deterministicFSA(fsa: FiniteStateAutomaton, trace = False) -> FiniteStateAutomaton:
    qʹ0 = set({fsa.q0})
    Qʹ, Rʹ, visited = {qʹ0}, set(), set()
    if trace: print(Qʹ, Rʹ, visited)
    while visited != Qʹ:
        qʹ = (Qʹ - visited).pop(); visited |= {qʹ}
        for t in fsa.T:
            rʹ = {r for (q, u, r) in fsa.R if u == t and q in qʹ}
            if rʹ != set(): Qʹ |= {set(rʹ)}; Rʹ |= {(qʹ, t, set(rʹ))}
        if trace: print(Qʹ, Rʹ, visited)
    Fʹ = {qʹ for qʹ in Qʹ for f in fsa.F if f in qʹ}
    return FiniteStateAutomaton(fsa.T, Qʹ, Rʹ, qʹ0, Fʹ)

def accepts(fsa: FiniteStateAutomaton, τ: str) -> bool:
    δ = {(q, a): r for (q, a, r) in fsa.R}
    q = fsa.q0
    for t in τ:
        if (q, t) in δ: q = δ[q, t]
        else: return False
    return q in fsa.F

Now we add one boolean function, `equalRegEx`, that takes two strings, parses them as regular expressions, converts them to finite state machines, makes those deterministic, minimizes them, and finally compares them for equivalence.

In [7]:
def minimizeFSA(fsa: FiniteStateAutomaton) -> FiniteStateAutomaton:
    δ = {(q, a): r for (q, a, r) in fsa.R}
    dist = {(q, r) for q in fsa.Q for r in fsa.Q if q != r and (q in fsa.F) != (r in fsa.F)}
    done = False
    while not done:
        done = True #; print(dist)
        for q in fsa.Q:
            for r in fsa.Q:
                if q != r and (q, r) not in dist and any(((q, u) in δ) != ((r, u) in δ) or \
                    ((q, u) in δ) and ((δ[(q, u)], δ[(r, u)]) in dist) for u in fsa.T):
                    dist |= {(q, r)}; done = False #; print('adding', q, r)
    Qʹ = {set({q} | {r for r in fsa.Q if (q, r) not in dist}) for q in fsa.Q}
    Rʹ = {(qʹ, u, rʹ) for qʹ in Qʹ for rʹ in Qʹ for u in fsa.T if any((q, u, r) in fsa.R for q in qʹ for r in rʹ)}
    qʹ0 = {qʹ for qʹ in Qʹ if fsa.q0 in qʹ}.pop()
    Fʹ = {qʹ for qʹ in Qʹ if (qʹ & fsa.F) != set()}
    return FiniteStateAutomaton(fsa.T, Qʹ, Rʹ, qʹ0, Fʹ)

def totalFSA(A: FiniteStateAutomaton, t = -1) -> FiniteStateAutomaton:
    T = set('abcdefghijklmnopqrstuvwxyz') # T is vocabulary, t is trap state
    R = A.R | {(q, a, t) for q in A.Q for a in T if all((q, a, r) not in A.R for r in A.Q)}
    if any(r == t for (q, a, r) in R): # transition to t exists
        Q = A.Q | {t}
        R = R | {(t, a, t) for a in T}
    else: Q = A.Q
    return FiniteStateAutomaton(T, Q, R, A.q0, A.F)

def equivalentFSA(a: FiniteStateAutomaton, aʹ: FiniteStateAutomaton, printMap = False) -> bool:
    a = totalFSA(minimizeFSA(a))
    aʹ = totalFSA(minimizeFSA(aʹ))
    δ = {(q, u): r for (q, u, r) in a.R}
    δʹ = {(q, u): r for (q, u, r) in aʹ.R}
    m, v = {a.q0: aʹ.q0}, {a.q0}
    while v != set():
        q = v.pop(); qʹ = m[q]
        for u in a.T:
            r, rʹ = δ[(q, u)], δʹ[(qʹ, u)]
            if r in m:
                if m[r] != rʹ: return False
            elif rʹ in m.values(): return False
            else: v.add(r); m[r] = rʹ
    if printMap: print(m) 
    return aʹ.F == {m[q] for q in a.F}

In [8]:
def equalRegEx(E1, E2):
    a1 = deterministicFSA(convertRegExToFSA(parse(E1)))
    a2 = deterministicFSA(convertRegExToFSA(parse(E2)))
    return equivalentFSA(a1, a2)

You may use the test cases below to check your implementation of `parse`:

In [9]:
assert equalRegEx('a+', '(a+)+')
assert equalRegEx('(a+)*', '(a*)+')
assert equalRegEx('(a+)*', 'a*')
assert equalRegEx('aa*', 'a*a')
assert equalRegEx('a*', '(a+)?')
assert equalRegEx('a*', '(a?)+')
assert equalRegEx('a?', '(a?)?')
assert equalRegEx('(a*b*)*', '(a|b)*')
assert not equalRegEx('a*b*', '(a|b)*')

Let us extend regular expressions with counted repetitions, for example:
- `a{3}` is `a` repeated 3 times, i.e. `aaa`;
- `a{3,}` is `a` repeated at least 3 times, i.e. `aaaa*`
- `a{2,4}` is `a` repeated 2, 3, or 4 times, i.e. `aa | aaa | aaaa`.

JupyterLab supports counted repetitions: you can try this out by selecting `Find...` and then clicking on `Use regex`. 

We let `{...}` bind like the other postfix operators, `*`, `+`, and `?`. That is:

    E₁|E₂{d} = E₁|(E₂{d})
    E₁ E₂{d} = E₁ (E₂{d})

In the extended grammar, `{` and `}` are now escaped characters:

    expression  →  term { '|' term }
    term  →  factor { factor }
    factor → atom [ '*' | '+' | '?' | '{' integer [',' [integer]] '}' ]
    atom  →  plainchar | escapedchar | '(' expression ')'
    plainchar  →  ' ' | '!' | '"' | '#' | '$' | '%' | '&' | '\'' | ',' | '-' | '.' | '/' |
         '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' | ':' | ';' | '<' | '=' | '>' | 
         '@' | 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'G' | 'H' | 'I' | 'J' | 'K' | 'L' | 'M' | 'N' | 'O' |
         'P' | 'Q' | 'R' | 'S' | 'T' | 'U' | 'V' | 'W' | 'X' | 'Y' | 'Z' | '[' | ']' | '^' | '_' |
         '`' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f' | 'g' | 'h' | 'i' | 'j' | 'k' | 'l' | 'm' | 'n' | 'o' |
         'p' | 'q' | 'r' | 's' | 't' | 'u' | 'v' | 'w' | 'x' | 'y' | 'z' | '~'
    escapedchar  → '\\' ( '(' | ')' | '*' | '+' | '?' | '\\' | '|' | '{' | '}' )
    integer  →  digit {digit}
    digit  →  '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'

Extend the attribute grammar above to construct an abstract syntax tree of extended regular expressions. The type of the nodes does not need to be extended. Rather, counted repetitions are expanded while parsing, similarly to `+`. As a hint, for EBNF expression `[E]` attribute rules can be added by `(E «S» | «T»)`, meaning that attributes are calculated according to `S` if `E` is present and according to `T` otherwise. [2 points]

    expression(e)  →  term(e) { '|' term(f) « e := Choice(e, f) »  }
    term(e)  →  factor(e) { factor(f) « e := Conc(e, f) » }
    factor(e) → atom(e) [ '*' « e := Star(e) » | '+' « e := Conc(e, Star(e)) » | '?' « e := Choice(e, '') » | '{' integer(n) [(',' « [(integer(m) « e := repeat_nm(e, n, m) » | « e := Conc(repeat_n(e, n), Star(e)) »)] | « e := repeat_n(e, n) »)] '}']
    atom(e)  →  plainchar(e) | escapedchar(e) | '(' expression(e) ')'
    plainchar(e)  →  ' ' « e := ' ' » | ... | '~' « e := '~' »
    escapedchar(e)  → '\\' ( '(' « e := '(' » | ')' | ... | '|' « e := '|' »)
    integer(n) → digit(n) {digit(d) « n := nd » }
    digit(d) → '0' « d := 0 » | … | '9' « d := 9 »

 - The auxiliary function `repeat_n(e, n)` creates concatenation `Conc()` of expression `e` `n` number of times.
 - The auxiliary function `repeat_nm(e, n, m)` creates choice `Choice()` of expressions `e` repeated from `n` to `m` times.

Now extend the parser to constructed the abstract syntax tree of extended regular expressions [4 points]

In [14]:
PlainChars = ' !"#$%&\',-./0123456789:;<=>@ABCDEFGHIJKLMNO' + \
                       'PQRSTUVWXYZ[]^_`abcdefghijklmnopqrstuvwxyz{}~'
EscapedChars = '()*+?\\|'
FirstFactor = PlainChars + '\\('
Digits = '0123456789'

src: str; pos: int; sym: str

def nxt():
    global pos, sym
    if pos < len(src): sym, pos = src[pos], pos+1
    else: sym = chr(0) # end-of-input symbol

def expression():
    e = term()
    while sym == '|': nxt(); e = Choice(e, term())
    return e

def term(): 
    e = factor()
    while sym in FirstFactor: e = Conc(e, factor())
    return e

def factor():
    e = atom()
    if sym == '*': nxt(); return Star(e)
    elif sym == '+': nxt(); return Conc(e, Star(e))
    elif sym == '?': nxt(); return Choice(e, '')
    elif sym == '{':
        nxt();
        if sym in Digits:
            n = int(integer()); m = n
            if sym == ',':
                nxt(); m = -1
                if sym in Digits: m = int(integer())
            if sym == '}': 
                if n == m: nxt(); return repeat_n(e, n) 
                elif m == -1: nxt(); return Conc(repeat_n(e, n), Star(e))
                elif n < m: nxt(); return repeat_nm(e, n, m)
                else: raise Exception("quantifier range is out of order")
            else: raise Exception("'}' expected at " + str(pos))
        else: raise Exception("invalid character at " + str(pos))
    return e

def atom():
    e = sym
    if e in PlainChars: nxt(); return e
    elif e == '\\':
        nxt()
        e = sym
        if e in EscapedChars: nxt(); return e
        else: raise Exception("invalid escaped character at " + str(pos))
    elif sym == '(':
        nxt(); e = expression()
        if sym == ')': nxt(); return e
        else: raise Exception("')' expected at " + str(pos))
    else: raise Exception("invalid character at " + str(pos))

def repeat_nm(e, n: int, m: int):
    if n == m: return repeat_n(e, n)
    return Choice(repeat_n(e, n), repeat_nm(e, n + 1, m))

def integer():
    n = ''
    while sym in Digits: n += sym; nxt()
    return n

def repeat_n(e, n: int):
    if n == 0: return ''
    return Conc(repeat_n(e, n - 1), e)

def parse(s: str):
    global src, pos;
    src, pos = s, 0; nxt(); e = expression()
    if sym != chr(0): raise Exception("unexpected character at " + str(pos))
    return e

Here are some test cases:

In [15]:
assert str(parse('a{0}')) == ''
assert str(parse('a{1}')) == '(a)'
assert str(parse('a{10}')) == '((((((((((a)a)a)a)a)a)a)a)a)a)'
assert str(parse('(ab){2}')) == '(((ab))(ab))'
assert str(parse('a{1,2}')) == '((a)|((a)a))'
assert str(parse('a{2,}')) == '(((a)a)(a)*)'

assert equalRegEx('a{0,}', 'a*')
assert equalRegEx('a{1,}', 'a+')
assert equalRegEx('a{1,1}', 'a')
assert equalRegEx('a{0,1}', 'a?')
assert equalRegEx('a{0,1}', 'a?')
assert equalRegEx('a{0,2}', 'a?|aa')
assert equalRegEx('a{1,3}', 'a|aa|aaa')

GhostSpeak = deterministicFSA(convertRegExToFSA(parse('bo{3,}h')))
assert not accepts(GhostSpeak, 'booo')
assert not accepts(GhostSpeak, 'booh')
assert accepts(GhostSpeak, 'boooh')
assert accepts(GhostSpeak, 'booooooooh')
assert not accepts(GhostSpeak, 'bboooohhh')
assert not accepts(GhostSpeak, 'booh boooh')