In [34]:
# input from user
# regex = input("Enter your regex: ")

operators = {
    "+",
    "*",
    "(",
    ")",
    "|",
    ".",
    "[",
    "]",
    "&",
}

In [35]:
# a-z A-z 0-9
alpha_numerics = set(
    [chr(i) for i in range(48, 58)]
    + [chr(i) for i in range(65, 91)]
    + [chr(i) for i in range(97, 123)]
)

In [36]:
def is_valid_regex(in_regex):
    return (
        all([c in alpha_numerics or c in operators for c in in_regex])
        and in_regex.count("(") == in_regex.count(")")
        and in_regex.count("[") == in_regex.count("]")
    )

In [37]:
is_valid_regex("(regex][)")

True

In [49]:
# precedence from highest to lowest
# Closure (Kleene star) a*
# Concatenation ab
# Union a+b
# If the input symbol is a letter… append it directly to the output queue
# If the input symbol is an operator… if there exists an operator already on the top of the operator stack with higher

# or equal precedence than our current input symbol, remove the operator from the top of the operator stack and append it

# to the output queue.

# Do this until the current input symbol has a higher precedence than the symbol on the top of the operator stack,

# or the operator stack is empty.

# If the input symbol is an operator AND there is a left parenthesis on top of the stack… append the input symbol onto

# the stack on top of the left parenthesis.

# If the input symbol is an ( … append it to the operator stack

# If the input symbol is an ) … pop all operators from the operator stack and append them to the output queue

# until you find an ( . Then, you can remove both of those parentheses and continue with the algorithm.

operators_precedence = {
    "*": 6,
    "&": 5,
    "+": 4,
    "?": 3,
    ".": 2,
    "|": 1,
}


def validate_range(range_regex):
    valid_ranges = [range(48, 58), range(65, 91), range(97, 123)]
    index = 1

    while index < len(range_regex) - 1:
        current_char = range_regex[index]

        if current_char in alpha_numerics:
            index += 1
        elif current_char in operators_precedence.keys():
            return False
        elif current_char == "-":
            prev_char = range_regex[index - 1]
            next_char = range_regex[index + 1]
            if not (
                prev_char in alpha_numerics
                and next_char in alpha_numerics
                and ord(prev_char) < ord(next_char)
                and any(
                    [
                        set(range(ord(prev_char), ord(next_char))).issubset(ranges)
                        for ranges in valid_ranges
                    ]
                )
            ):
                return False
            index += 2
    return True


def regex_to_postfix(regex):

    regex = list(regex)
    stack = []
    output_queue = []
    index = 0

    while index < len(regex):
        char = regex[index]

        if char in alpha_numerics or len(char) > 1:
            output_queue.append(char)

        elif char in operators:
            if char == "[":
                # if no matching closing parenthesis
                if not "]" in regex[index:]:
                    return False

                else:
                    range_re = "".join(regex[index : regex.index("]") + 1])
                    if validate_range(range_re):
                        output_queue.append(range_re)
                        index = regex.index("]") + 1
                        continue
                    return False
                    # print("range_re", range_re)

            if stack:
                if (
                    stack[-1] in operators_precedence.keys()
                    and char in operators_precedence.keys()
                ):
                    while (
                        stack
                        and stack[-1] in operators_precedence.keys()
                        and operators_precedence[stack[-1]]
                        >= operators_precedence[char]
                    ):
                        output_queue.append(stack.pop())

                    stack.append(char)

                elif char in operators_precedence.keys() and stack[-1] == "(":
                    stack.append(char)

                elif char == "(":
                    stack.append(char)

                elif char == ")":
                    while stack and stack[-1] != "(":
                        output_queue.append(stack.pop())

                    else:
                        if stack[-1] == "(":
                            stack.pop()
                        else:
                            return False

            elif char != "[" and char != "]":
                stack.append(char)

        index += 1

    if "(" in stack or ")" in stack:
        return False

    while stack:
        top = stack[-1]
        if top == "(":
            return False
        output_queue.append(stack.pop())
    return output_queue


# lw char b3do char
# lw char brdo bracket
# lw * b3dha char aw ( [
# lw + b3dha char aw ( [ => a+(bc) => a+.(b.c),
# lw ] aw ) b3do char =>  a+([a-z]a) => a+.([a-z].a)


def add_concatenation(regex):
    regex = list(regex)

    i = 1
    while i < len(regex):
        current_char = regex[i - 1]
        next_char = regex[i]
        if current_char == "[":
            # skip to close
            i = regex[i:].index("]") + i + 1
            continue
        elif current_char in alpha_numerics:
            if next_char in alpha_numerics or next_char in ["[", "("]:
                regex.insert(i, "&")
                i += 2
            else:
                i += 1
        elif current_char in ["*", "+"] and next_char in ["[", "("]:
            regex.insert(i, "&")
            i += 2

        elif current_char in ["]", ")"] and next_char in alpha_numerics:
            regex.insert(i, "&")
            i += 2
        else:
            i += 1

    return regex


# regex = "(A+.B*)?.(C|D)"
# regex = "[xA-c09]"
# regex = "ab+A*"
# regex = "ab+[a-c1-0]*"
regex = "ab(ab|(a|b)a)"


# print("".join(regex_to_postfix(regex)))

# regex = "ab+A*"
# abA*+

print("infix: ", "".join(add_concatenation(regex)))
print("postfix: ", "".join(regex_to_postfix(add_concatenation(regex))))

infix:  a&b&(a&b|(a|b)&a)
postfix:  ab&ab&ab|a&|&


In [39]:
regex = "ab+A*"

# ab*A+
print("".join(regex_to_postfix(regex)))

abA*+


In [40]:
regex

'ab+A*'

In [41]:
"".join(regex_to_postfix(regex))

'abA*+'

In [42]:
# Algorithm
# ( -> push to stack
# Alphanumeric -> append to output queue
# operator
# {
#     "*" : 5,
#     "+" : 4,
#     "?" : 3,
#     "." : 2,
#     "|" : 1
# }
# [abc] -> a | b | c
# ) -> pop from stack until you find (
# ranges -

In [43]:
#   new_range_regex = ["("]
#                     closing_index = regex[index:].index("]") + index
#                     index_range = index + 1

#                     while index_range < closing_index:
#                         char_range = regex[index_range]
#                         if char_range in alpha_numerics:
#                             next_char = regex[index_range + 1]
#                             if next_char == "]":
#                                 # new_range_regex.append("|")
#                                 new_range_regex.append(char_range)
#                                 break

#                             elif next_char in alpha_numerics:
#                                 new_range_regex.append(char_range)
#                                 new_range_regex.append("|")
#                                 index_range += 1

#                             elif next_char == "-":
#                                 end_range = regex[index_range + 2]
#                                 index_range += 2

#                                 if not end_range in alpha_numerics or ord(
#                                     end_range
#                                 ) < ord(char_range):
#                                     return False

#                                 to_append = char_range

#                                 while to_append != end_range:

#                                     if to_append not in alpha_numerics:  # like if A-a
#                                         return False

#                                     new_range_regex.append(to_append)
#                                     new_range_regex.append("|")
#                                     to_append = chr(ord(to_append) + 1)

#                                 else:
#                                     new_range_regex.append(to_append)
#                                     index_range += 1
#                                     if regex[index_range] in alpha_numerics:
#                                         new_range_regex.append("|")

#                         elif (
#                             char_range == "-"
#                             or char_range in operators_precedence.keys()
#                         ):
#                             return False

#                         else:
#                             index_range += 1

#                     new_range_regex.append(")")
#                     print(
#                         "NEW REGEX",
#                         # regex[:index],
#                         new_range_regex,
#                         # regex[closing_index + 1 :],
#                         sep="\n",
#                     )
#                     regex = regex[:index] + new_range_regex + regex[closing_index + 1 :]
#                     continue

In [44]:
validate_range("[a-b]")

True

In [45]:
"""
test cases
+++ -> invalid
"[a-cA-c09]"

ab[a-c0-3]* -> bayza

"""

'\ntest cases\n+++ -> invalid\n"[a-cA-c09]"\n\nab[a-c0-3]* -> bayza\n\n'

In [None]:
class node:
    def __init__(
        self,
        start,
    ):
        self.value = value
        self.next = None

In [None]:
def postfix_to_nfa(regex):
    pass