In [2]:
# input from user
# regex = input("Enter your regex: ")

operators = {
    "+",
    "*",
    "(",
    ")",
    "|",
    ".",
    "[",
    "]",
    "?",
}

In [3]:
# a-z A-z 0-9
alpha_numerics = set(
    [chr(i) for i in range(48, 58)]
    + [chr(i) for i in range(65, 91)]
    + [chr(i) for i in range(97, 123)]
)

In [4]:
def is_valid_regex(in_regex):
    return (
        all([c in alpha_numerics or c in operators for c in in_regex])
        and in_regex.count("(") == in_regex.count(")")
        and in_regex.count("[") == in_regex.count("]")
    )

In [5]:
is_valid_regex("(regex][)")

True

In [75]:
# precedence from highest to lowest
# Closure (Kleene star) a*
# Concatenation ab
# Union a+b
# If the input symbol is a letter… append it directly to the output queue
# If the input symbol is an operator… if there exists an operator already on the top of the operator stack with higher

# or equal precedence than our current input symbol, remove the operator from the top of the operator stack and append it

# to the output queue.

# Do this until the current input symbol has a higher precedence than the symbol on the top of the operator stack,

# or the operator stack is empty.

# If the input symbol is an operator AND there is a left parenthesis on top of the stack… append the input symbol onto

# the stack on top of the left parenthesis.

# If the input symbol is an ( … append it to the operator stack

# If the input symbol is an ) … pop all operators from the operator stack and append them to the output queue

# until you find an ( . Then, you can remove both of those parentheses and continue with the algorithm.

operators_precedence = {
    "*": 5,
    "+": 4,
    "?": 3,
    ".": 2,
    "|": 1,
}


def validate_range(range_regex):
    valid_ranges = [range(48, 58), range(65, 91), range(97, 123)]
    index = 1

    while index < len(range_regex) - 1:
        current_char = range_regex[index]

        if current_char in alpha_numerics:
            index += 1
        elif current_char in operators_precedence.keys():
            return False
        elif current_char == "-":
            prev_char = range_regex[index - 1]
            next_char = range_regex[index + 1]
            if not (
                prev_char in alpha_numerics
                and next_char in alpha_numerics
                and ord(prev_char) < ord(next_char)
                and any(
                    [
                        set(range(ord(prev_char), ord(next_char))).issubset(ranges)
                        for ranges in valid_ranges
                    ]
                )
            ):
                return False
            index += 2
    return True


def regex_to_postfix(regex):

    regex = list(regex)
    stack = []
    output_queue = []
    index = 0

    while index < len(regex):
        char = regex[index]

        if char in alpha_numerics or len(char) > 1:
            output_queue.append(char)

        elif char in operators:
            if char == "[":
                # if no matching closing parenthesis
                if not "]" in regex[index:]:
                    return False

                else:
                    range_re = "".join(regex[index : regex.index("]") + 1])
                    if validate_range(range_re):
                        output_queue.append(range_re)
                        index = regex.index("]") + 1
                        continue
                    return False
                    # print("range_re", range_re)

            if stack:
                if (
                    stack[-1] in operators_precedence.keys()
                    and char in operators_precedence.keys()
                ):
                    while (
                        stack
                        and stack[-1] in operators_precedence.keys()
                        and operators_precedence[stack[-1]]
                        >= operators_precedence[char]
                    ):
                        output_queue.append(stack.pop())

                    stack.append(char)

                elif char in operators_precedence.keys() and stack[-1] == "(":
                    stack.append(char)

                elif char == "(":
                    stack.append(char)

                elif char == ")":
                    while stack and stack[-1] != "(":
                        output_queue.append(stack.pop())

                    else:
                        if stack[-1] == "(":
                            stack.pop()
                        else:
                            return False

            elif char != "[" and char != "]":
                stack.append(char)

        index += 1

    if "(" in stack or ")" in stack:
        return False

    while stack:
        top = stack[-1]
        if top == "(":
            return False
        output_queue.append(stack.pop())
    return output_queue


# regex = "(A+.B*)?.(C|D)"
# regex = "[xA-c09]"
regex = "ab+[a-c1-0]*"
# regex = "ab+A*"


# print("".join(regex_to_postfix(regex)))

# regex = "ab+A*"
# abA*+

print(regex_to_postfix(regex))

False


In [29]:
regex = "ab+A*"

# ab*A+
print("".join(regex_to_postfix(regex)))

abA*+


In [7]:
regex

'ab[a-c0-3]*+'

In [8]:
"".join(regex_to_postfix(regex))

['[', 'a', '-', 'c', '0', '-', '3']


'abac03*+'

In [9]:
# Algorithm
# ( -> push to stack
# Alphanumeric -> append to output queue
# operator
# {
#     "*" : 5,
#     "+" : 4,
#     "?" : 3,
#     "." : 2,
#     "|" : 1
# }
# [abc] -> a | b | c
# ) -> pop from stack until you find (
# ranges -

In [41]:
#   new_range_regex = ["("]
#                     closing_index = regex[index:].index("]") + index
#                     index_range = index + 1

#                     while index_range < closing_index:
#                         char_range = regex[index_range]
#                         if char_range in alpha_numerics:
#                             next_char = regex[index_range + 1]
#                             if next_char == "]":
#                                 # new_range_regex.append("|")
#                                 new_range_regex.append(char_range)
#                                 break

#                             elif next_char in alpha_numerics:
#                                 new_range_regex.append(char_range)
#                                 new_range_regex.append("|")
#                                 index_range += 1

#                             elif next_char == "-":
#                                 end_range = regex[index_range + 2]
#                                 index_range += 2

#                                 if not end_range in alpha_numerics or ord(
#                                     end_range
#                                 ) < ord(char_range):
#                                     return False

#                                 to_append = char_range

#                                 while to_append != end_range:

#                                     if to_append not in alpha_numerics:  # like if A-a
#                                         return False

#                                     new_range_regex.append(to_append)
#                                     new_range_regex.append("|")
#                                     to_append = chr(ord(to_append) + 1)

#                                 else:
#                                     new_range_regex.append(to_append)
#                                     index_range += 1
#                                     if regex[index_range] in alpha_numerics:
#                                         new_range_regex.append("|")

#                         elif (
#                             char_range == "-"
#                             or char_range in operators_precedence.keys()
#                         ):
#                             return False

#                         else:
#                             index_range += 1

#                     new_range_regex.append(")")
#                     print(
#                         "NEW REGEX",
#                         # regex[:index],
#                         new_range_regex,
#                         # regex[closing_index + 1 :],
#                         sep="\n",
#                     )
#                     regex = regex[:index] + new_range_regex + regex[closing_index + 1 :]
#                     continue

In [70]:
validate_range("[a-b]")

True

In [11]:
"""
test cases
+++ -> invalid
"[a-cA-c09]"

ab[a-c0-3]* -> bayza

"""

'\ntest cases\n+++ -> invalid\n"[a-cA-c09]"\n\nab[a-c0-3]* -> bayza\n\n'

In [12]:
for i in range(10):
    print(i)
    if i == 5:
        i = 7

0
1
2
3
4
5
6
7
8
9


In [13]:
import re


def validateRegex(regex):
    try:
        re.compile(regex)
    except re.error:
        print(f"Invalid regular expression: {regex}")
        return False
    return True


class POSTFIX:
    def __init__(self, regex):
        self.regex = regex
        self.postfix = self.shunt_yard(regex)

    def get_postfix(self):
        return self.postfix

    def shunt_yard(self, regex):
        # Map each operator to its precedence level.
        # The operators supported by this code are * (kleene star), + (one or more), ? (zero or one), . (concatenation), and | (alternation).
        operators = {"*": 5, "+": 4, "?": 3, ".": 2, "|": 1}
        # postfix will eventually store the postfix notation of the input regular expression,.
        # stack is used as an intermediate stack in the Shunting Yard algorithm.
        postfix, stack = "", ""
        # Check if the regular expression contains any character classes (denoted by square brackets).
        # If a character class is found, the function converts it to an alternation between the characters inside the class.
        # For example, the character class [abc] would be converted to the regular expression (a|b|c).
        # This is done using a while loop that iterates over the characters of the regular expression, finds the opening and closing brackets of the character class, and replaces the contents of the class with an alternation.
        for i in range(len(regex)):
            c = regex[i]
            if c == "[":
                j = i + 1
                while regex[j] != "]":
                    if regex[j].isalnum() and regex[j + 1].isalnum():
                        regex = regex[: j + 1] + "|" + regex[j + 1 :]
                    j += 1

        # Replace all remaining square brackets with parentheses.
        # This is done because parentheses are used to group sub-expressions in regular expressions
        regex = regex.replace("[", "(")
        regex = regex.replace("]", ")")

        print("postfix1: ", regex)

        # Replace any hyphen character (-) in the regular expression with an alternation between the characters on either side of the hyphen.
        # For example, the expression a-z would be converted to (a|b|c|...|y|z).
        # This is done using another for loop that iterates over the characters of the regular expression, finds the hyphen character, and replaces it with an alternation.
        hyphen_count = regex.count("-")
        for i in range(hyphen_count):
            for j in range(len(regex)):
                c = regex[j]
                if c == "-":
                    final = regex[j + 1]
                    first = regex[j - 1]
                    temp_list = ""
                    for k in range(int(ord(final) - ord(first))):
                        temp_list = temp_list + "|"
                        char = chr(ord(first) + k + 1)
                        temp_list = temp_list + char
                    regex = regex[0:j] + temp_list + regex[j + 2 :]
                    break
        print("postfix2: ", regex)
        # Insert a concatenation operator (.) between any two adjacent characters(characters that are not operators), unless the characters are already separated by an operator, or the second character is an opening parenthesis.
        dotIndices = []
        for i in range(len(regex) - 1):
            startOps = [")", "*", "+", "*"]
            endOps = ["*", "+", ".", "|", ")"]
            if regex[i] in startOps and regex[i + 1] not in endOps:
                dotIndices.append(i)
            elif regex[i].isalnum() and (regex[i + 1].isalnum() or regex[i + 1] == "("):
                dotIndices.append(i)

        for i in range(len(dotIndices)):
            regex = (
                regex[: dotIndices[i] + 1 + i] + "." + regex[dotIndices[i] + 1 + i :]
            )
        print("postfix3: ", regex)
        # Iterate over each character of the regular expression and performs the Shunting Yard algorithm.
        for i in range(len(regex)):
            c = regex[i]
            # If the character is an opening parenthesis, push it onto the stack.
            if c == "(":
                stack = stack + c
            # If the character is a closing parenthesis, pop operators off the stack and append them to the postfix string until an opening parenthesis is found. Then pop the opening parenthesis from the stack.
            elif c == ")":
                while stack[-1] != "(":
                    # places the character at the end of the stack in the postfix expression
                    postfix = postfix + stack[-1]
                    # [:-1] denotes up to or including the last character
                    stack = stack[:-1]
                stack = stack[:-1]  # removes the open bracket in the stack

            # If the character is an operator, pop operators off the stack and append them to the postfix string as long as they have higher or equal precedence to the current operator. Then push the current operator onto the stack.
            elif c in operators:
                while stack and operators.get(c, 0) <= operators.get(stack[-1], 0):
                    postfix, stack = postfix + stack[-1], stack[:-1]
                stack = stack + c

            # If the character is a operand (i.e. not an operator or parenthesis), append it to the postfix string.
            else:
                postfix = postfix + c
        # After iterating over all characters of the regular expression, the function pops any remaining operators off the stack and appends them to the postfix string.
        while stack:
            postfix, stack = postfix + stack[-1], stack[:-1]
        print("postfix5: ", regex)

        # Finally, the function returns the postfix notation of the input regular expression.
        return postfix

In [14]:
# regex = "ab[a-c0-3]*"
postfix = POSTFIX(regex)
print("----------------------------------------------------------------")
print("regex:", regex)
print("----------------------------------------------------------------")
print("postfix: ", postfix.get_postfix())
print("----------------------------------------------------------------")

postfix1:  ab(a-c|0-3)*+
postfix2:  ab(a|b|c|0|1|2|3)*+
postfix3:  a.b.(a|b|c|0|1|2|3)*+
postfix5:  a.b.(a|b|c|0|1|2|3)*+
----------------------------------------------------------------
regex: ab[a-c0-3]*+
----------------------------------------------------------------
postfix:  ab.ab|c|0|1|2|3|*+.
----------------------------------------------------------------
