In [2]:
def prepare(code_string):
    return code_string.replace("\n", " ")

In [3]:
with open("code_example.txt", encoding = 'utf-8', mode="r") as f:
    code = prepare(f.read())
print(repr(code))

'"Начало"  "Первое" 110 88 8123 23 "Второе" 999.99, 111.111, "Конец второго" "Третье" vara, varb, varc  "Сочетаемое" 12 34  123: vara = 1 + [2 * 3] || 4  varb = 1 + [2 * 3] || 4  '


In [1]:
None.__repr__()

'None'

In [22]:
import re

#########################################
# Tokens
# Words
T_BEGIN = "BEGIN"
T_FIRST = "FIRST"
T_SECOND = "SECOND"
T_EOSECOND = "END OF SECOND"
T_THIRD = "THIRD"
T_COMBINED = "COMBINED"
# Values
T_REAL = "REAL"
T_INT = "INTEGER"
T_ID = "ID"
# Operators
T_PLUS = "PLUS"
T_MINUS = "MINUS"
T_MUL = "MUL"
T_DIV = "DIV"
T_COMMA = "COMMA"
T_LBR = "L BRACKET"
T_RBR = "R BRACKET"
T_EQ = "EQUALS"
T_AND = "AND"
T_OR = "OR"
# Other
T_EOF = "EOF"
T_MARK = "MARK"
T_TEMP_EXPR = "EXPRESSION"
#########################################

class Pattern(object):
    def __init__(self, re, type, token) -> None:
        super().__init__()
        self.re = re
        self.type = type
        self.token = token

# TODO fix errors with marks, variables, integers and real values 
PATTERNS = [
#########################################
# Other
    Pattern(R"^\d+:", "mark", T_MARK),
#########################################
# Words
    Pattern(R'^"Начало"', "word", T_BEGIN),
    Pattern(R'^"Первое"', "word", T_FIRST),
    Pattern(R'^"Второе"', "word", T_SECOND),
    Pattern(R'^"Конец второго"', "word", T_EOSECOND),
    Pattern(R'^"Третье"', "word", T_THIRD),
    Pattern(R'^"Сочетаемое"', "word", T_COMBINED),
#########################################
# Operands
    Pattern(R"^\d+.\d+", "operand", T_REAL),
    Pattern(R"^\d+", "operand", T_INT),
    Pattern(R"^[a-zA-Zа-яА-ЯеЁ](\d+|[a-zA-Zа-яА-ЯеЁ]+)*", "id", T_ID),
#########################################
# Operators
    # {"re": R"^(\+|\-|\*|\/|\|\||&&|\[|\]|=|,)", "type": "operator"},
    Pattern(R'^\+', "operator", T_PLUS),
    Pattern(R'^\-', "operator", T_MINUS),
    Pattern(R'^\*', "operator", T_MUL),
    Pattern(R'^\/', "operator", T_DIV),
    Pattern(R'^\=', "operator", T_EQ),
    Pattern(R'^\,', "operator", T_COMMA),
    Pattern(R'^\[', "operator", T_LBR),
    Pattern(R'^\]', "operator", T_RBR),
    Pattern(R'^\|\|', "operator", T_OR),
    Pattern(R'^&&', "operator", T_AND),
]

SKIP_PATTERNS = [
    R"^\\n+",
    R"^ "
]

class Token(object):
    def __init__(self, type, token, value):
        self.type = type
        self.token = token
        self.value = value

class Tokenizer:
    def __init__(self, code_string):
        self._code_string = code_string
        self._pos = 0

    def next_token_exists(self):
        return self._pos < len(self._code_string)

    def tokenize(self):
        tokens = []
        while self.next_token_exists():
            token = self.next_token()
            if token is not None:
                tokens.append(token)

        return tokens

    def next_token(self):
        current_string = self._code_string[self._pos:]

        for pattern in PATTERNS:
            match = re.match(pattern.re, current_string)
            if match:
                value = match.group(0)
                self._pos += len(value)
                return Token(pattern.type, pattern.token, value)
        
        for skip in SKIP_PATTERNS:
            match = re.match(skip, current_string)
            if match:
                value = match.group(0)
                self._pos += len(value)
                return None

        unrecognized_token = re.match(R'^.+s*|(\n)', current_string).group(0)
        raise Exception(f"Unrecognized token: {unrecognized_token} on {self._pos}")
        

In [23]:
T = Tokenizer(code)
for t in T.tokenize():
    print(t.__dict__)

{'type': 'word', 'token': 'BEGIN', 'value': '"Начало"'}
{'type': 'word', 'token': 'FIRST', 'value': '"Первое"'}
{'type': 'operand', 'token': 'REAL', 'value': '110 88'}
{'type': 'operand', 'token': 'REAL', 'value': '8123 23'}
{'type': 'word', 'token': 'SECOND', 'value': '"Второе"'}
{'type': 'operand', 'token': 'REAL', 'value': '999.99'}
{'type': 'operator', 'token': 'COMMA', 'value': ','}
{'type': 'operand', 'token': 'REAL', 'value': '111.111'}
{'type': 'operator', 'token': 'COMMA', 'value': ','}
{'type': 'word', 'token': 'END OF SECOND', 'value': '"Конец второго"'}
{'type': 'word', 'token': 'THIRD', 'value': '"Третье"'}
{'type': 'id', 'token': 'ID', 'value': 'vara'}
{'type': 'operator', 'token': 'COMMA', 'value': ','}
{'type': 'id', 'token': 'ID', 'value': 'varb'}
{'type': 'operator', 'token': 'COMMA', 'value': ','}
{'type': 'id', 'token': 'ID', 'value': 'varc'}
{'type': 'word', 'token': 'COMBINED', 'value': '"Сочетаемое"'}
{'type': 'operand', 'token': 'REAL', 'value': '12 34'}
{'type'

In [46]:
re.match(R"", "   sdf")