In [None]:
# so we want to parse input/strings such as "5 + 6 + 10"
# eventually we would want to move to "5+6 * 10+3" and parse that correctly

In [None]:
# for addition we do not need anything fance

In [1]:
eval("5 + 6 + 7")  # this is dangerous if you do not control the string ! 
# so eval will lex the string and parse it and then actually do the work (meaning summing)

18

In [2]:
text = "5 + 6 + 7"
# no significant whitespace
clean = text.replace(" ","") # part of lexical analysis cleaning whitespace
tokens = clean.split("+") # tokenization again part of lexical analysis
result = sum([int(token) for token in tokens]) # here we skip the tree since all of the tokens are separated by +..
result

18

In [3]:
def addIntrepreter(text):
    clean = text.replace(" ","")
    tokens = clean.split("+")
    result = sum([int(token) for token in tokens]) # nice shortcut because we only have +
    return result

In [4]:
addIntrepreter("  5+5+10000+5   + 7 + 10  ")

10032

In [None]:
# how about substraction well then we will already need some sort of structure we could use a stack based structure to store operation
# "10 - 5 + 3 - 2 + 20"  should be 26

In [None]:
# we could start again by stripping whitespace and similar as it is not signifant here
# optimization would be to skip cleaning and clean whitespace as we go
# then we could save the tokens in some sort of data structure (here stacks would work nicely)
# or we could interpret as we go (so sort of like REPL)

In [5]:
def sub_add(text):
    acc = 0
    n = 0
    tok = ""
    state = "NUM" # "OP"
    operations = ["+","-"]
    op = ""
    # we really need a state machine here for determining whether we have a number or addition or substraction
    # so ONE PASS parsing, scannerless parsing since it is so trivial
    # so there is a simple state machine hidden here
    for t in text:
        if t in [" ","\t"]: # same as replace or cleaning our insignifcant
            continue
        if t.isdigit():
#             print(f"Digit is {t} and tok is {tok}")
            if state == "OP":
                state = "NUM"
                tok = "" # not efficient keep building up the NUM
            tok += t
#             print(f"Digit is {t} and tok AFTER is {tok}")
            continue
        if t in operations:
            state = "OP" # FIXME multiple operations error
            print(f"BEFORE operation  {acc} {op} {tok}")
            if op == "+": # we check the previous operation
                acc += int(tok)
                tok = ""
            elif op == "-":
                acc -= int(tok)
                tok = ""
            elif op == "": # first time
                acc = int(tok)
                tok = ""
            print(f"AFTER operation {op} {acc}")
            op = t
    if op == "+": # we check the previous operation
        acc += int(tok)
    elif op == "-":
        acc -= int(tok)
    return acc
        

In [6]:
sub_add("10 - 5 + 3 - 2 + 20")


BEFORE operation  0  10
AFTER operation  10
BEFORE operation  10 - 5
AFTER operation - 5
BEFORE operation  5 + 3
AFTER operation + 8
BEFORE operation  8 - 2
AFTER operation - 6


26

In [None]:
# for more complicated operations we will need to build a syntax tree we can't just have a simple accumulator design, 
# above is only sufficient when we have left to right order of operations

# one example is given in this course
# https://ruslanspivak.com/lsbasi-part1/

## Full Arithmetic Parser


In [1]:
import re
from collections import namedtuple

# Grammar (EBNF):
# expression = term { ("+" | "-") term } ;
# term       = factor { ("*" | "/") factor } ;
# factor     = INTEGER | "(" expression ")" ;
# INTEGER    = [0-9]+ ;

Token = namedtuple('Token', ['type', 'value'])

TOKEN_SPEC = [
    ('INTEGER', r'\d+'),
    ('PLUS',    r'\+'),
    ('MINUS',   r'-'),
    ('MUL',     r'\*'),
    ('DIV',     r'/'),
    ('LPAREN',  r'\('),
    ('RPAREN',  r'\)'),
    ('WS',      r'\s+'),
]

master_pattern = re.compile(
    '|'.join(f'(?P<{name}>{pattern})' for name, pattern in TOKEN_SPEC)
)

def tokenize(text):
    """Generate tokens from the input text."""
    for mo in master_pattern.finditer(text):
        kind = mo.lastgroup
        if kind == 'WS':
            continue
        value = mo.group()
        yield Token(kind, value)
    yield Token('EOF', '')

# AST nodes
typedef = None
class AST:
    pass

class BinOp(AST):
    def __init__(self, left, op, right):
        self.left = left
        self.op = op    # 'PLUS', 'MINUS', 'MUL', or 'DIV'
        self.right = right

class Num(AST):
    def __init__(self, value):
        self.value = int(value)

# Parser with operator precedence
class Parser:
    def __init__(self, tokens):
        self.tokens = iter(tokens)
        self.current_token = next(self.tokens)

    def eat(self, token_type):
        if self.current_token.type == token_type:
            self.current_token = next(self.tokens)
        else:
            raise SyntaxError(f"Expected {token_type}, got {self.current_token.type}")

    def parse(self):
        node = self.parse_expression()
        if self.current_token.type != 'EOF':
            raise SyntaxError("Unexpected token after expression")
        return node

    def parse_expression(self):
        # expression = term { (+|-) term }
        node = self.parse_term()
        while self.current_token.type in ('PLUS', 'MINUS'):
            op = self.current_token.type
            self.eat(op)
            right = self.parse_term()
            node = BinOp(node, op, right)
        return node

    def parse_term(self):
        # term = factor { (*|/) factor }
        node = self.parse_factor()
        while self.current_token.type in ('MUL', 'DIV'):
            op = self.current_token.type
            self.eat(op)
            right = self.parse_factor()
            node = BinOp(node, op, right)
        return node

    def parse_factor(self):
        # factor = INTEGER | LPAREN expression RPAREN
        token = self.current_token
        if token.type == 'INTEGER':
            self.eat('INTEGER')
            return Num(token.value)
        elif token.type == 'LPAREN':
            self.eat('LPAREN')
            node = self.parse_expression()
            self.eat('RPAREN')
            return node
        else:
            raise SyntaxError(f"Unexpected token: {token.type}")

# Evaluator
def evaluate(node):
  if isinstance(node, Num):
      return node.value
  if isinstance(node, BinOp):
      left = evaluate(node.left)
      right = evaluate(node.right)
      if node.op == 'PLUS':
          return left + right
      elif node.op == 'MINUS':
          return left - right
      elif node.op == 'MUL':
          return left * right
      elif node.op == 'DIV':
          return left / right  # or integer division // if desired
  raise ValueError("Unknown node type")

# Interpreter function
def interpret(text):
  tokens = tokenize(text)
  parser = Parser(tokens)
  ast = parser.parse()
  return evaluate(ast)

# Examples
if __name__ == '__main__':
  print(interpret("3 + 4 * (2 - 5) / 5"))  # 3 + (4*(2-5)/5) = 3 + (4*(-3)/5) = 3 - 12/5 = 0.6
  print(interpret("(10 + 2) * 7"))       # (10+2) * 7 = 84


0.6000000000000001
84
