In [1]:
#default_exp parser

# Parser_evaluator
> Blah blah

In this notebook we'll develop a formula parser for Excel formulas. The goal is to take a string as input and produce an AST. The goal is to produce a parser that can handle cell references, functions, and the basic operators.

Here's the top-level Excel spec:

`formula=expression  ;
expression="(",  expression,  ")"  | constant  | prefix-operator,  expression  | expression,  infix-operator,  expression  | expression,  postfix-operator  | cell-reference  |function-call  | name  ;`

In [2]:
#export
class ParseError(ValueError): pass

In [3]:
#export
def parse(s):
    '''Parse s into an AST.'''
    assert isinstance(s, str), f'Argument {s} to parse is not a string'
    # formula such as a reference
    if s.startswith('='): return parse_formula(s[1:].strip())
    else: return parse_value(s)
    
def parse_value(s):
    # text
    # Todo(Rik): this doesn't work for formulas with tuples in them. Should be not greedy!
    if len(s) >= 2 and s.startswith('"') and s.endswith('"') and not '"' in s[1:-1]: return s[1:-1]
    # Single char
    elif not s in '0123456789' and len(s) == 1: return s
    # Bools
    elif s.lower() == 'true': return True
    elif s.lower() == 'false': return False
    # Integers
    try: return int(s)
    except ValueError: pass
    # Floats
    try: return float(s)
    except ValueError: pass
    raise ParseError(f'Unable to parse value {s}')

In [4]:
#export
import re
from py_proto import colname_to_num, num_to_colname
from collections import namedtuple

re_ref = re.compile('^(?P<cfix>\$)?(?P<col>[A-Z]+)(?P<rfix>\$)?(?P<row>[1-9][0-9]*)$')

# Todo(Rik): Maybe worried about negative rows/cols? Input vaildation?
class Ref:
    def __init__(self, row, column, fixed_row=False, fixed_column=False):
        self.row, self.column = row, column
        self.fixed_row, self.fixed_column = fixed_row, fixed_column
        
    def __eq__(self, other):
        return self.__dict__ == other.__dict__
    
    def __hash__(self):
        return self.to_string().__hash__()
    
    def __repr__(self):
        return f'Ref(row={self.row}, column={self.column}, fixed_row={self.fixed_row}, fixed_column={self.fixed_column})'

    @classmethod
    def from_string(cls, s):
        m = re_ref.match(s)
        if not m: raise ParseError(f'{s} is not a reference')
        row, fixed_row = int(m['row'])-1, bool(m['rfix'])
        column, fixed_column = colname_to_num(m['col']), bool(m['cfix'])
        assert row >= 0
        return cls(row, column, fixed_row, fixed_column)
    
    def to_string(self):
        col = f'{"$" if self.fixed_column else ""}{num_to_colname(self.column)}'
        row = f'{"$" if self.fixed_column else ""}{self.row+1}'
        return col+row

InfixOp = namedtuple('InfixOp', ['op', 'left', 'right'])
PrefixOp = namedtuple('PrefixOp', ['op', 'arg'])
PostfixOp = namedtuple('PostfixOp', ['op', 'arg'])
# We write operators in precedence order
operators = {
    # Todo(Rik): support for intersection op (which idiot ever thought that should be a space)
    ':': InfixOp, # ' ': InfixOp
    '-': PrefixOp, '%': PostfixOp, r'\^': InfixOp,
    r'\*': InfixOp, '/': InfixOp, r'\+': InfixOp,
    '-': InfixOp, r'\&': InfixOp, '=': InfixOp,
    '<': InfixOp, '>': InfixOp, '<=': InfixOp,
    '>=': InfixOp, '<>': InfixOp, ',': InfixOp, 
}

Function = namedtuple('Function', ['name', 'args'])

def get_parenthesized_indices(s):
    '''Get a set of indices in s of characters which occur on or between parentheses'''
    paren_indices = set()
    n_spans_open = 0
    for i, c in enumerate(s):
        if c == '(': n_spans_open +=1 
        
        if n_spans_open > 0: paren_indices.add(i)

        if c == ')': n_spans_open -= 1
        if n_spans_open < 0: raise ParseError(f'Unmatched ) in {s}')
        
    if n_spans_open > 0: raise ParseError(f'Unmatched ( in {s}')
    return paren_indices

assert get_parenthesized_indices('(2+3)*(4*(7-3))') == set(range(0, 5)) | set(range(6, 15))
assert get_parenthesized_indices('fooo (3, 4) bar ((5, (7)))') == set(range(5, 11)) | set(range(16, 26))


def parse_formula(s):
    '''Turn a formula (after some = sign) into an AST'''
    # Base case, somebody put a constant there
    try: return parse(s)
    except ParseError: pass
    # Parse ref
    try: return Ref.from_string(s)
    except ParseError: pass
        
    # Matching operators: we do this in __reverse__ precendence order. The intuition for this is that
    # during calculation, we roll up the parse tree from the bottom, since that's where the leaves with
    # values are. Since the strongest binding operations hould be executed first, it follows that we
    # want to push those operators down into the 
    # Operators are in precedence order, so start by identifying the *last* thing that should match.
    # We try to parse the sections identified by the parts indicated by the formula. If that fails, clearly
    # we must have misinterpreted the operation (like for -3.0, we'll first try to parse it as
    # InfixOp('-', '', '3.0')) but '' doesn't parse (text values we want to have quotes).
    indices_to_skip = get_parenthesized_indices(s)
    for op, typ in reversed(operators.items()):
        # Todo(Rik): special-casing for space operator. It's only valid between two references,
        # should just be stripped otherwise. Bit of a hassle.
        # Todo(Rik): did not think through tuples well enough: this is a case where a higher-priority
        # operator can follow a lower priority one, e.g. in '=IF(3<4, 10, 11)'. Maybe this is a precendence
        # error, and tuple should be somewhere else in the hierarchy?
        # Todo(Rik): similary, tuples might get empty arguments [(1,,1) should evaluate to (1, None, 1)]
        # but this is obviously nonsensical for the others.
        op_whitespace = r'\s*'+op+r'\s*'
        if re.search(op_whitespace, s) and typ == InfixOp:
            for m in re.finditer(op_whitespace, s):  # Sure hope nothing's left associative
                if m.start() in indices_to_skip: continue
                left, right = s[:m.start()], s[m.end():]
                try: return typ(op, parse_formula(left), parse_formula(right))
                except ParseError: pass
        elif re.search(op_whitespace, s) and typ == PostfixOp:
            # If parsing was correct, should have been the last one
            if s.index(op) in indices_to_skip: continue
            if s.index(op) != len(s)-1: raise ParseError(f'PostfixOp {op} not in last position in {s}')
            try: return typ(op, parse_formula(s[:-1]))
            except ParseError: pass
        elif re.search(op_whitespace, s) and typ == PrefixOp:
            # Everything following op should be parseable as one expression
            if s.index(op) in indices_to_skip: continue
            if s.index(op) != 0: raise ParseError(f'PrefixOp {op} not in first position in {s}')
            try: return typ(op, parse_formula(s[1:]))
            except ParseError: pass
          
    # There are no operators outside of parentheses to parse. that means that we must've arrived
    # at an enclosing expression. Either something like '(3+4)', or something like 'SUM(A1:A4)'.
    # We use `find` to distinguish between the cases:
    if s == '' or s[-1] != ')': raise ParseError(f'{s} does not appear to be a parseable formula.')
    i_open = s.find('(')
    if i_open == 0: return parse_formula(s[1:-1])
    elif i_open > 0:
        if s[i_open+1:-1] == '': return Function(name=s[:i_open], args=None)
        else: return Function(name=s[:i_open], args=parse_formula(s[i_open+1:-1]))
        
    raise ParseError(f'{s} does not appear to be a parseable formula.')

No js module found, not running main scripts.


In [5]:
import pytest

with pytest.raises(ParseError):
    Ref.from_string('foo')
with pytest.raises(ParseError):
    Ref.from_string('A0')
assert Ref.from_string('A4') == Ref(3, 0, fixed_row=False, fixed_column=False)
assert Ref.from_string('$A4') == Ref(3, 0, fixed_row=False, fixed_column=True)
assert Ref.from_string('A$4') == Ref(3, 0, fixed_row=True, fixed_column=False)
assert Ref.from_string('$A$4') == Ref(3, 0, fixed_row=True, fixed_column=True)
assert Ref(10, 10) == Ref(10, 10, fixed_row=False, fixed_column=False)
assert Ref.from_string('A4').to_string() == 'A4'

In [6]:
# Individual values
assert parse('-25') == -25
assert parse('10.3') == 10.3
assert parse('TRUE') == True
assert parse('FALSE') == False
assert parse('\"foo\"') == 'foo'
assert parse("1.3e-7") == 1.3e-7
assert parse("-1.3e6") == -1.3e6
assert parse('c') == 'c'  # We allow single characters
assert parse('1e7') == 1e7

In [7]:
# Easy formulas
assert parse('=3.0') == 3.0
assert parse('=B7') == Ref(6, 1, fixed_row=False, fixed_column=False)
assert parse('=$B$7') == Ref(6, 1, fixed_row=True, fixed_column=True)
assert parse('=3*4') == InfixOp(op=r'\*', left=3, right=4)
assert parse('=2+3*4') == InfixOp(op=r'\+', left=2, right=InfixOp(op=r'\*', left=3, right=4))
assert parse('=3^4') == InfixOp(op=r'\^', left=3, right=4)
with pytest.raises(ParseError):
    parse('=foo-')
    
# Don't forget spacing issues!
assert parse('= 3.0') == 3.0

In [8]:
assert parse('=(2+3)*4') == InfixOp(op=r'\*', left=InfixOp(op=r'\+', left=2, right=3), right=4)
assert parse('=2+3*4') == InfixOp(op=r'\+', left=2, right=InfixOp(op=r'\*', left=3, right=4))
assert parse('=SUM(A1:A4)') == Function(
    name='SUM',
    args=InfixOp(op=':', left=Ref(row=0, column=0), right=Ref(row=3, column=0))
)
assert parse('=PI()') == Function(name='PI', args=None)

# Evaluation
Evaluation is slightly difficult: we need to evaluate refs, but they might be circular. One way to go about this is to fix an evaluation order on the spreadsheet (let's say row 1 first, then row 2, etc.), and defer any cells that refer to not-yet-evaluated cells, making additional passes over those.
For now, let's skip over formula's with references in them, raising EvaluationError instead.

In [9]:
import operator as op

def parse_criterium(criterium):
    if not isinstance(criterium, str):
        return lambda val: val == criterium
    
    # Comparison op
    comparisons = {'<=': op.le, '<>': op.ne, '<': op.lt,
                   '>=': op.ge, '>': op.gt, '=': op.eq}
    for symbol, impl in comparisons.items():
        if criterium.startswith(symbol):
            try: rest = parse_value(criterium[len(symbol):])
            except ParseError: continue
            return lambda val: impl(val, rest)
        
    # String match
    regex = []
    escaping = False
    for i, c in enumerate(criterium):
        if c == '~' and not escaping:
            escaping = True
        elif c == '~' and escaping:
            escaping = False
            regex.append(c)
        elif c == '*' and not escaping:
            regex.append('.*')
        elif c == '*' and escaping:
            escaping = False
            regex.append('\*')
        elif c == '?' and not escaping:
            regex.append('.')
        elif c == '?' and escaping:
            regex.append('\?')
        else:
            regex.append(c)
    regex = ''.join(regex)

    if regex != criterium:
        return lambda val: re.fullmatch(regex, val) is not None
    
    # Need to parse the value
    to_compare = parse_value(criterium)  # Closure are lazily evaluated
    return lambda val: val == to_compare
    
assert parse_criterium('3')(3) == True
assert parse_criterium('3')(4) == False
assert parse_criterium('<=4')(4) == True
assert parse_criterium('<=4')(5) == False
assert parse_criterium('<>"foo"')('foo') == False
assert parse_criterium('*')('foobar') == True
assert parse_criterium('foo*bar')('foobar') == True
assert parse_criterium('foo*bar')('fooooobar') == True
assert parse_criterium('foo*bar')('fooar') == False
assert parse_criterium('foo?bar')('fooobar') == True
assert parse_criterium('foo?bar')('fooar') == False
assert parse_criterium('foo~*bar')('foo*bar') == True
assert parse_criterium('foo~*bar')('foooobar') == False
assert parse_criterium('foo~?bar')('foo?bar') == True
assert parse_criterium('foo~?bar')('foo~?bar') == False

In [10]:
def evaluate(tree, context=None):  # Todo(Rik): str might evaluate to ref.
    if isinstance(tree, (int, float, bool, str)):
        return tree
    elif isinstance(tree, InfixOp) and tree.op == ':':
        start, end = tree.left, tree.right
        assert isinstance(start, Ref) and isinstance(end, Ref)
        # Responsiblity of the caller to handle IndexError
        return Range(
            values=[[context[i][j] for j in range(start.column, end.column+1)]
                     for i in range(start.row, end.row+1)],
            start=start,
            end=end
        )
    elif isinstance(tree, InfixOp):
        return infix_eval_map[tree.op](evaluate(tree.left, context), evaluate(tree.right, context))
    elif isinstance(tree, PrefixOp):
        return prefix_eval_map[tree.op](evaluate(tree.arg, context))
    elif isinstance(tree, PostfixOp):
        return postfix_eval_map[tree.op](evaluate(tree.arg, context))
    elif isinstance(tree, Function):
#         if tree.args is None: return function_eval_map[tree.name]()
        args = evaluate(tree.args, context)
        if not isinstance(args, tuple): # Gymnastics to handle one-argument functions
            args = (args,)
        return function_eval_map[tree.name](args)
    elif isinstance(tree, Ref):  # Responsiblity of the caller to handle IndexError
        return context[tree.row][tree.column]

In [11]:
operators = {
    ':': InfixOp, ' ': InfixOp,
    '-': PrefixOp, '%': PostfixOp, r'\^': InfixOp,
    r'\*': InfixOp, '/': InfixOp, r'\+': InfixOp,
    '-': InfixOp, r'\&': InfixOp, '=': InfixOp,
    '<': InfixOp, '>': InfixOp, '<=': InfixOp,
    '>=': InfixOp, '<>': InfixOp, ',': InfixOp,
}

class Range:  # We don't use a named tuple here because isinstance(namedtuple, tuple) == True.
    def __init__(self, values=[], start=None, end=None):
        if start.column > end.column or start.row > end.row: raise ValueError
        self.values = values
        self.start = start
        self.end = end
    
    def __repr__(self):
        return f'Range(values={self.values}, start={self.start}, end={self.end})'


infix_eval_map = {
    ':': None,
    ',': lambda left, right: (left, right),  # Tuple constructor
    ' ': None,  # Range intersection
    r'\^': lambda left, right: left ** right,
    r'\*': lambda left, right: left * right,
    '/': lambda left, right: left / right,
    r'\+': lambda left, right: left + right,
    '-': lambda left, right: left - right,
    r'\&': None,  # Todo(Rik) look this up
    '=': lambda left, right: left == right,
    '<': lambda left, right: left < right,
    '>': lambda left, right: left > right,
    '<=': lambda left, right: left <= right,
    '>=': lambda left, right: left >= right,
    '<>': lambda left, right: left != right,
}

prefix_eval_map = {
    '-': lambda arg: -arg,
}

postfix_eval_map = {
    '%': lambda arg: arg/100,
}

def flatten_ranges(args):
    '''Takes a tuple args and expands all the ranges to fit inline.'''
    for arg in args:
        if isinstance(arg, Range):
            yield from (v for row in arg.values for row in col)
        elif isinstance(arg, tuple):  # Todo(Rik): do I want to flatten ranges in this case?
            yield from flatten_ranges(arg)
        else:
            yield arg
            
def flatten_nested_tuples(args, flatten_ranges=False):
    '''
    Takes some nested tuples in args and flattens them.
    >>> flatten_nested_tuples((3, 4, (5, (6, 7))))
    (3, 4, 5, 6, 7)
    '''
    for arg in args:
        if isinstance(arg, tuple):
            yield from flatten_nested_tuples(arg, flatten_ranges)
        elif isinstance(arg, Range) and flatten_ranges:
            yield from (v for row in arg.values for v in row)
        else:
            yield arg

def splat(func):
    '''
    Decorate a function to be called with a tuple, like lambda args: f(*args).
    This allows us to define a function on tuples, while still benefitting from
    the arity-checking of `f`.
    This is more readable when `f` is not a function from the stdlib. For instance,
    `f(x, y) = math.atan(y/x)`. Defining `g = splat(lambda x, y: math.atan(y/x))`
    is more readable than `g = lambda args: (lambda x, y: math.atan(y/x))(*args)`.
    '''
    def wrapped(args):
        return func(*args)
    return wrapped
    
def flat_ranges(func):
    '''
    Decorate a function so ranges in the argument are flattened.
    '''
    def wrapped(args):
        return func(tuple(flatten_ranges(args)))
    return wrapped

def flat_tuples(func, flatten_ranges=False):
    '''
    Decorate a function so tuples get flattened.
    '''
    def wrapped(args):
        return func(tuple(flatten_nested_tuples(args, flatten_ranges)))
    return wrapped

def constant(value):
    def wrapped(args):
        if args is not None: raise TypeError
        return value
    return wrapped
            
import math
from numbers import Number
from functools import reduce
            
def excel_averageif(index_range, criterium, average_range=None):
    '''
    While excel doesn't require index_range and average_range to have the same shape, we do!
    Todo(Rik): maybe there needs to be some AST-rewriting pass?
    '''
    criterium = parse_criterium(criterium)
    if average_range is None:
        to_count = [i for i_row in index_range.values for i in i_row if criterium(i)]
    else:
        to_count = [a for i_row, a_row in zip(index_range.values, average_range.values)
                      for i, a in zip(i_row, a_row) if criterium(i)]
    return sum(to_count) / len(to_count)

def excel_choose(index, *values):
    if 1 <= index <= len(values):
        return values[index-1]
    else:
        raise ValueError

def count_if(should_count):
    def counter(args):
        c = 0
        for arg in args:
            if isinstance(arg, Range):
                c += sum(should_count(v) for row in arg.values for v in row)
            else:
                c += should_count(arg)
        return c
    return counter
        
def excel_countif(r, criterium):
    criterium = parse_criterium(criterium)
    return count_if(criterium)([r])

def excel_find(s, t, start_pos=0):
    return t.index(s, start_pos)+1

def excel_fv(interest, n_periods, payment=None, present_value=None, timing=0):
    assert timing in (0, 1)
    if payment is None:
        assert present_value is not None, f'Payment was not provided, so present_value must be!'
        payment = 0
    elif present_value is None:
        present_value = 0 

    appreciation = present_value * (1+interest)**n_periods
    payments = payment * (1+interest*timing)*((1+interest)**n_periods-1)/interest
    return -(appreciation + payments)
    
def excel_hlookup(lookup_value, table_range, i, is_approximate=True):
    if i < 1: raise ValueError('row index < 1 passed to hlookup')
    try: j = table_range.values[0].index(lookup_value)
    except ValueError:
        if not is_approximate: return None
        try: j = [j for j, s in enumerate(table_range.values[0]) if not s.startswith(lookup_value)][-1]
        except IndexError: return None
    return table_range.values[i-1][j]  # Excel is 1-indexed

def excel_if(cond, yes, no=False):
    return yes if cond else no

# We do our own argument parsing because the first argument might be a tuple
def excel_index(args):
    refs_or_array = args[0] if isinstance(args[0], tuple) else (args[0],)
    remaining = flat_tuples(lambda x: x)(args[1])
    i, j, a = None, None, None
    i = None if len(remaining) <= 0 else remaining[0]-1
    j = None if len(remaining) <= 1 else remaining[1]-1
    a = 0 if len(remaining) <= 2 else remaining[2]-1
    if len(remaining) > 3: raise TypeError('Too many arguments to INDEX')
    if i is None and j is None:
        raise TypeError('One of row or column number must be present.')
    elif i is not None and j is None:
        if i < 0: raise TypeError('row number < 1 in INDEX not allowed')
        return refs_or_array[a].values[i]
    elif i is None and j is not None:
        if j < 0: raise TypeError('column number < 1 in INDEX not allowed')
        return [row[j] for row in refs_or_array[a].values]
    else:
        if i < 0 or j < 0: raise TypeError('column or row number < 1 in INDEX not allowed')
        return refs_or_array[a].values[i][j]

from scipy.optimize import newton
    
def excel_irr(args):
    if len(args) == 0 or len(args) > 2: raise TypeError('too little or too many arguments to IRR')
    values = args[0]  # Todo(Rik): array argsss
    guess = args[1] if len(args) > 1 else 0.1
    return newton(lambda r: sum(v / (1+r)**i for i, v in enumerate(values)), guess)
    
def p(f):
    '''Print args and kwargs'''
    def wrapped(*a, **kw):
        print(a, kw)
        return f(*a, **kw)
    return wrapped
        
from datetime import date, time
# Functions get their arguments passed to them as a tuple.
# The responsibility for turning this into proper arguments lies with the implementor.
function_eval_map = {
    'SUM': flat_tuples(sum, flatten_ranges=True),
    'ABS': splat(abs),
    'ACOS': splat(math.acos),
    'AND': lambda args: reduce(op.and_, args, True),
    'ASIN': splat(math.asin), # lambda args: math.asin(*args),
    'ATAN': splat(math.atan), # lambda args: math.atan(*args),
    'ATAN2': splat(lambda x, y: math.atan(y/x)),
    'AVERAGE': flat_tuples(lambda x: sum(x)/len(x), flatten_ranges=True),
    'AVERAGEIF': flat_tuples(splat(excel_averageif)),
    'CHOOSE': flat_tuples(splat(excel_choose)),
    'COLUMNS': splat(lambda r: r.end.column - r.start.column + 1),
    'COS': splat(math.cos),
    'COUNT': flat_tuples(count_if(lambda v: isinstance(v, Number) and not isinstance(v, bool))),
    'COUNTA': flat_tuples(count_if(lambda v: v is not None and not isinstance(v, str))),
    'COUNTBLANK': flat_tuples(count_if(lambda v: v is None)),
    'COUNTIF': flat_tuples(splat(lambda r, c: count_if(parse_criterium(c))([r]))),
    'DATE': flat_tuples(splat(lambda year, month, day: date(year, month, day))),
    'DAY': splat(lambda d: d.day),
    'EVEN': splat(lambda n: 2*math.floor(n/2) if n < 0 else 2*math.ceil(n/2)),
    'EXACT': flat_tuples(splat(lambda s, t: s == t)),
    'EXP': splat(math.exp),
    'FACT': splat(lambda x: math.factorial(int(x))),
    'FALSE': splat(constant(False)),
    'FIND': flat_tuples(splat(excel_find)),
    'FV': flat_tuples(splat(excel_fv)),
    'HLOOKUP': flat_tuples(splat(excel_hlookup)),
    #'HOUR': ?????
    'IF': flat_tuples(splat(excel_if)),
    'INDEX': excel_index,
    'INT': flat_tuples(splat(lambda x: math.floor(x))),
    'IRR': flat_tuples(splat(excel_irr))
}

In [12]:
def C(s, context=None):
    return evaluate(parse(s), context=context)

In [13]:
assert C('=INT(8.9)') == 8
assert C('=INT(-8.9)') == -9

In [14]:
rows = [
    ['Fruit', 'Price', 'Count'],
    ['Apples', 0.69, 40],
    ['Bananas', 0.34, 38],
    ['Lemons', 0.55, 15],
    ['Oranges', 0.25, 25],
    ['Pears', 0.59, 40],
    ['Almonds', 2.8, 10],
]

assert C('=INDEX(A2:C7, 2, 3)', rows) == 38
assert C('=INDEX((A2:C4,A6:C7),2,2,2)', rows) == 2.8
assert C('=INDEX((A2:C4,A6:C7),2,2,1)', rows) == 0.34

In [15]:
assert C('=IF(10>5,"Yes","No")') == 'Yes'
assert  C('=IF(10>5,"Yes")') == 'Yes'
# Empty args not yet supported
# assert C('=IF(10>5,"Yes",)') == 'Yes'
assert C('=IF(10<5,"Yes")') == False
# assert C('=IF(10<5,"Yes",)') == 0
# assert C('=IF(10>5,,"No")') == 0
# assert C('=IF(10>5,,)') == 0 
assert C('=IF(10>5,"Yes",20)') == 'Yes'
assert C('=IF(10<5,"Yes",20)') == 20

In [16]:
rows = [
    ['Axles', 'Bearings', 'Bolts'],
    [4, 6, 9],
    [5, 7, 10],
    [6, 8, 11],
]

assert C('=HLOOKUP("Axles",A1:C4,2,TRUE)', context=rows) == 4
assert C('=HLOOKUP("Bearings",A1:C4,3,FALSE)', context=rows) == 7
assert C('=HLOOKUP("B",A1:C4,3,TRUE)', context=rows) == 5
assert C('=HLOOKUP("Bolts",A1:C4,4)', context=rows) == 11
# Array expressions not yet supported
# assert C('=HLOOKUP(3,{1,2,3;"a","b","c";"d","e","f"},2,TRUE)', context=rows) == 'c'

In [17]:
assert C('=FV(0.06/12,10,-200,-500,1)') == pytest.approx(2581.40, abs=1e-2)
assert C('=FV(0.12/12,12,-1000)') == pytest.approx(12682.50, abs=1e-2)
# This test doesn't work due to empty entry in comma-separated list
# assert C('=FV(0.11/12,35,-2000,,1)') == pytest.approx(82846.25, abs=1e-2)
assert C('=FV(0.06/12,12,-100,-1000,1)') == pytest.approx(2301.40, abs=1e-2)

In [18]:
assert C('=FIND("de", "abcdef")') == 4

In [19]:
assert C('=FALSE()') == False

In [20]:
assert C('=FACT(5)') == 120
assert C('=FACT(3.5)') == 6
assert C('=FACT(0)') == 1

In [21]:
assert C('=EXP(0)') == 1
assert C('=EXP(-1)') == pytest.approx(0.367879441)
assert C('=EXP(1)') == pytest.approx(2.718281828)
assert C('=EXP(2)') == pytest.approx(7.389056099)

In [22]:
assert C('=EXACT("ABC", "ABC")') == True
assert C('=EXACT("ABC", "ABCD")') == False
assert C('=EXACT("Abc", "aBC")') == False
assert C('=EXACT("", "")') == True
with pytest.raises(TypeError):
    C('=EXACT("", "", "")')

In [23]:
assert C('=EVEN(1.5)') == 2
assert C('=EVEN(3)') == 4
assert C('=EVEN(2)') == 2
assert C('=EVEN(-1)') == -2

In [24]:
assert C('=DATE(2021, 7, 31)') == date(2021, 7, 31)
assert C('=DAY(DATE(2021, 7, 31))') == 31

In [25]:
rows = [[i+j for j in range(10)] for i in range(10)]

assert C('=COUNTIF(A1:J10, "=3")', rows) == 4
assert C('=COUNTIF(A1:J10, "<0")', rows) == 0

In [26]:
assert C('=COUNTBLANK(A1:D1)', context=[[1, None, 3, None]]) == 2

In [27]:
assert C('=COUNTA("foo", 3, 4, "bar")') == 2
assert C('=COUNTA(True, 3, 4, b)') == 3

In [28]:
rows = [[i+j for j in range(10)] for i in range(10)]

assert C('=COUNT("foo", 3, 4, "bar")') == 2
assert C('=COUNT(True, 3, 4, b)') == 2
assert C('=COUNT(A1:A4)', rows) == 4
assert C('=COUNT()') == 0

In [29]:
assert C('=COS(37)') == math.cos(37)
with pytest.raises(TypeError):
    C('=COS(37, 10)')

In [30]:
rows = [[i+j for j in range(10)] for i in range(10)]

assert C('=COLUMNS(A1:B4)', rows) == 2
assert C('=COLUMNS(A1:E7)', rows) == 5
assert C('=COLUMNS(A1:A2)', rows) == 1
with pytest.raises(ValueError):
    C('=COLUMNS(B7:A1)')

In [31]:
assert C('=CHOOSE(1, 3, 4, 5)') == 3
assert C('=CHOOSE(3, 3, 4, 5)') == 5
with pytest.raises(ValueError):
    C('=CHOOSE(4, 3, 4, 5)')

In [32]:
rows = [[i+j for j in range(10)] for i in range(10)]

assert C('=AVERAGEIF(A1:A4, 3)', rows) == 3
assert C('=AVERAGEIF(A1:A4, ">=2")', rows) == 2.5
assert C('=AVERAGEIF(A1:A4, ">=2", B1:B4)', rows) == 3.5

In [33]:
rows = [[i+j for j in range(10)] for i in range(10)]

assert C('=SUM(A1:A2)', [[1], [2]]) == 3
assert C('=SUM(1, 2, 3, 4, 5)') == 15
assert C('=SUM(A1:B1, 3)', [[1, 2]]) == 6
assert C('=SUM(A1:J10)', rows) == sum(sum(row) for row in rows)

In [34]:
assert C('=3*4') == 12
assert C('=(2+3)*4') == 20
assert C('=1e7 / 2') == 5_000_000
assert C('=SUM(3, 4)') == 7
assert C('=IF(3 < 4, "three is less than four", "huh?!")') == 'three is less than four'
assert evaluate(Ref(row=0, column=1), context=[[0, 1]]) == 1
assert C('=A1 + A$2', context=[[1, 2], [3, 4]]) == 4

# Translation
Evaluation is useful, but in the end the core seems to be translation.

Goal of this part given a table-like grid of cells, write an equivalent Python program that can be run on the input data (cells without any dependencies) to generate the output (cells in the last column or cells without any dependencies).

In [35]:
infix_translate_map = {
    '\\+': lambda x, y: f'{x} + {y}',
    '\\*': lambda x, y: f'{x} * {y}'

}

prefix_translate_map = {}
postfix_translate_map = {}

# We save the variables in a tuple because it's convenient to use ordering
# to compare formulas. If they have the same AST structure we can just check
# them for consistency one by one.
def translate(tree, variables=tuple()):  # Todo(Rik): str might translate to ref.
    if isinstance(tree, (int, float, bool, str)):
        return tree, variables
    elif isinstance(tree, InfixOp) and tree.op == ':':
        # Todo(Rik): handle ranges in formulas
        raise NotImplemented
    elif isinstance(tree, InfixOp):
        t_left, var_left = translate(tree.left, variables)
        t_right, var_right = translate(tree.right, variables)
        return infix_translate_map[tree.op](t_left, t_right), var_left + var_right
    elif isinstance(tree, PrefixOp):
        t_arg, var_arg = translate(tree.arg, variables)
        return prefix_translate_map[tree.op](t_arg), var_arg
    elif isinstance(tree, PostfixOp):
        t_arg, var_arg = translate(tree.arg, variables)
        return postfix_translate_map[tree.op](t_arg), var_arg
    elif isinstance(tree, Function):
#         if tree.args is None: return function_translate_map[tree.name]()
        t_args, var_args = translate(tree.args, variables)
        if not isinstance(args, tuple): # Gymnastics to handle one-argument functions
            t_args = (t_args,)
        return function_translate_map[tree.name](t_args), var_args
    elif isinstance(tree, Ref):  # Create and return variable name
        return tree.to_string(), variables + (tree,)

That covers translating a single cell. Next example is detecting when output cells are similar. In the below example, it should recognize that the first and second rows are the same function.

In [36]:
def equivalent(tree, other):
    if type(tree) != type(other): return False
    if isinstance(tree, (int, float, bool, str)):  # Todo(Rik): maybe str parses to Ref?
        return tree == other
    elif isinstance(tree, InfixOp) and tree.op == ':':
        # Todo(Rik): handle ranges in formulas
        raise NotImplemented
    elif isinstance(tree, InfixOp):
        return (tree.op == other.op
                and equivalent(tree.left, other.left)
                and equivalent(tree.right, other.right))
    elif isinstance(tree, PrefixOp) or isinstance(tree, PostfixOp):
        return (tree.op == other.op
                and equivalent(tree.arg, other.arg))
    elif isinstance(tree, Function):  # Todo(Rik): consider turning ops into function calls?
        return (tree.name == other.name
               and equivalent(tree.args, other.args))
    elif isinstance(tree, Ref):  # Create and return variable name
        return True

You can still bamboozle this algorithm by putting `=A1+B1` and then `=C14+D37`. However, since the ordering of variables in translate is deterministic (and we used a tuple!), we can then check for equivalence of variables.

In [37]:
assert equivalent(parse('=A1+B1'), parse('=A2+B2'))

In [38]:
def consistent(these_vars, other_vars):
    """Checks if the tuples of variables are "transposed" across rows, returning the delta if so."""
    # Todo(Rik): deal with fixed rows/columns
    # Yeahhh wouldn't I like an Option<i32> here...
    if len(these_vars) != len(other_vars): return False
    if any(v.column != w.column for v, w in zip(these_vars, other_vars)): return False
    if len(set(v.row - w.row for v, w in zip(these_vars, other_vars))) != 1: return False
    return these_vars[0].row - other_vars[0].row

In [39]:
tree, these_vars = translate(parse('=A1+B1'))
other, other_vars = translate(parse('=A2+B2'))
assert consistent(these_vars, other_vars) == -1

tree, these_vars = translate(parse('=A1+B1'))
other, other_vars = translate(parse('=A5+B5'))
assert consistent(these_vars, other_vars) == -4

tree, these_vars = translate(parse('=A3+B3'))
other, other_vars = translate(parse('=A1+B1'))
assert consistent(these_vars, other_vars) == 2

tree, these_vars = translate(parse('=A1+B1'))
other, other_vars = translate(parse('=A2+C3'))
assert not consistent(these_vars, other_vars)

tree, these_vars = translate(parse('=A1*B1'))
other, other_vars = translate(parse('=A2+B2'))
assert consistent(these_vars, other_vars)

In [40]:
def is_copied_across(column):
    head, *column = column
    head_ast = parse(head)
    head_code, head_vars = translate(head_ast)
    for i, item in enumerate(column):
        item_ast = parse(item)
        if not equivalent(head_ast, item_ast): return False
        _, item_vars = translate(item_ast)
        if not consistent(item_vars, head_vars) == i+1: return False
    return True

In [41]:
assert is_copied_across(['=A1+B1', '=A2+B2'])
assert not is_copied_across(['=A1+B1', '=A3+B3'])
assert not is_copied_across(['=A1+B1', '=A2*B2'])
assert is_copied_across(['=A1+B1', '=A2+B2', '=A3+B3'])
assert not is_copied_across(['=A1+B1', '=A2+C3'])

In [42]:
def write_function(name, code, variables):
    variables = ', '.join(v.to_string() for v in variables)
    return '\n'.join([
        f'def {name}({variables}):',
        f'  return {code}'
    ])

row = [3, 4, '=A1+B1']
code, these_vars = translate(parse(row[2]))
print(write_function('adder', code, these_vars))

def adder(A1, B1):
  return A1 + B1


In [47]:
def column_to_code(colname, sheet):
    colnum = colname_to_num(colname)

    function_name = f'calculate_{colname}'

    column_asts = [parse(item) for item in [row[colnum] for row in sheet]]
    column_codes, column_vars = zip(*[translate(ast) for ast in column_asts])

    head_ast, *tail_asts = column_asts
    head_code, *tail_codes = column_codes
    head_vars, *tail_vars = column_vars

    for i, (other_ast, other_vars) in enumerate(zip(tail_asts, tail_vars)):
        if not equivalent(head_ast, other_ast) or not consistent(other_vars, head_vars) == (i+1):
            raise NotImplementedError('Inconsistent column formula in row {i}')

    input_data = [[sheet[r.row][r.column] for r in variables] for variables in column_vars]
    variable_text = ', '.join(var.to_string() for var in head_vars)

    function_body = write_function(function_name, head_code, head_vars)
    exec(function_body)
    function = eval(function_name)
    function_result = [function(*row) for row in input_data]
    result = '\n'.join([
        function_body,
        '',
        f'{colname} = [{function_name}({variable_text}) for {variable_text} in {input_data}]',
        f'\nResult: {function_result}'
    ])
    return result

sheet = [
    [3, 4, 7, '=A1+B1*C1'],
    [4, 5, 8, '=A2+B2*C2']
]
print(column_to_code('D', sheet))

def calculate_D(A1, B1, C1):
  return A1 + B1 * C1

D = [calculate_D(A1, B1, C1) for A1, B1, C1 in [[3, 4, 7], [4, 5, 8]]]

Result: [31, 44]


Next, chaining functions. One cell = one line of code, I suppose. :)

In [91]:
from collections import deque

def is_value(cell):
    return not isinstance(cell, str) or not cell.startswith('=')


def cell_to_code(ref, sheet):
    code, variables = translate(parse(sheet[ref.row][ref.column]))
    code = deque([code])
    to_map = [r for r in variables if not is_value(sheet[r.row][r.column])]
    
    # Todo(Rik): circular reference protection. Requires maintaining a tree,
    # rather than just a flat list. I.e. track the chain of things that
    # led us to calculating this cell and see if it includes something
    # this cell refers to.
    done = {ref}
    
    variables = deque([r for r in variables if is_value(sheet[r.row][r.column])])
    while to_map:
        this_ref = to_map.pop()
        this_varname = this_ref.to_string()
        
        if this_ref in done or this_ref in variables: continue

        this_code, these_vars = translate(parse(sheet[this_ref.row][this_ref.column]))

        variables.extend([r for r in these_vars if is_value(sheet[r.row][r.column]) and r not in variables])
        to_map.extend([r for r in these_vars if not is_value(sheet[r.row][r.column]) and r not in to_map])

        code.appendleft(f'{this_varname} = {this_code}')
        done.add(this_ref)
    return code, variables


sheet = [
    ['3', '4', '=A1+D2', '0'],
    ['4', '5', '=A2+D3', '=A1+D1'],
]

code, variables = cell_to_code(Ref(row=0, column=2), sheet)
code, variables

(deque(['D2 = A1 + D1', 'A1 + D2']),
 deque([Ref(row=0, column=0, fixed_row=False, fixed_column=False),
        Ref(row=0, column=3, fixed_row=False, fixed_column=False)]))

In [92]:
def write_multiline_function(name, code, variables):
    variables = ', '.join(v.to_string() for v in variables)
    *lines, last = code
    return '\n'.join([
        f'def {name}({variables}):'
    ] + [f'    {line}' for line in lines] + [
        f'    return {last}'
    ])

print(write_multiline_function('foo', code, variables))

def foo(A1, D1):
    D2 = A1 + D1
    return A1 + D2


# Experiment with generating an AST rather than Python code directly

In [240]:
import ast

from collections import deque

infix_translate_map = {
    '\\+': ast.Add(),
    '\\*': ast.Mult(),
    '-': ast.Sub(),
}

prefix_translate_map = {}
postfix_translate_map = {}

# We save the variables in a tuple because it's convenient to use ordering
# to compare formulas. If they have the same AST structure we can just check
# them for consistency one by one.
def translate(tree, variables=tuple()):  # Todo(Rik): str might translate to ref.
    if isinstance(tree, (int, float, bool, str)):
        return tree, variables
    elif isinstance(tree, InfixOp) and tree.op == ':':
        # Todo(Rik): handle ranges in formulas
        raise NotImplemented
    elif isinstance(tree, InfixOp):
        t_left, var_left = translate(tree.left, variables)
        t_right, var_right = translate(tree.right, variables)
        return ast.BinOp(left=t_left, op=infix_translate_map[tree.op], right=t_right), var_left + var_right
    elif isinstance(tree, PrefixOp):
        t_arg, var_arg = translate(tree.arg, variables)
        return prefix_translate_map[tree.op](t_arg), var_arg
    elif isinstance(tree, PostfixOp):
        t_arg, var_arg = translate(tree.arg, variables)
        return postfix_translate_map[tree.op](t_arg), var_arg
    elif isinstance(tree, Function):
#         if tree.args is None: return function_translate_map[tree.name]()
        t_args, var_args = translate(tree.args, variables)
        if not isinstance(args, tuple): # Gymnastics to handle one-argument functions
            t_args = (t_args,)
        return function_translate_map[tree.name](t_args), var_args
    elif isinstance(tree, Ref):  # Create and return variable name
        return ast.Name(tree.to_string(), ctx=ast.Load()), variables + (tree,)

def is_value(cell):
    return not isinstance(cell, str) or not cell.startswith('=')


def cell_to_code(ref, sheet):
    code, variables = translate(parse(sheet[ref.row][ref.column]))
    code = deque([ast.Return(code)])
    to_map = [r for r in variables if not is_value(sheet[r.row][r.column])]
    
    # Todo(Rik): circular reference protection. Requires maintaining a tree,
    # rather than just a flat list. I.e. track the chain of things that
    # led us to calculating this cell and see if it includes something
    # this cell refers to.
    done = {ref}
    
    variables = deque([r for r in variables if is_value(sheet[r.row][r.column]) or r.row != ref.row])
    while to_map:
        this_ref = to_map.pop()
        this_varname = this_ref.to_string()
        
        if this_ref in done or this_ref in variables: continue

        this_code, these_vars = translate(parse(sheet[this_ref.row][this_ref.column]))

        variables.extend([r for r in these_vars if is_value(sheet[r.row][r.column]) and r not in variables])
        to_map.extend([r for r in these_vars if not is_value(sheet[r.row][r.column]) and r not in to_map])

        code.appendleft(ast.Assign([ast.Name(this_varname, ast.Store())], this_code))
        done.add(this_ref)
    return code, variables

sheet = [
    ['3', '4', '=A1+D2', '0'],
    ['4', '5', '=A2+D3', '=A1+D1'],
]

code, variables = cell_to_code(Ref(row=0, column=2), sheet)
code, variables

def to_function(code, variables, name):
    ast_args = ast.arguments(
        posonlyargs=[],
        args=[ast.arg(v.to_string()) for v in variables],
        kwonlyargs=[],
        kw_defaults=[],
        defaults=[],
    )
    ast_function = ast.FunctionDef(
        name=name,
        args=ast_args,
        body=list(code),
        decorator_list=[],
    )
    
    return ast.fix_missing_locations(ast_function)

code_obj = compile(ast.Module([to_function(code, variables, 'foo')], type_ignores=[]), '<ebb>', mode='exec')
exec(code_obj)
print(ast.unparse(top_level))

def foo(A1, D2):
    return A1 + D2


That works pretty alright, though no circular reference protection yet. It can handle arbitrary references as long as they terminate. However, in the case that we have twelve-hundred rows of the same shape, we don't want to unpack them all.

In [50]:
sheet = [
    ['3', '4', '=A1+B1'],
    ['4', '5', '=C1+A2+B2'],
]

code, variables = cell_to_code(Ref(row=1, column=2), sheet)
print(write_multiline_function('foo', code, variables))

def foo(A2, B2, A1, B1):
    C1 = A1 + B1
    return C1 + A2 + B2


In [51]:
sheet = [
    ['3', '4', '=A1+B1', '0'],
    ['4', '5', '=A2+B2', '=D1+C2'],
    ['7', '8', '=A3+B3', '=D2+C3'],
    ['9', '0', '=A4+B4', '=D3+C4'],
]

# Should turn into something like this:
def calculate_D(A2, B2, previous_D):
    C2 = A2 + B2
    return previous_D + C2

# Essentially this just means stopping it short?
# How to deal with counter-flow?
sheet = [
    ['3', '4', '=A1+D2', '0'],
    ['4', '5', '=A2+D3', '=A1+D1'],
]

# This can be untangled as follows
code, variables, = cell_to_code(Ref(row=0, column=2), sheet)
print(write_multiline_function('foo', code, variables))

# Let's not deal with this for now. I'm sure it's intellectually stimulating but
# either it's a rat's nest, in which case don't bother, *or* there is only counterflow,
# which is just normal flow but upside down and it's trivial.

def foo(A1, D1):
    D2 = A1 + D1
    return A1 + D2


Now what is important here is not that you can just stop short, but how calculate values in code.

In [52]:
def calculate_initial_D(A1, B1):
    return A1 + B1

def calculate_D(A2, B2, previous_D):
    C2 = A2 + B2
    return previous_D + C2

head, *input_data = [[3, 4], [4, 5]]
result = [calculate_initial_D(*head)]
for A, B in input_data:
    previous_D = result[-1]
    result.append(calculate_D(A, B, previous_D))

result

[7, 16]

So what about looking further back, something like?

In [53]:
sheet = [
    [3, 4, 0, '=A1+B1', '=C1*D1'],
    [4, 5, 3, '=A2+B2', '=C2*D2'],
    [7, 8, 2, '=A3+B3', '=E1*C3*D3'],
    [7, 8, 2, '=A4+B4', '=E2*C4*D4'],
]
output = []

A, B, C = [3, 4, 0]
D = A + B
E = C * D
output.append({'D': D, 'E': E})

A, B, C = [4, 5, 3]
D = A + B
E = C * D
output.append({'D': D, 'E': E})

    
# Second phase
input_phase_1 = [[7, 8, 2], [7, 8, 2]]
for i, (A, B, C) in enumerate(input_phase_1):
    D = A + B
    E = output[i]['D']*C*D
    output.append({'D': D, 'E': E})

output

[{'D': 7, 'E': 0}, {'D': 9, 'E': 27}, {'D': 15, 'E': 210}, {'D': 15, 'E': 270}]

In this case, we would expect some kind of "start-up" rows, and then an eventually stable set of cell formulas. Especially gnarly when there are two regimes.

In [54]:
def find_transitions(column):
    transitions = set()
    previous_ast, *asts = [parse(cell) for cell in column]
    for i, ast in enumerate(asts):
        if not equivalent(ast, previous_ast):
            previous_ast = ast
            transitions.add(i+1)
    return transitions

column = ['=A1+B1', '=A2+B2', '=A3*B3', '=A4*B4']
find_transitions(column) 

{2}

That routine will find cells that are copied across. Now we can extract the loop body (as separate from initialization).

In [55]:
sheet = [
    [3, 4, 0, '=A1+B1', '=C1*D1'],
    [4, 5, 3, '=A2+B2', '=C2*D2'],
    [7, 8, 2, '=A3+B3', '=E1*C3*D3'],
    [7, 8, 2, '=A4+B4', '=E2*C4*D4'],
]

value_cols = [(i, col) for i, col in enumerate(zip(*sheet)) if all(is_value(cell) for cell in col)]
calc_cols = [(i, col) for i, col in enumerate(zip(*sheet)) if not all(is_value(cell) for cell in col)]
value_cols, calc_cols

([(0, (3, 4, 7, 7)), (1, (4, 5, 8, 8)), (2, (0, 3, 2, 2))],
 [(3, ('=A1+B1', '=A2+B2', '=A3+B3', '=A4+B4')),
  (4, ('=C1*D1', '=C2*D2', '=E1*C3*D3', '=E2*C4*D4'))])

In [56]:
import operator as op

def join_sets(sets):
    return reduce(op.ior, sets, set())

def join_dicts(dicts):
    return reduce(lambda d, e: {**d, **e}, dicts, {})

join_sets([{1, 2, 3}, {4, 5, 6}]), join_dicts([{3: 4}, {4: 5}])

({1, 2, 3, 4, 5, 6}, {3: 4, 4: 5})

There's an implicit ordering here, or "compatibleness". Some kind of intersection operation where column 3 has no transition, but column 4 has one at index 2, so therefore we see a transition at index 2.

In case we had something like `[[0, 1], [2, 3]]`, `[[0], [1, 2, 3]]`, we would have gotten transitions at 1 and 2. Overlapping transforms are not interesting.

In [57]:
sheet

[[3, 4, 0, '=A1+B1', '=C1*D1'],
 [4, 5, 3, '=A2+B2', '=C2*D2'],
 [7, 8, 2, '=A3+B3', '=E1*C3*D3'],
 [7, 8, 2, '=A4+B4', '=E2*C4*D4']]

In [268]:
def cell_to_row_code(ref, sheet):
    code, variables = translate(parse(sheet[ref.row][ref.column]))
    code = deque([code])
    to_map = [r for r in variables if not is_value(sheet[r.row][r.column])]
    
    # Todo(Rik): circular reference protection. Requires maintaining a tree,
    # rather than just a flat list. I.e. track the chain of things that
    # led us to calculating this cell and see if it includes something
    # this cell refers to.
    done = {ref}
    
    variables = deque([r for r in variables if is_value(sheet[r.row][r.column]) or r.row != ref.row])
    while to_map:
        this_ref = to_map.pop()
        this_varname = this_ref.to_string()
        
        if this_ref in done or this_ref in variables: continue

        this_code, these_vars = translate(parse(sheet[this_ref.row][this_ref.column]))

        variables.extend([r for r in these_vars if is_value(sheet[r.row][r.column]) and r not in variables])
        to_map.extend([r for r in these_vars if not is_value(sheet[r.row][r.column]) and r not in to_map])

        code.appendleft(f'{this_varname} = {this_code}')
        done.add(this_ref)
    return code, variables

colnum = 4
sheet = [
    [3, 4, 0, '=A1+B1', '=C1*D1'],
    [4, 5, 3, '=A2+B2', '=C2*D2'],
    [7, 8, 2, '=A3+B3', '=E1*C3*D3'],
    [7, 8, 2, '=A4+B4', '=E2*C4*D4'],
]

value_cols = [(i, col) for i, col in enumerate(zip(*sheet)) if all(is_value(cell) for cell in col)]
calc_cols = [(i, col) for i, col in enumerate(zip(*sheet)) if not all(is_value(cell) for cell in col)]
transitions = list(join_sets([find_transitions(col) for _, col in calc_cols]))
code_and_variables = [cell_to_row_code(Ref(row=start, column=colnum), sheet) for start in [0]+transitions]
code_and_variables

[(deque(['D1 = <ast.BinOp object at 0x7fb76cb28e80>',
         <ast.BinOp at 0x7fb76cb28d60>]),
  deque([Ref(row=0, column=2, fixed_row=False, fixed_column=False),
         Ref(row=0, column=0, fixed_row=False, fixed_column=False),
         Ref(row=0, column=1, fixed_row=False, fixed_column=False)])),
 (deque(['D3 = <ast.BinOp object at 0x7fb76cb28850>',
         <ast.BinOp at 0x7fb76cb288e0>]),
  deque([Ref(row=0, column=4, fixed_row=False, fixed_column=False),
         Ref(row=2, column=2, fixed_row=False, fixed_column=False),
         Ref(row=2, column=0, fixed_row=False, fixed_column=False),
         Ref(row=2, column=1, fixed_row=False, fixed_column=False)]))]

In [59]:
calc_cols

[(3, ('=A1+B1', '=A2+B2', '=A3+B3', '=A4+B4')),
 (4, ('=C1*D1', '=C2*D2', '=E1*C3*D3', '=E2*C4*D4'))]

In [283]:
def sheet_to_code(sheet):
    value_cols = [(i, col) for i, col in enumerate(zip(*sheet)) if all(is_value(cell) for cell in col)]
    calc_cols = [(i, col) for i, col in enumerate(zip(*sheet)) if not all(is_value(cell) for cell in col)]
    transitions = list(join_sets([find_transitions(col) for _, col in calc_cols]))
    block_ranges = list(zip([0]+transitions, transitions+[len(sheet)]))
    blocks = [sheet[start:end] for start, end in block_ranges]

    result = [ast.Assign([ast.Name('output', ast.Store())], ast.Constant([]))]
    for start, end in block_ranges:
        to_calc = []
        for colnum, _ in calc_cols:
            colname = num_to_colname(colnum)
            function_name = f'calculate_{colname}{start}'
            code, variables = cell_to_code(Ref(row=start, column=colnum), sheet)

            to_calc.append((colname, function_name, variables))
            result.append(to_function(code, variables, function_name))

            
        loop_body = []
        # Assign non-local variables, if any
        loop_body.extend([
            ast.parse(f'{v.to_string()} = output[-{start-v.row}][{v.column}]')
            for v in variables if v.row != start
        ])

        # Calculate results which determined we needed to calculate
        for colname, function_name, variables in to_calc:
            loop_body.append(ast.Assign(
                [ast.Name(f'{colname}{start+1}', ast.Store())],
                ast.Call(
                    ast.Name(function_name, ast.Load()),
                    [ast.Name(v.to_string(), ast.Load()) for v in variables],
                    [],
                )
            ))
            
        # Write stuff pre-loop body
        result.append(
            ast.For(
                ast.Tuple([ast.Name(v.to_string(), ast.Store()) for v in variables if v.row == start], ast.Store()),
                ast.Constant([[row[v.column] for v in variables if v.row==start] for row in sheet[start:end]]),
                loop_body,
                []
            )
        )

        

        # Add to ouptut
        sorted_columns = ', '.join([f'{num_to_colname(i)}{start+1}' for i in range(len(sheet[start]))])
        result.append(ast.parse(f'output.append([{sorted_columns}])'))
    return result

sheet = [
    [3, 4, 0, '=A1+B1', '=C1*D1'],
    [4, 5, 3, '=A2+B2', '=C2*D2'],
    [7, 8, 2, '=A3+B3', '=E1*C3*D3'],
    [7, 8, 2, '=A4+B4', '=E2*C4*D4'],
]
full_ast = ast.fix_missing_locations(ast.Module(sheet_to_code(sheet), type_ignores=[]))
print(ast.unparse(full_ast))

output = []

def calculate_D0(A1, B1):
    return A1 + B1

def calculate_E0(C1, A1, B1):
    D1 = A1 + B1
    return C1 * D1
for (C1, A1, B1) in [[0, 3, 4], [3, 4, 5]]:
    D1 = calculate_D0(A1, B1)
    E1 = calculate_E0(C1, A1, B1)
output.append([A1, B1, C1, D1, E1])

def calculate_D2(A3, B3):
    return A3 + B3

def calculate_E2(E1, C3, A3, B3):
    D3 = A3 + B3
    return E1 * (C3 * D3)
for (C3, A3, B3) in [[2, 7, 8], [2, 7, 8]]:
    E1 = output[-2][4]
    D3 = calculate_D2(A3, B3)
    E3 = calculate_E2(E1, C3, A3, B3)
output.append([A3, B3, C3, D3, E3])


# Making a small demo
Leverage ipydatagrid (from Bloomberg) to show a small demo with a sheet on the left and code on the right. Next up probably for loops. :)

Also could think about a backwards connection, i.e. editing the python code and updating the Excel sheet as necessary. What to do when putting a new line though... Something to think about.

For now though, I think continuing the row-interpretation work and making it go in the demo might be good!

In [234]:
sheet = [
    [3, 4, 0, '=A1+B1', '=C1*D1'],
    [4, 5, 3, '=A2+B2', '=C2*D2'],
    [7, 8, 2, '=A3+B3', '=E1*C3*D3'],
    [7, 8, 2, '=A4+B4', '=E2*C4*D4'],
]

sheet = [[2, 3, 4, '=A1*B1+C1']]

In [235]:
from ipydatagrid import DataGrid
import ipywidgets as widgets
import pandas as pd

datagrid = DataGrid(pd.DataFrame(sheet), editable=True, base_column_size=128, base_row_size=30,
                    layout={'height': '200px', 'width': '600px'})
code_output = widgets.Output(layout={'border': '1px solid black'})

def update_df(cell):
    sheet[cell['row']][cell['column']] = cell['value']
    with code_output:
        code_output.clear_output()        
        update_code(sheet)
    
def update_code(sheet):
    code = sheet_to_code(sheet)
    print(code)
    exec(code)
    print(f'Value of output: {eval("output")}')
        
    
datagrid.on_cell_change(update_df)
with code_output:
    update_code(sheet)
    
widgets.Box(children=[datagrid, code_output])

Box(children=(DataGrid(auto_fit_params={'area': 'all', 'padding': 30, 'numCols': None}, base_column_size=128, …

In [None]:
x