In [1]:
import os
import xlcalculator


from series_extraction.excel_loader import ExcelLoader
from series_extraction.excel_validator import ExcelValidator
from series_extraction.excel_cleaner import ExcelCleaner
from series_extraction.table_finder import TableFinder
from series_extraction.series_extractor import SeriesExtractor
from series_extraction.excel_compatibility_checker import ExcelCompatibilityChecker
from series_extraction.series_iterator import SeriesIterator
from series_extraction.series_mapper import SeriesMapper

from ast_building.formula_parser import FormulaParser


In [2]:
current_directory = os.getcwd()

parent_directory = os.path.abspath(os.path.join(current_directory, os.pardir))
data_directory = os.path.join(parent_directory, 'data')

project_name = 'test_excel_8'

excel_raw_file_path = os.path.join(data_directory, "excel_files_raw", f"{project_name}_raw.xlsx")
excel_reduced_filepath = os.path.join(data_directory, "excel_files_reduced", f"{project_name}_reduced.xlsx")

In [3]:
excel_raw= ExcelLoader.load_file(excel_raw_file_path)
excel_reduced = ExcelLoader.load_file(excel_reduced_filepath)

In [4]:
is_valid = ExcelValidator.validate_excel(excel_reduced)

In [5]:
if not is_valid:
    raise Exception("Excel file is not valid")

In [6]:
excel_reduced_clean = ExcelCleaner.clean_excel(excel_reduced)

In [7]:
extracted_tables, data = TableFinder.find_tables(excel_reduced_clean)

In [8]:
series_data = SeriesExtractor.extract_table_details(extracted_tables, data)

In [9]:
is_compatible = ExcelCompatibilityChecker.check_file(excel_raw, excel_reduced, extracted_tables)

In [10]:
if not is_compatible:
    raise Exception("Excel file is not compatible")

In [11]:
series_dict = SeriesExtractor.extract_series(extracted_tables=extracted_tables, data=data)

In [12]:
series_iterator = SeriesIterator.iterate_series(series_dict)

In [13]:
series = None
for item in series_iterator:
    if item.formulas != [None, None]:
        series = item
        break


In [14]:
series

Series(series_id=UUID('d66abed2-4223-42e0-b395-cca88c06dbfb'), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='Index_Match_Size', formulas=['=INDEX(B:B,MATCH(A2,A:A,0))', '=INDEX(B:B,MATCH(A3,A:A,0))'], values=['male', 'female'], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=13, row=2, coordinate='M2', value=None, value_type=None), series_length=2, series_data_type=<SeriesDataType.STR: 'str'>)

In [15]:
series_mapping = SeriesMapper.map_series(series_dict)

In [16]:
formula_1 = "=SUM(A2:A4)+AVERAGE(A7:A8)"
formula_1_ast = FormulaParser.parse_formula(formula_1)



In [17]:
from formula_visualiser import FormulaVisualiser

visualizer = FormulaVisualiser()
ast_graph = visualizer.visualise(formula_1_ast)
file_name = ast_graph.render('formula_1_ast')


In [32]:
series_mapping

{Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None): {Cell(column=1, row=2, coordinate=None, value=None, value_type=None): (0,
   Series(series_id=UUID('d1e32880-defe-4808-b92b-02db1f8d070f'), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='age', formulas=[None, None], values=[56, 46], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=1, row=2, coordinate='A2', value=None, value_type=None), series_length=2, series_data_type=<SeriesDataType.INT: 'int'>)),
  Cell(column=1, row=3, coordinate=None, value=None, value_type=None): (1,
   Series(series_id=UUID('d1e32880-defe-4808-b92b-02db1f8d070f'), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='age', formulas=[None, None], values=[56, 46], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=1, row=2, coordinate='A2', value=None, value_type=None), series_length=2, serie

In [33]:
from objects import Cell, Worksheet

worksheet = Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None)
cell = Cell(column=10, row=3, coordinate=None, value=None, value_type=None)

def get_series_from_cell_and_sheet_name(series_mapping, worksheet, cell):
    return series_mapping[worksheet][cell]

In [34]:
series_from_cell_and_sheet_name = get_series_from_cell_and_sheet_name(series_mapping,worksheet, cell)

In [35]:
series_from_cell_and_sheet_name

(1,
 Series(series_id=UUID('7283d138-743e-48ed-8b9f-2948bc4a869d'), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='indebt', formulas=[None, None], values=[True, True], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=10, row=2, coordinate='J2', value=None, value_type=None), series_length=2, series_data_type=<SeriesDataType.BOOL: 'bool'>))

In [36]:
def get_cells_between(cell_start: Cell, cell_end: Cell):
    """cell_start and cell_end as inputs. Get a list of all cells between these two cells."""
    cells = []
    for row in range(cell_start.row, cell_end.row + 1):
        for column in range(cell_start.column, cell_end.column + 1):
            cells.append(Cell(row=row, column=column, coordinate=None, value=None, value_type=None))
    return cells


In [37]:
cell_start = Cell(column=4, row=2, coordinate=None, value=None, value_type=None)
cell_end = Cell(column=4, row=3, coordinate=None, value=None, value_type=None)

In [38]:
cells = get_cells_between(cell_start, cell_end)

In [39]:
cells

[Cell(column=4, row=2, coordinate=None, value=None, value_type=None),
 Cell(column=4, row=3, coordinate=None, value=None, value_type=None)]

In [42]:
series_example = get_series_from_cell_and_sheet_name(series_mapping, worksheet, cells[0])

In [44]:
def get_series_from_range(series_mapping, sheet_name, cell_start, cell_end):
    cells_in_range = get_cells_between(cell_start, cell_end)
    worksheet = Worksheet(sheet_name=sheet_name, workbook_file_path=None, worksheet=None)
    series_range = [get_series_from_cell_and_sheet_name(series_mapping=series_mapping, worksheet=worksheet, cell=cell) for cell in cells_in_range]
    return series_range

In [45]:
series_range = get_series_from_range(series_mapping, 'Sheet1', cell_start, cell_end)

In [46]:
series_range

[(0,
  Series(series_id=UUID('1856f865-1bfa-4485-a0c3-c3a9529a404e'), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='platform', formulas=[None, None], values=['Instagram', 'Facebook'], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=4, row=2, coordinate='D2', value=None, value_type=None), series_length=2, series_data_type=<SeriesDataType.STR: 'str'>)),
 (1,
  Series(series_id=UUID('1856f865-1bfa-4485-a0c3-c3a9529a404e'), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='platform', formulas=[None, None], values=['Instagram', 'Facebook'], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=4, row=2, coordinate='D2', value=None, value_type=None), series_length=2, series_data_type=<SeriesDataType.STR: 'str'>))]

In [99]:
import xlcalculator.ast_nodes

class SeriesImplementer:

    @staticmethod
    def serialise_ast_to_formula(ast):
        if isinstance(ast, xlcalculator.ast_nodes.RangeNode): 
            value = ast.tvalue.strip('[]')
            return f"{value}"
        elif isinstance(ast, xlcalculator.ast_nodes.FunctionNode):
            args = ', '.join(SeriesImplementer.serialise_ast_to_formula(arg) for arg in ast.args)
            return f"{ast.tvalue}({args})"
        elif isinstance(ast, xlcalculator.ast_nodes.OperatorNode):
            left = SeriesImplementer.serialise_ast_to_formula(ast.left) if ast.left else ''
            right = SeriesImplementer.serialise_ast_to_formula(ast.right) if ast.right else ''
            return f"({left} {ast.tvalue} {right})".strip()
        elif isinstance(ast, xlcalculator.ast_nodes.OperandNode) and ast.tsubtype == 'text':
            return f'"{ast.tvalue}"'
        else:
            return str(ast.tvalue)
        
    
    @staticmethod
    def replace_range_nodes(ast):
        if isinstance(ast, xlcalculator.ast_nodes.RangeNode):
            return xlcalculator.ast_nodes.RangeNode(xlcalculator.tokenizer.f_token(tvalue='A2:A4', ttype='operand', tsubtype='range'))
        elif isinstance(ast, xlcalculator.ast_nodes.FunctionNode):
            modified_args = [SeriesImplementer.replace_range_nodes(arg) for arg in ast.args]
            modified_function_node = xlcalculator.ast_nodes.FunctionNode(ast.token)
            modified_function_node.args = modified_args
            return modified_function_node
        elif isinstance(ast, xlcalculator.ast_nodes.OperatorNode):
            modified_left = SeriesImplementer.replace_range_nodes(ast.left) if ast.left else None
            modified_right = SeriesImplementer.replace_range_nodes(ast.right) if ast.right else None
            modified_operator_node = xlcalculator.ast_nodes.OperatorNode(ast.token)
            modified_operator_node.left = modified_left
            modified_operator_node.right = modified_right
            return modified_operator_node
        else:
            return ast


In [100]:
formula_1 = "=SUM(A5:A6)+SUM(A6:A7)"
formula_1_ast = FormulaParser.parse_formula(formula_1)

In [102]:
formula_1_ast_new = SeriesImplementer.replace_range_nodes(formula_1_ast)

In [104]:
formula_1_new = SeriesImplementer.serialise_ast_to_formula(formula_1_ast_new)

In [105]:
formula_1_new

'(SUM(A2:A4) + SUM(A2:A4))'