In [1]:
import os
import xlcalculator


from series_extraction.excel_loader import ExcelLoader
from series_extraction.excel_validator import ExcelValidator
from series_extraction.excel_cleaner import ExcelCleaner
from series_extraction.table_finder import TableFinder
from series_extraction.series_extractor import SeriesExtractor
from series_extraction.excel_compatibility_checker import ExcelCompatibilityChecker
from series_extraction.series_iterator import SeriesIterator
from series_extraction.series_mapper import SeriesMapper

from ast_building.formula_parser import FormulaParser


In [2]:
current_directory = os.getcwd()

parent_directory = os.path.abspath(os.path.join(current_directory, os.pardir))
data_directory = os.path.join(parent_directory, 'data')

project_name = 'test_excel_8'

excel_raw_file_path = os.path.join(data_directory, "excel_files_raw", f"{project_name}_raw.xlsx")
excel_reduced_filepath = os.path.join(data_directory, "excel_files_reduced", f"{project_name}_reduced.xlsx")

In [3]:
excel_raw= ExcelLoader.load_file(excel_raw_file_path)
excel_reduced = ExcelLoader.load_file(excel_reduced_filepath)

In [4]:
is_valid = ExcelValidator.validate_excel(excel_reduced)

In [5]:
if not is_valid:
    raise Exception("Excel file is not valid")

In [6]:
excel_reduced_clean = ExcelCleaner.clean_excel(excel_reduced)

In [7]:
extracted_tables, data = TableFinder.find_tables(excel_reduced_clean)

In [8]:
series_data = SeriesExtractor.extract_table_details(extracted_tables, data)

In [9]:
is_compatible = ExcelCompatibilityChecker.check_file(excel_raw, excel_reduced, extracted_tables)

In [10]:
if not is_compatible:
    raise Exception("Excel file is not compatible")

In [11]:
series_dict = SeriesExtractor.extract_series(extracted_tables=extracted_tables, data=data)

In [12]:
series_iterator = SeriesIterator.iterate_series(series_dict)

In [13]:
series = None
for item in series_iterator:
    if item.formulas != [None, None]:
        series = item
        break


In [14]:
series

Series(series_id=UUID('8abf60de-4735-4190-b804-f0839c390cc4'), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='Index_Match_Size', formulas=['=INDEX(B:B,MATCH(A2,A:A,0))', '=INDEX(B:B,MATCH(A3,A:A,0))'], values=['male', 'female'], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=13, row=2, coordinate='M2', value=None, value_type=None), series_length=2, series_data_type=<SeriesDataType.STR: 'str'>)

In [15]:
series_mapping = SeriesMapper.map_series(series_dict)

In [16]:
formula_1 = "=SUM(A2:A4)+AVERAGE(A7:A8)"
formula_1_ast = FormulaParser.parse_formula(formula_1)



In [17]:
from formula_visualiser import FormulaVisualiser

visualizer = FormulaVisualiser()
ast_graph = visualizer.visualise(formula_1_ast)
file_name = ast_graph.render('formula_1_ast')


In [18]:
series_mapping

{Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None): {Cell(column=1, row=2, coordinate=None, value=None, value_type=None): (0,
   Series(series_id=UUID('183fe6ce-c3c3-46cc-ace6-db86d124d3bc'), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='age', formulas=[None, None], values=[56, 46], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=1, row=2, coordinate='A2', value=None, value_type=None), series_length=2, series_data_type=<SeriesDataType.INT: 'int'>)),
  Cell(column=1, row=3, coordinate=None, value=None, value_type=None): (1,
   Series(series_id=UUID('183fe6ce-c3c3-46cc-ace6-db86d124d3bc'), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='age', formulas=[None, None], values=[56, 46], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=1, row=2, coordinate='A2', value=None, value_type=None), series_length=2, serie

In [19]:
from objects import Cell, Worksheet

worksheet = Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None)
cell = Cell(column=10, row=3, coordinate=None, value=None, value_type=None)

def get_series_from_cell_and_sheet_name(series_mapping, worksheet, cell):
    return series_mapping[worksheet][cell]

In [20]:
series_from_cell_and_sheet_name = get_series_from_cell_and_sheet_name(series_mapping,worksheet, cell)

In [21]:
series_from_cell_and_sheet_name

(1,
 Series(series_id=UUID('c94802a2-2f40-4f13-b778-62062a0a9f3f'), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='indebt', formulas=[None, None], values=[True, True], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=10, row=2, coordinate='J2', value=None, value_type=None), series_length=2, series_data_type=<SeriesDataType.BOOL: 'bool'>))

In [22]:
def get_cells_between(cell_start: Cell, cell_end: Cell):
    """cell_start and cell_end as inputs. Get a list of all cells between these two cells."""
    cells = []
    for row in range(cell_start.row, cell_end.row + 1):
        for column in range(cell_start.column, cell_end.column + 1):
            cells.append(Cell(row=row, column=column, coordinate=None, value=None, value_type=None))
    return cells


In [23]:
cell_start = Cell(column=4, row=2, coordinate=None, value=None, value_type=None)
cell_end = Cell(column=4, row=3, coordinate=None, value=None, value_type=None)

In [24]:
cells = get_cells_between(cell_start, cell_end)

In [25]:
cells

[Cell(column=4, row=2, coordinate=None, value=None, value_type=None),
 Cell(column=4, row=3, coordinate=None, value=None, value_type=None)]

In [26]:
series_example = get_series_from_cell_and_sheet_name(series_mapping, worksheet, cells[0])

In [27]:
series_example[1]

Series(series_id=UUID('0a4945e3-5663-4f95-a6e5-9d3dc9b6c107'), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='platform', formulas=[None, None], values=['Instagram', 'Facebook'], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=4, row=2, coordinate='D2', value=None, value_type=None), series_length=2, series_data_type=<SeriesDataType.STR: 'str'>)

In [28]:
def coordinate_from_string(cell_coordinate: str):
    """Convert Excel-style cell reference to numerical row and column indices."""
    column_str = ''.join(filter(str.isalpha, cell_coordinate))
    row_str = ''.join(filter(str.isdigit, cell_coordinate))

    # Convert column letters to number (A=1, B=2, ..., Z=26, AA=27, ...)
    column = 0
    for char in column_str:
        column = column * 26 + (ord(char.upper()) - ord('A') + 1)
    
    # Convert row string to number
    row = int(row_str)

    return (column, row)

def get_series_from_range(series_mapping: dict, sheet_name: str, cell_range: str):
    """cell_range is an Excel cell range as a string, eg. 'A1:B2'"""

    # Get the start and end cell coordinates from the cell range
    cell_start_coordinate, cell_end_coordinate = cell_range.split(':')

    # Convert cell coordinates to row and column
    cell_start_column, cell_start_row = coordinate_from_string(cell_start_coordinate)
    cell_end_column, cell_end_row = coordinate_from_string(cell_end_coordinate)

    cell_start = Cell(column=cell_start_column, row=cell_start_row, coordinate=None, value=None, value_type=None)
    cell_end = Cell(column=cell_end_column, row=cell_end_row, coordinate=None, value=None, value_type=None)

    cells_in_range = get_cells_between(cell_start, cell_end)
    worksheet = Worksheet(sheet_name=sheet_name, workbook_file_path=None, worksheet=None)
    series_range = [get_series_from_cell_and_sheet_name(series_mapping=series_mapping, worksheet=worksheet, cell=cell) for cell in cells_in_range]
    return series_range

In [29]:
series_range = get_series_from_range(series_mapping, 'Sheet1', cell_range="A2:A3")

In [30]:
series_range

[(0,
  Series(series_id=UUID('183fe6ce-c3c3-46cc-ace6-db86d124d3bc'), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='age', formulas=[None, None], values=[56, 46], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=1, row=2, coordinate='A2', value=None, value_type=None), series_length=2, series_data_type=<SeriesDataType.INT: 'int'>)),
 (1,
  Series(series_id=UUID('183fe6ce-c3c3-46cc-ace6-db86d124d3bc'), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='age', formulas=[None, None], values=[56, 46], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=1, row=2, coordinate='A2', value=None, value_type=None), series_length=2, series_data_type=<SeriesDataType.INT: 'int'>))]

In [31]:
from objects import Series, SeriesRange
import xlcalculator

class SeriesImplementer:

    def __init__(self, series_mapping):
        self.series_mapping = series_mapping

    @staticmethod
    def serialise_ast_to_formula(ast):
        if isinstance(ast, xlcalculator.ast_nodes.RangeNode): 
            value = ast.tvalue.strip('[]')
            return f"{value}"
        elif isinstance(ast, xlcalculator.ast_nodes.FunctionNode):
            args = ', '.join(SeriesImplementer.serialise_ast_to_formula(arg) for arg in ast.args)
            return f"{ast.tvalue}({args})"
        elif isinstance(ast, xlcalculator.ast_nodes.OperatorNode):
            left = SeriesImplementer.serialise_ast_to_formula(ast.left) if ast.left else ''
            right = SeriesImplementer.serialise_ast_to_formula(ast.right) if ast.right else ''
            return f"({left} {ast.tvalue} {right})".strip()
        elif isinstance(ast, xlcalculator.ast_nodes.OperandNode) and ast.tsubtype == 'text':
            return f'"{ast.tvalue}"'
        else:
            return str(ast.tvalue)
        
    @staticmethod
    def get_series_uuids_from_series_range(series_range: SeriesRange):
        series_ids = [str(series.series_id).replace("-","") for series in series_range.series]
        return f'{"+".join(series_ids)}_{series_range.start_index}_{series_range.end_index}'
        

    @staticmethod
    def replace_range_nodes(ast,sheet_name, series_mapping):
        if isinstance(ast, xlcalculator.ast_nodes.RangeNode):
            series_range = get_series_from_range(series_mapping=series_mapping, sheet_name=sheet_name, cell_range=ast.tvalue)
            series_uuids = SeriesImplementer.get_series_uuids_from_series_range(series_range)
            return xlcalculator.ast_nodes.RangeNode(xlcalculator.tokenizer.f_token(tvalue=series_uuids, ttype='operand', tsubtype='range'))
        elif isinstance(ast, xlcalculator.ast_nodes.FunctionNode):
            modified_args = [SeriesImplementer.replace_range_nodes(arg) for arg in ast.args]
            modified_function_node = xlcalculator.ast_nodes.FunctionNode(ast.token)
            modified_function_node.args = modified_args
            return modified_function_node
        elif isinstance(ast, xlcalculator.ast_nodes.OperatorNode):
            modified_left = SeriesImplementer.replace_range_nodes(ast.left) if ast.left else None
            modified_right = SeriesImplementer.replace_range_nodes(ast.right) if ast.right else None
            modified_operator_node = xlcalculator.ast_nodes.OperatorNode(ast.token)
            modified_operator_node.left = modified_left
            modified_operator_node.right = modified_right
            return modified_operator_node
        else:
            return ast


In [32]:
series_range = SeriesRange(series = [series_example[1]], start_index=0, end_index=1)

In [33]:
formula_1 = "=SUM(A5:A6)+SUM(A6:A7)"
formula_1_ast = FormulaParser.parse_formula(formula_1)

In [34]:
formula_1_ast_new = SeriesImplementer.replace_range_nodes(formula_1_ast)

TypeError: SeriesImplementer.replace_range_nodes() missing 2 required positional arguments: 'sheet_name' and 'series_mapping'

In [None]:
formula_1_new = SeriesImplementer.serialise_ast_to_formula(formula_1_ast_new)

In [None]:
formula_1_new

'(SUM(ecae942631aa44b48990c0b970dc3ea5_0_1) + SUM(ecae942631aa44b48990c0b970dc3ea5_0_1))'

In [None]:
formula_1_ast_new = FormulaParser.parse_formula(f'={formula_1_new}')

In [None]:
formula_visualiser = FormulaVisualiser()
ast_graph = formula_visualiser.visualise(formula_1_ast_new)
ast_graph.render('formula_1_ast_new')

'formula_1_ast_new.png'