In [1]:
import os

from series_extraction.excel_loader import ExcelLoader
from series_extraction.excel_cleaner import ExcelCleaner
from series_extraction.table_finder import TableFinder
from series_extraction.series_extractor import SeriesExtractor
from series_extraction.excel_compatibility_checker import ExcelCompatibilityChecker
from series_extraction.series_iterator import SeriesIterator
from series_extraction.series_mapper import SeriesMapper
from series_extraction.excel_validator import ExcelValidator

from ast_transformation.series_formula_generator import SeriesFormulaGenerator
from ast_transformation.formula_generator import SeriesIdLoader

from ast_building.formula_parser import FormulaParser
from ast_building.series_implementer import SeriesImplementer

from pipeline_building.series_dependencies_builder import SeriesDependenciesBuilder
from pipeline_building.dag_sorter import DAGSorter

from excel_builder import ExcelBuilder

from excel_checker import ExcelChecker


In [2]:
current_directory = os.getcwd()

parent_directory = os.path.abspath(os.path.join(current_directory, os.pardir))
data_directory = os.path.join(parent_directory, 'data')

project_name = 'test_excel_2'

excel_raw_file_path = os.path.join(data_directory, "excel_files_raw", f"{project_name}_raw.xlsx")
excel_reduced_filepath = os.path.join(data_directory, "excel_files_reduced", f"{project_name}_reduced.xlsx")
excel_reduced_clean_filepath = os.path.join(data_directory, "excel_files_reduced_clean", f"{project_name}_reduced_clean.xlsx")
excel_reduced_clean_series_filepath = os.path.join(data_directory, "excel_files_reduced_clean_series", f"{project_name}_reduced_clean_series.xlsx")

In [3]:
excel_raw= ExcelLoader.load_file(excel_raw_file_path)
excel_reduced = ExcelLoader.load_file(excel_reduced_filepath)
is_valid = ExcelValidator.validate_excel(excel_reduced)
if not is_valid:
    raise Exception("Excel file is not valid")

excel_reduced_clean = ExcelCleaner.clean_excel(excel_reduced)
ExcelBuilder.create_excel_from_workbook(excel_reduced_clean.workbook_with_formulas, excel_reduced_clean_filepath)

extracted_tables, data = TableFinder.find_tables(excel_reduced_clean)
is_compatible = ExcelCompatibilityChecker.check_file(excel_raw, excel_reduced, extracted_tables)
if not is_compatible:
    print(extracted_tables)
    raise Exception("Excel file is not compatible")

series_dict = SeriesExtractor.extract_series(extracted_tables=extracted_tables, data=data)
series_mapping = SeriesMapper.map_series(series_dict)
series_iterator = SeriesIterator.iterate_series(series_dict)

series_list = [series for series in series_iterator]

series_list_with_formulas = [
    series for series in series_list if series.formulas != [None, None]
]
series_list_with_values = [
    series for series in series_list if series.formulas == [None, None]
]

series_list_new = []
formula_1_ast_series_list = []

for series in series_list_with_formulas:
    formula_1, formula_2 = SeriesFormulaGenerator.adjust_formulas(
        series.formulas
    )
    if formula_1 is not None and formula_2 is not None:

        series_implementer = SeriesImplementer(
            series_mapping, sheet_name=series.worksheet.sheet_name
        )

        formula_1_ast = FormulaParser.parse_formula(formula_1)
        formula_1_ast_series = series_implementer.update_ast(formula_1_ast)
        formula_1_ast_series_list.append((series.series_id, formula_1_ast_series))

        formula_2_ast = FormulaParser.parse_formula(formula_2)
        formula_2_ast_series = series_implementer.update_ast(formula_2_ast)

        SeriesFormulaGenerator.process_series_formulas(
            series,
            formula_1_ast_series,
            formula_2_ast_series,
            series_mapping,
            series_dict,
            series_list_new,
        )

series_list_updated = series_list_new + series_list_with_values

ExcelBuilder.create_excel_from_series(series_list_updated, excel_reduced_clean_series_filepath)
ExcelChecker.excels_are_equivalent(excel_reduced_clean_filepath, excel_reduced_clean_series_filepath)

True

In [4]:
series_dependencies = SeriesDependenciesBuilder.build_dependencies(formula_1_ast_series_list)
sorted_dag = DAGSorter.sort_dag(series_dependencies)

In [5]:
import xlcalculator

def get_function_node(ast):
    """Recursively search for the first FunctionNode in the AST."""
    if isinstance(ast, xlcalculator.ast_nodes.FunctionNode):
        return ast  # Return the function node immediately when found
    else:
        return process_node(ast)  # Handle other node types and capture return

def process_node(node):
    """Recursively process each node based on its type."""
    if isinstance(node, xlcalculator.ast_nodes.RangeNode):
        pass  # No action needed for RangeNode in this context
    elif isinstance(node, xlcalculator.ast_nodes.FunctionNode):
        return get_function_node(node)  # Directly return any found FunctionNode
    elif isinstance(node, xlcalculator.ast_nodes.OperatorNode):
        return handle_operator_node(node)  # Capture return from operator node handling

def handle_function_node(node):
    """Process a FunctionNode by iterating over its arguments."""
    for arg in node.args:
        result = get_function_node(arg)  # Look for FunctionNodes within the arguments
        if result:
            return result  # Return the first found FunctionNode up the call stack

def handle_operator_node(node):
    """Process an OperatorNode by processing its left and right children."""
    if node.left:
        result = get_function_node(node.left)
        if result:
            return result  # Return the first found FunctionNode up the call stack
    if node.right:
        result = get_function_node(node.right)
        if result:
            return result  # Return the first found FunctionNode up the call stack

function_node = get_function_node(formula_1_ast_series_list[1][1])


In [6]:
print(formula_1_ast_series_list[1][1])

- (SUMIF((('parking|Vehicle Reg|1|1',), (None, None)), (('summary|Vehicle Reg|1|3',), (0, 0)), (('parking|TGB parking cost|1|3',), (None, None))))


In [7]:
function_node.tvalue

'SUMIF'

In [38]:
import pandas as pd

def get_dataframe_from_series_id_string_list(series_id_string_list):

    df_list = []
    for series_id_string in series_id_string_list:
        series_id = SeriesIdLoader.load_series_id_from_string(series_id_string)
        sheet_name = series_id.sheet_name
        for series in series_dict[sheet_name]:
            if series.series_id == series_id:
                print(series)
                df_list.append(pd.DataFrame(data=series.values, columns=[series.series_id.series_header]))
            
    return pd.concat(df_list, axis=1)
        

In [39]:
series_id_string_list = ('parking|Vehicle Reg|1|1','summary|Vehicle Reg|1|3')
df = get_dataframe_from_series_id_string_list(series_id_string_list)

Series(series_id=SeriesId(sheet_name='parking', series_header='Vehicle Reg', series_header_cell_row=1, series_header_cell_column=1), worksheet=Worksheet(sheet_name='parking', workbook_file_path=None, worksheet=None), series_header='Vehicle Reg', formulas=[None, None], values=['FG20UGP', 'WV69OLC'], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=1, row=2, coordinate='A2', sheet_name=None, value=None, value_type=None), series_length=2, series_data_type=<SeriesDataType.STR: 'str'>)
Series(series_id=SeriesId(sheet_name='summary', series_header='Vehicle Reg', series_header_cell_row=1, series_header_cell_column=3), worksheet=Worksheet(sheet_name='summary', workbook_file_path=None, worksheet=None), series_header='Vehicle Reg', formulas=['=summary!A2', '=summary!A3'], values=['FG20VLN', 'FH19VRE'], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=3, row=2, coordinate='C2', sheet_name=None, value=None, value_type=None), series_length=2

In [40]:
df

Unnamed: 0,Vehicle Reg,Vehicle Reg.1
0,FG20UGP,FG20VLN
1,WV69OLC,FH19VRE


In [11]:
# Create a python function that takes an AST and replaces RangeNodes with pandas dataframes, and FunctionNodes with Python functions

In [12]:
# Build a xlcalculator Function Node

import xlcalculator


token = xlcalculator.tokenizer.f_token(tvalue="SUMIF", ttype="function", tsubtype="")

function_node = xlcalculator.ast_nodes.FunctionNode(token)

In [13]:
function_node

<FunctionNode tvalue: 'SUMIF', ttype: function, tsubtype: >

In [43]:
ast = formula_1_ast_series_list[1][1]

In [45]:
print(ast)

- (SUMIF((('parking|Vehicle Reg|1|1',), (None, None)), (('summary|Vehicle Reg|1|3',), (0, 0)), (('parking|TGB parking cost|1|3',), (None, None))))
