In [1]:
import os

from series_extraction.excel_loader import ExcelLoader
from series_extraction.excel_cleaner import ExcelCleaner
from series_extraction.table_finder import TableFinder
from series_extraction.series_extractor import SeriesExtractor
from series_extraction.excel_compatibility_checker import ExcelCompatibilityChecker
from series_extraction.series_iterator import SeriesIterator
from series_extraction.series_mapper import SeriesMapper
from series_extraction.excel_validator import ExcelValidator

from ast_transformation.series_formula_generator import SeriesFormulaGenerator
from ast_transformation.formula_generator import SeriesIdLoader
from ast_transformation.formula_generator import FormulaGenerator

from ast_building.formula_parser import FormulaParser
from ast_building.series_implementer import SeriesImplementer

from pipeline_building.series_dependencies_builder import SeriesDependenciesBuilder
from pipeline_building.dag_sorter import DAGSorter

from excel_builder import ExcelBuilder

from excel_checker import ExcelChecker

import pandas as pd
import xlcalculator
import ast
import numpy as np


In [2]:
current_directory = os.getcwd()

parent_directory = os.path.abspath(os.path.join(current_directory, os.pardir))
data_directory = os.path.join(parent_directory, 'data')

project_name = 'average'

excel_raw_file_path = os.path.join(data_directory, "excel_files_raw", f"{project_name}_raw.xlsx")
excel_reduced_filepath = os.path.join(data_directory, "excel_files_reduced", f"{project_name}_reduced.xlsx")
excel_reduced_clean_filepath = os.path.join(data_directory, "excel_files_reduced_clean", f"{project_name}_reduced_clean.xlsx")
excel_reduced_clean_series_filepath = os.path.join(data_directory, "excel_files_reduced_clean_series", f"{project_name}_reduced_clean_series.xlsx")

In [3]:
excel_raw= ExcelLoader.load_file(excel_raw_file_path)
excel_reduced = ExcelLoader.load_file(excel_reduced_filepath)
is_valid = ExcelValidator.validate_excel(excel_reduced)
if not is_valid:
    raise Exception("Excel file is not valid")

excel_reduced_clean = ExcelCleaner.clean_excel(excel_reduced)
ExcelBuilder.create_excel_from_workbook(excel_reduced_clean.workbook_with_formulas, excel_reduced_clean_filepath)

extracted_tables, data = TableFinder.find_tables(excel_reduced_clean)
is_compatible = ExcelCompatibilityChecker.check_file(excel_raw, excel_reduced, extracted_tables)
if not is_compatible:
    print(extracted_tables)
    raise Exception("Excel file is not compatible")

series_dict = SeriesExtractor.extract_series(extracted_tables=extracted_tables, data=data)
series_mapping = SeriesMapper.map_series(series_dict)
series_iterator = SeriesIterator.iterate_series(series_dict)

series_list = [series for series in series_iterator]

series_list_with_formulas = [
    series for series in series_list if series.formulas != [None, None]
]
series_list_with_values = [
    series for series in series_list if series.formulas == [None, None]
]

series_list_new = []
formula_1_ast_series_list = []
ast_generator_dict = {}

for series in series_list_with_formulas:
    formula_1, formula_2 = SeriesFormulaGenerator.adjust_formulas(
        series.formulas
    )
    if formula_1 is not None and formula_2 is not None:

        series_implementer = SeriesImplementer(
            series_mapping, sheet_name=series.worksheet.sheet_name
        )

        formula_1_ast = FormulaParser.parse_formula(formula_1)
        formula_1_ast_series = series_implementer.update_ast(formula_1_ast)
        formula_1_ast_series_list.append((series.series_id, formula_1_ast_series))

        formula_2_ast = FormulaParser.parse_formula(formula_2)
        formula_2_ast_series = series_implementer.update_ast(formula_2_ast)

        SeriesFormulaGenerator.process_series_formulas(
            series,
            formula_1_ast_series,
            formula_2_ast_series,
            series_mapping,
            series_dict,
            series_list_new,
        )

        sheet_name = series.worksheet.sheet_name
        series_list_within_sheet = series_dict.get(sheet_name)
        ast_generator = FormulaGenerator.get_ast_generator(
            formula_1_ast_series, formula_2_ast_series, series_list_within_sheet
        )
        ast_generator_dict[series.series_id] = ast_generator
        

series_list_updated = series_list_new + series_list_with_values

ExcelBuilder.create_excel_from_series(series_list_updated, excel_reduced_clean_series_filepath)
ExcelChecker.excels_are_equivalent(excel_reduced_clean_filepath, excel_reduced_clean_series_filepath)

True

In [4]:
ast_generator_dict.keys()

dict_keys([SeriesId(sheet_name='Sheet1', series_header='average', series_header_cell_row=1, series_header_cell_column=3)])

In [5]:
series_dependencies = SeriesDependenciesBuilder.build_dependencies(formula_1_ast_series_list)
sorted_dag = DAGSorter.sort_dag(series_dependencies)

In [6]:
sorted_dag

[SeriesId(sheet_name='Sheet1', series_header='col_1', series_header_cell_row=1, series_header_cell_column=1),
 SeriesId(sheet_name='Sheet1', series_header='col_2', series_header_cell_row=1, series_header_cell_column=2),
 SeriesId(sheet_name='Sheet1', series_header='average', series_header_cell_row=1, series_header_cell_column=3)]

In [7]:
for series_id in sorted_dag:
    if series_id in ast_generator_dict.keys():
        ast_generator_dict[series_id]

In [8]:
formula_1_ast_series

<FunctionNode tvalue: 'AVERAGE', ttype: function, tsubtype: >

In [9]:
import xlcalculator

def extract_series_id_string_list(ast):
    series_id_string_list = []

    def replace_range_node(node):
        return node

    def replace_function_node(node):
        modified_args = [traverse_ast(arg) for arg in node.args]
        modified_function_node = xlcalculator.ast_nodes.FunctionNode(node.token)
        modified_function_node.args = modified_args
        return modified_function_node

    def replace_operator_node(node):
        modified_left = traverse_ast(node.left) if node.left else None
        modified_right = traverse_ast(node.right) if node.right else None
        modified_operator_node = xlcalculator.ast_nodes.OperatorNode(node.token)
        modified_operator_node.left = modified_left
        modified_operator_node.right = modified_right
        return modified_operator_node

    def traverse_ast(node):
        if isinstance(node, xlcalculator.ast_nodes.RangeNode):
            series_id_string_list.append(node.tvalue)
            return replace_range_node(node)
        elif isinstance(node, xlcalculator.ast_nodes.FunctionNode):
            return replace_function_node(node)
        elif isinstance(node, xlcalculator.ast_nodes.OperatorNode):
            return replace_operator_node(node)
        elif isinstance(node, list):
            return [traverse_ast(item) for item in node]
        elif hasattr(node, 'children'):
            traverse_ast(node.children)
        return node

    traverse_ast(ast)
    return series_id_string_list

In [10]:
range_nodes_string_list = extract_series_id_string_list(formula_1_ast_series)

In [11]:
range_nodes_string_list

["(('Sheet1|col_1|1|1', 'Sheet1|col_2|1|2'), (0, 0))"]

In [12]:
for range_nodes_string in range_nodes_string_list:
    series_id_string_list, indexes = ast.literal_eval(range_nodes_string)
    print(series_id_string_list)


('Sheet1|col_1|1|1', 'Sheet1|col_2|1|2')


In [13]:
series_id_string_list = ('Sheet1|col_1|1|1', 'Sheet1|col_2|1|2')

In [14]:
class FormulaEvaluator:
    def __init__(self, series_id_string_list):
        self.df_dict = {}
        for series_id_string in series_id_string_list:
            df = self.get_dataframe_from_series_id_string(series_id_string)
            self.df_dict[series_id_string] = df

    @staticmethod
    def get_dataframe_from_series_id_string(series_id_string):
        # Assume series_dict and SeriesIdLoader are predefined somewhere in the project
        df_list = []
        series_id = SeriesIdLoader.load_series_id_from_string(series_id_string)
        sheet_name = series_id.sheet_name
        for series in series_dict[sheet_name]:
            if series.series_id == series_id:
                df_list.append(pd.DataFrame(data=series.values, columns=[series.series_id.series_header]))
        return pd.concat(df_list, axis=1)

    def fetch_df(self, identifier, index_range):
        df = self.df_dict[identifier]
        start, end = index_range
        return df.iloc[start:end+1]

    def AVERAGE(self, args):
        identifiers, index_range = args
        series_list = [self.fetch_df(identifier, index_range) for identifier in identifiers]
        numbers = [item for sublist in [series.select_dtypes(include=[np.number]).values.flatten() for series in series_list] for item in sublist]
        return xlcalculator.xlfunctions.statistics.AVERAGE(numbers)

    def eval_formula(self, formula):
        tree = ast.parse(formula, mode='eval')
        local_env = {
            'AVERAGE': self.AVERAGE
        }
        compiled = compile(tree, filename="<ast>", mode='eval')
        result = eval(compiled, {'__builtins__': {}}, local_env)
        return result


In [15]:
evaluator = FormulaEvaluator(series_id_string_list)
formula = "AVERAGE((('Sheet1|col_1|1|1', 'Sheet1|col_2|1|2'), (0, 0)))"
result = evaluator.eval_formula(formula)
assert result == 1.5  # Placeholder for actual result comparison
