In [1]:
import os

from series_extraction.excel_loader import ExcelLoader
from series_extraction.excel_cleaner import ExcelCleaner
from series_extraction.table_finder import TableFinder
from series_extraction.series_extractor import SeriesExtractor
from series_extraction.excel_compatibility_checker import ExcelCompatibilityChecker
from series_extraction.series_iterator import SeriesIterator
from series_extraction.series_mapper import SeriesMapper
from series_extraction.excel_validator import ExcelValidator

from ast_transformation.series_formula_generator import SeriesFormulaGenerator
from ast_transformation.formula_generator import FormulaGenerator
# from ast_transformation.formula_evaluator import FormulaEvaluator

from ast_building.formula_parser import FormulaParser
from ast_building.series_implementer import SeriesImplementer

from pipeline_building.series_dependencies_builder import SeriesDependenciesBuilder
from pipeline_building.dag_sorter import DAGSorter

from excel_builder import ExcelBuilder

from excel_checker import ExcelChecker


In [2]:
current_directory = os.getcwd()

parent_directory = os.path.abspath(os.path.join(current_directory, os.pardir))
data_directory = os.path.join(parent_directory, 'data')

project_name = 'rounddown'

excel_raw_file_path = os.path.join(data_directory, "excel_files_raw", f"{project_name}_raw.xlsx")
excel_reduced_filepath = os.path.join(data_directory, "excel_files_reduced", f"{project_name}_reduced.xlsx")
excel_reduced_clean_filepath = os.path.join(data_directory, "excel_files_reduced_clean", f"{project_name}_reduced_clean.xlsx")
excel_reduced_clean_series_filepath = os.path.join(data_directory, "excel_files_reduced_clean_series", f"{project_name}_reduced_clean_series.xlsx")
excel_reduced_clean_series_python_filepath = os.path.join(data_directory, "excel_files_reduced_clean_series_python", f"{project_name}_reduced_clean_series_python.xlsx")

In [3]:
excel_raw= ExcelLoader.load_file(excel_raw_file_path)
excel_reduced = ExcelLoader.load_file(excel_reduced_filepath)
is_valid = ExcelValidator.validate_excel(excel_reduced)
if not is_valid:
    raise Exception("Excel file is not valid")

excel_reduced_clean = ExcelCleaner.clean_excel(excel_reduced)
ExcelBuilder.create_excel_from_workbook(excel_reduced_clean.workbook_with_formulas, excel_reduced_clean_filepath)

extracted_tables, data = TableFinder.find_tables(excel_reduced_clean)
is_compatible = ExcelCompatibilityChecker.check_file(excel_raw, excel_reduced, extracted_tables)
if not is_compatible:
    print(extracted_tables)
    raise Exception("Excel file is not compatible")

series_dict = SeriesExtractor.extract_series(extracted_tables=extracted_tables, data=data)
series_mapping = SeriesMapper.map_series(series_dict)
series_iterator = SeriesIterator.iterate_series(series_dict)

series_list = [series for series in series_iterator]

series_list_with_formulas = [
    series for series in series_list if series.formulas != [None, None]
]
series_list_with_values = [
    series for series in series_list if series.formulas == [None, None]
]

series_list_new = []
formula_1_ast_series_list = []
ast_generator_dict = {}

for series in series_list_with_formulas:
    formula_1, formula_2 = SeriesFormulaGenerator.adjust_formulas(
        series.formulas
    )
    if formula_1 is not None and formula_2 is not None:

        series_implementer = SeriesImplementer(
            series_mapping, sheet_name=series.worksheet.sheet_name
        )

        formula_1_ast = FormulaParser.parse_formula(formula_1)
        formula_1_ast_series = series_implementer.update_ast(formula_1_ast)
        formula_1_ast_series_list.append((series.series_id, formula_1_ast_series))

        formula_2_ast = FormulaParser.parse_formula(formula_2)
        formula_2_ast_series = series_implementer.update_ast(formula_2_ast)

        SeriesFormulaGenerator.process_series_formulas(
            series,
            formula_1_ast_series,
            formula_2_ast_series,
            series_mapping,
            series_dict,
            series_list_new,
        )

        sheet_name = series.worksheet.sheet_name
        series_list_within_sheet = series_dict.get(sheet_name)
        ast_generator = FormulaGenerator.get_ast_generator(
            formula_1_ast_series, formula_2_ast_series, series_list_within_sheet
        )
        ast_generator_dict[series.series_id] = ast_generator
        

series_list_updated = series_list_new + series_list_with_values

ExcelBuilder.create_excel_from_series(series_list_updated, excel_reduced_clean_series_filepath)
ExcelChecker.excels_are_equivalent(excel_reduced_clean_filepath, excel_reduced_clean_series_filepath)

True

In [4]:
series_dependencies = SeriesDependenciesBuilder.build_dependencies(formula_1_ast_series_list)
sorted_dag = DAGSorter.sort_dag(series_dependencies)

In [5]:
def build_excel_with_python_formulas(series_list_with_values, ast_generator_dict, evaluator, excel_reduced_clean_series_python_filepath):

    series_list_new_python = []
    for series_id in ast_generator_dict.keys():
        ast_generator = ast_generator_dict[series_id]  
        series = evaluator.get_series_from_id(series_id)  
        values = evaluator.calculate_series_values(ast_generator, 1, 2) 
        series.values = values
        series_list_new_python.append(series)

    series_list_updated_python = series_list_new_python + series_list_with_values
    ExcelBuilder.create_excel_from_series(series_list_updated_python, excel_reduced_clean_series_python_filepath)
    

In [6]:
import xlcalculator
import pandas as pd
import ast
import numpy as np

from ast_transformation.formula_generator import SeriesIdLoader


class FormulaEvaluator:

    def __init__(self, formula_ast, series_dict):
        self.series_dict = series_dict

        range_nodes_string_list = FormulaEvaluator.extract_series_id_string_list(
            formula_ast
        )
        series_id_string_list = FormulaEvaluator.get_series_id_string_list(
            range_nodes_string_list
        )
        self.df_dict = {}
        for series_id_string in series_id_string_list:
            df = self.get_dataframe_from_series_id_string(
                series_id_string, self.series_dict
            )
            self.df_dict[series_id_string] = df

    @staticmethod
    def extract_series_id_string_list(ast):
        series_id_string_list = []

        def replace_range_node(node):
            return node

        def replace_function_node(node):
            modified_args = [traverse_ast(arg) for arg in node.args]
            modified_function_node = xlcalculator.ast_nodes.FunctionNode(node.token)
            modified_function_node.args = modified_args
            return modified_function_node

        def replace_operator_node(node):
            modified_left = traverse_ast(node.left) if node.left else None
            modified_right = traverse_ast(node.right) if node.right else None
            modified_operator_node = xlcalculator.ast_nodes.OperatorNode(node.token)
            modified_operator_node.left = modified_left
            modified_operator_node.right = modified_right
            return modified_operator_node

        def traverse_ast(node):
            if isinstance(node, xlcalculator.ast_nodes.RangeNode):
                series_id_string_list.append(node.tvalue)
                return replace_range_node(node)
            elif isinstance(node, xlcalculator.ast_nodes.FunctionNode):
                return replace_function_node(node)
            elif isinstance(node, xlcalculator.ast_nodes.OperatorNode):
                return replace_operator_node(node)
            elif isinstance(node, list):
                return [traverse_ast(item) for item in node]
            elif hasattr(node, "children"):
                traverse_ast(node.children)
            return node

        traverse_ast(ast)
        return series_id_string_list

    @staticmethod
    def get_dataframe_from_series_id_string(series_id_string, series_dict):
        # Assume series_dict and SeriesIdLoader are predefined somewhere in the project
        df_list = []
        series_id = SeriesIdLoader.load_series_id_from_string(series_id_string)
        sheet_name = series_id.sheet_name
        for series in series_dict[sheet_name]:
            if series.series_id == series_id:
                df_list.append(
                    pd.DataFrame(
                        data=series.values, columns=[series.series_id.series_header]
                    )
                )
        return pd.concat(df_list, axis=1)

    def fetch_df(self, identifier, index_range):
        df = self.df_dict[identifier]
        start, end = index_range
        return df.iloc[start : end + 1]

    def IF(self, *args):
        if len(args) != 3:
            raise ValueError("IF function expects exactly three arguments.")
        return xlcalculator.xlfunctions.logical.IF(*args)   
    
    def AND(self, *args):
        return xlcalculator.xlfunctions.logical.AND(*args)
            
            
    def ROUND(self, *args):
        value, decimal_points = args
        identifiers, index_range = value

        series_df = self.fetch_df(identifiers[0], index_range)
        number = float(series_df.iloc[0,0])

        return xlcalculator.xlfunctions.math.ROUND(number, decimal_points)
    
    def ROUNDDOWN(self, *args):
        value, decimal_points = args
        identifiers, index_range = value

        series_df = self.fetch_df(identifiers[0], index_range)
        number = float(series_df.iloc[0,0])

        return xlcalculator.xlfunctions.math.ROUNDDOWN(number, decimal_points)


    def AVERAGE(self, args):
        identifiers, index_range = args
        series_list = [
            self.fetch_df(identifier, index_range) for identifier in identifiers
        ]
        numbers = [
            item
            for sublist in [
                series.select_dtypes(include=[np.number]).values.flatten()
                for series in series_list
            ]
            for item in sublist
        ]
        return xlcalculator.xlfunctions.statistics.AVERAGE(numbers)

    def evaluate_formula(self, formula):
        formula = str(formula)
        print("formula")
        print(formula)
        tree = ast.parse(formula, mode="eval")
        local_env = {"AVERAGE": self.AVERAGE, "IF": self.IF, "AND": self.AND, "ROUND": self.ROUND, "ROUNDDOWN": self.ROUNDDOWN}
        compiled = compile(tree, filename="<ast>", mode="eval")
        result = eval(compiled, {"__builtins__": {}}, local_env)
        return result

    @staticmethod
    def get_series_id_string_list(range_nodes_string_list):
        series_id_string_list = []

        for item in range_nodes_string_list:
            item = ast.literal_eval(item)[0]
            for i in item:
                series_id_string_list.append(i)
        return series_id_string_list

    def get_series_from_id(self, series_id):
        for series in self.series_dict[series_id.sheet_name]:
            if series.series_id == series_id:
                return series

    def calculate_series_values(self, ast_generator, index_start, index_end):
        values = []
        for index in range(index_start, index_end + 1):
            formula = ast_generator.get_nth_formula(n=index)
            formula_evaluator = FormulaEvaluator(formula, self.series_dict)
            value = formula_evaluator.evaluate_formula(formula)
            values.append(value)
        return values


In [7]:
evaluator = FormulaEvaluator(formula_ast=None, series_dict=series_dict)


In [8]:
build_excel_with_python_formulas(series_list_with_values, ast_generator_dict, evaluator, excel_reduced_clean_series_python_filepath)

formula
AVERAGE((('Sheet1|col_1|1|1', 'Sheet1|col_2|1|2'), (0, 0)))
formula
AVERAGE((('Sheet1|col_1|1|1', 'Sheet1|col_2|1|2'), (1, 1)))
formula
IF((AVERAGE((('Sheet1|col_1|1|1', 'Sheet1|col_2|1|2'), (0, 0)))) > (2), "Y", "N")
formula
IF((AVERAGE((('Sheet1|col_1|1|1', 'Sheet1|col_2|1|2'), (1, 1)))) > (2), "Y", "N")
formula
ROUND((('Sheet1|average|1|3',), (0, 0)), 0)
formula
ROUND((('Sheet1|average|1|3',), (1, 1)), 0)
formula
AND((AVERAGE((('Sheet1|col_1|1|1',), (0, 0)))) > (1), (AVERAGE((('Sheet1|col_2|1|2',), (0, 0)))) > (1))
formula
AND((AVERAGE((('Sheet1|col_1|1|1',), (1, 1)))) > (1), (AVERAGE((('Sheet1|col_2|1|2',), (1, 1)))) > (1))
formula
ROUNDDOWN((('Sheet1|average|1|3',), (0, 0)), 0)
formula
ROUNDDOWN((('Sheet1|average|1|3',), (1, 1)), 0)


In [9]:
ExcelChecker.excels_are_equivalent(excel_reduced_clean_filepath, excel_reduced_clean_series_python_filepath)

True