In [1]:
import os

from series_extraction.excel_loader import ExcelLoader
from series_extraction.excel_cleaner import ExcelCleaner
from series_extraction.table_extractor import TableExtractor
from series_extraction.series_extractor import SeriesExtractor
from series_extraction.excel_compatibility_checker import ExcelCompatibilityChecker
from series_extraction.series_iterator import SeriesIterator
from series_extraction.series_mapper import SeriesMapper
from series_extraction.excel_validator import ExcelValidator

from ast_transformation.series_formula_generator_old import SeriesFormulaGenerator
from ast_transformation.formula_generator import FormulaGenerator

from ast_building.formula_parser import FormulaParser
from ast_building.series_implementer import SeriesImplementer

from pipeline_building.series_dependencies_builder import SeriesDependenciesBuilder
from pipeline_building.dag_sorter import DAGSorter

from excel_builder import ExcelBuilder

from excel_checker import ExcelChecker


In [2]:
data_directory = "/Users/chrislittle/GitHub/speedsheet/excel-2-python/data"

project_name = 'test_excel_1'

excel_raw_file_path = os.path.join(data_directory, "excel_files_raw", f"{project_name}_raw.xlsx")
excel_reduced_filepath = os.path.join(data_directory, "excel_files_reduced", f"{project_name}_reduced.xlsx")
excel_reduced_clean_filepath = os.path.join(data_directory, "excel_files_reduced_clean", f"{project_name}_reduced_clean.xlsx")
excel_reduced_clean_series_filepath = os.path.join(data_directory, "excel_files_reduced_clean_series", f"{project_name}_reduced_clean_series.xlsx")
excel_reduced_clean_series_python_filepath = os.path.join(data_directory, "excel_files_reduced_clean_series_python", f"{project_name}_reduced_clean_series_python.xlsx")

In [3]:
excel_raw= ExcelLoader.load_file(excel_raw_file_path)
excel_reduced = ExcelLoader.load_file(excel_reduced_filepath)
is_valid = ExcelValidator.validate_excel(excel_reduced)
if not is_valid:
    raise Exception("Excel file is not valid")

excel_reduced_clean = ExcelCleaner.clean_excel(excel_reduced)
ExcelBuilder.create_excel_from_workbook(excel_reduced_clean.workbook_with_formulas, excel_reduced_clean_filepath)

extracted_tables, workbook_data = TableExtractor.extract_tables(excel_reduced_clean)
is_compatible = ExcelCompatibilityChecker.check_file(excel_raw, excel_reduced, extracted_tables)
if not is_compatible:
    print(extracted_tables)
    raise Exception("Excel file is not compatible")

series_dict = SeriesExtractor.extract_series(extracted_tables=extracted_tables, workbook_data=workbook_data)
series_mapping = SeriesMapper.map_series(series_dict)
series_iterator = SeriesIterator.iterate_series(series_dict)

series_list = [series for series in series_iterator]

series_list_with_formulas = [
    series for series in series_list if series.formulas != [None, None]
]
series_list_with_values = [
    series for series in series_list if series.formulas == [None, None]
]

series_list_new = []
formula_1_ast_series_list = []
generic_formula_dictionary = {}

for series in series_list_with_formulas:
    formula_1, formula_2 = SeriesFormulaGenerator.adjust_formulas(
        series.formulas
    )
    if formula_1 is not None and formula_2 is not None:

        series_implementer = SeriesImplementer(
            series_mapping, sheet_name=series.worksheet.sheet_name
        )

        formula_1_ast = FormulaParser.parse_formula(formula_1)
        formula_1_ast_series = series_implementer.update_ast(formula_1_ast)
        formula_1_ast_series_list.append((series.series_id, formula_1_ast_series))

        formula_2_ast = FormulaParser.parse_formula(formula_2)
        formula_2_ast_series = series_implementer.update_ast(formula_2_ast)

        generic_formula_ast = FormulaGenerator.traverse_and_replace(formula_1_ast_series, formula_2_ast_series)
        generic_formula_dictionary[series.series_id] = generic_formula_ast

series_list_updated = series_list_new + series_list_with_values

<class 'str'>
(('Sheet1|col_1|13|7',), (0, 0), (1, 1))
<class 'str'>
(('Sheet1|col_2|13|8',), (0, 0), (1, 1))
<class 'str'>
(('Sheet1|col_1|13|7', 'Sheet1|col_2|13|8', 'Sheet1|col_3|13|9'), (0, 0), (1, 1))
<class 'str'>
(('Sheet1|horizontal_column_1|12|2',), (0, 0), (1, 1))
<class 'str'>
(('Sheet1|col_1|2|2',), (0, 0), (1, 1))
<class 'str'>
(('Sheet1|col_2|2|3',), (0, 0), (1, 1))
<class 'str'>
(('Sheet1|col_1|2|2', 'Sheet1|col_2|2|3', 'Sheet1|col_3|2|4'), (0, 0), (1, 1))
<class 'str'>
(('Sheet2|horizontal_col_1|2|1',), (0, 0), (1, 1))
<class 'str'>
(('Sheet2|horizontal_col_2|3|1',), (0, 0), (1, 1))


In [4]:
series_dependencies = SeriesDependenciesBuilder.build_dependencies(generic_formula_dictionary)

In [5]:
series_dependencies

{SeriesId(sheet_name='Sheet1', series_header='col_3', series_header_cell_row=13, series_header_cell_column=9): [SeriesId(sheet_name='Sheet1', series_header='col_1', series_header_cell_row=13, series_header_cell_column=7),
  SeriesId(sheet_name='Sheet1', series_header='col_2', series_header_cell_row=13, series_header_cell_column=8)],
 SeriesId(sheet_name='Sheet1', series_header='col_4', series_header_cell_row=13, series_header_cell_column=10): [SeriesId(sheet_name='Sheet1', series_header='col_1', series_header_cell_row=13, series_header_cell_column=7),
  SeriesId(sheet_name='Sheet1', series_header='col_3', series_header_cell_row=13, series_header_cell_column=9),
  SeriesId(sheet_name='Sheet1', series_header='col_2', series_header_cell_row=13, series_header_cell_column=8)],
 SeriesId(sheet_name='Sheet1', series_header='horizontal_column_2', series_header_cell_row=13, series_header_cell_column=2): [SeriesId(sheet_name='Sheet1', series_header='horizontal_column_1', series_header_cell_row=1

In [6]:
series_dict

{'Sheet1': [Series(series_id=SeriesId(sheet_name='Sheet1', series_header='col_1', series_header_cell_row=13, series_header_cell_column=7), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='col_1', formulas=[None, None], values=[1, 3], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=7, row=14, coordinate='G14', sheet_name=None, value=None, value_type=None, formula=None), series_length=2, series_data_type=<SeriesDataType.INT: 'int'>),
  Series(series_id=SeriesId(sheet_name='Sheet1', series_header='col_2', series_header_cell_row=13, series_header_cell_column=8), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='col_2', formulas=[None, None], values=[2, 4], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=8, row=14, coordinate='H14', sheet_name=None, value=None, value_type=None, formula=None), series_length=2, series_data_type=<SeriesDat

In [7]:
generic_formula_dictionary

{SeriesId(sheet_name='Sheet1', series_header='col_3', series_header_cell_row=13, series_header_cell_column=9): <OperatorNode tvalue: '+', ttype: operator-infix, tsubtype: math>,
 SeriesId(sheet_name='Sheet1', series_header='col_4', series_header_cell_row=13, series_header_cell_column=10): <FunctionNode tvalue: 'SUM', ttype: function, tsubtype: >,
 SeriesId(sheet_name='Sheet1', series_header='horizontal_column_2', series_header_cell_row=13, series_header_cell_column=2): <OperatorNode tvalue: '*', ttype: operator-infix, tsubtype: math>,
 SeriesId(sheet_name='Sheet1', series_header='col_3', series_header_cell_row=2, series_header_cell_column=4): <OperatorNode tvalue: '+', ttype: operator-infix, tsubtype: math>,
 SeriesId(sheet_name='Sheet1', series_header='col_4', series_header_cell_row=2, series_header_cell_column=5): <FunctionNode tvalue: 'SUM', ttype: function, tsubtype: >,
 SeriesId(sheet_name='Sheet2', series_header='horizontal_col_3', series_header_cell_row=4, series_header_cell_col

In [8]:
import pickle

def pickle_object(obj, obj_name):
    with open(f"{obj_name}.pkl", 'wb') as file:
        pickle.dump(obj, file)

In [9]:
pickle_object(generic_formula_dictionary, "generic_formula_dictionary")
pickle_object(series_dict, "series_dict")

In [10]:
formula_ast = list(generic_formula_dictionary.values())[0]

In [11]:
str(formula_ast)

"((('Sheet1|col_1|13|7',), (0, 0), (1, 1))) + ((('Sheet1|col_2|13|8',), (0, 0), (1, 1)))"

In [12]:
from ast_transformation.formula_list_generator import FormulaListGenerator
from ast_transformation.formula_evaluator import FormulaEvaluator
import pandas as pd

series_dict={
        "Sheet1|col_1|2|2": [1, 2, 3, 4, 5],
        "Sheet1|col_2|2|3": [6, 7, 8, 9, 10],
        "Sheet1|col_3|2|4": [11, 12, 13, 14, 15],

        "Sheet1|col_1|13|7": [1, 2, 3, 4, 5],
        "Sheet1|col_2|13|8": [6, 7, 8, 9, 10],
        "Sheet1|col_3|13|9": [11, 12, 13, 14, 15],

        "Sheet1|horizontal_column_1|12|2": [20, 21, 22, 23, 24],

    }

formula_list_generator = FormulaListGenerator(formula_ast, series_dict)

formula_list = formula_list_generator.generate_formula_list(0, 4)

In [14]:
str(formula_list[1])

'(ARRAY(ARRAYROW(2))) + (ARRAY(ARRAYROW(7)))'