In [1]:
import os

from series_extraction.excel_loader import ExcelLoader
from series_extraction.excel_cleaner import ExcelCleaner
from series_extraction.table_extractor import TableExtractor
from series_extraction.series_extractor import SeriesExtractor
from series_extraction.excel_compatibility_checker import ExcelCompatibilityChecker
from series_extraction.series_iterator import SeriesIterator
from series_extraction.series_mapper import SeriesMapper
from series_extraction.excel_validator import ExcelValidator

from ast_transformation.series_formula_generator_old import SeriesFormulaGenerator
from ast_transformation.formula_generator import FormulaGenerator

from ast_building.formula_parser import FormulaParser

from pipeline_building.series_dependencies_builder import SeriesDependenciesBuilder
from pipeline_building.dag_sorter import DAGSorter
from pipeline_building.pipeline_builder import PipelineBuilder

from excel_builder import ExcelBuilder

from excel_checker import ExcelChecker

from excel_data_extractor import ExcelDataExtractor


In [2]:
data_directory = "/Users/chrislittle/GitHub/speedsheet/excel-2-python/data"

project_name = 'test_excel_2'

excel_raw_file_path = os.path.join(data_directory, "excel_files_raw", f"{project_name}_raw.xlsx")
excel_reduced_filepath = os.path.join(data_directory, "excel_files_reduced", f"{project_name}_reduced.xlsx")
excel_reduced_clean_filepath = os.path.join(data_directory, "excel_files_reduced_clean", f"{project_name}_reduced_clean.xlsx")
excel_raw_clean_filepath = os.path.join(data_directory, "excel_files_raw_clean", f"{project_name}_raw_clean.xlsx")
excel_reduced_clean_series_filepath = os.path.join(data_directory, "excel_files_reduced_clean_series", f"{project_name}_reduced_clean_series.xlsx")
excel_reduced_clean_series_python_filepath = os.path.join(data_directory, "excel_files_reduced_clean_series_python", f"{project_name}_reduced_clean_series_python.xlsx")
excel_raw_clean_series_python_filepath = os.path.join(data_directory, "excel_files_raw_clean_series_python", f"{project_name}_raw_clean_series_python.xlsx")

In [3]:
excel_raw= ExcelLoader.load_file(excel_raw_file_path)
excel_reduced = ExcelLoader.load_file(excel_reduced_filepath)
is_valid = ExcelValidator.validate_excel(excel_reduced)
if not is_valid:
    raise Exception("Excel file is not valid")

excel_reduced_clean = ExcelCleaner.clean_excel(excel_reduced)
excel_raw_clean = ExcelCleaner.clean_excel(excel_raw)
ExcelBuilder.create_excel_from_workbook(excel_reduced_clean.workbook_with_formulas, excel_reduced_clean_filepath)
ExcelBuilder.create_excel_from_workbook(excel_raw_clean.workbook_with_formulas,  excel_raw_clean_filepath)

extracted_tables, workbook_data = TableExtractor.extract_tables(excel_reduced_clean)
is_compatible = ExcelCompatibilityChecker.check_file(excel_raw, excel_reduced, extracted_tables)
if not is_compatible:
    raise Exception("Excel file is not compatible")

series_dict = SeriesExtractor.extract_series(extracted_tables=extracted_tables, workbook_data=workbook_data)
series_mapping = SeriesMapper.map_series(series_dict)
series_iterator = SeriesIterator.iterate_series(series_dict)

series_list = [series for series in series_iterator]

series_list_with_formulas = [
    series for series in series_list if series.formulas != [None, None]
]
series_list_with_values = [
    series for series in series_list if series.formulas == [None, None]
]

generic_formula_dictionary = {}

for series in series_list_with_formulas:
    formula_1, formula_2 = SeriesFormulaGenerator.adjust_formulas(
        series.formulas
    )
    if formula_1 is not None and formula_2 is not None:

        print(f"formula_1: {formula_1}")
        print(f"formula_2: {formula_2}")

        formula_1_ast = FormulaParser.parse_formula(formula_1)
        formula_2_ast = FormulaParser.parse_formula(formula_2)

        generic_formula = FormulaGenerator.generate_generic_formula(formula_1_ast[0], formula_2_ast[0])

        print(f"generic_formula: {generic_formula}")

        generic_formula_dictionary[series.series_id] = generic_formula

formula_1: =A2
formula_2: =A3
generic_formula: {0: (0, 1, 0, 1)}
formula_1: =-SUMIF(parking!A:A,C2,parking!C:C)
formula_2: =-SUMIF(parking!A:A,C3,parking!C:C)
generic_formula: {3: (0, 0, 0, 0), 5: (0, 1, 0, 1), 7: (0, 0, 0, 0)}
formula_1: =A2
formula_2: =A3
generic_formula: {0: (0, 1, 0, 1)}
formula_1: =H2-(H2/1.2)
formula_2: =H3-(H3/1.2)
generic_formula: {0: (0, 1, 0, 1), 3: (0, 1, 0, 1)}
formula_1: =-SUMIF(cleaning!A:A,F2,cleaning!F:F)
formula_2: =-SUMIF(cleaning!A:A,F3,cleaning!F:F)
generic_formula: {3: (0, 0, 0, 0), 5: (0, 1, 0, 1), 7: (0, 0, 0, 0)}
formula_1: =CONCATENATE(summary!A2," Parking Recharge")
formula_2: =CONCATENATE(summary!A3," Parking Recharge")
generic_formula: {2: (0, 1, 0, 1)}
formula_1: =-ROUND(summary!D2*1.2,2)
formula_2: =-ROUND(summary!D3*1.2,2)
generic_formula: {3: (0, 1, 0, 1)}
formula_1: =ROUND(D2/6,2)
formula_2: =ROUND(D3/6,2)
generic_formula: {2: (0, 1, 0, 1)}
formula_1: =D2>0
formula_2: =D3>0
generic_formula: {0: (0, 1, 0, 1)}
formula_1: =B2/2
formula_2: 

In [4]:
generic_formula_dictionary

{SeriesId(sheet_name='summary', series_header='Vehicle Reg', series_header_cell_row=1, series_header_cell_column=3): {0: (0,
   1,
   0,
   1)},
 SeriesId(sheet_name='summary', series_header='TGB Cost (No VAT) ', series_header_cell_row=1, series_header_cell_column=4): {3: (0,
   0,
   0,
   0),
  5: (0, 1, 0, 1),
  7: (0, 0, 0, 0)},
 SeriesId(sheet_name='summary', series_header='Vehicle Reg', series_header_cell_row=1, series_header_cell_column=6): {0: (0,
   1,
   0,
   1)},
 SeriesId(sheet_name='summary', series_header='VAT', series_header_cell_row=1, series_header_cell_column=7): {0: (0,
   1,
   0,
   1),
  3: (0, 1, 0, 1)},
 SeriesId(sheet_name='summary', series_header='TGB Cost (Inc VAT) ', series_header_cell_row=1, series_header_cell_column=8): {3: (0,
   0,
   0,
   0),
  5: (0, 1, 0, 1),
  7: (0, 0, 0, 0)},
 SeriesId(sheet_name='xero', series_header='Description', series_header_cell_row=1, series_header_cell_column=2): {2: (0,
   1,
   0,
   1)},
 SeriesId(sheet_name='xero', se

In [5]:
formula_1_ast

([E2 <Range>, * <Operator>, B2 <Range>],
 <formulas.builder.AstBuilder at 0x15891dd90>)

In [6]:
series_values_dict = {str(series.series_id): series.values for series in series_list}

In [7]:
# series_values_dict_raw = ExcelDataExtractor.extract_series_data_from_excel(excel_raw.workbook_with_values, series_list)

In [8]:
series_dependencies = SeriesDependenciesBuilder.build_dependencies(generic_formula_dictionary)

In [9]:
sorted_dag = DAGSorter.sort_dag(series_dependencies)

In [10]:
# series_list_updated_raw = PipelineBuilder.create_series_list(sorted_dag, generic_formula_dictionary, series_dict, series_values_dict_raw, series_list_with_values)

In [11]:
series_list_updated_reduced = PipelineBuilder.create_series_list(sorted_dag, generic_formula_dictionary, series_dict, series_values_dict, series_list_with_values)

FormulaError: ('Not a valid formula:\n%s', '={0: (0, 1, 0, 1)}')

In [None]:
series_list_updated_reduced

[Series(series_id=SeriesId(sheet_name='VehicleData', series_header='vrm', series_header_cell_row=1, series_header_cell_column=1), worksheet=Worksheet(sheet_name='VehicleData', workbook_file_path=None, worksheet=None), series_header='vrm', formulas=[None, None], values=['YY70KTT', 'YY15UUW'], series_starting_cell=Cell(column=1, row=2, coordinate='A2', sheet_name=None, value=None, value_type=None, formula=None), series_length=2, series_data_type=<SeriesDataType.STR: 'str'>),
 Series(series_id=SeriesId(sheet_name='VehicleData', series_header='make', series_header_cell_row=1, series_header_cell_column=2), worksheet=Worksheet(sheet_name='VehicleData', workbook_file_path=None, worksheet=None), series_header='make', formulas=[None, None], values=['Kia', 'Vauxhall'], series_starting_cell=Cell(column=2, row=2, coordinate='B2', sheet_name=None, value=None, value_type=None, formula=None), series_length=2, series_data_type=<SeriesDataType.STR: 'str'>),
 Series(series_id=SeriesId(sheet_name='Vehicl

In [None]:
ExcelBuilder.create_excel_from_series(series_list_updated_reduced, excel_reduced_clean_series_python_filepath, values_only=True)

In [None]:
# ExcelBuilder.create_excel_from_series(series_list_updated_raw, excel_raw_clean_series_python_filepath, values_only=True)

In [None]:
ExcelChecker.excels_are_equivalent(excel_reduced_clean_series_python_filepath, excel_reduced_clean_filepath)

True

In [None]:
# ExcelChecker.excels_are_equivalent(excel_raw_clean_series_python_filepath, excel_raw_clean_filepath)

True