In [1]:
import os

from series_extraction.excel_loader import ExcelLoader
from series_extraction.excel_validator import ExcelValidator
from series_extraction.excel_cleaner import ExcelCleaner
from series_extraction.table_finder import TableFinder
from series_extraction.series_extractor import SeriesExtractor
from series_extraction.excel_compatibility_checker import ExcelCompatibilityChecker
from series_extraction.series_iterator import SeriesIterator
from series_extraction.series_mapper import SeriesMapper

from ast_building.formula_parser import FormulaParser
from ast_building.series_implementer import SeriesImplementer

from ast_transformation.formula_generator import FormulaGenerator
from ast_transformation.formula_checker import FormulaChecker
from ast_transformation.cell_range_implementer import CellRangeImplementer

from excel_builder import ExcelBuilder

In [2]:
current_directory = os.getcwd()

parent_directory = os.path.abspath(os.path.join(current_directory, os.pardir))
data_directory = os.path.join(parent_directory, 'data')

project_name = 'test_excel_8'

excel_raw_file_path = os.path.join(data_directory, "excel_files_raw", f"{project_name}_raw.xlsx")
excel_reduced_filepath = os.path.join(data_directory, "excel_files_reduced", f"{project_name}_reduced.xlsx")

In [3]:
excel_raw= ExcelLoader.load_file(excel_raw_file_path)
excel_reduced = ExcelLoader.load_file(excel_reduced_filepath)

In [4]:
is_valid = ExcelValidator.validate_excel(excel_reduced)

In [5]:
if not is_valid:
    raise Exception("Excel file is not valid")

In [6]:
excel_reduced_clean = ExcelCleaner.clean_excel(excel_reduced)

In [7]:
extracted_tables, data = TableFinder.find_tables(excel_reduced_clean)

In [8]:
extracted_tables

{Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None): [Table(name='Sheet1_1', range=CellRange(start_cell=Cell(column=1, row=1, coordinate='A1', sheet_name=None, value=None, value_type=None), end_cell=Cell(column=20, row=3, coordinate='T3', sheet_name=None, value=None, value_type=None)), header_location=<HeaderLocation.TOP: 'top'>, header_values=['age', 'gender', 'time_spent', 'platform', 'interests', 'location', 'demographics', 'profession', 'income', 'indebt', 'isHomeOwner', 'Owns_Car', 'Index_Match_Size', 'If_Weight_Or_Size', 'Age Category', 'Average Time Spent for Gender', 'Interest Count', 'Platform Popularity', 'Gender code', 'Sumifs'])],
 Worksheet(sheet_name='Sheet2', workbook_file_path=None, worksheet=None): [Table(name='Sheet2_1', range=CellRange(start_cell=Cell(column=1, row=1, coordinate='A1', sheet_name=None, value=None, value_type=None), end_cell=Cell(column=10, row=3, coordinate='J3', sheet_name=None, value=None, value_type=None)), header_location=<Hea

In [9]:
series_data = SeriesExtractor.extract_table_details(extracted_tables, data)

In [10]:
is_compatible = ExcelCompatibilityChecker.check_file(excel_raw, excel_reduced, extracted_tables)

In [11]:
if not is_compatible:
    raise Exception("Excel file is not compatible")

In [12]:
series_dict = SeriesExtractor.extract_series(extracted_tables=extracted_tables, data=data)

In [13]:
series_dict

{'Sheet1': [Series(series_id=SeriesId(sheet_name='Sheet1', series_header='age', series_header_cell_row=1, series_header_cell_column=1), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='age', formulas=[None, None], values=[56, 46], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=1, row=2, coordinate='A2', sheet_name=None, value=None, value_type=None), series_length=2, series_data_type=<SeriesDataType.INT: 'int'>),
  Series(series_id=SeriesId(sheet_name='Sheet1', series_header='gender', series_header_cell_row=1, series_header_cell_column=2), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='gender', formulas=[None, None], values=['male', 'female'], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=2, row=2, coordinate='B2', sheet_name=None, value=None, value_type=None), series_length=2, series_data_type=<SeriesDataType.STR: 'str'>),
  

In [14]:
series_mapping = SeriesMapper.map_series(series_dict)

In [15]:
series_mapping

{Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None): {Cell(column=1, row=2, coordinate=None, sheet_name=None, value=None, value_type=None): (0,
   Series(series_id=SeriesId(sheet_name='Sheet1', series_header='age', series_header_cell_row=1, series_header_cell_column=1), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='age', formulas=[None, None], values=[56, 46], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=1, row=2, coordinate='A2', sheet_name=None, value=None, value_type=None), series_length=2, series_data_type=<SeriesDataType.INT: 'int'>)),
  Cell(column=1, row=3, coordinate=None, sheet_name=None, value=None, value_type=None): (1,
   Series(series_id=SeriesId(sheet_name='Sheet1', series_header='age', series_header_cell_row=1, series_header_cell_column=1), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='age', formulas=[None, None], v

In [16]:
series_iterator = SeriesIterator.iterate_series(series_dict)

In [17]:
series_list = [series for series in series_iterator]

# Create a list of series that have formulas
series_list_with_formulas = [series for series in series_list if series.formulas != [None, None]]

# Create a list of series that do not have formulas
series_list_with_values = [series for series in series_list if series.formulas == [None, None]]

In [18]:
series_list_with_formulas

[Series(series_id=SeriesId(sheet_name='Sheet1', series_header='Index_Match_Size', series_header_cell_row=1, series_header_cell_column=13), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='Index_Match_Size', formulas=['=INDEX(B:B,MATCH(A2,A:A,0))', '=INDEX(B:B,MATCH(A3,A:A,0))'], values=['male', 'female'], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=13, row=2, coordinate='M2', sheet_name=None, value=None, value_type=None), series_length=2, series_data_type=<SeriesDataType.STR: 'str'>),
 Series(series_id=SeriesId(sheet_name='Sheet1', series_header='If_Weight_Or_Size', series_header_cell_row=1, series_header_cell_column=14), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='If_Weight_Or_Size', formulas=['=IF(OR(B2>0,C2>0),"Acceptable","Check")', '=IF(OR(B3>0,C3>0),"Acceptable","Check")'], values=['Acceptable', 'Acceptable'], header_location=<HeaderLocation.TOP

In [19]:
series_list_with_values

[Series(series_id=SeriesId(sheet_name='Sheet1', series_header='age', series_header_cell_row=1, series_header_cell_column=1), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='age', formulas=[None, None], values=[56, 46], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=1, row=2, coordinate='A2', sheet_name=None, value=None, value_type=None), series_length=2, series_data_type=<SeriesDataType.INT: 'int'>),
 Series(series_id=SeriesId(sheet_name='Sheet1', series_header='gender', series_header_cell_row=1, series_header_cell_column=2), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='gender', formulas=[None, None], values=['male', 'female'], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=2, row=2, coordinate='B2', sheet_name=None, value=None, value_type=None), series_length=2, series_data_type=<SeriesDataType.STR: 'str'>),
 Series(series

In [20]:
series_list_with_formulas

[Series(series_id=SeriesId(sheet_name='Sheet1', series_header='Index_Match_Size', series_header_cell_row=1, series_header_cell_column=13), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='Index_Match_Size', formulas=['=INDEX(B:B,MATCH(A2,A:A,0))', '=INDEX(B:B,MATCH(A3,A:A,0))'], values=['male', 'female'], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=13, row=2, coordinate='M2', sheet_name=None, value=None, value_type=None), series_length=2, series_data_type=<SeriesDataType.STR: 'str'>),
 Series(series_id=SeriesId(sheet_name='Sheet1', series_header='If_Weight_Or_Size', series_header_cell_row=1, series_header_cell_column=14), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='If_Weight_Or_Size', formulas=['=IF(OR(B2>0,C2>0),"Acceptable","Check")', '=IF(OR(B3>0,C3>0),"Acceptable","Check")'], values=['Acceptable', 'Acceptable'], header_location=<HeaderLocation.TOP

In [21]:
series_list_new = []

for series in series_list_with_formulas:
    formula_1 = series.formulas[0]
    formula_1_ast = FormulaParser.parse_formula(formula_1)
    series_implementer = SeriesImplementer(series_mapping, sheet_name = series.worksheet.sheet_name)
    formula_1_ast_series = series_implementer.update_ast(formula_1_ast)

    formula_2 = series.formulas[1]
    formula_2_ast = FormulaParser.parse_formula(formula_2)
    formula_2_ast_series = series_implementer.update_ast(formula_2_ast)
    
    sheet_name = series.worksheet.sheet_name

    series_list = series_dict.get(sheet_name)

    ast_generator = FormulaGenerator.get_ast_generator(formula_1_ast_series, formula_2_ast_series, series_list)

    formulas_are_correct, formula_1_ast_new, formula_2_ast_new = FormulaChecker.check_formulas(ast_generator)

    cell_range_implementer = CellRangeImplementer(series_dict)


    formula_1_ast_new_cell_ranges = cell_range_implementer.update_ast(formula_1_ast_new)
    formula_2_ast_new_cell_ranges = cell_range_implementer.update_ast(formula_2_ast_new)
    
    series.formulas = [f"={formula_1_ast_new_cell_ranges}", f"={formula_2_ast_new_cell_ranges}"]
    series_list_new.append(series)
    
    if not formulas_are_correct:
        raise Exception("Formulas are not correct")

In [22]:
series_list_new = series_list_new + series_list_with_values

In [23]:
series_list_new

[Series(series_id=SeriesId(sheet_name='Sheet1', series_header='Index_Match_Size', series_header_cell_row=1, series_header_cell_column=13), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='Index_Match_Size', formulas=['=INDEX(Sheet1!B:B, MATCH(Sheet1!A2, Sheet1!A:A, 0))', '=INDEX(Sheet1!B:B, MATCH(Sheet1!A3, Sheet1!A:A, 0))'], values=['male', 'female'], header_location=<HeaderLocation.TOP: 'top'>, series_starting_cell=Cell(column=13, row=2, coordinate='M2', sheet_name=None, value=None, value_type=None), series_length=2, series_data_type=<SeriesDataType.STR: 'str'>),
 Series(series_id=SeriesId(sheet_name='Sheet1', series_header='If_Weight_Or_Size', series_header_cell_row=1, series_header_cell_column=14), worksheet=Worksheet(sheet_name='Sheet1', workbook_file_path=None, worksheet=None), series_header='If_Weight_Or_Size', formulas=['=IF(OR((Sheet1!B2) > (0), (Sheet1!C2) > (0)), "Acceptable", "Check")', '=IF(OR((Sheet1!B3) > (0), (Sheet1!C3) 

In [24]:
excel_builder = ExcelBuilder(series_list_new, 
                             "test.xlsx")

excel_builder.create_excel_from_series()