In [1]:
import json
import os
from pathlib import Path
import shutil
import subprocess

from output_parsing.APE_to_notebook import parse_ape_solutions, solution_to_notebook

In [2]:
APE_PATH = Path('..') / '..' / 'APE' / 'APE-2.0.3-executable.jar'
USE_CASE_PATH = Path('.').resolve() / 'usecases' / 'house_prices'
config = 'config_run.json'
input_step = 0

In [3]:
CONFIG = {
    "ontology_path": "../../ontology/ontology_v2_DIM_2.owl",
    "ontologyPrefixIRI": "http://www.co-ode.org/ontologies/ont.owl#",
    "toolsTaxonomyRoot": "ToolsTaxonomy",
    "dataDimensionsTaxonomyRoots": [
        "DataClass",
        "StatisticalRelevance"
    ],
    "tool_annotations_path": "../../ontology/tool_annotations_v2_DIM_2.json",
    "constraints_path": "constraints_run.json",
    "solutions_dir_path": "./solutions/",
    "solution_length": {
        "min": 2,
        "max": 10
    },
    "solutions": "5",
    "timeout_sec": "1000",
    "number_of_execution_scripts": "0",
    "number_of_generated_graphs": "5",
    "tool_seq_repeat": "true",
    "debug_mode": "false",
    "use_workflow_input": "ONE",
    "use_all_generated_data": "NONE",
    "inputs": [
        {
            "DataClass": [
                "IntColumn"
            ],
            "StatisticalRelevance": [
                "IndependentVariable"
            ],
            "APE_label": [
                "OverallQual"
            ]
        },
        {
            "DataClass": [
                "IntColumn"
            ],
            "StatisticalRelevance": [
                "IndependentVariable"
            ],
            "APE_label": [
                "YearBuilt"
            ]
        },
        {
            "DataClass": [
                "IntColumn"
            ],
            "StatisticalRelevance": [
                "IndependentVariable"
            ],
            "APE_label": [
                "TotalBsmtSF"
            ]
        },
        {
            "DataClass": [
                "IntColumn"
            ],
            "StatisticalRelevance": [
                "IndependentVariable"
            ],
            "APE_label": [
                "1stFlrSF"
            ]
        },
        {
            "DataClass": [
                "IntColumn"
            ],
            "StatisticalRelevance": [
                "IndependentVariable"
            ],
            "APE_label": [
                "FullBath"
            ]
        },
        {
            "DataClass": [
                "IntColumn"
            ],
            "StatisticalRelevance": [
                "IndependentVariable"
            ],
            "APE_label": [
                "GrLivArea"
            ]
        },
        {
            "DataClass": [
                "IntColumn"
            ],
            "StatisticalRelevance": [
                "IndependentVariable"
            ],
            "APE_label": [
                "TotRmsAbvGrd"
            ]
        },
        {
            "DataClass": [
                "IntColumn"
            ],
            "StatisticalRelevance": [
                "IndependentVariable"
            ],
            "APE_label": [
                "GarageCars"
            ]
        },
        {
            "DataClass": [
                "IntColumn"
            ],
            "StatisticalRelevance": [
                "IndependentVariable"
            ],
            "APE_label": [
                "GarageArea"
            ]
        },
        {
            "DataClass": [
                "IntColumn"
            ],
            "StatisticalRelevance": [
                "DependentVariable"
            ],
            "APE_label": [
                "SalePrice"
            ]
        },
        {
            "DataClass": [
                "StrColumn"
            ],
            "StatisticalRelevance": [
                "IndependentVariable"
            ],
            "APE_label": [
                "SaleCondition"
            ]
        },
        {
            "DataClass": [
                "MixedDataFrame"
            ],
            "StatisticalRelevance": [
                "NoRelevance"
            ],
            "APE_label": [
                "housing_train"
            ]
        }
    ],
    "outputs": [
    ]
}

In [4]:
INPUT_MAPPING = [{
    'label': 'housing_train',
    'source': os.path.abspath(os.path.join(
        'usecases',
        'house_prices',
        'train.csv',
    )),
    'type': 'csv',
    'DataClass': 'MixedDataFrame',
    'StatisticalRelevance': 'NoRelevance'
}]

In [6]:
constraint_min_input = [
    # Set 1 Univariate Dependent Variable Distribution
    (
        [
            {
                "constraintid": "use_m",
                "parameters": [
                    {"ToolsTaxonomy": ["describe"]}
                ]
            }, {
                "constraintid": "use_m",
                "parameters": [
                    {"ToolsTaxonomy": ["Distribution"]}
                ]
            }, {
                "constraintid": "operation_input",
                "parameters": [
                    {"ToolsTaxonomy": ["Distribution"]},
                    {"StatisticalRelevance": ["DependentVariable"]}
                ]
            }, {
                "constraintid": "use_m",
                "parameters": [
                    {"ToolsTaxonomy": ["skew"]}
                ]
            }, {
                "constraintid": "operation_input",
                "parameters": [
                    {"ToolsTaxonomy": ["skew"]},
                    {"StatisticalRelevance": ["DependentVariable"]}
                ]
            }, {
                "constraintid": "use_m",
                "parameters": [
                    {"ToolsTaxonomy": ["kurt"]}
                ]
            }, {
                "constraintid": "operation_input",
                "parameters": [
                    {"ToolsTaxonomy": ["kurt"]},
                    {"StatisticalRelevance": ["DependentVariable"]}
                ]
            }
        ],
        4,
        [],
    ),
    # Set 2 Multivariate Dependent Variable Distribution
    (
        [
            {
                "constraintid": "use_m",
                "parameters": [
                    {"ToolsTaxonomy": ["scatterplot"]}
                ]
            }, {
                "constraintid": "operation_input",
                "parameters": [
                    {"ToolsTaxonomy": ["scatterplot"]},
                    {"StatisticalRelevance": ["DependentVariable"]}
                ]
            }, {
                "constraintid": "operation_input",
                "parameters": [
                    {"ToolsTaxonomy": ["scatterplot"]},
                    {"StatisticalRelevance": ["IndependentVariable"]}
                ]
            }, {
                "constraintid": "use_m",
                "parameters": [
                    {"ToolsTaxonomy": ["Distribution"]}
                ]
            }, {
                "constraintid": "operation_input",
                "parameters": [
                    {"ToolsTaxonomy": ["Distribution"]},
                    {"StatisticalRelevance": ["DependentVariable"]}
                ]
            }, {
                "constraintid": "operation_input",
                "parameters": [
                    {"ToolsTaxonomy": ["Distribution"]},
                    {
                        "DataClass": ["StrColumn"],
                        "StatisticalRelevance": ["IndependentVariable"]
                    }
                ]
            }, {
                "constraintid": "use_m",
                "parameters": [
                    {"ToolsTaxonomy": ["set_figure_size"]},
                ]
            }, {
                "constraintid": "operation_input",
                "parameters": [
                    {"ToolsTaxonomy": ["set_figure_size"]},
                    {"APE_label": ["9"]}
                ]
            }, {
                "constraintid": "operation_input",
                "parameters": [
                    {"ToolsTaxonomy": ["set_figure_size"]},
                    {"APE_label": ["16"]}
                ]
            }, {
                "constraintid": "connected_op",
                "parameters": [
                    {"ToolsTaxonomy": ["set_figure_size"]},
                    {"ToolsTaxonomy": ["Distribution"]}
                ]
            }, {
                "constraintid": "use_m",
                "parameters": [
                    {"ToolsTaxonomy": ["rotate_x_labels"]},
                ]
            }, {
                "constraintid": "connected_op",
                "parameters": [
                    {"ToolsTaxonomy": ["Distribution"]},
                    {"ToolsTaxonomy": ["rotate_x_labels"]},
                ]
            }, {
                "constraintid": "use_m",
                "parameters": [
                    {"ToolsTaxonomy": ["k_most_corr_indep_var_corr_matrix"]},
                ]
            }, {
                "constraintid": "operation_input",
                "parameters": [
                    {"ToolsTaxonomy": ["k_most_corr_indep_var_corr_matrix"]},
                    {"StatisticalRelevance": ["DependentVariable"]}
                ]
            }, {
                "constraintid": "operation_input",
                "parameters": [
                    {"ToolsTaxonomy": ["k_most_corr_indep_var_corr_matrix"]},
                    {"APE_label": ["10"]}
                ]
            }, {
                "constraintid": "use_m",
                "parameters": [
                    {"ToolsTaxonomy": ["heatmap"]},
                ]
            }, {
                "constraintid": "connected_op",
                "parameters": [
                    {"ToolsTaxonomy": ["k_most_corr_indep_var_corr_matrix"]},
                    {"ToolsTaxonomy": ["heatmap"]},
                ]
            }, {
                "constraintid": "use_m",
                "parameters": [
                    {"ToolsTaxonomy": ["pairplot"]},
                ]
            }, {
                "constraintid": "operation_input",
                "parameters": [
                    {"ToolsTaxonomy": ["pairplot"]},
                    {"StatisticalRelevance": ["DependentVariable"]}
                ]
            }, {
                "constraintid": "operation_input",
                "parameters": [
                    {"ToolsTaxonomy": ["pairplot"]},
                    {"APE_label": ["10"]}
                ]
            }
        ],
        7,
        [
            {
                "DataClass": ["Int"],
                "StatisticalRelevance": ["BasicObjectRelevance"],
                "APE_label": ["9"]
            }, {
                "DataClass": ["Int"],
                "StatisticalRelevance": ["BasicObjectRelevance"],
                "APE_label": ["10"]
            }, {
                "DataClass": ["Int"],
                "StatisticalRelevance": ["BasicObjectRelevance"],
                "APE_label": ["16"]
            }
        ]
    ),
    # Set 3 Cleaning
    (
        [
            {
                "constraintid": "use_m",
                "parameters": [
                    {"ToolsTaxonomy": ["na_count_percentage"]}
                ]
            }, {
                "constraintid": "use_m",
                "parameters": [
                    {"ToolsTaxonomy": ["dropna_col_i"]}
                ]
            }, {
                "constraintid": "use_m",
                "parameters": [
                    {"ToolsTaxonomy": ["filter_sd"]}
                ]
            }, {
                "constraintid": "use_m",
                "parameters": [
                    {"ToolsTaxonomy": ["drop_sd_i"]}
                ]
            }
        ],
        4,
        []
    ),
    # Set 4 Statistical Assumptions
    (
        [
            {
                "constraintid": "use_m",
                "parameters": [
                    {"ToolsTaxonomy": ["log"]}
                ]
            },{
                "constraintid": "use_m",
                "parameters": [
                    {"ToolsTaxonomy": ["normality_plots"]}
                ]
            }, {
                "constraintid": "operation_input",
                "parameters": [
                    {"ToolsTaxonomy": ["normality_plots"]},
                    {
                        "DataClass": ["Column"],
                        "StatisticalRelevance": ["DependentVariable"]
                    }
                ]
            }, {
                "constraintid": "operation_input",
                "parameters": [
                    {"ToolsTaxonomy": ["normality_plots"]},
                    {
                        "DataClass": ["Column"],
                        "StatisticalRelevance": ["IndependentVariable"]
                    }
                ]
            }, {
                "constraintid": "connected_op",
                "parameters": [
                    {"ToolsTaxonomy": ["log"]},
                    {"ToolsTaxonomy": ["normality_plots"]}
                ]
            }
        ],
        3,
        []
    )
]

In [7]:
for iter_ix, cst_conf in enumerate(constraint_min_input, start=1):
    print('Iteration', iter_ix)
    if iter_ix not in [2]:
        continue

    constraint_set, min_len, inputs = cst_conf

    # create config and constraints
    with open(USE_CASE_PATH / 'constraints_run.json', 'w', encoding='utf-8') as file_:
        json.dump({"constraints": constraint_set}, file_, indent=4)

    config_local = CONFIG.copy()
    config_local['inputs'] += inputs
    config_local['solution_length']['min'] = min_len
    with open(USE_CASE_PATH / config, 'w', encoding='utf-8') as file_:
        json.dump(config_local, file_, indent=4)

    # run APE
    proc = subprocess.Popen(
        ['java', '-Xmx8g', '-jar', str(APE_PATH), config],
        cwd=str(USE_CASE_PATH),
    )
    proc.wait()

    # check error code
    if proc.returncode != 0:
        print('APE failed', iter_ix)
        continue
    print('APE finished', iter_ix)

    # output
    folder_path = USE_CASE_PATH / 'out' / f'iteration_{iter_ix}'
    folder_path.mkdir(exist_ok=True)

    # copy input files into the output folder
    shutil.copy(
        str(USE_CASE_PATH / config),
        str(folder_path / config),
    )
    shutil.copy(
        str(USE_CASE_PATH / 'constraints_run.json'),
        str(folder_path / 'constraints_run.json'),
    )

    # copy solutions
    shutil.copy(
        str(USE_CASE_PATH / 'solutions' / 'solutions.txt'),
        str(folder_path / 'solutions.txt'),
    )

    shutil.copytree(
        str(USE_CASE_PATH / 'solutions' / 'Figures'),
        str(folder_path / 'Figures'),
        dirs_exist_ok=True,
    )

    # produce notebooks
    workflows_list = parse_ape_solutions(
        folder_path / 'solutions.txt',
        input_step=input_step,
    )

    for wk_ix, workflow in enumerate(workflows_list, start=1):
        notebook = solution_to_notebook(
            workflow,
            input_mapping=INPUT_MAPPING,
            solution_num=wk_ix-1,
            input_step=input_step,
        )
        with open(
            folder_path / f'workflow_{wk_ix}_start_{input_step}.ipynb',
            'w',
            encoding='utf-8',
        ) as out_:
            json.dump(notebook, out_, indent=4)
    print('Notebooks produced', iter_ix)

Iteration 1
Iteration 2

-------------------------------------------------------------
	Workflow discovery - length 7
-------------------------------------------------------------
Total problem setup time: 59.979 sec (11822479 clauses).
Found 5 solutions. Solving time: 8.463 sec.


APE found 5 solutions.
Total APE runtime: 		76.874 sec.
Total encoding time: 		59.979 sec.
Total SAT solving time: 	8.463 sec.


-------------------------------------------------------------
	Generating graphical representation
	of the first 5 workflows
-------------------------------------------------------------

Loading.....

Graphical files have been generated. Running time: 2.174 sec.
CWL annotations file not configured. No executable CWL files are generated.
APE finished 2
Notebooks produced 2
Iteration 3
Iteration 4
