In [1]:
import json
import os
from pathlib import Path
import shutil
import subprocess

from output_parsing.APE_to_notebook import parse_ape_solutions, solution_to_notebook

In [2]:
APE_PATH = Path('..') / '..' / 'APE' / 'APE-2.0.3-executable.jar'
USE_CASE_PATH = Path('.').resolve() / 'usecases' / 'titanic'
config = 'config_run.json'
input_step = 0

In [3]:
CONFIG = {
    "ontology_path": "../../ontology/ontology_v2_DIM_2.owl",
    "ontologyPrefixIRI": "http://www.co-ode.org/ontologies/ont.owl#",
    "toolsTaxonomyRoot": "ToolsTaxonomy",
    "dataDimensionsTaxonomyRoots": [
        "DataClass",
        "StatisticalRelevance"
    ],
    "tool_annotations_path": "../../ontology/tool_annotations_v2_DIM_2.json",
    "constraints_path": "constraints_run.json",
    "solutions_dir_path": "./solutions/",
    "solution_length": {
        "min": 2,
        "max": 10
    },
    "solutions": "5",
    "timeout_sec": "1000",
    "number_of_execution_scripts": "0",
    "number_of_generated_graphs": "5",
    "tool_seq_repeat": "true",
    "debug_mode": "false",
    "use_workflow_input": "ONE",
    "use_all_generated_data": "NONE",
    "inputs": [
        # {
        #     "DataClass": [
        #         "IntColumn"
        #     ],
        #     "StatisticalRelevance": [
        #         "IndependentVariable"
        #     ],
        #     "APE_label": [
        #         "PassengerId"
        #     ]
        # },
        {
            "DataClass": [
                "IntColumn"
            ],
            "StatisticalRelevance": [
                "DependentVariable"
            ],
            "APE_label": [
                "Survived"
            ]
        },
        {
            "DataClass": [
                "IntColumn"
            ],
            "StatisticalRelevance": [
                "IndependentVariable"
            ],
            "APE_label": [
                "Pclass"
            ]
        },
        {
            "DataClass": [
                "StrColumn"
            ],
            "StatisticalRelevance": [
                "IndependentVariable"
            ],
            "APE_label": [
                "Name"
            ]
        },
        {
            "DataClass": [
                "StrColumn"
            ],
            "StatisticalRelevance": [
                "IndependentVariable"
            ],
            "APE_label": [
                "Sex"
            ]
        },
        {
            "DataClass": [
                "FloatColumn"
            ],
            "StatisticalRelevance": [
                "IndependentVariable"
            ],
            "APE_label": [
                "Age"
            ]
        },
        {
            "DataClass": [
                "IntColumn"
            ],
            "StatisticalRelevance": [
                "IndependentVariable"
            ],
            "APE_label": [
                "SibSp"
            ]
        },
        {
            "DataClass": [
                "IntColumn"
            ],
            "StatisticalRelevance": [
                "IndependentVariable"
            ],
            "APE_label": [
                "Parch"
            ]
        },
        # {
        #     "DataClass": [
        #         "StrColumn"
        #     ],
        #     "StatisticalRelevance": [
        #         "IndependentVariable"
        #     ],
        #     "APE_label": [
        #         "Ticket"
        #     ]
        # },
        {
            "DataClass": [
                "FloatColumn"
            ],
            "StatisticalRelevance": [
                "IndependentVariable"
            ],
            "APE_label": [
                "Fare"
            ]
        },
        # {
        #     "DataClass": [
        #         "StrColumn"
        #     ],
        #     "StatisticalRelevance": [
        #         "IndependentVariable"
        #     ],
        #     "APE_label": [
        #         "Cabin"
        #     ]
        # },
        {
            "DataClass": [
                "StrColumn"
            ],
            "StatisticalRelevance": [
                "IndependentVariable"
            ],
            "APE_label": [
                "Embarked"
            ]
        },
        {
            "DataClass": [
                "MixedDataFrame"
            ],
            "StatisticalRelevance": [
                "NoRelevance"
            ],
            "APE_label": [
                "titanic_train"
            ]
        }
    ],
    "outputs": [
    ]
}

In [4]:
INPUT_MAPPING = [{
    'label': 'titanic_train',
    'source': os.path.abspath(os.path.join(
        'usecases',
        'titanic',
        'train.csv',
    )),
    'type': 'csv',
    'DataClass': 'MixedDataFrame',
    'StatisticalRelevance': 'NoRelevance'
}]

In [5]:
constraint_min_input = [
    # Set 1 Feature Engineering
    (
        [
            { # extract_i
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['extract_i']},
                    {'APE_label': ['Name']},
                ]
            }, {
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['extract_i']},
                    {'APE_label': [' ([A-Za-z]+)\.']},
                ]
            }, { # replace_re_i
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['replace_re_i']},
                    {'APE_label': ['Name']},
                ]
            }, {
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['replace_re_i']},
                    {'APE_label': ['Lady|Countess|Capt|Col|Don|Dr|Major|Rev|Sir|Jonkheer|Dona']},
                ]
            }, {
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['replace_re_i']},
                    {'APE_label': ['Rare']},
                ]
            }, { # bin_nominal_i
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['bin_nominal_i']},
                    {'APE_label': ['Age']},
                ]
            }, {
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['bin_nominal_i']},
                    {'APE_label': ['4']},
                ]
            }, { # bin_nominal_q_i
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['bin_nominal_q_i']},
                    {'APE_label': ['Fare']},
                ]
            }, {
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['bin_nominal_q_i']},
                    {'APE_label': ['4']},
                ]
            }, { # impute_median_i
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['impute_median_i']},
                    {'APE_label': ['Age']},
                ]
            }, { # impute_mode_i
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['impute_mode_i']},
                    {'APE_label': ['Embarked']},
                ]
            }, { # one_hot_encode_i
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['one_hot_encode_i']},
                    {'APE_label': ['Name']},
                ]
            # }, { # one_hot_encode_i
            #     'constraintid': 'operation_input',
            #     'parameters': [
            #         {'ToolsTaxonomy': ['one_hot_encode_i']},
            #         {'APE_label': ['Sex']},
            #     ]
            # }, { # one_hot_encode_i
            #     'constraintid': 'operation_input',
            #     'parameters': [
            #         {'ToolsTaxonomy': ['one_hot_encode_i']},
            #         {'APE_label': ['Embarked']},
            #     ]
            }, { # iter_2 fix order
                'constraintid': 'itn_m',
                'parameters': [
                    {'ToolsTaxonomy': ['Encoding']},
                    {'ToolsTaxonomy': ['EDAFeatureEngineering']},
                ]
            }, {
                'constraintid': 'depend_m',
                'parameters': [
                    {'ToolsTaxonomy': ['replace_re_i']},
                    {'ToolsTaxonomy': ['extract_i']},
                ]
            }
        ],
        # 9, # memout
        7,
        [
            {
                "DataClass": ["Str"],
                "StatisticalRelevance": ["BasicObjectRelevance"],
                "APE_label": [" ([A-Za-z]+)\."]
            }, {
                "DataClass": ["Str"],
                "StatisticalRelevance": ["BasicObjectRelevance"],
                "APE_label": ["Lady|Countess|Capt|Col|Don|Dr|Major|Rev|Sir|Jonkheer|Dona"]
            }, {
                "DataClass": ["Str"],
                "StatisticalRelevance": ["BasicObjectRelevance"],
                "APE_label": ["Rare"]
            }, {
            #     "DataClass": ["Int"],
            #     "StatisticalRelevance": ["BasicObjectRelevance"],
            #     "APE_label": ["20"]
            # }, {
                "DataClass": ["Int"],
                "StatisticalRelevance": ["BasicObjectRelevance"],
                "APE_label": ["4"]
            },
        ],
    ),
    # Set 2 Modeling
    (
        [
            { # col_split + train_test_split
                'constraintid': 'connected_op',
                'parameters': [
                    {'ToolsTaxonomy': ['column_split']},
                    {'ToolsTaxonomy': ['train_test_split']},
                ]
            }, { # fit, required since state is not tracked (no predict without fit)
                'constraintid': 'connected_op',
                'parameters': [
                    {'ToolsTaxonomy': ['train_test_split']},
                    {'ToolsTaxonomy': ['fit_estimator']}
                ]
            }, { # use a classifier -> init
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['fit_estimator']},
                    {'DataClass': ['Classifier']}
                ]
            }, { # predict
                'constraintid': 'connected_op',
                'parameters': [
                    {'ToolsTaxonomy': ['train_test_split']},
                    {'ToolsTaxonomy': ['predict']},
                ]
            }, { # classification_report, hopefully matches (X_test, y_test)
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['classification_report']},
                    {'StatisticalRelevance': ['Prediction']},
                ]
            }, {
                'constraintid': 'connected_op',
                'parameters': [
                    {'ToolsTaxonomy': ['train_test_split']},
                    {'ToolsTaxonomy': ['classification_report']},
                ]
            },
        ],
        6,
        [],
    ),
    # Set 3 Ensembling
    (
        [
            { # init_sklearn_voting_estimator -> ComplexEnsemble
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['init_sklearn_voting_estimator']},
                    {'APE_label': ['RandomForestClassifier,LinearSVClassifier,LogisticRegressionClassifier']},
                ]
            }, { # col_split + train_test_split
                'constraintid': 'connected_op',
                'parameters': [
                    {'ToolsTaxonomy': ['column_split']},
                    {'ToolsTaxonomy': ['train_test_split']},
                ]
            }, { # fit, required since state is not tracked (no predict without fit)
                'constraintid': 'connected_op',
                'parameters': [
                    {'ToolsTaxonomy': ['train_test_split']},
                    {'ToolsTaxonomy': ['fit_estimator']}
                ]
            }, { # use a classifier -> VotingClassifier
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['fit_estimator']},
                    {'DataClass': ['Classifier']}
                ]
            }, { # predict
                'constraintid': 'connected_op',
                'parameters': [
                    {'ToolsTaxonomy': ['train_test_split']},
                    {'ToolsTaxonomy': ['predict']},
                ]
            }, { # classification_report, hopefully matches (X_test, y_test)
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['classification_report']},
                    {'StatisticalRelevance': ['Prediction']},
                ]
            }, {
                'constraintid': 'connected_op',
                'parameters': [
                    {'ToolsTaxonomy': ['train_test_split']},
                    {'ToolsTaxonomy': ['classification_report']},
                ]
            },
        ],
        6,
        [
            {
                "DataClass": ["Str"],
                "StatisticalRelevance": ["BasicObjectRelevance"],
                "APE_label": ['RandomForestClassifier,LinearSVClassifier,LogisticRegressionClassifier']
            }
        ],
    )
]

- embedding / scaling steps are introducing data leakage since they are applied before the train/test split
    - no direct way to reference group of operations to be applied again in APE (pipeline step)

In [6]:
last_len = -1
for iter_ix, cst_conf in enumerate(constraint_min_input, start=1):
    if last_len >= 0:
        input_step += last_len
        print(f"input_step: {input_step}")
    else:
        # input_step = 0
        input_step = 7

    print('Iteration', iter_ix)

    if iter_ix == 1:
        continue

    constraint_set, min_len, inputs = cst_conf

    # create config and constraints
    with open(USE_CASE_PATH / 'constraints_run.json', 'w', encoding='utf-8') as file_:
        json.dump({"constraints": constraint_set}, file_, indent=4)

    config_local = CONFIG.copy()
    config_local['inputs'] += inputs
    config_local['solution_length']['min'] = min_len
    with open(USE_CASE_PATH / config, 'w', encoding='utf-8') as file_:
        json.dump(config_local, file_, indent=4)

    # run APE
    proc = subprocess.Popen(
        ['java', '-Xmx8g', '-jar', str(APE_PATH), config],
        cwd=str(USE_CASE_PATH),
    )
    proc.wait()

    # check error code
    if proc.returncode != 0:
        print('APE failed', iter_ix)
        continue
    print('APE finished', iter_ix)

    # output
    folder_path = USE_CASE_PATH / 'out' / f'iteration_{iter_ix}'
    folder_path.mkdir(exist_ok=True, parents=True)

    # copy input files into the output folder
    shutil.copy(
        str(USE_CASE_PATH / config),
        str(folder_path / config),
    )
    shutil.copy(
        str(USE_CASE_PATH / 'constraints_run.json'),
        str(folder_path / 'constraints_run.json'),
    )

    # copy solutions
    shutil.copy(
        str(USE_CASE_PATH / 'solutions' / 'solutions.txt'),
        str(folder_path / 'solutions.txt'),
    )

    shutil.copytree(
        str(USE_CASE_PATH / 'solutions' / 'Figures'),
        str(folder_path / 'Figures'),
        dirs_exist_ok=True,
    )

    # produce notebooks
    workflows_list = parse_ape_solutions(
        folder_path / 'solutions.txt',
        input_step=input_step,
    )

    for wk_ix, workflow in enumerate(workflows_list, start=1):
        notebook = solution_to_notebook(
            workflow,
            input_mapping=INPUT_MAPPING,
            solution_num=wk_ix-1,
            input_step=input_step,
        )
        with open(
            folder_path / f'workflow_{wk_ix}_start_{input_step}.ipynb',
            'w',
            encoding='utf-8',
        ) as out_:
            json.dump(notebook, out_, indent=4)
    last_len = len(workflows_list[-1]['steps'])
    print('Notebooks produced', iter_ix)

Iteration 1
Iteration 2

-------------------------------------------------------------
	Workflow discovery - length 6
-------------------------------------------------------------
Total problem setup time: 38.859 sec (6837143 clauses).
Found 5 solutions. Solving time: 11.44 sec.


APE found 5 solutions.
Total APE runtime: 		54.83 sec.
Total encoding time: 		38.859 sec.
Total SAT solving time: 	11.44 sec.


-------------------------------------------------------------
	Generating graphical representation
	of the first 5 workflows
-------------------------------------------------------------

Loading.....

Graphical files have been generated. Running time: 0.976 sec.
CWL annotations file not configured. No executable CWL files are generated.
APE finished 2
Notebooks produced 2
input_step: 13
Iteration 3

-------------------------------------------------------------
	Workflow discovery - length 6
-------------------------------------------------------------
Total problem setup time: 38.55