In [2]:
import json
import os
from pathlib import Path
import shutil
import subprocess

from output_parsing.APE_to_notebook import parse_ape_solutions, solution_to_notebook

In [3]:
APE_PATH = Path('..') / '..' / 'APE' / 'APE-2.0.3-executable.jar'
USE_CASE_PATH = Path('.').resolve() / 'usecases' / 'imbd'
config = 'config_run.json'
input_step = 0

In [4]:
CONFIG = {
    "ontology_path": "../../ontology/ontology_v2_DIM_2.owl",
    "ontologyPrefixIRI": "http://www.co-ode.org/ontologies/ont.owl#",
    "toolsTaxonomyRoot": "ToolsTaxonomy",
    "dataDimensionsTaxonomyRoots": [
        "DataClass",
        "StatisticalRelevance"
    ],
    "tool_annotations_path": "../../ontology/tool_annotations_v2_DIM_2.json",
    "constraints_path": "constraints_run.json",
    "solutions_dir_path": "./solutions/",
    "solution_length": {
        "min": 2,
        "max": 10
    },
    "solutions": "5",
    "timeout_sec": "1000",
    "number_of_execution_scripts": "0",
    "number_of_generated_graphs": "5",
    "tool_seq_repeat": "true",
    "debug_mode": "false",
    "use_workflow_input": "ONE",
    "use_all_generated_data": "NONE",
    "inputs": [
        {
            "DataClass": [
                "StrColumn"
            ],
            "StatisticalRelevance": [
                "DependentVariable"
            ],
            "APE_label": [
                "sentiment"
            ]
        },
        {
            "DataClass": [
                "StrColumn"
            ],
            "StatisticalRelevance": [
                "IndependentVariable"
            ],
            "APE_label": [
                "review"
            ]
        },
        {
            "DataClass": [
                "MixedDataFrame"
            ],
            "StatisticalRelevance": [
                "NoRelevance"
            ],
            "APE_label": [
                "imbd_train"
            ]
        }
    ],
    "outputs": [
    ]
}

In [5]:
INPUT_MAPPING = [{
    'label': 'imbd_train',
    'source': os.path.abspath(os.path.join(
        'usecases',
        'imbd',
        'imbd_train_fixed.csv', # ! change to fixed version to speed up demo, spelling correction takes too long
    )),
    'type': 'csv',
    'DataClass': 'MixedDataFrame',
    'StatisticalRelevance': 'NoRelevance'
}]

In [8]:
constraint_min_input = [
    # Set 1 Preprocessing
    (
        [
            {   # get_text_from_html_i
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['get_text_from_html_i']},
                    {'APE_label': ['review']},
                ]
            }, { # expand_abbr_i
                'constraintid': 'next_m',
                'parameters': [
                    {'ToolsTaxonomy': ['get_text_from_html_i']},
                    {'ToolsTaxonomy': ['expand_abbr_i']},
                ]
            }, {
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['expand_abbr_i']},
                    {'APE_label': ['abrev.json']},
                ]
            }, {
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['expand_abbr_i']},
                    {'APE_label': ['review']},
                ]
            }, { # replace_re_i
                'constraintid': 'next_m',
                'parameters': [
                    {'ToolsTaxonomy': ['expand_abbr_i']},
                    {'ToolsTaxonomy': ['replace_re_i']},
                ]
            }, {
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['replace_re_i']},
                    {'APE_label': ['review']},
                ]
            }, {
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['replace_re_i']},
                    {'APE_label': ['[^a-zA-Z]']},
                ]
            }, {
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['replace_re_i']},
                    {'APE_label': [' ']},
                ]
            }, { # lemmatize_i
                'constraintid': 'next_m',
                'parameters': [
                    {'ToolsTaxonomy': ['replace_re_i']},
                    {'ToolsTaxonomy': ['lemmatize_i']},
                ]
            }, {
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['lemmatize_i']},
                    {'APE_label': ['review']},
                ]
            }, { # remove_stopwords_i
                'constraintid': 'next_m',
                'parameters': [
                    {'ToolsTaxonomy': ['lemmatize_i']},
                    {'ToolsTaxonomy': ['remove_stopwords_i']},
                ]
            }, {
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['remove_stopwords_i']},
                    {'APE_label': ['review']},
                ]
            },
            # EDA
            { # plot_wordcloud
                'constraintid': 'last_m',
                'parameters': [
                    {'ToolsTaxonomy': ['plot_wordcloud']},
                ]
            }, {
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['plot_wordcloud']},
                    {'APE_label': ['review']},
                ]
            }
        ],
        6,
        [
            {
                "DataClass": ["Str"],
                "StatisticalRelevance": ["BasicObjectRelevance"],
                "APE_label": ["[^a-zA-Z]"]
            }, {
                "DataClass": ["Str"],
                "StatisticalRelevance": ["BasicObjectRelevance"],
                "APE_label": [" "]
            }, {
                "DataClass": ["Str"],
                "StatisticalRelevance": ["BasicObjectRelevance"],
                "APE_label": ["abrev.json"]
            },
        ],
    ),
    # Set 2 Embedding + Modeling + Evaluation
    (
        [
            { # column_split + train_test_split
                'constraintid': 'connected_op',
                'parameters': [
                    {'ToolsTaxonomy': ['column_split']},
                    {'ToolsTaxonomy': ['train_test_split']},
                ]
            }, { # embed_text_word2vec
                'constraintid': 'connected_op',
                'parameters': [
                    {'ToolsTaxonomy': ['train_test_split']},
                    {'ToolsTaxonomy': ['embed_text_word2vec']},
                ]
            }, {
                'constraintid': 'depend_m',
                'parameters': [
                    {'ToolsTaxonomy': ['embed_text_word2vec']},
                    {'ToolsTaxonomy': ['train_test_split']},
                ]
            }, {
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['embed_text_word2vec']},
                    {'APE_label': ['review']},
                ]
            }, { # fit_estimator
                'constraintid': 'connected_op',
                'parameters': [
                    {'ToolsTaxonomy': ['embed_text_word2vec']},
                    {'ToolsTaxonomy': ['fit_estimator']},
                ]
            }, { # -> init_sklearn_estimator
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['fit_estimator']},
                    {'DataClass': ['Classifier']},
                ]
            }, { # embed_text_word2vec + predict_estimator
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['embed_text_word2vec']},
                    {'DataClass': ['Word2Vec']},
                ]
            }, {
                'constraintid': 'connected_op',
                'parameters': [
                    {'ToolsTaxonomy': ['embed_text_word2vec']},
                    {'ToolsTaxonomy': ['predict']},
                ]
            }, {
                'constraintid': 'connected_op',
                'parameters': [
                    {'ToolsTaxonomy': ['fit_estimator']},
                    {'ToolsTaxonomy': ['predict']},
                ]
            }, { # classification_report
                'constraintid': 'operation_input',
                'parameters': [
                    {'ToolsTaxonomy': ['classification_report']},
                    {'StatisticalRelevance': ['Prediction']},
                ]
            }, {
                'constraintid': 'connected_op',
                'parameters': [
                    {'ToolsTaxonomy': ['train_test_split']},
                    {'ToolsTaxonomy': ['classification_report']},
                ]
            }
        ],
        8,
        [],
    )
]

Would like to replace word2vec hardcode with TextEmbedding but relations between multiple TextEmbedding operations + produced artifact are more complex. So, I'll leave it as is for now.

In [9]:
last_len = -1
for iter_ix, cst_conf in enumerate(constraint_min_input, start=1):
    # if last_len >= 0:
    #     input_step += last_len
    #     print(f"input_step: {input_step}")
    # else:
    #     input_step = 0

    # print('Iteration', iter_ix)

    # if iter_ix != 1:
    #     continue
    if iter_ix != 2:
        continue
    input_step = 7

    constraint_set, min_len, inputs = cst_conf

    # create config and constraints
    with open(USE_CASE_PATH / 'constraints_run.json', 'w', encoding='utf-8') as file_:
        json.dump({"constraints": constraint_set}, file_, indent=4)

    config_local = CONFIG.copy()
    config_local['inputs'] += inputs
    config_local['solution_length']['min'] = min_len
    with open(USE_CASE_PATH / config, 'w', encoding='utf-8') as file_:
        json.dump(config_local, file_, indent=4)

    # run APE
    proc = subprocess.Popen(
        ['java', '-Xmx8g', '-jar', str(APE_PATH), config],
        cwd=str(USE_CASE_PATH),
    )
    proc.wait()

    # check error code
    if proc.returncode != 0:
        print('APE failed', iter_ix)
        continue
    print('APE finished', iter_ix)

    # output
    folder_path = USE_CASE_PATH / 'out' / f'iteration_{iter_ix}'
    folder_path.mkdir(exist_ok=True, parents=True)

    # copy input files into the output folder
    shutil.copy(
        str(USE_CASE_PATH / config),
        str(folder_path / config),
    )
    shutil.copy(
        str(USE_CASE_PATH / 'constraints_run.json'),
        str(folder_path / 'constraints_run.json'),
    )

    # copy solutions
    shutil.copy(
        str(USE_CASE_PATH / 'solutions' / 'solutions.txt'),
        str(folder_path / 'solutions.txt'),
    )

    shutil.copytree(
        str(USE_CASE_PATH / 'solutions' / 'Figures'),
        str(folder_path / 'Figures'),
        dirs_exist_ok=True,
    )

    # produce notebooks
    workflows_list = parse_ape_solutions(
        folder_path / 'solutions.txt',
        input_step=input_step,
    )

    for wk_ix, workflow in enumerate(workflows_list, start=1):
        notebook = solution_to_notebook(
            workflow,
            input_mapping=INPUT_MAPPING,
            solution_num=wk_ix-1,
            input_step=input_step,
        )
        with open(
            folder_path / f'workflow_{wk_ix}_start_{input_step}.ipynb',
            'w',
            encoding='utf-8',
        ) as out_:
            json.dump(notebook, out_, indent=4)
    last_len = len(workflows_list[-1]['steps'])
    print('Notebooks produced', iter_ix)


-------------------------------------------------------------
	Workflow discovery - length 8
-------------------------------------------------------------
Total problem setup time: 38.432 sec (7290876 clauses).
Found 5 solutions. Solving time: 18.562 sec.


APE found 5 solutions.
Total APE runtime: 		61.295 sec.
Total encoding time: 		38.432 sec.
Total SAT solving time: 	18.562 sec.


-------------------------------------------------------------
	Generating graphical representation
	of the first 5 workflows
-------------------------------------------------------------

Loading.....

Graphical files have been generated. Running time: 0.823 sec.
CWL annotations file not configured. No executable CWL files are generated.
APE finished 2
Notebooks produced 2
