In [1]:
import pandas as pd
import numpy as np

import json

from typing import cast, Dict, Tuple, Set, List, NamedTuple

# config template

In [2]:
CONFIG = {
    "ontology_path": "../../ontology/ontology_v2.owl",
    "ontologyPrefixIRI": "http://www.co-ode.org/ontologies/ont.owl#",
    "toolsTaxonomyRoot": "ToolsTaxonomy",
    "dataDimensionsTaxonomyRoots": [
        "DataClass",
        # "DataState",
        "StatisticalRelevance",
        # "DataSetIndex"
    ],
    "tool_annotations_path": "../../ontology/tool_annotations_v2.json",
    "constraints_path": "constraints.json",
    "solutions_dir_path": "./solutions/",
    "solution_length": {
        "min": 1,
        "max": 10
    },
    "solutions": "5",
    "number_of_execution_scripts": "0",
    "number_of_generated_graphs": "5",
    "tool_seq_repeat": "true",
    "debug_mode": "false",
    "use_workflow_input": "ONE",
    "use_all_generated_data": "ONE",
}

# input data

In [3]:
def read_input_table(file_name: str, dependent_vars: List[str], alias: str):
    col_list: List[Dict[str, str]] = []
    table: pd.DataFrame = pd.read_csv(file_name)
    table.columns = table.columns.astype(str)
    for col in table.columns:
        col_list.append({
            'DataClass': {
                np.int64: 'IntColumn',
                np.float64: 'FloatColumn',
                np.object_: 'StrColumn',
                np.bool_: 'BoolColumn',
            }[table[col].dtype.type],
            # 'DataState': 'NonModified',
            'StatisticalRelevance': 'IndependentVariable' if col not in dependent_vars else 'DependentVariable',
            # 'DataSetIndex': 'InputData',
            'ColumnID': col,
            'TableID': alias,
            'OtherID': '-',
        })
    table_types = {}
    for key, subset in [('dep', table[dependent_vars]), ('indep', table.drop(columns=dependent_vars))]:
        if all(subset.dtypes.isin(['int64', 'float64', 'bool'])):
            if all(subset.dtypes == 'int64'):
                table_type = 'IntDataFrame'
            elif all(subset.dtype == 'float64'):
                table_type = 'FloatDataFrame'
            elif all(subset.dtype == 'bool'):
                table_type = 'BoolDataFrame'
            else:
                table_type = 'NumberDataFrame'
        elif all(subset.dtypes == 'object'):
            table_type = 'StrDataFrame'
        else:
            #! incomplete
            table_type = 'MixedDataFrame'
        table_types[key] = table_type
    return col_list, table_types['dep'], table_types['indep']


def input_data_to_config(input_data: Dict[str, List]):
    inputs_out: List[Dict[str, List[str]]] = []
    # tables
    table: Tuple[List[Dict[str, str]], str, str]
    for table in input_data['tables']:
        for col in table[0]:
            inputs_out.append(
                {k: [col[k]] for k in [
                    'DataClass',
                    # 'DataState',
                    'StatisticalRelevance',
                    # 'DataSetIndex'
                ]}
                | {'APE_label': [
                    col['ColumnID'],
                    # col['TableID'], #! problem with APE_labels, see mail to Vedran
                ]}
            )
        inputs_out.append({
            'DataClass': [table[1]],
            # 'DataState': ['NonModified'],
            'StatisticalRelevance': ['DependentVariable'],
            # 'DataSetIndex': ['InputData'],
            'APE_label': [table[0][0]['TableID']]
        })
        inputs_out.append({
            'DataClass': [table[2]],
            # 'DataState': ['NonModified'],
            'StatisticalRelevance': ['IndependentVariable'],
            # 'DataSetIndex': ['InputData'],
            'APE_label': [table[0][0]['TableID']]
        })

    return inputs_out

In [4]:
input_data = {key: [] for key in ['tables']}

In [5]:
input_data['tables'] += [read_input_table('train.csv', ['SalePrice'], 'housing_train')]

In [6]:
INPUT = input_data_to_config(input_data)

# output data

In [7]:
OUTPUT = [
    {
        "DataClass": [
            "Figure"
        ],
        # "DataState": [
        #     "NoState"
        # ],
        "StatisticalRelevance": [
            "NoRelevance"
        ],
        # "DataSetIndex": [
        #     "InputData"
        # ]
    }
]

# dump config

In [8]:
with open('config.json', 'w', encoding='utf-8') as config_f:
    json.dump(CONFIG | {"inputs": INPUT, "outputs": OUTPUT}, config_f, indent=4)