In [1]:
import d3m
from d3m.container import Dataset
import d3m.index
import d3m.runtime
from d3m.metadata.base import Context, ArgumentType
from d3m.metadata.pipeline import Pipeline, PrimitiveStep
from d3m.metadata.problem import PerformanceMetric, parse_problem_description
import os
import time
import yaml

In [2]:
!ls -1 d3m_ta2_nyu/pipelines/*.yaml

d3m_ta2_nyu/pipelines/kfold_tabular_split.yaml
d3m_ta2_nyu/pipelines/scoring.yaml


In [3]:
with open('d3m_ta2_nyu/pipelines/scoring.yaml') as fp:
    scoring_pipeline = Pipeline.from_json_structure(yaml.safe_load(fp))
with open('d3m_ta2_nyu/pipelines/kfold_tabular_split.yaml') as fp:
    data_preparation_pipeline = Pipeline.from_json_structure(yaml.safe_load(fp))

Digest for pipeline 'f596cd77-25f8-4d4c-a350-bb30ab1e58f6' does not match a computed one. Provided digest: 45f2dfeae00b8168eb29a3bb54e9cfb86d7277832ff6f1c0a5113cbecce9084a. Computed digest: 786116cf587189b6af7787be5d633a171b38ca5ebd5dba617564c353fa0e5e5d.
Digest for pipeline 'c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8' does not match a computed one. Provided digest: 72bf7c1d757a87cd2b6ec2609ac042cf09861015bbd3fef730c2156e1e647d67. Computed digest: df21a5626b602d90a6699939e889702f427f3a0c85a48655b18d9d6e37fff605.


In [4]:
dataset = Dataset.load('file:///d3m/data/seed_datasets_current/185_baseball/185_baseball_dataset/datasetDoc.json')

In [5]:
problem = parse_problem_description('/d3m/data/seed_datasets_current/185_baseball/185_baseball_problem/problemDoc.json')
problem

{'id': '185_baseball_problem',
 'version': '2.0',
 'name': 'baseball_problem',
 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/problem.json',
 'problem': {'task_type': <TaskType.CLASSIFICATION: 1>,
  'task_subtype': <TaskSubtype.MULTICLASS: 3>,
  'performance_metrics': [{'metric': <PerformanceMetric.F1_MACRO: 6>,
    'params': {}}]},
 'outputs': {'predictions_file': 'predictions.csv'},
 'description': "**Author**: Jeffrey S. Simonoff  \n**Source**: [AnalCatData](http://www.stern.nyu.edu/~jsimonof/AnalCatData) - 2003  \n**Please cite**: Jeffrey S. Simonoff, Analyzing Categorical Data, Springer-Verlag, New York, 2003  \n \nDatabase of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave' \n\nNotes:  \n* Quotes, Single-Quotes and Backslashes were removed, Blanks replaced with Underscores\n* Player is an i

In [6]:
pipeline_description = Pipeline(context=Context.TESTING)
pipeline_description.add_input(name='inputs')

# 0 denormalize
step0 = PrimitiveStep(primitive=d3m.index.get_primitive('d3m.primitives.data_transformation.denormalize.Common'))
step0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0')
step0.add_output('produce')
pipeline_description.add_step(step0)

# 1 dataset_to_dataframe
step1 = PrimitiveStep(primitive=d3m.index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common'))
step1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce')
step1.add_output('produce')
pipeline_description.add_step(step1)

# 2 column_parser
step2 = PrimitiveStep(primitive=d3m.index.get_primitive('d3m.primitives.data_transformation.column_parser.DataFrameCommon'))
step2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce')
step2.add_output('produce')
pipeline_description.add_step(step2)

# 3 extract_columns_by_semantic_types Attribute
step3 = PrimitiveStep(primitive=d3m.index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.DataFrameCommon'))
step3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce')
step3.add_output('produce')
step3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE,
                         data=['https://metadata.datadrivendiscovery.org/types/Attribute'])
pipeline_description.add_step(step3)

# 4 cast_to_type
step4 = PrimitiveStep(primitive=d3m.index.get_primitive('d3m.primitives.data_transformation.cast_to_type.Common'))
step4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.3.produce')
step4.add_output('produce')
pipeline_description.add_step(step4)

# 5 imputer
step5 = PrimitiveStep(primitive=d3m.index.get_primitive('d3m.primitives.data_cleaning.imputer.SKlearn'))
step5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce')
step5.add_output('produce')
pipeline_description.add_step(step5)

# 6 extract_columns_by_semantic_types Target
step6 = PrimitiveStep(primitive=d3m.index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.DataFrameCommon'))
step6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce')
step6.add_output('produce')
step6.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE,
                         data=['https://metadata.datadrivendiscovery.org/types/Target'])
pipeline_description.add_step(step6)

# 7 cast_to_type
step7 = PrimitiveStep(primitive=d3m.index.get_primitive('d3m.primitives.data_transformation.cast_to_type.Common'))
step7.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.6.produce')
step7.add_output('produce')
pipeline_description.add_step(step7)

# 8 random_forest.SKlearn
step8 = PrimitiveStep(primitive=d3m.index.get_primitive('d3m.primitives.classification.random_forest.SKlearn'))
step8.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce')
step8.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.7.produce')
step8.add_output('produce')
pipeline_description.add_step(step8)

# 9 extract_columns_by_semantic_types Target, PrimaryKey
step9 = PrimitiveStep(primitive=d3m.index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.DataFrameCommon'))
step9.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce')
step9.add_output('produce')
step9.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE,
                         data=['https://metadata.datadrivendiscovery.org/types/Target',
                               'https://metadata.datadrivendiscovery.org/types/PrimaryKey'])
pipeline_description.add_step(step9)

# 10 construct_predictions
step10 = PrimitiveStep(primitive=d3m.index.get_primitive('d3m.primitives.data_transformation.construct_predictions.DataFrameCommon'))
step10.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.8.produce')
step10.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.9.produce')
step10.add_output('produce')
pipeline_description.add_step(step10)

pipeline_description.add_output(name='output predictions', data_reference='steps.10.produce')

'outputs.0'

In [7]:
result = d3m.runtime.evaluate(
    pipeline=pipeline_description,
    data_pipeline=data_preparation_pipeline,
    scoring_pipeline=scoring_pipeline,
    problem_description=problem,
    inputs=[dataset],
    data_params={'number_of_folds': '4'},
    metrics=[{'metric': PerformanceMetric.F1_MACRO}],
    volumes_dir=os.environ.get('D3M_PRIMITIVE_STATIC', None),
    context=Context.TESTING,
    random_seed=0,
)
result

Docker image environment variable not set: D3M_BASE_IMAGE_NAME
Docker image environment variable not set: D3M_BASE_IMAGE_DIGEST
Docker image environment variable not set: D3M_IMAGE_NAME
Docker image environment variable not set: D3M_IMAGE_DIGEST
Docker image environment variable not set: D3M_BASE_IMAGE_NAME
Docker image environment variable not set: D3M_BASE_IMAGE_DIGEST
Docker image environment variable not set: D3M_IMAGE_NAME
Docker image environment variable not set: D3M_IMAGE_DIGEST
Docker image environment variable not set: D3M_BASE_IMAGE_NAME
Docker image environment variable not set: D3M_BASE_IMAGE_DIGEST
Docker image environment variable not set: D3M_IMAGE_NAME
Docker image environment variable not set: D3M_IMAGE_DIGEST
Docker image environment variable not set: D3M_BASE_IMAGE_NAME
Docker image environment variable not set: D3M_BASE_IMAGE_DIGEST
Docker image environment variable not set: D3M_IMAGE_NAME
Docker image environment variable not set: D3M_IMAGE_DIGEST
Docker image env

[(     metric       targets     value
  0  F1_MACRO  Hall_of_Fame  0.564307,
  <d3m.metadata.pipeline_run.pipeline_run.PipelineRun at 0x7fd1846d0eb8>,
  <d3m.metadata.pipeline_run.pipeline_run.PipelineRun at 0x7fd184697eb8>),
 (     metric       targets     value
  0  F1_MACRO  Hall_of_Fame  0.650228,
  <d3m.metadata.pipeline_run.pipeline_run.PipelineRun at 0x7fd18447fe80>,
  <d3m.metadata.pipeline_run.pipeline_run.PipelineRun at 0x7fd1846e0c88>),
 (     metric       targets     value
  0  F1_MACRO  Hall_of_Fame  0.726984,
  <d3m.metadata.pipeline_run.pipeline_run.PipelineRun at 0x7fd184370e48>,
  <d3m.metadata.pipeline_run.pipeline_run.PipelineRun at 0x7fd1844319e8>),
 (     metric       targets     value
  0  F1_MACRO  Hall_of_Fame  0.688293,
  <d3m.metadata.pipeline_run.pipeline_run.PipelineRun at 0x7fd1845648d0>,
  <d3m.metadata.pipeline_run.pipeline_run.PipelineRun at 0x7fd184564978>)]

In [8]:
result[0][0]

Unnamed: 0,metric,targets,value
0,F1_MACRO,Hall_of_Fame,0.564307


In [9]:
d3m.runtime.combine_folds([r[0] for r in result])

Unnamed: 0,metric,targets,value,fold
0,F1_MACRO,Hall_of_Fame,0.564307,0
1,F1_MACRO,Hall_of_Fame,0.650228,1
2,F1_MACRO,Hall_of_Fame,0.726984,2
3,F1_MACRO,Hall_of_Fame,0.688293,3


In [10]:
class CustomRuntime(d3m.runtime.Runtime):
    def __init__(self, targets, **kwargs):
        super(CustomRuntime, self).__init__(**kwargs)

        self.__targets = targets

    def _mark_columns(self, dataset):
        dataset = dataset.copy()
        
        for res_id, col_idx in self.__targets:
            dataset.metadata = dataset.metadata.add_semantic_type(
                [res_id, d3m.metadata.base.ALL_ELEMENTS, col_idx],
                'https://metadata.datadrivendiscovery.org/types/Target',
            )
            dataset.metadata = dataset.metadata.add_semantic_type(
                [res_id, d3m.metadata.base.ALL_ELEMENTS, col_idx],
                'https://metadata.datadrivendiscovery.org/types/TrueTarget',
            )
            dataset.metadata = dataset.metadata.remove_semantic_type(
                [res_id, d3m.metadata.base.ALL_ELEMENTS, col_idx],
                'https://metadata.datadrivendiscovery.org/types/Attribute',
            )
        # TODO: Set previous target as attribute?
        
        return dataset

In [11]:
runtime = CustomRuntime(
    targets=[('learningData', 18)],
    pipeline=pipeline_description,
    is_standard_pipeline=True,
    volumes_dir=os.environ.get('D3M_PRIMITIVE_STATIC', None),
    context=Context.TESTING,
)

Docker image environment variable not set: D3M_BASE_IMAGE_NAME
Docker image environment variable not set: D3M_BASE_IMAGE_DIGEST
Docker image environment variable not set: D3M_IMAGE_NAME
Docker image environment variable not set: D3M_IMAGE_DIGEST


In [12]:
runtime.fit(
    inputs=[dataset],
    return_values=['outputs.0'],
)

<d3m.runtime.Result at 0x7fd185fb5a20>