In [25]:
from kfp import dsl
from kfp.components import load_component_from_url, load_component_from_file

Helpers

In [14]:
def task_use_image(image_name):
    """
    Helper to update the image used by an existing kfp task
    
    From https://github.com/kaizentm/kubemlops/blob/master/code/utils/kfp_helper.py
    
    Usage: 
      my_task = task_op_factory()
      my_task.apply(use_image(my_image_name))
    """
    def _use_image(task):
        task.image = image_name
        return task
    return _use_image

def component_use_image(component, image_name):
    """
    Helper to update the image used by a component (eg: task factory)
    
    Note that this edits component in place (and copying or deepcopying
    does not prevent that).
    
    Usage: 
      task_op_factory = load_component_from_url(...)
      task_op_factory = component_use_image(component, image_name)
    """
    implementation = component.component_spec.implementation.to_dict()
    implementation['container']['image'] = image_name
    component.component_spec.implementation = component.component_spec.implementation.from_dict(implementation)

Components

In [2]:
# TODO: These should be pulled from a more long-term home of 
# reusable components
copy_to_minio_op = load_component_from_url('https://raw.githubusercontent.com/StatCan/kubeflow-mlops/scribner-iowa-staging/components/copy_to_minio.yaml')
copy_from_minio_op = load_component_from_url('https://raw.githubusercontent.com/StatCan/kubeflow-mlops/scribner-iowa-staging/components/copy_from_minio.yaml')

In [36]:
# DEBUGGING ONLY.  THIS JUST GIVES ME AUTO COMPLETES IN MY NOTEBOOK
# DELETE THIS CELL
train_component = load_component_from_file("../../containers/iowa-train/component.yaml")

In [37]:
def build_pipeline(svm_c: float, svm_gamma: float, container_tag: str = ":latest"):
    """
    Returns a function defining a training pipeline that is pinned to op containers and model params
    
    This pattern makes sense if you're using CI to produce a pipeline that will recreate
    the same trained model when fed the same data.  The resultant pipeline definition will
    clearly show the parameters/containers used. If doing a hyperparameter search with this 
    pattern, the search must be done through the CI system (eg: make 10 branches, edit 
    params on each branch, commit each branch, they all build independent containers and 
    pipelines, then all run and report their results).
    
    For hyperparameter searches, a more efficient pattern might be to make the training 
    pipeine accept hyperparameters as well as dataset.  But if we do this, it means the
    same pipeline can later be used with different args so maybe CD workflows are harder
    to trace?  Would need to try it out.
    """
    # TODO: Add default minio creds?
    image_repo_name = "k8scc01covidmlopsacr.azurecr.io/mlops"
    
    # Use train from a template component.yaml held elsewhere.
    # Alternatively we could define the component by:
    # train_component = dsl.ContainerOp(
#             "train",
#             image=...,
#             ...
#         )
    train_component = load_component_from_file("../../containers/iowa-train/component.yaml")
    train_component_image_name = f"{image_repo_name}/train{container_tag}"
    component_use_image(train_component, train_component_image_name)
    
    @dsl.pipeline(
        name="Iowa Liquor Training Pipeline",
        description="Trains a pipeline to classify liquor based on its name using a specific model and hyperparameters"
    )
    def training_pipeline(
        data_train: str,
        data_test: str,
        output_model: str,
        output_results: str,
        minio_url: str,
        minio_access_key: str,
        minio_secret_key: str,
    ):
        """
        Pipeline for training the Iowa liquor categorization pipeline
        """

        operations = {}

        operations['get training data'] = copy_from_minio_op(
            minio_url=minio_url,
            minio_access_key=minio_access_key,
            minio_secret_key=minio_secret_key,
            minio_source=data_train,
            local_destination="train.csv",
        )
        
        operations['get scoring data'] = copy_from_minio_op(
            minio_url=minio_url,
            minio_access_key=minio_access_key,
            minio_secret_key=minio_secret_key,
            minio_source=data_train,
            local_destination="test.csv",
        )
        
        operations['train'] = train_component(
            data_train=operations['get training data'].output,
            svm_gamma=svm_gamma,
            svm_c=svm_c
        )
        
    return training_pipeline

In [38]:
pipeline = build_pipeline(container_tag=":latest", svm_c=110., svm_gamma=0.11)

In [39]:
import kfp.compiler as compiler
compiler = compiler.Compiler().compile(pipeline_func=pipeline, 
#                                        package_path=f"{__file__}.tar.gz",
                                       package_path=f"training_pipeline.tar.gz",  # TEMP WAY WHILE IN NOTEBOOK
                                       type_check=False,  # TEMP WHILE I FIGURE OUT HOW TO USE TYPES FOR MINIO URL
                                      )

type name String is different from expected: URL
type name String is different from expected: URL


