In [None]:
import kfp
from kfp import dsl
from kfp.components import load_component_from_url, load_component_from_file

# Parameters

Next cell handled via papermill parameters

In [None]:
svm_c = None
svm_gamma = None
container_tag = None
output_pipeline_filename = "training_pipeline.tar.gz"

# For debugging until multi-tenant working
minio_url = "http://minimal-tenant1-minio.minio:9000"
minio_access_key = "profile-andrew-scribner-7af84d76-cdd1-6bb6-ea17-bf2f35872d03"
minio_secret_key = "GYh1XnB5mCmCgdstPeNTcQxn"

Defaults for debugging/running locally

In [None]:
# svm_c = 100.
# svm_gamma = 0.1
# container_tag = "e276f46d4b74a280d3e82992bf6a1d7ed02c1fa7"

# Helpers

In [None]:
def task_use_image(image_name):
    """
    Helper to update the image used by an existing kfp task
    
    From https://github.com/kaizentm/kubemlops/blob/master/code/utils/kfp_helper.py
    
    Usage: 
      my_task = task_op_factory()
      my_task.apply(use_image(my_image_name))
    """
    def _use_image(task):
        task.image = image_name
        return task
    return _use_image

def component_use_image(component, image_name):
    """
    Helper to update the image used by a component (eg: task factory)
    
    Note that this edits component in place (and copying or deepcopying
    does not prevent that).
    
    Usage: 
      task_op_factory = load_component_from_url(...)
      task_op_factory = component_use_image(component, image_name)
    """
    implementation = component.component_spec.implementation.to_dict()
    implementation['container']['image'] = image_name
    component.component_spec.implementation = component.component_spec.implementation.from_dict(implementation)
    
def load_component_from_file_and_pin_version(component_filename, image_repo_name, image_name, tag):
    """
    Returns the given component but updated to use image repo_name/image_name:tag
    """
    component = load_component_from_file(component_filename)
    component_image_name_untagged = f"{image_repo_name}/{image_name}"
    component_image_name = f"{component_image_name_untagged.rstrip(':')}:{tag.lstrip(':')}"
    component_use_image(component, component_image_name)
    return component

# Components

In [None]:
# TODO: These should be pulled from a more long-term home of 
# reusable components
copy_to_minio_op = load_component_from_url('https://raw.githubusercontent.com/StatCan/kubeflow-mlops/scribner-iowa-staging/components/copy_to_minio.yaml')
copy_from_minio_op = load_component_from_url('https://raw.githubusercontent.com/StatCan/kubeflow-mlops/scribner-iowa-staging/components/copy_from_minio.yaml')

In [None]:
# DEBUGGING ONLY.  THIS JUST GIVES ME AUTO COMPLETES IN MY NOTEBOOK
# DELETE THIS CELL
train_component = load_component_from_file("../../containers/iowa-train/component.yaml")
score_component = load_component_from_file("../../containers/iowa-score/component.yaml")

# Pipeline

In [None]:
def build_pipeline(svm_c: float, svm_gamma: float, container_tag: str = "latest"):
    """
    Returns a function defining a training pipeline that is pinned to op containers and model params
    
    This pattern makes sense if you're using CI to produce a pipeline that will recreate
    the same trained model when fed the same data.  The resultant pipeline definition will
    clearly show the parameters/containers used. If doing a hyperparameter search with this 
    pattern, the search must be done through the CI system (eg: make 10 branches, edit 
    params on each branch, commit each branch, they all build independent containers and 
    pipelines, then all run and report their results).
    
    For hyperparameter searches, a more efficient pattern might be to make the training 
    pipeine accept hyperparameters as well as dataset.  But if we do this, it means the
    same pipeline can later be used with different args so maybe CD workflows are harder
    to trace?  Would need to try it out.
    """
    # TODO: Add default minio creds?
    image_repo_name = "k8scc01covidmlopsacr.azurecr.io/mlops"
    
    # Set up any components that require version pinning
    
    # Use train/score from a template component.yaml held elsewhere.
    # Alternatively we could define the component by:
    # train_component = dsl.ContainerOp(
#             "train",
#             image=...,
#             ...
#         )

    train_component = load_component_from_file_and_pin_version(
        component_filename="../../containers/iowa-train/component.yaml",
        image_repo_name=image_repo_name,
        image_name="iowa-train",
        tag=container_tag
    )
    
    score_component = load_component_from_file_and_pin_version(
        component_filename="../../containers/iowa-score/component.yaml",
        image_repo_name=image_repo_name,
        image_name="iowa-score",
        tag=container_tag
    )

    # Define the pipeline 
    @dsl.pipeline(
        name="Iowa Liquor Training Pipeline",
        description="Trains a pipeline to classify liquor based on its name using a specific model and hyperparameters"
    )
    def training_pipeline(
        data_train: str,
        data_test: str,
      # TODO: Handle these automatically once multitenancy is available
        minio_url: str,
        minio_access_key: str,
        minio_secret_key: str,
    ):
        """
        Pipeline for training the Iowa liquor categorization pipeline
        """

        operations = {}

        operations['get training data'] = copy_from_minio_op(
            minio_url=minio_url,
            minio_access_key=minio_access_key,
            minio_secret_key=minio_secret_key,
            minio_source=data_train,
        ).set_display_name("get training data")
        
        operations['get scoring data'] = copy_from_minio_op(
            minio_url=minio_url,
            minio_access_key=minio_access_key,
            minio_secret_key=minio_secret_key,
            minio_source=data_test,
        ).set_display_name("get scoring data")
        
        operations['train'] = train_component(
            data_train=operations['get training data'].output,
            svm_gamma=svm_gamma,
            svm_c=svm_c
        )
        
        operations['score'] = score_component(
            data=operations['get scoring data'].output,
            model=operations['train'].outputs['model'],
        )
        
    return training_pipeline

In [None]:
pipeline = build_pipeline(container_tag=container_tag, svm_c=svm_c, svm_gamma=svm_gamma)

In [None]:
import kfp.compiler as compiler
compiler = compiler.Compiler().compile(pipeline_func=pipeline, 
                                       package_path=output_pipeline_filename,
                                       type_check=False,  # TEMP WHILE I FIGURE OUT HOW TO USE TYPES FOR MINIO URL
                                      )

# For debugging in notebook

Use below code to submit the pipeline from here

In [None]:
# from utilities import get_minio_credentials

# # Get minio credentials using a helper
# minio_tenant = 'minimal'
# minio_settings = get_minio_credentials(minio_tenant, strip_http=False)
# minio_url = minio_settings["url"]
# minio_access_key = minio_settings["access_key"]
# minio_secret_key = minio_settings["secret_key"]

# arguments = {
#     'data_train': 'andrew-scribner/iowa/processed/train/2020-08-13_18:02:01_train.csv',
#     'data_test': 'andrew-scribner/iowa/processed/test/2020-08-13_18:02:01_test.csv',
#     'minio_url': minio_url,
#     'minio_access_key': minio_access_key,
#     'minio_secret_key': minio_secret_key,
# }

# client = kfp.Client()
# client.create_run_from_pipeline_package(
#     pipeline_file=output_pipeline_filename,
#     arguments=arguments,
# )