In [1]:
!python -m pip install --user --upgrade pip

/usr/bin/python: No module named pip


In [2]:
!python -m pip install --user --upgrade pip

/usr/bin/python: No module named pip


In [3]:
!pip3 install kfp --upgrade --user

Collecting kfp
  Downloading kfp-1.6.4.tar.gz (225 kB)
[K     |████████████████████████████████| 225 kB 27.3 MB/s eta 0:00:01
Collecting google-api-python-client<2,>=1.7.8
  Downloading google_api_python_client-1.12.8-py2.py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 32 kB/s s eta 0:00:01
Collecting kfp-pipeline-spec<0.2.0,>=0.1.8
  Downloading kfp_pipeline_spec-0.1.8-py3-none-any.whl (27 kB)
Collecting uritemplate<4dev,>=3.0.0
  Downloading uritemplate-3.0.1-py2.py3-none-any.whl (15 kB)
Collecting google-auth-httplib2>=0.0.3
  Downloading google_auth_httplib2-0.1.0-py2.py3-none-any.whl (9.3 kB)
Collecting httplib2<1dev,>=0.15.0
  Downloading httplib2-0.19.1-py3-none-any.whl (95 kB)
[K     |████████████████████████████████| 95 kB 5.2 MB/s  eta 0:00:01
Building wheels for collected packages: kfp
  Building wheel for kfp (setup.py) ... [?25ldone
[?25h  Created wheel for kfp: filename=kfp-1.6.4-py3-none-any.whl size=309236 sha256=fc284daf4762106c8b7e5039911

In [381]:
import kfp
from kfp import dsl
import kfp.components as comp

In [382]:
def load_data_op():
    return dsl.ContainerOp(
        name = 'Load Data',
        image = 'mavendevv/load_telemetry:v.0.2',
        arguments = [],
        file_outputs={
            'data': '/load_data/data'
        }      
    )

In [383]:
def stat_op(data):
    return dsl.ContainerOp(
        name = 'Dataset Statistics',
        image =  'mavendevv/stat2_customer:v.0.1', #'mavencodev/stat_customer:v.0.18',
        arguments = ['--data', data],
        file_outputs={
            'stats': '/statgen/stats',
            'mlpipeline-ui-metadata': '/mlpipeline-ui-metadata.json'
        }
    )

In [384]:
def schema_op(stats):
    return dsl.ContainerOp(
        name = 'Dataset Schema',
        image = 'mavencodevv/schema_heart:v.0.2',
        arguments = ['--stats', stats],
        file_outputs={
            'schema': '/schema/schema',
            'mlpipeline-ui-metadata': '/mlpipeline-ui-metadata.json'
        }      
    )

In [385]:
def val_op(stats, schema):
    return dsl.ContainerOp(
        name = 'Dataset Validation',
        image = 'mavencodev/valid_customer:v.0.3',
        arguments = ['--stats', stats, '--schema', schema],
        file_outputs={
            'mlpipeline-ui-metadata': '/mlpipeline-ui-metadata.json'}
    )

In [386]:
def preprocess_op(data):
    return dsl.ContainerOp(
        name = 'Preprocess Data',
        image = 'mavendevv/preprocess_telemetry2:v.0.2',
        arguments = ['--data', data],
        file_outputs={
            'clean_data': '/preprocess/clean_data'   
        }
    )

In [387]:
def rf_op(clean_data):
    return dsl.ContainerOp(
        name = 'Randomforest',
        image = 'mavendevv/rf_telemetry:v.0.2',
        arguments = ['--clean_data', clean_data
            ],
        file_outputs={
            'rf_metrics': '/randomforest/rf_metrics' 
        }
    )

In [388]:
def cat_op(clean_data):
    return dsl.ContainerOp(
        name = 'Catboost',
        image = 'mavendevv/cat_telemetry2:v.0.2',
        arguments = ['--clean_data', clean_data
            ],
        file_outputs={
            'cat_metrics': '/cat/cat_metrics' 
        }
    )

In [389]:
def eval_op(rf_metrics,cat_metrics):
    return dsl.ContainerOp(
        name = 'Model Evaluation',
        image = 'mavendevv/eval_telemetry3:v.0.2',
        arguments = ['--rf_metrics', rf_metrics,
                     '--cat_metrics', cat_metrics
                     ],
                   
        file_outputs={
            'best_model': '/eval/best_model' 
        }    
    )

In [390]:
def export_op(bucket_name,best_model):
    return dsl.ContainerOp(
        name = 'Export Model to MinIO Storage',
        image = 'mavendevv/export_telemetry12:v.0.2',
        arguments = ['--bucket_name', bucket_name,
                     '--best_model',best_model
                    ]
  )

In [391]:
@dsl.pipeline(
    name='Temperature Prediction in Telemetry Dataset',
    description='An ML reusable pipeline that predicts temperature'
)

# Define parameters to be fed into pipeline
def telemetry_pipeline(bucket_name): 
    
    _load_data_op = load_data_op()
    
    _stat_op  = stat_op(
        dsl.InputArgumentPath(_load_data_op.outputs['data'])).after(_load_data_op)
    
    _schema_op = schema_op(
        dsl.InputArgumentPath(_stat_op.outputs['stats'])).after(_stat_op)
    
    _val_op = val_op(
        dsl.InputArgumentPath(_stat_op.outputs['stats']),
        dsl.InputArgumentPath(_schema_op.outputs['schema'])).after(_stat_op,_schema_op)
    
    _preprocess_op = preprocess_op(
        dsl.InputArgumentPath(_load_data_op.outputs['data'])).after(_load_data_op,_val_op)
    
    _rf_op = rf_op(
        dsl.InputArgumentPath(_preprocess_op.outputs['clean_data'])).after(_preprocess_op)
    
    _cat_op = cat_op(
        dsl.InputArgumentPath(_preprocess_op.outputs['clean_data'])).after(_preprocess_op)
    
    _eval_op = eval_op(
        dsl.InputArgumentPath(_rf_op.outputs['rf_metrics']),
        dsl.InputArgumentPath(_cat_op.outputs['cat_metrics'])).after(_rf_op,_cat_op)
        
    _export_op = export_op(bucket_name,
        dsl.InputArgumentPath(_eval_op.outputs['best_model'])).after(_eval_op)

In [392]:
# Compile pipeline to generate compressed YAML definition of the pipeline.
experiment_name = 'telemetry_pipeline'

kfp.compiler.Compiler().compile(telemetry_pipeline, '{}.yaml'.format(experiment_name))

In [None]:
client = kfp.Client()
client.create_run_from_pipeline_func(telemetry_pipeline, arguments={})

In [None]:
# Note: s3 bucket_name used in this project is 'telemetry'