In [None]:
# Install the packages
! pip3 install --user --no-cache-dir --upgrade "kfp>2" "google-cloud-pipeline-components>2" \
                                        google-cloud-aiplatform

In [None]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

In [None]:
import kfp
import typing
from typing import Dict
from typing import NamedTuple
from kfp import dsl
from kfp.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics,
                        component,
                        OutputPath,
                        InputPath)
import google.cloud.aiplatform as aip

### Pipeline config

In [None]:
#The Google Cloud project that this pipeline runs in.
PROJECT_ID = "assignment1-402316"
# The region that this pipeline runs in
REGION = "us-central1"
# Specify a Cloud Storage URI that your pipelines service account can access. The artifacts of your pipeline runs are stored within the pipeline root.
PIPELINE_ROOT = "gs://temp_de2023_group1"
# image registry location
IMAGE_REG = "image-repo-group1"

In [None]:
@dsl.container_component
def toxic_data_ingestion(project: str, bucket: str, data_file_name: str,  features: Output[Artifact]):

    return dsl.ContainerSpec(
        image=f'{REGION}-docker.pkg.dev/{PROJECT_ID}/{IMAGE_REG}/toxic-data-ingestor:0.0.1',
        command=[
            'python3', '/pipelines/component/src/component.py'
        ],
        args=['--project_id',project,'--bucket',bucket,'--file_name',data_file_name,'--feature_path', features.path])

In [None]:
@dsl.container_component
def toxic_data_cleaning(features: Input[Artifact], X_dtm: Output[Artifact], y: Output[Artifact]):

    return dsl.ContainerSpec(
        image=f'{REGION}-docker.pkg.dev/{PROJECT_ID}/{IMAGE_REG}/toxic-data-cleaner:0.0.1',
        command=[
            'python3', '/pipelines/component/src/component.py'
        ],
        args=['--dataset',features.path])

In [None]:
@dsl.container_component
def multilabel_classifier(project: str, features: Input[Artifact], model_bucket: str,  metrics: OutputPath(str)):

    return dsl.ContainerSpec(
        image=f'{REGION}-docker.pkg.dev/{PROJECT_ID}/{IMAGE_REG}/toxic-multilabel-trainer:0.0.1',
        command=[
            'python3', '/pipelines/component/src/component.py'
        ],
        args=['--project_id',project,'--feature_path',features.path,'--model_repo',model_bucket,'--metrics_path', metrics])