In [1]:
!pip install -U google-cloud-aiplatform "shapely<2"



In [1]:
PROJECT_ID = 'dt-tu-sandbox-dev'
BUCKET_URI = 'gs://x-demos-us'
REGION = "us-central1"

In [2]:
from kfp import compiler, dsl
from kfp.dsl import pipeline, importer_node, Input, component, Metrics, Output, Artifact, Dataset

from google.cloud import aiplatform
from google_cloud_pipeline_components.v1 import dataset, model, custom_job, batch_predict_job
from google_cloud_pipeline_components.types import artifact_types
from datetime import datetime

In [3]:
!gsutil mb -l $REGION -p $PROJECT_ID $BUCKET_URI 

Creating gs://x-demos-us/...
ServiceException: 409 A Cloud Storage bucket named 'x-demos-us' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


In [4]:
PATH=%env PATH
%env PATH={PATH}:/home/jupyter/.local/bin

PIPELINE_ROOT = f"{BUCKET_URI}/pipeline_root/"
PIPELINE_ROOT

env: PATH=/usr/local/cuda/bin:/opt/conda/bin:/opt/conda/condabin:/usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games:/home/jupyter/.local/bin


'gs://ovo-demos-us/pipeline_root/'

In [5]:
!mkdir traincontainer
!mkdir traincontainer/trainer

In [6]:
%%writefile traincontainer/Dockerfile
FROM gcr.io/deeplearning-platform-release/sklearn-cpu.0-23
WORKDIR /

# Copies the trainer code to the docker image.
COPY trainer /trainer

RUN pip install sklearn google-cloud-bigquery joblib pandas google-cloud-storage

# Sets up the entry point to invoke the trainer.
ENTRYPOINT ["python", "-m", "trainer.train"]

Writing traincontainer/Dockerfile


In [7]:
%%writefile traincontainer/trainer/train.py
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
from google.cloud import bigquery
from google.cloud import storage
from joblib import dump

import os
import pandas as pd

bqclient = bigquery.Client()
storage_client = storage.Client()

def download_table(bq_table_uri: str):
    prefix = "bq://"
    if bq_table_uri.startswith(prefix):
        bq_table_uri = bq_table_uri[len(prefix):]

    table = bigquery.TableReference.from_string(bq_table_uri)
    rows = bqclient.list_rows(
        table,
    )
    return rows.to_dataframe(create_bqstorage_client=False)

# These environment variables are from Vertex AI managed datasets
training_data_uri = os.environ["AIP_TRAINING_DATA_URI"]
test_data_uri = os.environ["AIP_TEST_DATA_URI"]

# Download data into Pandas DataFrames, split into train / test
df = download_table(training_data_uri)
test_df = download_table(test_data_uri)
labels = df.pop("Class").tolist()
data = df.values.tolist()
test_labels = test_df.pop("Class").tolist()
test_data = test_df.values.tolist()

# Define and train the Scikit model
skmodel = DecisionTreeClassifier()
skmodel.fit(data, labels)
score = skmodel.score(test_data, test_labels)
print('accuracy is:',score)

# Save the model to a local file
dump(skmodel, "model.joblib")

# Upload the saved model file to GCS
bucket = storage_client.get_bucket("YOUR_GCS_BUCKET")
model_directory = os.environ["AIP_MODEL_DIR"]
storage_path = os.path.join(model_directory, "model.joblib")
blob = storage.blob.Blob.from_string(storage_path, client=storage_client)
blob.upload_from_filename("model.joblib")

Writing traincontainer/trainer/train.py


In [8]:
BUCKET = BUCKET_URI[5:] # Trim the 'gs://' before adding to train script
!sed -i -r 's@YOUR_GCS_BUCKET@'"$BUCKET"'@' traincontainer/trainer/train.py

In [9]:
IMAGE_URI=f"gcr.io/{PROJECT_ID}/scikit:v1"

In [10]:
!docker build ./traincontainer -t $IMAGE_URI

Sending build context to Docker daemon   5.12kB
Step 1/5 : FROM gcr.io/deeplearning-platform-release/sklearn-cpu.0-23
 ---> 834b2407ec81
Step 2/5 : WORKDIR /
 ---> Using cache
 ---> b19575cc807b
Step 3/5 : COPY trainer /trainer
 ---> 86aa09b559b7
Step 4/5 : RUN pip install sklearn google-cloud-bigquery joblib pandas google-cloud-storage
 ---> Running in 64b87f88d232
Collecting sklearn
  Downloading sklearn-0.0.post11.tar.gz (3.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'
[91m  error: subprocess-exited-with-error
  
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [18 lines of output]
      The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
      rather than 'sklearn' for pip commands.
      
      Here is how to fix this error in the main use cases:
      - use 'pip install scikit-learn' rather than 'pip install sklearn'
      - replace 'sklearn' by 'scikit-learn' in your pip require

In [11]:
!docker push $IMAGE_URI

The push refers to repository [gcr.io/dt-tu-sandbox-dev/scikit]

[1Bb21ef996: Preparing 
[1Bf5655919: Preparing 
[1B306932b9: Preparing 
[1Bb913d7bd: Preparing 
[1Bce19c9dd: Preparing 
[1Bfaab67e8: Preparing 
[1B2b5f8a22: Preparing 
[1B6cf8673e: Preparing 
[1B2cf21221: Preparing 
[1B24722d3f: Preparing 
[1Bb9d4bbb9: Preparing 
[1B0533bd85: Preparing 
[1Be1652f88: Preparing 
[1B17ce4d28: Preparing 
[1B70dfb97d: Preparing 
[1B63ac1279: Preparing 
[1B158f7ef1: Preparing 
[1B07a648ba: Preparing 
[1B75d94c35: Preparing 
[1Bbf18a086: Preparing 
[1Bfc16734e: Preparing 
[3Bbf18a086: Preparing 
[1Bd63887b6: Preparing 
[1Bd3c0b431: Preparing 
[1B82bdeb5f: Layer already exists [21A[2K[20A[2K[19A[2K[15A[2K[11A[2K[10A[2K[9A[2K[5A[2K[4A[2K[3A[2Kv1: digest: sha256:c0768ae21cd40547ea9769e74498d360ec9b6ebe7b3018347008dc0e0e8f9a31 size: 5755


In [12]:
%%writefile batch_examples.csv
Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4
23288,558.113,207.567738,143.085693,1.450653336,0.7244336162,23545,172.1952453,0.8045881703,0.9890847314,0.9395021523,0.8295857874,0.008913077034,0.002604069884,0.6882125787,0.9983578734
23689,575.638,205.9678003,146.7475015,1.403552348,0.7016945718,24018,173.6714472,0.7652721693,0.9863019402,0.8983750474,0.8431970773,0.00869465998,0.002711119968,0.7109813112,0.9978994889
23727,559.503,189.7993849,159.3717704,1.190922235,0.5430731512,24021,173.8106863,0.8037601626,0.9877607094,0.952462433,0.9157600082,0.007999299741,0.003470231343,0.8386163926,0.9987269085
31158,641.105,212.0669751,187.1929601,1.132879009,0.4699241567,31474,199.1773023,0.7813134733,0.989959967,0.9526231013,0.9392188582,0.0068061806,0.003267009878,0.8821320637,0.9993488983
32514,649.012,221.4454899,187.1344232,1.183349841,0.5346736437,32843,203.4652564,0.7849831,0.9899826447,0.9700068737,0.9188051492,0.00681077351,0.002994124691,0.8442029022,0.9989873701
33078,659.456,235.5600775,178.9312328,1.316483846,0.6503915309,33333,205.2223615,0.7877214708,0.9923499235,0.9558229607,0.8712102818,0.007121351881,0.002530662194,0.7590073551,0.9992209221
33680,683.09,256.203255,167.9334938,1.525623324,0.7552213942,34019,207.081404,0.80680321,0.9900349805,0.9070392732,0.8082699962,0.007606985006,0.002002710402,0.6533003868,0.9966903078
33954,716.75,277.3684803,156.3563259,1.773951126,0.825970469,34420,207.9220419,0.7994819873,0.9864613597,0.8305492781,0.7496238998,0.008168948587,0.001591181142,0.5619359911,0.996846984
36322,719.437,272.0582306,170.8914975,1.591993952,0.7780978465,36717,215.0502424,0.7718560075,0.9892420405,0.8818487005,0.7904566678,0.007490177594,0.001803782407,0.6248217437,0.9947124371
36675,742.917,285.8908964,166.8819538,1.713132487,0.8119506999,37613,216.0927123,0.7788277766,0.9750618137,0.8350248381,0.7558572692,0.0077952528,0.001569528272,0.5713202115,0.9787472145
37454,772.679,297.6274753,162.1493177,1.835514817,0.8385619338,38113,218.3756257,0.8016695205,0.9827093118,0.7883332637,0.7337213257,0.007946480356,0.001420623993,0.5383469838,0.9881438654
37789,766.378,313.5680678,154.3409867,2.031657789,0.8704771226,38251,219.3500608,0.7805870567,0.9879218844,0.8085170916,0.6995293312,0.008297866252,0.001225659709,0.4893412853,0.9941740339
47883,873.536,327.9986493,186.5201272,1.758516115,0.822571799,48753,246.9140116,0.7584464543,0.9821549443,0.7885506623,0.7527897207,0.006850002074,0.00135695419,0.5666923636,0.9965376533
49777,861.277,300.7570338,211.6168613,1.42123379,0.7105823885,50590,251.7499649,0.8019106536,0.9839296304,0.843243269,0.8370542883,0.00604208839,0.001829706116,0.7006598815,0.9958014989
49882,891.505,357.1890036,179.8346914,1.986207449,0.8640114945,51042,252.0153467,0.7260210171,0.9772736178,0.7886896753,0.7055518063,0.007160679276,0.001094585314,0.4978033513,0.9887407248
53249,919.923,325.3866286,208.9174205,1.557489212,0.7666552108,54195,260.3818974,0.6966846347,0.9825445152,0.7907120655,0.8002231025,0.00611066177,0.001545654241,0.6403570138,0.9973491406
61129,964.969,369.3481688,210.9473449,1.750902193,0.8208567513,61796,278.9836198,0.7501135067,0.9892064211,0.8249553283,0.7553404711,0.006042110436,0.001213219664,0.5705392272,0.9989583843
61918,960.372,353.1381442,224.0962377,1.575832543,0.7728529173,62627,280.7782864,0.7539207091,0.9886790043,0.8436218213,0.7950947556,0.005703319619,0.00140599258,0.6321756704,0.9962029945
141953,1402.05,524.2311633,346.3974998,1.513380332,0.7505863011,143704,425.1354762,0.7147107987,0.9878152313,0.9074598849,0.8109694843,0.003692991084,0.0009853172185,0.6576715044,0.9953071199
145285,1440.991,524.9567463,353.0769977,1.486805285,0.7400216694,146709,430.0960442,0.7860466375,0.9902937107,0.8792413513,0.8192980608,0.003613289371,0.001004269363,0.6712493125,0.9980170255
146153,1476.383,526.1933264,356.528288,1.475881001,0.7354662103,149267,431.3789276,0.7319360978,0.9791380546,0.8425962592,0.8198107159,0.003600290972,0.001003163512,0.6720896099,0.991924286

Writing batch_examples.csv


In [13]:
!gsutil cp batch_examples.csv $BUCKET_URI

Copying file://batch_examples.csv [Content-Type=text/csv]...
/ [1 files][  4.0 KiB/  4.0 KiB]                                                
Operation completed over 1 objects/4.0 KiB.                                      


In [3]:
@component(
    base_image="python:3.10",
    packages_to_install=["google-cloud-aiplatform==1.36.4","google-cloud-pipeline-components==2.6.0"]
)
def custom_training_job(
    project: str,
    location: str,
    staging_bucket: str,
    display_name: str,
    container_uri: str,
    model_serving_container_image_uri: str,
    dataset: Input[artifact_types.VertexDataset],
    model_display_name: str,
    bigquery_destination: str,
    model: Output[Artifact],
    metrics: Output[Metrics],
    replica_count: int = 1,
    machine_type: str = "n1-standard-4",
    training_fraction_split: float = 0.8,
    validation_fraction_split: float = 0.1,
    test_fraction_split: float = 0.1,
    sync: bool = True,
    
):
    
    from google.cloud import aiplatform
    
    aiplatform.init(project=project, location=location, staging_bucket=staging_bucket)
    
    job = aiplatform.CustomContainerTrainingJob(
        display_name=display_name,
        container_uri=container_uri,
        model_serving_container_image_uri=model_serving_container_image_uri,
    )
    
    vertex_dataset = aiplatform.TabularDataset(dataset_name=dataset.metadata["resourceName"])
    
    uploaded_model = job.run(
        dataset=vertex_dataset,
        model_display_name=model_display_name,
        replica_count=replica_count,
        machine_type=machine_type,
        training_fraction_split=training_fraction_split,
        validation_fraction_split=validation_fraction_split,
        test_fraction_split=test_fraction_split,
        sync=sync,
        bigquery_destination=bigquery_destination
    )
    
    resource_name = f"{uploaded_model.resource_name}@{uploaded_model.version_id}"
    model.metadata["resourceName"] = resource_name
    model.metadata["containerSpec"] = {"imageUri": model_serving_container_image_uri}
    model.uri = uploaded_model.uri
    model.TYPE_NAME = "google.VertexModel"
    

## Define pipeline

In [4]:
@pipeline(name="test-pipeline")
def pipeline(
    bq_source: str = "bq://sara-vertex-demos.beans_demo.large_dataset",
    bucket: str = BUCKET_URI,
    project: str = PROJECT_ID,
    gcp_region: str = REGION,
    bq_dest: str = "",
    container_uri: str = "",
    batch_destination: str = ""
):
    dataset_create_op = dataset.TabularDatasetCreateOp(
        display_name="tabular-beans-dataset",
        bq_source=bq_source,
        project=project,
        location=gcp_region
    )
    
    training_op = custom_training_job(
        project=project,
        location=gcp_region,
        staging_bucket=bucket,
        display_name="pipeline-beans-custom-train",
        container_uri=container_uri,
        model_serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.0-24:latest",
        dataset=dataset_create_op.outputs["dataset"],
        model_display_name="scikit-beans-model-pipeline",
        bigquery_destination=bq_dest,
    )
   

    batch_predict_op = batch_predict_job.ModelBatchPredictOp(
        project=project,
        location=gcp_region,
        job_display_name="beans-batch-predict",
        model=training_op.outputs["model"],
        gcs_source_uris=["{0}/batch_examples.csv".format(BUCKET_URI)],
        instances_format="csv",
        gcs_destination_output_uri_prefix=batch_destination,
        machine_type="n1-standard-4"
    )

In [5]:
compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="custom_train_pipeline.json"
)

In [6]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

pipeline_job = aiplatform.PipelineJob(
    display_name="custom-train-pipeline",
    template_path="custom_train_pipeline.json",
    job_id="custom-train-pipeline-{0}".format(TIMESTAMP),
    parameter_values={
        "project": PROJECT_ID,
        "bucket": BUCKET_URI,
        "bq_dest": "bq://{0}".format(PROJECT_ID),
        "container_uri": "gcr.io/{0}/scikit:v1".format(PROJECT_ID),
        "batch_destination": "{0}/batchpredresults".format(BUCKET_URI),
        "gcp_region":REGION
    },
    enable_caching=True,
)

In [7]:
pipeline_job.submit()

Creating PipelineJob
PipelineJob created. Resource name: projects/435046587974/locations/us-central1/pipelineJobs/custom-train-pipeline-20231122170729
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/435046587974/locations/us-central1/pipelineJobs/custom-train-pipeline-20231122170729')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/custom-train-pipeline-20231122170729?project=435046587974
