In [1]:
# Step 2 — Verify environment (Workbench)

import sys, subprocess, os

def pip_install(pkgs):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", *pkgs])

# Install if needed (safe to run)
pip_install([
    "google-cloud-aiplatform",
    "kfp",
    "google-cloud-pipeline-components",
    "numpy<2",
])

# Basic imports check
import google.cloud.aiplatform as aiplatform
import kfp
from kfp import dsl
from kfp.dsl import component

print("Python:", sys.version.split()[0])
print("aiplatform:", aiplatform.__version__)
print("kfp:", kfp.__version__)





Python: 3.10.19
aiplatform: 1.138.0
kfp: 2.15.2


In [2]:
# Step 3 — Initialize Vertex AI SDK

# Fill these in:
PROJECT_ID = "vertex-ai-487907"         
LOCATION = "us-central1"
STAGING_BUCKET = "gs://vertex-mlops-vinzur"
PIPELINE_ROOT = f"{STAGING_BUCKET}/phase2-artifact-io"
SERVICE_ACCOUNT = "vertex-pipeline-sa@vertex-ai-487907.iam.gserviceaccount.com"

aiplatform.init(
    project=PROJECT_ID,
    location=LOCATION,
    staging_bucket=STAGING_BUCKET,
)

print("Initialized Vertex AI with:")
print("PROJECT_ID:", PROJECT_ID)
print("LOCATION:", LOCATION)
print("STAGING_BUCKET:", STAGING_BUCKET)
print("PIPELINE_ROOT:", PIPELINE_ROOT)
print("SERVICE_ACCOUNT:", SERVICE_ACCOUNT)

Initialized Vertex AI with:
PROJECT_ID: vertex-ai-487907
LOCATION: us-central1
STAGING_BUCKET: gs://vertex-mlops-vinzur
PIPELINE_ROOT: gs://vertex-mlops-vinzur/phase2-artifact-io
SERVICE_ACCOUNT: vertex-pipeline-sa@vertex-ai-487907.iam.gserviceaccount.com


In [3]:
# Step 3: define lightweight components with artifact I/O

from kfp import dsl, compiler
from kfp.dsl import component, Dataset, Model, Metrics, Output, Input
from google.cloud import aiplatform
import time

@component(
    base_image="python:3.10-slim",
    packages_to_install=["pandas", "scikit-learn", "numpy<2"]
)
def preprocess(output_data: Output[Dataset], n_rows: int = 200):
    import pandas as pd
    import numpy as np

    rng = np.random.default_rng(42)
    x1 = rng.normal(size=n_rows)
    x2 = rng.normal(size=n_rows)
    y = (x1 + 0.5 * x2 + rng.normal(scale=0.3, size=n_rows) > 0).astype(int)

    df = pd.DataFrame({"x1": x1, "x2": x2, "y": y})
    df.to_csv(output_data.path, index=False)

    print(f"Wrote dataset to: {output_data.path}")
    print(df.head())


@component(
    base_image="python:3.10-slim",
    packages_to_install=["pandas", "scikit-learn", "joblib", "numpy<2"]
)
def train(input_data: Input[Dataset], output_model: Output[Model]):
    import pandas as pd
    from sklearn.linear_model import LogisticRegression
    import joblib

    df = pd.read_csv(input_data.path)
    X = df[["x1", "x2"]]
    y = df["y"]

    clf = LogisticRegression()
    clf.fit(X, y)

    # Save model to the provided artifact path
    joblib.dump(clf, output_model.path)

    print(f"Read dataset from: {input_data.path}")
    print(f"Saved model to: {output_model.path}")


@component(
    base_image="python:3.10-slim",
    packages_to_install=["pandas", "scikit-learn", "joblib", "numpy<2"]
)
def evaluate(input_data: Input[Dataset], input_model: Input[Model], metrics: Output[Metrics]):
    import pandas as pd
    import joblib
    from sklearn.metrics import accuracy_score

    df = pd.read_csv(input_data.path)
    X = df[["x1", "x2"]]
    y = df["y"]

    clf = joblib.load(input_model.path)
    preds = clf.predict(X)

    acc = float(accuracy_score(y, preds))
    metrics.log_metric("accuracy", acc)

    print(f"Loaded model from: {input_model.path}")
    print(f"Accuracy: {acc}")


In [4]:
# Step 4 — Define the pipeline wiring

@dsl.pipeline(
    name="phase2-artifact-io",
    description="Phase 2: lightweight components + Dataset/Model/Metrics artifacts"
)
def phase2_pipeline(n_rows: int = 200):
    data_task = preprocess(n_rows=n_rows)
    model_task = train(input_data=data_task.outputs["output_data"])
    _ = evaluate(
        input_data=data_task.outputs["output_data"],
        input_model=model_task.outputs["output_model"]
    )

In [5]:
# Step 5 — Compile to YAML

PIPELINE_YAML = "phase2_artifact_io.yaml"
compiler.Compiler().compile(phase2_pipeline, PIPELINE_YAML)
PIPELINE_YAML

'phase2_artifact_io.yaml'

In [6]:
# Step 6 — Submit pipeline run to Vertex

# we've already defined these above so commenting for now
# PROJECT_ID = "vertex-ai-487907"
# LOCATION = "us-central1"
# STAGING_BUCKET = "gs://vertex-mlops-vinzur"
# PIPELINE_ROOT = f"{STAGING_BUCKET}/pipeline-root"
# SERVICE_ACCOUNT = "vertex-pipeline-sa@vertex-ai-487907.iam.gserviceaccount.com"

aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=STAGING_BUCKET)

job = aiplatform.PipelineJob(
    display_name=f"phase2-artifact-io-{int(time.time())}",
    template_path=PIPELINE_YAML,
    pipeline_root=PIPELINE_ROOT,
    parameter_values={"n_rows": 200},
    enable_caching=False
)

job.run(service_account=SERVICE_ACCOUNT, sync=True)
print("State:", job.state)

Creating PipelineJob
PipelineJob created. Resource name: projects/208722280565/locations/us-central1/pipelineJobs/phase2-artifact-io-20260220065227
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/208722280565/locations/us-central1/pipelineJobs/phase2-artifact-io-20260220065227')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/phase2-artifact-io-20260220065227?project=208722280565
PipelineJob projects/208722280565/locations/us-central1/pipelineJobs/phase2-artifact-io-20260220065227 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/208722280565/locations/us-central1/pipelineJobs/phase2-artifact-io-20260220065227 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/208722280565/locations/us-central1/pipelineJobs/phase2-artifact-io-20260220065227 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/208722280565/locations/us-centra