In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
!gcloud config set project gcp-ml-specialization-demo

In [None]:
!gcloud auth login

In [None]:
!pip3 install google-cloud-aiplatform kfp google_cloud_pipeline_components

In [None]:
PATH=%env PATH
%env PATH={PATH}:/home/jupyter/.local/bin
REGION="us-central1" # you can specify any other location of your choice

# Get projet name
shell_output=!gcloud config get-value project 2> /dev/null
PROJECT_ID=shell_output[0]

# Set bucket name
BUCKET_NAME="gs://"+PROJECT_ID+"-black-friday-sales"

# Create bucket
PIPELINE_ROOT = f"{BUCKET_NAME}/pipeline_root_blackfridaysales/"
PIPELINE_ROOT

SERVICE_ACCOUNT = "296237320026-compute@developer.gserviceaccount.com"


In [None]:
from typing import NamedTuple
import typing
from kfp import dsl
from kfp.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics,
                        component,
                        OutputPath,
                        InputPath)

from kfp import compiler
from google.cloud import bigquery
from google.cloud import aiplatform
from google.cloud.aiplatform import pipeline_jobs
# from google_cloud_pipeline_components import aiplatform as gcc_aip
from google_cloud_pipeline_components.v1.custom_job import utils
import kfp

In [None]:
@component(
  packages_to_install=["pandas", "pyarrow", "scikit-learn==1.3.0"],
  base_image="python:3.10",
  output_component_file="get_blackfriday_data.yaml"
)
def get_blackfriday_data(
  # An input parameter of type str.
  url: str,
  # Use Output[T] to get a metadata-rich handle to the output artifact of type `Dataset`.
  # the artifact already has path in the place, where we run the pipeline
  dataset_train: Output[Dataset],
  dataset_test: Output[Dataset]
):
  import numpy as np
  import pandas as pd
  from sklearn.model_selection import train_test_split

  df_sales = pd.read_csv(url)
  df_sales['target'] = df_sales.Purchase
  df_sales.drop(
      columns=['Purchase'],
      inplace=True
  )

  train, test = train_test_split(df_sales, test_size=0.3)
  train.to_csv(dataset_train.path + ".csv" , index=False)
  test.to_csv(dataset_test.path + ".csv" , index=False)

In [None]:
@component(
  packages_to_install=[
      "pandas",
      "scikit-learn==1.3.0"
  ], base_image="python:3.10",
)
def train_blackfriday(
  # Use Input[T] to get a metadata-rich handle to the
  # input artifact of type `Dataset`.
  dataset: Input[Dataset],
  model: Output[Model],
):
  import pickle
  import pandas as pd
  from sklearn.ensemble import RandomForestRegressor

  data = pd.read_csv(dataset.path + ".csv")
  model_rf = RandomForestRegressor(n_estimators=10)
  model_rf.fit(
      data.drop(columns=["target"]),
      data.target,
  )
  model.metadata["framework"] = "scikit-learn"
  file_name = model.path + ".pkl"
  with open(file_name, 'wb') as file:
      pickle.dump(model_rf, file)

In [None]:
@component(
  packages_to_install = [
      "pandas",
      "xgboost==1.7.1",
      "scikit-learn==1.3.0"
  ], base_image="python:3.10",
)
def blackfriday_evaluation(
  test_set:  Input[Dataset],
  rf_blackfriday_model: Input[Model],
  # thresholds_dict_str: str,
  kpi: Output[Metrics]
)-> NamedTuple("output", [("deploy", str)]):
#-> NamedTuple("Metrics", [("MAE", float), ("MSE", float), ("R_squared", float), ("deploy", str)])


  # from sklearn.ensemble import RandomForestClassifier
  import pandas as pd
  import logging
  import pickle
  # from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score
  from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
  import numpy as np
  import json
  import typing



  data = pd.read_csv(test_set.path+".csv")
  file_name = rf_blackfriday_model.path + ".pkl"
  with open(file_name, 'rb') as file:
      model = pickle.load(file)

  X_test = data.drop(columns=["target"])
  y_target = data.target
  y_pred = model.predict(X_test)

  mae = mean_absolute_error(y_target, y_pred)
  mse = mean_squared_error(y_target, y_pred)
  r2 = r2_score(y_target, y_pred)

  kpi.log_metric("MAE", float(mae))
  kpi.log_metric("MSE", float(mse))
  kpi.log_metric("R_squared", float(r2))

  deploy = "true"
  return (deploy,)

In [None]:
@component(
  packages_to_install=["google-cloud-aiplatform", "scikit-learn==1.3.0", "xgboost==1.7.1", "kfp"],
  base_image="python:3.10",
  output_component_file="model_blackfriday_coponent.yml"
)
def deploy_blackfriday(
  model: Input[Model],
  project: str,
  region: str,
  serving_container_image_uri : str,
  vertex_endpoint: Output[Artifact],
  vertex_model: Output[Model]
):
  from google.cloud import aiplatform
  aiplatform.init(project=project, location=region)

  DISPLAY_NAME  = "blackfridaysales"
  MODEL_NAME = "blackfriday-rf"
  ENDPOINT_NAME = "blackfriday_endpoint"

  def create_endpoint():
      endpoints = aiplatform.Endpoint.list(
        filter='display_name="{}"'.format(ENDPOINT_NAME),
        order_by='create_time desc',
        project=project,
        location=region,
      )
      if len(endpoints) > 0:
          return endpoints[0]  # most recently created
      else:
          return aiplatform.Endpoint.create(
            display_name=ENDPOINT_NAME, project=project, location=region
        )
  endpoint = create_endpoint()

  #Import a model programmatically
  model_upload = aiplatform.Model.upload(
      display_name = DISPLAY_NAME,
      artifact_uri = model.uri.replace("model", ""),
      serving_container_image_uri = serving_container_image_uri,
      serving_container_health_route=f"/v1/models/{MODEL_NAME}",
      serving_container_predict_route=f"/v1/models/{MODEL_NAME}:predict",
      serving_container_environment_variables={
      "MODEL_NAME": MODEL_NAME,
  },
  )
  model_deploy = model_upload.deploy(
      machine_type="n1-standard-4",
      endpoint=endpoint,
      traffic_split={"0": 100},
      deployed_model_display_name=DISPLAY_NAME,
  )

  # Save the resource name to the output params
  vertex_model.uri = model_deploy.resource_name

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
DISPLAY_NAME = 'pipeline-blackfriday-job{}'.format(TIMESTAMP)

In [None]:
@dsl.pipeline(
  # Default pipeline root. You can override it when submitting the pipeline.
  pipeline_root=PIPELINE_ROOT,
  # A name for the pipeline. Use to determine the pipeline Context.
  name="pipeline-blackfriday",
)
def pipeline(
  url: str = "https://storage.googleapis.com/randomforest-blackfriday/train.csv",
  project: str = PROJECT_ID,
  region: str = REGION,
  display_name: str = DISPLAY_NAME,
  api_endpoint: str = REGION+"-aiplatform.googleapis.com",
  serving_container_image_uri: str = "us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-3:latest"
):
  data_op = get_blackfriday_data(url=url)
  train_model_op = train_blackfriday(dataset = data_op.outputs["dataset_train"])
  model_evaluation_op = blackfriday_evaluation(
      test_set=data_op.outputs["dataset_test"],
      rf_blackfriday_model=train_model_op.outputs["model"],
      # thresholds_dict_str = thresholds_dict_str, # I deploy the model anly if the model performance is above the threshold
  )

  with dsl.Condition(
      model_evaluation_op.outputs["deploy"]=="true",
      name="deploy-blackfriday",
  ):
      deploy_model_op = deploy_blackfriday(
        model=train_model_op.outputs['model'],
        project=project,
        region=region,
        serving_container_image_uri = serving_container_image_uri,
      )

# pipeline()

In [None]:
compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path='ml_blackfriday.json'
)

In [None]:
aiplatform.init(project=PROJECT_ID, location=REGION)

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
start_pipeline = pipeline_jobs.PipelineJob(
  display_name="blackfriday-pipeline",
  template_path="ml_blackfriday.json",
  enable_caching=True,
  location=REGION
)

In [None]:
start_pipeline.run(service_account=SERVICE_ACCOUNT)

In [None]:
import gcsfs
import pandas as pd

fs = gcsfs.GCSFileSystem()

data_path = 'gs://gcp-ml-specialization-demo-black-friday-sales/pipeline_root_blackfridaysales/296237320026/pipeline-blackfriday-20240308121252/get-blackfriday-data_-3824834746941177856/dataset_test.csv'
with fs.open(data_path, 'rb') as f:
    test_df = pd.read_csv(f, nrows=10)

In [None]:
# create instances
instances = test_df.drop(columns='target').values.tolist()

instances

In [None]:
ENDPOINT_ID = !(gcloud ai endpoints list --region=$REGION \
              --format='value(ENDPOINT_ID)'\
              --filter=display_name=$ENDPOINT_NAME \
              --sort-by=creationTimeStamp)

In [None]:
ENDPOINT_ID = '474208369943511040' # the most recent endpoint


# test = instances[0]
# print(test)
aiplatform.init(project=PROJECT_ID, location=REGION)
endpoint = aiplatform.Endpoint(ENDPOINT_ID)
prediction = endpoint.predict(instances)

prediction.predictions