## MLOps With Kubeflow Pipelines (Part 1)

In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

# import the entire dataset into `data` for quick EDA
data = load_breast_cancer() 
df = pd.DataFrame(data = data.data, columns = data.feature_names) 
df['target'] = pd.Series(data.target) 

#import features into X and target into y for training
X, y = load_breast_cancer(return_X_y = True, as_frame = True) 

# Create the training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)


In [None]:
#print head 
data.head()

In [None]:
# Printing the dimenions of data
print(data.shape)
print(X.shape) 
print(y.shape)

In [None]:

# Load Dataset and Train Model
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/wb-platform/pipelines/kubeflow-pycaret:latest",
    output_component_file="train_model.yaml",
)
def model_training(
            dataset_id: str,
            file_bucket: str, 
            save_path: str,
            model: Output[Model],
            metrics: Output[Metrics],
            metricsc: Output[ClassificationMetrics], 
            col_list: list 
    ) -> NamedTuple(
        "Outputs",
        [
            ("accuracy", float),  # Return parameters
            ("f1_score", float),
            ("roc_auc", float), 
            ("X_y_val_index", list), 
            ("model_location", str)
        ],
    ):

    ### Added to model_training component: save model artifacts in GCS bucket

# Import Libraries
import pickle
import logging
from datetime import datetime
from google.cloud import storage

# define model artifacts and assign elements to the dict
model_artifacts = {}
create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
model_artifacts['create_time'] = create_time
model_artifacts['model'] = xgb_model
model_artifacts['col_list'] = col_list

# create and write model_artifacts.pkl
with open('model_artifacts.pkl', 'wb') as pkl_file:
    pickle.dump(model_artifacts, pkl_file)

    # Use the 'pickle.dump()' method to serialize and store the 'model_artifacts' data
    pickle.dump(model_artifacts, pkl_file)

# create a gcs bucket instance
storage_client = storage.Client()
bucket = storage_client.get_bucket(file_bucket)

# define the folder path where the models will be saved. create one if not found. 
model_path = 'breast_cancer_models/'
blob = bucket.blob(model_path)
if not blob.exists(storage_client):
    blob.upload_from_string('')

# set model name and upload 'model_artifacts.pkl' to the folder in gcs bucket 
model_name = 'breast_cancer_models_{}'.format(model_artifacts['create_time'])
model_location = f'{model_path}{model_name}'
blob = bucket.blob(model_location)
blob.upload_from_filename('model_artifacts.pkl')

print(f"Model artifacts loaded to GCS Bucket: {model_location}")

#     model.metadata['accuracy'] = accuracy
#     model.metadata['precision'] = precision
#     model.metadata['recall'] = recall
#     model.metadata['f1_score'] = f1_score
#     model.metadata['auc'] = roc_auc

model.uri = f'gs://{file_bucket}/{model_location}'

#     # Log additional model details 
#     with open(model.path, 'w') as output_file:
#         output_file.write(f'You can enter additional model details here')
#     output_file.close()
    
    time.sleep(120)

    return (accuracy, f1_score, roc_auc, list(X_val.index), model_location)
    

In [None]:
# import required libraries
import kfp
from kfp import dsl
from kfp.v2.dsl import (Model, Input, Output, component)

# Component for uploading model to Vertex Model Registry
@component(
# Uploads model
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/wb-platform/pipelines/kubeflow-pycaret:latest",
    output_component_file="model-upload.yaml",
)

def upload_model_to_mr(
    model: Input[Model],
    vertex_model: Output[Model],
    model_name: str,
    prediction_image: str,
    col_list: list, 
    result: str
):
    from google.cloud import aiplatform
    import os
    from datetime import datetime

    aiplatform.init(project=project_id, location=region)
    
    if result == "Pass": 

        ## check for existing models 
        # if model already exists, update version
        try:
            model_uid = aiplatform.Model.list(
                filter=f'display_name={model_name}',      
                order_by="update_time",
                location=region)[-1].resource_name

            uploaded_model = aiplatform.Model.upload(
                display_name = model_name, 
                artifact_uri = os.path.dirname(model.uri),
                parent_model = model_uid,
                is_default_version = True
            )
        # if model does not exist, upload as a new model
        except:
            uploaded_model = aiplatform.Model.upload(
                display_name = model_name,
                artifact_uri = os.path.dirname(model.uri),
            )

        vertex_model.uri = uploaded_model.resource_name
        vertex_model.version_create_time = datetime.now()
        vertex_model.version_description = "breast cancer model" 
    
    else: 
        
        print("Training performance is not satisfactory. Upload to the Model Registry revoked.")
        
    