### Make sure to first upload the Iris.csv file to your data bucket in cloud storage to be able to run this file

And if you want to do the batch prediction, also add the iris_batch.csv file to that data bucket. Or another batch file you want to use

In [1]:
# Install the packages
! pip3 install --user --no-cache-dir --upgrade "kfp>2" "google-cloud-pipeline-components>2" \
                                        google-cloud-aiplatform

Collecting kfp>2
  Downloading kfp-2.9.0.tar.gz (595 kB)
     ------------------------------------- 595.6/595.6 kB 20.8 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting google-cloud-pipeline-components>2
  Downloading google_cloud_pipeline_components-2.17.0-py3-none-any.whl.metadata (5.9 kB)
Collecting google-cloud-aiplatform


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
anaconda-project 0.9.1 requires ruamel-yaml, which is not installed.
jupyter-server 1.4.1 requires pyzmq>=17, which is not installed.
notebook 6.3.0 requires pyzmq>=17, which is not installed.
spyder 4.2.5 requires pyqt5<5.13, which is not installed.
spyder 4.2.5 requires pyqtwebengine<5.13, which is not installed.
spyder 4.2.5 requires pyzmq>=17, which is not installed.
tensorflow-intel 2.13.0 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.7.1 which is incompatible.
sphinx 4.0.1 requires Jinja2<3.0,>=2.3, but you have jinja2 3.1.4 which is incompatible.
sphinx 4.0.1 requires MarkupSafe<2.0, but you have markupsafe 2.1.5 which is incompatible.


  Downloading google_cloud_aiplatform-1.70.0-py2.py3-none-any.whl.metadata (32 kB)
Collecting docstring-parser<1,>=0.7.3 (from kfp>2)
  Downloading docstring_parser-0.16-py3-none-any.whl.metadata (3.0 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 (from kfp>2)
  Downloading google_api_core-2.21.0-py3-none-any.whl.metadata (2.8 kB)
Collecting google-cloud-storage<3,>=2.2.1 (from kfp>2)
  Downloading google_cloud_storage-2.18.2-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting kfp-pipeline-spec==0.4.0 (from kfp>2)
  Downloading kfp_pipeline_spec-0.4.0-py3-none-any.whl.metadata (301 bytes)
Collecting kfp-server-api<2.4.0,>=2.1.0 (from kfp>2)
  Downloading kfp_server_api-2.3.0.tar.gz (84 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting kubernetes<31,>=8.0.0 (from kfp>2)
  Downloading kubernetes-30.1.0-py2.py3-none-any.whl.metadata (1.5 kB)
Collecting requests-toolbelt<1,>=0.8.0 (from kfp>2

In [2]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

In [3]:
! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
! pip3 freeze | grep aiplatform
! python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"

Python was not found; run without arguments to install from the Microsoft Store, or disable this shortcut from Settings > Manage App Execution Aliases.
'grep' is not recognized as an internal or external command,
operable program or batch file.
Python was not found; run without arguments to install from the Microsoft Store, or disable this shortcut from Settings > Manage App Execution Aliases.


In [4]:
import kfp
import typing
from typing import Dict
from typing import NamedTuple
from kfp import dsl
from kfp.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics,
                        component, 
                        OutputPath, 
                        InputPath)
import google.cloud.aiplatform as aip
from google_cloud_pipeline_components.v1.model import ModelUploadOp
from google_cloud_pipeline_components.v1.endpoint import (EndpointCreateOp,ModelDeployOp)
from google_cloud_pipeline_components.types import artifact_types

  from google_cloud_pipeline_components.v1.model import ModelUploadOp


In [5]:
#The Google Cloud project that this pipeline runs in.
PROJECT_ID = "your project id"
# The region that this pipeline runs in
REGION = "us-central1"
# Specify a Cloud Storage URI that your pipelines service account can access. The artifacts of your pipeline runs are stored within the pipeline root.
PIPELINE_ROOT = "your url to pipeline root"   # e.g., gs://temp_de2024

## First create a function that trains a SVM model on dataset

In [6]:
@dsl.component(
    packages_to_install=['pandas', 'scikit-learn==1.3.2'],
    base_image="python:3.10.7-slim"
)
def train_svm(features: Input[Dataset], out_model: Output[Model]) -> NamedTuple('outputs', metrics=dict):
    '''train a SVM with default parameters'''
    import pandas as pd
    from sklearn import svm
    from sklearn import metrics
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import recall_score, accuracy_score
    import json
    import logging 
    import sys
    import os
    import pickle  
       
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    df = pd.read_csv(features.path)
    df = df.drop(columns=['id']) # have to drop the variable
    
    logging.info(df.columns)        
    
    x_train, x_test, y_train, y_test = train_test_split(df.drop('CLASS_LABEL',axis=1), 
                                                    df['CLASS_LABEL'], test_size=0.20, 
                                                    random_state=42)
    model_svm = svm.SVC()
    model_svm.fit(x_train,y_train)
    y_pred = model_svm.predict(x_test)

    metrics_dict = {
        "accuracy": accuracy_score(y_test, y_pred)
        "recall": recall_score(y_test, y_pred)
    }
    logging.info(metrics_dict)  
    
    out_model.metadata["file_type"] = ".pkl"
    out_model.metadata["algorithm"] = "svm"
   # Save the model
    model_file = out_model.path + ".pkl"
    with open(model_file, 'wb') as f:  
        pickle.dump(model_svm, f)   
    
    outputs = NamedTuple('outputs', metrics=dict)
    return outputs(metrics_dict)

OSError: [WinError 123] The filename, directory name, or volume label syntax is incorrect: '<ipython-input-6-64505418d412>'

## Then train a Random Forest Classifier on the dataset

In [None]:
@dsl.component(
    packages_to_install=['pandas', 'scikit-learn==1.3.2'],
    base_image="python:3.10.7-slim"
)
def train_rf(features: Input[Dataset], out_model: Output[Model]) -> NamedTuple('outputs', metrics=dict):
    '''train a Random Forest with default parameters'''
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import recall_score, accuracy_score
    import json
    import logging 
    import sys
    import os
    import pickle  
       
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    df = pd.read_csv(features.path)
    df = df.drop(columns=['id'])
    
    logging.info(df.columns)        
    
    x_train, x_test, y_train, y_test = train_test_split(df.drop('CLASS_LABEL',axis=1), 
                                                    df['CLASS_LABEL'], test_size=0.20, 
                                                    random_state=42)
    
    model_rf = RandomForestClassifier(n_estimators=100, random_state=42) 
    model_rf.fit(x_train,y_train)
    y_pred = model_rf.predict(x_test)

    metrics_dict = {
        "accuracy": accuracy_score(y_test, y_pred)
        "recall": recall_score(y_test, y_pred)
    }
    logging.info(metrics_dict)  
    
    out_model.metadata["file_type"] = ".pkl"
    out_model.metadata["algorithm"] = "dt"
   # Save the model
    model_file = out_model.path + ".pkl"
    with open(model_file, 'wb') as f:  
        pickle.dump(model_rf, f)   
    
    outputs = NamedTuple('outputs', metrics=dict)
    return outputs(metrics_dict)

## Then train a Logistic Regression on the dataset

In [None]:
@dsl.component(
    packages_to_install=['pandas', 'scikit-learn==1.3.2'],
    base_image="python:3.10.7-slim"
)
def train_lr(features: Input[Dataset], out_model: Output[Model]) -> NamedTuple('outputs', metrics=dict):
    '''train a Logistic Regression with default parameters'''
    import pandas as pd
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import recall_score, accuracy_score
    import json
    import logging 
    import sys
    import os
    import pickle  
       
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    df = pd.read_csv(features.path)
    df = df.drop(columns=['id'])
    
    logging.info(df.columns)        
    
    x_train, x_test, y_train, y_test = train_test_split(df.drop('CLASS_LABEL',axis=1), 
                                                    df['CLASS_LABEL'], test_size=0.20, 
                                                    random_state=42)
    
    model_lr = LogisticRegression(random_state=42)
    model_lr.fit(x_train,y_train)
    y_pred = model_lr.predict(x_test)

    metrics_dict = {
        "accuracy": accuracy_score(y_test, y_pred)
        "recall": recall_score(y_test, y_pred)
    }
    logging.info(metrics_dict)  
    
    out_model.metadata["file_type"] = ".pkl"
    out_model.metadata["algorithm"] = "dt"
   # Save the model
    model_file = out_model.path + ".pkl"
    with open(model_file, 'wb') as f:  
        pickle.dump(model_lr, f)   
    
    outputs = NamedTuple('outputs', metrics=dict)
    return outputs(metrics_dict)

## Then we compare the two model's performances to decide which one to use

In [None]:
@dsl.component(
    base_image="python:3.10.7-slim"
)
def compare_model(svm_metrics: dict, rf_metrics: dict, lr_metrics: dict) -> str:
    import logging
    import json
    import sys
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    logging.info(svm_metrics)
    logging.info(rf_metrics)
    logging.info(lr_metrics)
    if svm_metrics.get("accuracy") > dt_metrics.get("accuracy"):
        return "SVM"
    else :
        return "DT"

## Then we upload the model to google cloud storage 

In [None]:
@dsl.component(
    packages_to_install=["google-cloud-storage"],
    base_image="python:3.10.7-slim"
)
def upload_model_to_gcs(project_id: str, model_repo: str, model: Input[Model], model_name: str):
    '''upload model to gsc'''
    from google.cloud import storage   
    import logging 
    import sys
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)    
  
    # upload the model to GCS
    client = storage.Client(project=project_id)
    bucket = client.bucket(model_repo)
    blob = bucket.blob('model.pkl')
    source_file_name= model.path + '.pkl'
   
    blob.upload_from_filename(source_file_name)    
    
    print(f"File {source_file_name} uploaded to {model_repo}.")

## Then we create the complete pipeline

In [None]:
# Define the workflow of the pipeline.
@kfp.dsl.pipeline(
    name="phishing-predictor-training-pipeline-v2")
def pipeline(project_id: str, data_bucket: str, dataset_uri: str, model_repo: str, model_repo_uri: str):
    
    # load the dataset
    dataset_op = kfp.dsl.importer(
        artifact_uri=dataset_uri,
        artifact_class=Dataset,
        reimport=False,
    )
    
    # svm model
    training_svm_job_run_op = train_svm(
        features=dataset_op.output
    )
    
    # rf model
    training_rf_job_run_op = train_rf(
        features=dataset_op.output
    )
    
    # lr model
    training_lr_job_run_op = train_lr(
        features=dataset_op.output
    )
        
    # compare models
    comp_model_op = compare_model(svm_metrics=training_svm_job_run_op.outputs["metrics"],
                                    rf_metrics=training_rf_job_run_op.outputs["metrics"],
                                    lr_metrics=training_lr_job_run_op.outputs["metrics"]).after(training_svm_job_run_op, training_rf_job_run_op, training_lr_job_run_op)  
    
    # This part of the code did not work, so right now, we just use the DT model, whether it was better or not
#     with dsl.If(comp_model_op.output=='SVM'):
#         final_model_op = training_svm_job_run_op
#     with dsl.If(comp_model_op.output=='DT'):
#         final_model_op = training_dt_job_run_op
    
    # upload model to storage
    upload_model_to_gc_op = upload_model_to_gcs(
        project_id=project_id,
        model_repo=model_repo,
        model=training_rf_job_run_op.outputs['out_model'],
        model_name=comp_model_op.output
    ) 
    
    # dont know why we do this but is for deploying the model i believe
    import_unmanaged_model_task = dsl.importer(
            artifact_uri=model_repo_uri,
            artifact_class=artifact_types.UnmanagedContainerModel,
            metadata={
                "containerSpec": {
                    "imageUri": "us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-3:latest",  # see https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers  
                },
            },
        ).after(upload_model_to_gc_op)  
    
    # upload model to model registry
    model_upload_op = ModelUploadOp(
            project=project_id,
            display_name=f"phishing-prediction-model-{comp_model_op.output}",
            unmanaged_container_model=import_unmanaged_model_task.outputs["artifact"],
        ).after(import_unmanaged_model_task) 
    
    # create endpoint for predictions
    create_endpoint_op = EndpointCreateOp(
            project=project_id,
            display_name="phishing-prediction-service",
        ).after(model_upload_op) 
    
    # deploy full model
    model_deploy_op = ModelDeployOp(
            model=model_upload_op.outputs["model"],
            endpoint=create_endpoint_op.outputs['endpoint'],
            deployed_model_display_name=f"phishing-prediction-model-{comp_model_op.output}",
            dedicated_resources_machine_type="n1-standard-2",
            dedicated_resources_min_replica_count=1,
            dedicated_resources_max_replica_count=1,
            traffic_split={"0": 100},
        ).after(create_endpoint_op)

## Then we compile the YAML file, which is used to run the pipeline

In [None]:
from kfp import compiler
compiler.Compiler().compile(pipeline_func=pipeline,
        package_path='iris_predictor_training_pipeline.yaml')

## Then we run the pipeline job

In [None]:
import google.cloud.aiplatform as aip

# Before initializing, make sure to set the GOOGLE_APPLICATION_CREDENTIALS
# environment variable to the path of your service account.
aip.init(
    project=PROJECT_ID,
    location=REGION,
)

# Prepare the pipeline job
job = aip.PipelineJob(
    display_name="phishing-predictor",
    enable_caching=False,
    template_path="phishing_predictor_training_pipeline.yaml",
    pipeline_root=PIPELINE_ROOT,
    location=REGION,
    parameter_values={
        'project_id': PROJECT_ID, # makesure to use your project id 
        'data_bucket': 'data_de2024_2083033',  # makesure to use your data bucket name 
        'dataset_uri':'gs://data_de2024_2083033/phishing.csv',
        'model_repo':'models_de2024_2083033', # makesure to use your model bucket name 
        'model_repo_uri':'gs://models_de2024_2083033' # makesure to use your model bucket name 
    }
)

job.run()

## Small test to see if model is there

In [None]:
DISPLAY_NAME = "phishing-prediction-model"
! gcloud ai models list --region={REGION} --filter={DISPLAY_NAME}