### Make sure to first upload the phishing.csv file to your data bucket in cloud storage to be able to run this file

In [2]:
# Install the packages
! pip3 install --user --no-cache-dir --upgrade "kfp>2" "google-cloud-pipeline-components>2" \
                                        google-cloud-aiplatform

Collecting kfp>2
  Downloading kfp-2.9.0.tar.gz (595 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m595.6/595.6 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting kfp-pipeline-spec==0.4.0 (from kfp>2)
  Downloading kfp_pipeline_spec-0.4.0-py3-none-any.whl.metadata (301 bytes)
Collecting kfp-server-api<2.4.0,>=2.1.0 (from kfp>2)
  Downloading kfp_server_api-2.3.0.tar.gz (84 kB)
  Preparing metadata (setup.py) ... [?25ldone


In [3]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

In [2]:
! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
! pip3 freeze | grep aiplatform
! python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"

KFP SDK version: 2.7.0
google-cloud-aiplatform==1.70.0
google_cloud_pipeline_components version: 2.17.0


In [3]:
import kfp
import typing
from typing import Dict
from typing import NamedTuple
from kfp import dsl
from kfp.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics,
                        component, 
                        OutputPath, 
                        InputPath)
import google.cloud.aiplatform as aip
from google_cloud_pipeline_components.v1.model import ModelUploadOp
from google_cloud_pipeline_components.v1.endpoint import (EndpointCreateOp,ModelDeployOp)
from google_cloud_pipeline_components.types import artifact_types

In [4]:
#The Google Cloud project that this pipeline runs in.
PROJECT_ID = "tranquil-lore-435419-m2"
# The region that this pipeline runs in
REGION = "us-central1"
# Specify a Cloud Storage URI that your pipelines service account can access. The artifacts of your pipeline runs are stored within the pipeline root.
PIPELINE_ROOT = "gs://temp_de2024_2083033"   # e.g., gs://temp_de2024

## First load the data

In [6]:
@dsl.component(
    packages_to_install=["pandas","google-cloud-storage"],
    base_image="python:3.10.7-slim"
)
def load_data(project_id: str, bucket: str, filename: str, dataset: Output[Dataset]):
    '''download data'''
    from google.cloud import storage
    import pandas as pd
    import logging 
    import sys
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    # Downloaing the phishing dataset file from the google storage data bucket 
    client = storage.Client(project=project_id)
    bucket = client.bucket(bucket)
    blob = bucket.blob(filename)
    blob.download_to_filename(dataset.path + ".csv")
    logging.info('Downloaded Data!')

## Then split the data into a train and test set

In [7]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn==1.3.2"],
    base_image="python:3.10.7-slim"
)
def train_test_split(dataset: Input[Dataset], dataset_train: Output[Dataset], dataset_test: Output[Dataset]):
    '''train_test_split'''
    import pandas as pd
    import logging 
    import sys
    from sklearn.model_selection import train_test_split as tts
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO) 
    
    # Split the phishing dataset into a train set (80%) and test set (20%) and store those datasets
    alldata = pd.read_csv(dataset.path+".csv", index_col=None)
    train, test = tts(alldata, test_size=0.2, random_state=42)
    train.to_csv(dataset_train.path + ".csv" , index=False, encoding='utf-8-sig')
    test.to_csv(dataset_test.path + ".csv" , index=False, encoding='utf-8-sig')

## Then create a function that trains a SVM model on dataset

In [8]:
@dsl.component(
    packages_to_install=['pandas', 'scikit-learn==1.3.2'],
    base_image="python:3.10.7-slim"
)
def train_svm(train_set: Input[Dataset], test_set: Input[Dataset], out_model: Output[Model]) -> NamedTuple('outputs', metrics=dict):
    '''train a SVM with default parameters'''
    import pandas as pd
    from sklearn import svm
    from sklearn import metrics
    from sklearn.metrics import recall_score, accuracy_score
    import json
    import logging 
    import sys
    import os
    import pickle  
       
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    # Load the train and test sets into dataframes
    df_train = pd.read_csv(train_set.path+".csv")
    df_test = pd.read_csv(test_set.path+".csv")

    logging.info(df_train.columns)
    logging.info(df_test.columns)  
    
    # Split the train and test sets into the X variables and the y variable
    x_train, y_train = df_train.drop('CLASS_LABEL', axis=1), df_train['CLASS_LABEL']
    x_test, y_test = df_test.drop('CLASS_LABEL', axis=1), df_test['CLASS_LABEL']
    
    # Train a SVM Classifier on the train data and predict the test data
    model_svm = svm.SVC(random_state=42)
    model_svm.fit(x_train,y_train)
    y_pred = model_svm.predict(x_test)
    
    # Store the accuracy and recall of the model on the test set in a dictionary
    metrics_dict = {
        "accuracy": accuracy_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred)
    }
    logging.info(metrics_dict)  
    
    # Store some metadata
    out_model.metadata["file_type"] = ".pkl"
    out_model.metadata["algorithm"] = "SVM"
    
    # Save the model to a pickle file
    model_file = out_model.path + ".pkl"
    with open(model_file, 'wb') as f:  
        pickle.dump(model_svm, f)   
    
    # Return the metrics dictionary as an output
    outputs = NamedTuple('outputs', metrics=dict)
    return outputs(metrics_dict)

## Then train a Random Forest Classifier on the dataset

In [9]:
@dsl.component(
    packages_to_install=['pandas', 'scikit-learn==1.3.2'],
    base_image="python:3.10.7-slim"
)
def train_rf(train_set: Input[Dataset], test_set: Input[Dataset], out_model: Output[Model]) -> NamedTuple('outputs', metrics=dict):
    '''train a Random Forest with default parameters'''
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import recall_score, accuracy_score
    import json
    import logging 
    import sys
    import os
    import pickle  
       
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    # Load the train and test sets into dataframes
    df_train = pd.read_csv(train_set.path+".csv")
    df_test = pd.read_csv(test_set.path+".csv")

    logging.info(df_train.columns)
    logging.info(df_test.columns)  
    
    # Split the train and test sets into the X variables and the y variable
    x_train, y_train = df_train.drop('CLASS_LABEL', axis=1), df_train['CLASS_LABEL']
    x_test, y_test = df_test.drop('CLASS_LABEL', axis=1), df_test['CLASS_LABEL']
    
    # Train a RF Classifier on the train data and predict the test data
    model_rf = RandomForestClassifier(random_state=42) 
    model_rf.fit(x_train,y_train)
    y_pred = model_rf.predict(x_test)

    # Store the accuracy and recall of the model on the test set in a dictionary
    metrics_dict = {
        "accuracy": accuracy_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred)
    }
    logging.info(metrics_dict)  
    
    # Store some metadata
    out_model.metadata["file_type"] = ".pkl"
    out_model.metadata["algorithm"] = "RF"

    # Save the model to a pickle file
    model_file = out_model.path + ".pkl"
    with open(model_file, 'wb') as f:  
        pickle.dump(model_rf, f)   

    # Return the metrics dictionary as an output
    outputs = NamedTuple('outputs', metrics=dict)
    return outputs(metrics_dict)

## Then train a Logistic Regression on the dataset

In [10]:
@dsl.component(
    packages_to_install=['pandas', 'scikit-learn==1.3.2'],
    base_image="python:3.10.7-slim"
)
def train_lr(train_set: Input[Dataset], test_set: Input[Dataset], out_model: Output[Model]) -> NamedTuple('outputs', metrics=dict):
    '''train a Logistic Regression with default parameters'''
    import pandas as pd
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import recall_score, accuracy_score
    import json
    import logging 
    import sys
    import os
    import pickle  
       
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    # Load the train and test sets into dataframes
    df_train = pd.read_csv(train_set.path+".csv")
    df_test = pd.read_csv(test_set.path+".csv")

    logging.info(df_train.columns)
    logging.info(df_test.columns)  
    
    # Split the train and test sets into the X variables and the y variable
    x_train, y_train = df_train.drop('CLASS_LABEL', axis=1), df_train['CLASS_LABEL']
    x_test, y_test = df_test.drop('CLASS_LABEL', axis=1), df_test['CLASS_LABEL']

    # Train a Logistic Regression on the train data and predict the test data    
    model_lr = LogisticRegression(random_state=42)
    model_lr.fit(x_train,y_train)
    y_pred = model_lr.predict(x_test)

    # Store the accuracy and recall of the model on the test set in a dictionary
    metrics_dict = {
        "accuracy": accuracy_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred)
    }
    logging.info(metrics_dict)  
    
    # Store some metadata
    out_model.metadata["file_type"] = ".pkl"
    out_model.metadata["algorithm"] = "LR"

    # Save the model to a pickle file
    model_file = out_model.path + ".pkl"
    with open(model_file, 'wb') as f:  
        pickle.dump(model_lr, f)   

    # Return the metrics dictionary as an output
    outputs = NamedTuple('outputs', metrics=dict)
    return outputs(metrics_dict)

## Then we compare the two model's performances to decide which one to use

In [11]:
@dsl.component(
    base_image="python:3.10.7-slim"
)
def compare_model(svm_metrics: dict, rf_metrics: dict, lr_metrics: dict) -> str:
    '''Compare the SVM, RF, and LR model's performances on the test set with each other
    and return the best model on recall (with at least an accuracy of 0.7). We want to optimize
    recall, because the higher the recall, the minimal the type II error. The type II error in our
    case means predicting that it is not a phishing link when in reality it is. This is a bigger 
    problem than predicting that it is a phishing link, when in reality it is not (better safe than sorry).
    '''
    import logging
    import json
    import sys
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    logging.info(svm_metrics)
    logging.info(rf_metrics)
    logging.info(lr_metrics)

    # Obtain the recall and accuracy for the 3 models
    svm_recall = svm_metrics.get('recall')
    svm_accuracy = svm_metrics.get('accuracy')
    
    rf_recall = rf_metrics.get('recall')
    rf_accuracy = rf_metrics.get('accuracy')
    
    lr_recall = lr_metrics.get('recall')
    lr_accuracy = lr_metrics.get('accuracy')
    
    # We iteratively update the best_recall score and in this way can find the best model
    best_model = None
    best_recall = 0
    
    # Compare SVM model
    if svm_accuracy >= 0.7 and svm_recall > best_recall:
        best_model = 'SVM'
        best_recall = svm_recall
    
    # Compare Random Forest model
    if rf_accuracy >= 0.7 and rf_recall > best_recall:
        best_model = 'RF'
        best_recall = rf_recall
    
    # Compare Logistic Regression model
    if lr_accuracy >= 0.7 and lr_recall > best_recall:
        best_model = 'LR'
        best_recall = lr_recall

    # It could be the case that none of the models had an accuracy higher than 0.7
    # In that case, we want to use the model with the highest accuracy
    if best_model is None:
        if svm_accuracy > lr_accuracy:
            if svm_accuracy > rf_accuracy:
                best_model = "SVM"
            else:
                best_model = "RF"
        else:
            if lr_accuracy > rf_accuracy:
                best_model = "LR"
            else:
                best_model = "RF"
            
    return best_model

## Then we upload the model to google cloud storage 

In [12]:
@dsl.component(
    packages_to_install=["google-cloud-storage"],
    base_image="python:3.10.7-slim"
)
def upload_model_to_gcs(project_id: str, model_repo: str, model: Input[Model], model_name: str):
    '''Upload the best model to Google Cloud Storage models bucket'''
    from google.cloud import storage   
    import logging 
    import sys
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)    
  
    # Upload the model to GCS, and store it as model-A1.pkl
    client = storage.Client(project=project_id)
    bucket = client.bucket(model_repo)
    blob = bucket.blob('model-A1.pkl')
    source_file_name= model.path + '.pkl'
   
    blob.upload_from_filename(source_file_name)    
    
    print(f"File {source_file_name} uploaded to {model_repo}.")

## Then we create the complete pipeline

In [13]:
# Define the workflow of the pipeline.
@kfp.dsl.pipeline(
    name="phishing-predictor-training-pipeline-final",
    description="This pipeline downloads the phishing dataset, trains 3 models, compares those models,\
        chooses the best model, and uploads that model to the model bucket in cloud storage")
def pipeline(project_id: str, data_bucket: str, filename:str, model_repo: str, model_repo_uri: str):
    
    # load the dataset
    dataset_op = load_data(
        project_id=project_id,
        bucket=data_bucket,
        filename=filename)

    # Split the dataset into a train (80%) and test (20%) set
    train_test_split_op = train_test_split(dataset=dataset_op.outputs["dataset"]).after(dataset_op)
    
    # Train and test a SVM Classifier on the data
    training_svm_job_run_op = train_svm(
        train_set=train_test_split_op.outputs["dataset_train"],
        test_set=train_test_split_op.outputs["dataset_test"]
    ).after(train_test_split_op)
    
    # Train and test a RF Classifier on the data
    training_rf_job_run_op = train_rf(
        train_set=train_test_split_op.outputs["dataset_train"],
        test_set=train_test_split_op.outputs["dataset_test"]
    ).after(train_test_split_op)
    
    # Train and test a Logistic Regression on the data
    training_lr_job_run_op = train_lr(
        train_set=train_test_split_op.outputs["dataset_train"],
        test_set=train_test_split_op.outputs["dataset_test"]
    ).after(train_test_split_op)
        
    # Compare the models based on their performance metrics and choose the best model
    comp_model_op = compare_model(svm_metrics=training_svm_job_run_op.outputs["metrics"],
                                    rf_metrics=training_rf_job_run_op.outputs["metrics"],
                                    lr_metrics=training_lr_job_run_op.outputs["metrics"]).after(training_svm_job_run_op, training_rf_job_run_op, training_lr_job_run_op)  
    
    # Conditional deployment based on the best-performing model
    with kfp.dsl.Condition(comp_model_op.output == 'SVM'):
        # If SVM is the best, upload the SVM model to GCS
        upload_model_to_gc_op = upload_model_to_gcs(
            project_id=project_id,
            model_repo=model_repo,
            model=training_svm_job_run_op.outputs['out_model'],
            model_name="SVM"
        ).after(comp_model_op)
    
    with kfp.dsl.Condition(comp_model_op.output == 'RF'):
        # If Random Forest is the best, upload the RF model to GCS
        upload_model_to_gc_op = upload_model_to_gcs(
            project_id=project_id,
            model_repo=model_repo,
            model=training_rf_job_run_op.outputs['out_model'],
            model_name="RF"
        ).after(comp_model_op)

    with kfp.dsl.Condition(comp_model_op.output == 'LR'):
        # If Logistic Regression is the best, upload the LR model to GCS
        upload_model_to_gc_op = upload_model_to_gcs(
            project_id=project_id,
            model_repo=model_repo,
            model=training_lr_job_run_op.outputs['out_model'],
            model_name="LR"
        ).after(comp_model_op)

  with kfp.dsl.Condition(comp_model_op.output == 'SVM'):
  with kfp.dsl.Condition(comp_model_op.output == 'RF'):
  with kfp.dsl.Condition(comp_model_op.output == 'LR'):


## Then we compile the YAML file, which is used to run the pipeline

In [14]:
from kfp import compiler
compiler.Compiler().compile(pipeline_func=pipeline,
        package_path='phishing_predictor_training_pipeline_final.yaml')

## Then we run the pipeline job

In [15]:
import google.cloud.aiplatform as aip

# Before initializing, make sure to set the GOOGLE_APPLICATION_CREDENTIALS
# environment variable to the path of your service account.
aip.init(
    project=PROJECT_ID,
    location=REGION,
)

# Prepare the pipeline job
job = aip.PipelineJob(
    display_name="phishing-predictor-final",
    enable_caching=False,
    template_path="phishing_predictor_training_pipeline_final.yaml",
    pipeline_root=PIPELINE_ROOT,
    location=REGION,
    parameter_values={
        'project_id': PROJECT_ID, # makesure to use your project id 
        'data_bucket': 'data_de2024_2083033',  # makesure to use your data bucket name 
        'filename': 'phishing.csv',
        'model_repo':'models_de2024_2083033', # makesure to use your model bucket name 
        'model_repo_uri':'gs://models_de2024_2083033' # makesure to use your model bucket name 
    }
)

job.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/454935101788/locations/us-central1/pipelineJobs/phishing-predictor-training-pipeline-final-20241024131622
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/454935101788/locations/us-central1/pipelineJobs/phishing-predictor-training-pipeline-final-20241024131622')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/phishing-predictor-training-pipeline-final-20241024131622?project=454935101788
PipelineJob projects/454935101788/locations/us-central1/pipelineJobs/phishing-predictor-training-pipeline-final-20241024131622 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/454935101788/locations/us-central1/pipelineJobs/phishing-predictor-training-pipeline-final-20241024131622 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/454935101788/locations/us-central1/pipelineJobs/phishing-predictor-

## Small test to see if model is there

In [None]:
DISPLAY_NAME = "phishing-prediction-model-final"
! gcloud ai models list --region={REGION} --filter={DISPLAY_NAME}

In [3]:
import pickle
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)
model

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
