In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/ai-platform-samples/blob/master/ai-platform-unified/notebooks/notebook_template.ipynb"">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/ai-platform-samples/blob/master/ai-platform-unified/notebooks/notebook_template.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
</table>

## Overview

This notebook shows how to get your training dataset from Vertex AI Feature Store, train your model using Vertex AI managed training pipeline, and deployment it into Vertex AI endpoint. You will learn how to use your own custom code for ML training on Vertex AI.

### Objective

In the following notebook, you will learn how to:

    * Build a containers to run your own custom code on Vertex AI
    * Use Vertex AI to train your model at scale
    * Use Vertex AI to create an endpoint

### Load config settings

In [1]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
BUCKET_NAME = PROJECT_ID
config = !gsutil cat gs://{BUCKET_NAME}/config/notebook_env.py
print(config.n)
exec(config.n)


BUCKET_NAME          = "transaction-detective22-1559"
PROJECT              = "transaction-detective22-1559"
REGION               = "us-central1"
SUFFIX               = "aiplatform.googleapis.com"
API_ENDPOINT         = "us-central1-aiplatform.googleapis.com"
PREDICT_API_ENDPOINT = "us-central1-prediction-aiplatform.googleapis.com"
FS_NAME              = "transaction-detective22-1559"
ID                   = "7551"
FEATURESTORE_ID      = "fraud_finder_7551"
TIMESTAMP            = "1647254678"
TRAINING_DS_SIZE     = "1000"



### Select an experiment name

Let us define the experiment name to store . If EXEPERIMENT_NAME is not set, set a default one below:

In [2]:
EXPERIMENT_NAME = ""  # @param {type:"string"}
if EXPERIMENT_NAME == "" or EXPERIMENT_NAME is None:
    EXPERIMENT_NAME = "fd-experiment-" + TIMESTAMP

### Import libraries and define constants

In [3]:
#General
import os
import sys
from typing import Union, List

#Data Preprocessing
import numpy as np
import pandas as pd

#Model Training with Vertex AI
from google.cloud import bigquery
from google.cloud import aiplatform as vertex_ai
from google.cloud.aiplatform_v1 import ModelServiceClient
from google.cloud.aiplatform_v1.types import ListModelEvaluationsRequest
from google.protobuf.json_format import MessageToDict
from google.cloud.aiplatform import gapic as aip
from google.cloud import storage


#Feature Store
from google.cloud import aiplatform as vertex_ai
from google.cloud.aiplatform import Featurestore, EntityType, Feature

In [4]:
# General
DATA_DIR = os.path.join(os.pardir, 'data')
TRAIN_DATA_DIR = os.path.join(DATA_DIR, 'train')
# CONFIG_DIR = os.path.join(os.pardir, 'config')
DATA_URI = f'gs://{BUCKET_NAME}/data'
TRAIN_DATA_URI = f'{DATA_URI}/train'
CONFIG_URI =  f'gs://{BUCKET_NAME}/config' 
BQ_DATASET = "tx"

#Feature Store
START_DATE_TRAIN = "2022-01-01" #consider few days for training (demo)
END_DATE_TRAIN = "2022-01-31"
EVENTS_TABLE_NAME = f'events_{END_DATE_TRAIN}'
CUSTOMERS_TABLE_NAME = f'customers_{END_DATE_TRAIN}'
TERMINALS_TABLE_NAME = f'terminals_{END_DATE_TRAIN}'
DATA_ENDPOINT = f"{REGION}-featurestore-aiplatform.googleapis.com"
ADMIN_ENDPOINT = f"{REGION}-aiplatform.googleapis.com"
EVENT_ENTITY = 'event'
CUSTOMER_ENTITY = 'customer'
TERMINAL_ENTITY = 'terminal'
SERVING_FEATURE_IDS = {CUSTOMER_ENTITY: ['*'], EVENT_ENTITY: ['*'], TERMINAL_ENTITY: ['*']}
READ_INSTANCES_TABLE = f"ground_truth_{TIMESTAMP}"
READ_INSTANCES_URI = f"bq://{PROJECT_ID}.{BQ_DATASET}.{READ_INSTANCES_TABLE}"

# Training
COLUMNS_IGNORE = ['terminal_id', 'customer_id', 'entity_type_event', 'entity_type_customer', 'entity_type_terminal']
TARGET = 'tx_fraud'

## Custom Training
DATASET_NAME=f"sample_train-{ID}-{END_DATE_TRAIN}"
TRAIN_JOB_NAME=f"fraudfinder_xgb_train_frmlz-{ID}-{TIMESTAMP}"
MODEL_NAME=f"fraudfinder_xgb_model_frmlz-{ID}-{TIMESTAMP}"
DEPLOYED_NAME = f"fraudfinder_xgb_prediction_frmlz-{ID}-{TIMESTAMP}"
MODEL_SERVING_IMAGE_URI = 'us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.1-1:latest'
IMAGE_REPOSITORY = f'fraudfinder-{ID}'
IMAGE_NAME='dask-xgb-classificator'
IMAGE_TAG='v1'
IMAGE_URI=f"us-central1-docker.pkg.dev/{PROJECT_ID}/{IMAGE_REPOSITORY}/{IMAGE_NAME}:{IMAGE_TAG}"
TRAIN_COMPUTE='e2-standard-4'
DEPLOY_COMPUTE='n1-standard-4'

### Inizialize clients

In [5]:
bq_client = bigquery.Client(project=PROJECT_ID, location=REGION)

In [6]:
vertex_ai.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_NAME, experiment=EXPERIMENT_NAME)

### Helpers

In [7]:
def create_gcs_dataset(client,
                       display_name: str, 
                       gcs_source: Union[str, List[str]]):
    
    dataset = client.TabularDataset.create(
        display_name=display_name, gcs_source=gcs_source,
    )

    dataset.wait()
    return dataset
    
def get_evaluation_metrics(client, model_resource_name):
    model_evalution_request = ListModelEvaluationsRequest(parent=model_resource_name)
    model_evaluation_list = client.list_model_evaluations(request=model_evalution_request)
    metrics_strlist = []
    for evaluation in model_evaluation_list:
        metrics = MessageToDict(evaluation._pb.metrics)
    return metrics

def gcs_list(gcs_uri):
    obj_list=[]
    storage_client = storage.Client()
    bucket, key = gcs_uri.replace("gs://", "").split("/", 1)
    for blob in storage_client.list_blobs(bucket, prefix=key):
        obj_list.append("gs://"+bucket+"/"+str(blob.name))
    return obj_list

## Fetching feature values for model training

To fetch training data, we have to specify the following inputs to batch serving:

- A file containing a "query", with the entities and timestamps for each label.
- List of features to fetch values for
- Destination location and format


### Read-instance list

In our case, we need a csv file with content formatted like the table below

|event|customer                     |terminal|timestamp                                    |
|-----|-----------------------------|--------|---------------------------------------------|
|xxx57538|xxx3859                         |xxx8811    |2021-07-07 00:01:10 UTC                      |
|xxx57539|xxx4165                         |xxx8810    |2021-07-07 00:01:55 UTC                      |
|xxx57540|xxx2289                         |xxx2081    |2021-07-07 00:02:12 UTC                      |
|xxx57541|xxx3227                         |xxx3011    |2021-07-07 00:03:23 UTC                      |
|xxx57542|xxx2819                         |xxx6263    |2021-07-07 00:05:30 UTC                      |

where column names are the name of entities in Feature Store and the timestamp represents the time of occured event

In [8]:
sql_query = f"""
CREATE OR REPLACE TABLE
  `{PROJECT_ID}.{BQ_DATASET}.{READ_INSTANCES_TABLE}` AS
SELECT
  e.TX_ID AS event,
  e.CUSTOMER_ID AS customer,
  e.TERMINAL_ID AS terminal,
  e.TX_TS AS timestamp
FROM
  `{BQ_DATASET}.{EVENTS_TABLE_NAME}` AS e
WHERE
  e.TX_TS BETWEEN "{START_DATE_TRAIN}" AND "{END_DATE_TRAIN}"
LIMIT {TRAINING_DS_SIZE}
"""

try:
    job = bq_client.query(sql_query)
    _ = job.result()
except RuntimeError as error:
    print(error)

### Get Feature Store

In [9]:
try:
    ff_feature_store = Featurestore(FEATURESTORE_ID)
except NameError:
    print(f"""The feature store {FEATURESTORE_ID} does not exist!""") 

### Export a sample of data to a bucket 
In this section, we will use Batch Serving of feature store, to prepare dataset for training examples by calling the BatchReadFeatureValues API. Batch Serving is used to fetch a large batch of feature values for high-throughput, typically for training a model or batch prediction. 

In [10]:
!gsutil uniformbucketlevelaccess set on gs://{BUCKET_NAME}

Enabling Uniform bucket-level access for gs://transaction-detective22-1559...


In [11]:
ff_feature_store.batch_serve_to_gcs(
    gcs_destination_output_uri_prefix = TRAIN_DATA_URI,
    gcs_destination_type = 'csv',
    serving_feature_ids = SERVING_FEATURE_IDS, 
    read_instances_uri = READ_INSTANCES_URI
)

INFO:google.cloud.aiplatform.featurestore.featurestore:Serving Featurestore feature values: projects/130114571864/locations/us-central1/featurestores/fraud_finder_7551
INFO:google.cloud.aiplatform.featurestore.featurestore:Serve Featurestore feature values backing LRO: projects/130114571864/locations/us-central1/featurestores/fraud_finder_7551/operations/5029479065225003008
INFO:google.cloud.aiplatform.featurestore.featurestore:Featurestore feature values served. Resource name: projects/130114571864/locations/us-central1/featurestores/fraud_finder_7551


<google.cloud.aiplatform.featurestore.featurestore.Featurestore object at 0x7f6ca5c2dd10> 
resource name: projects/130114571864/locations/us-central1/featurestores/fraud_finder_7551

In [12]:
!gsutil uniformbucketlevelaccess set off gs://{BUCKET_NAME}

E0314 13:23:59.643261984   25880 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies


Disabling Uniform bucket-level access for gs://transaction-detective22-1559...


Lets create a copy of the training data in our local notebook instance. We need it later for testing our model.

In [13]:
!gsutil ls $TRAIN_DATA_URI

E0314 13:24:04.760137943   25880 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies


gs://transaction-detective22-1559/data/train/000000000000.csv


In [14]:
!sudo gsutil cp -r $TRAIN_DATA_URI $TRAIN_DATA_DIR

E0314 13:24:10.893078716   25880 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies


Copying gs://transaction-detective22-1559/data/train/000000000000.csv...
/ [1 files][158.2 KiB/158.2 KiB]                                                
Operation completed over 1 objects/158.2 KiB.                                    


Exporting the features into cloud storage, will generate multiple csv file. 

In [15]:
obj_list = gcs_list(TRAIN_DATA_URI)



## Builing a custom fraud detection model

### Fixing Imbalanced Dataset
In this section, we will deal with imbalance dataset. Specifically, we will randomly delete some of the instances from the non-fraud class in order to match the numbers with the fraud class. This technique is called undersampling. We will skip this step for our workshop.

In [None]:
# from urllib.parse import urlparse
# obj_list = gcs_list(TRAIN_DATA_URI)

# TRAIN_DATA_URI_BALANCED = f'{DATA_URI}/balanced_train'
# TRAIN_DATA_DIR_BALANCED = os.path.join(DATA_DIR, 'balanced_train')
# if not os.path.exists(TRAIN_DATA_DIR_BALANCED):
#     os.makedirs(TRAIN_DATA_DIR_BALANCED)
# for ds_csv_uri in obj_list:
#     tx_df = pd.read_csv(ds_csv_uri)
#     shuffled_df = tx_df.sample(frac=1,random_state=4)
#     fraud_df = shuffled_df.loc[shuffled_df['tx_fraud'] == 1]
#     non_fraud_df = shuffled_df.loc[shuffled_df['tx_fraud'] == 0].sample(n=fraud_df.shape[0],random_state=42)
#     balanced_df = pd.concat([fraud_df, non_fraud_df])
#     balanced_df.to_csv(os.path.join(TRAIN_DATA_DIR_BALANCED,  os.path.basename(urlparse(ds_csv_uri).path)), index=False)
#     break

# !gsutil cp -r  $TRAIN_DATA_DIR_BALANCED $TRAIN_DATA_URI_BALANCED
# obj_list = gcs_list(TRAIN_DATA_URI_BALANCED)

#### Builing Vertex AI dataset
In this section, we will build a Vertex AI dataset from our tabular. Vertex AI datasets can be used to train AutoML models or custom-trained models.  

In [16]:
dataset = create_gcs_dataset(client=vertex_ai, display_name=DATASET_NAME, gcs_source=obj_list[0]) #obj_list

print("Dataset:", f"{dataset.display_name}")
print("Name: \t", f"{dataset.resource_name}")

INFO:google.cloud.aiplatform.datasets.dataset:Creating TabularDataset
INFO:google.cloud.aiplatform.datasets.dataset:Create TabularDataset backing LRO: projects/130114571864/locations/us-central1/datasets/4325905354182361088/operations/6720017775349202944
INFO:google.cloud.aiplatform.datasets.dataset:TabularDataset created. Resource name: projects/130114571864/locations/us-central1/datasets/4325905354182361088
INFO:google.cloud.aiplatform.datasets.dataset:To use this TabularDataset in another session:
INFO:google.cloud.aiplatform.datasets.dataset:ds = aiplatform.TabularDataset('projects/130114571864/locations/us-central1/datasets/4325905354182361088')
Dataset: sample_train-7551-2022-01-31
Name: 	 projects/130114571864/locations/us-central1/datasets/4325905354182361088


### Train the model with custom model
In this section, we will use xgboost algorithm. Specifically, we will perform custom training with a pre-built xgboost container.

#### Create the training application
Typically, to perform custom training you ccan either use a pre-built container, or buid a new container. In this secion we will build a container for xgboost, and use it for training through VertexAI Manged Training serivice.



In [17]:
!mkdir -p -m 777 build_training

E0314 13:26:18.380018914   25880 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies


In [18]:
%%writefile build_training/train_xgb.py

"""
train_gb.py is the module for training a XGBClassifier pipeline
"""

# Libraries --------------------------------------------------------------------------------------------------------------------------

import argparse
import numpy as np
import os
import json
import logging
from pathlib import Path
import dask.dataframe as dask_df
from dask.distributed import LocalCluster, Client
import xgboost as xgb
from sklearn.metrics import roc_curve, confusion_matrix, average_precision_score, f1_score, log_loss, precision_score, recall_score

# Variables --------------------------------------------------------------------------------------------------------------------------
## Read environmental variables
TRAINING_DATA_PATH = os.environ["AIP_TRAINING_DATA_URI"].replace('gs://', '/gcs/')
TEST_DATA_PATH = os.environ["AIP_TEST_DATA_URI"].replace('gs://', '/gcs/')
MODEL_DIR = os.environ['AIP_MODEL_DIR'].replace('gs://', '/gcs/')
MODEL_PATH = MODEL_DIR + 'model.bst'


## Training variables
LABEL_COLUMN = "tx_fraud"
UNUSED_COLUMNS = ["timestamp","entity_type_event","terminal_id","customer_id","entity_type_customer","entity_type_terminal"]
DATA_SCHEMA = {
"timestamp": "object",
"entity_type_event": "object",
"tx_amount": "float64",
"customer_id": "int64",
"tx_fraud": "int64",
"terminal_id": "int64",
"entity_type_customer": "int64",
"customer_id_nb_tx_7day_window": "int64",
"customer_id_nb_tx_14day_window": "int64",
"customer_id_nb_tx_1day_window": "int64",
"customer_id_avg_amount_7day_window": "float64",
"customer_id_avg_amount_14day_window": "float64",
"customer_id_avg_amount_1day_window": "float64",
"entity_type_terminal": "int64",
"terminal_id_risk_1day_window": "float64",
"terminal_id_risk_14day_window": "float64",
"terminal_id_risk_7day_window": "float64",
"terminal_id_nb_tx_7day_window": "int64",
"terminal_id_nb_tx_14day_window": "int64",
"terminal_id_nb_tx_1day_window": "int64"
}

# Helpers -----------------------------------------------------------------------------------------------------------------------------
def get_args():
    parser = argparse.ArgumentParser()

    # Data files arguments
    parser.add_argument('--bucket', dest='bucket', type=str,
                        required=True, help='Bucket uri')
    parser.add_argument('--max_depth', dest='max_depth',
                        default=6, type=int,
                        help='max_depth value.')
    parser.add_argument('--eta', dest='eta',
                        default=0.4, type=float,
                        help='eta.')
    parser.add_argument('--gamma', dest='gamma',
                        default=0.0, type=float,
                        help='eta value')
    parser.add_argument("-v", "--verbose", 
                        help="increase output verbosity", 
                        action="store_true")
    
    return parser.parse_args()

def set_logging():
    #TODO
    pass

def resample(df, replace, frac=1, random_state = 8):
    shuffled_df = df.sample(frac=frac, replace=replace, random_state=random_state)
    return shuffled_df

def preprocess(df):
    
    df = df.drop(columns=UNUSED_COLUMNS)

    # Drop rows with NaN's
    df = df.dropna()

    # Convert integer valued (numeric) columns to floating point
    numeric_columns = df.select_dtypes(["int32", "float32", "float64"]).columns
    numeric_format = {col:"float32" for col in numeric_columns}
    df.astype(numeric_format)

    return df

def evaluate_model(model, x_true, y_true):
    
    y_true = y_true.compute()
    
    #calculate metrics
    metrics={}
    
    y_score =  model.predict_proba(x_true)[:, 1]
    y_score = y_score.compute()
    fpr, tpr, thr = roc_curve(
         y_true=y_true, y_score=y_score, pos_label=True
    )
    fpr_list = fpr.tolist()[::1000]
    tpr_list = tpr.tolist()[::1000]
    thr_list = thr.tolist()[::1000]

    y_pred = model.predict(x_true)
    y_pred.compute()
    c_matrix = confusion_matrix(y_true, y_pred)
    
    avg_precision_score = round(average_precision_score(y_true, y_score), 3)
    f1 = round(f1_score(y_true, y_pred), 3)
    lg_loss = round(log_loss(y_true, y_pred), 3)
    prec_score = round(precision_score(y_true, y_pred), 3)
    rec_score = round(recall_score(y_true, y_pred), 3)
    
    
    metrics['fpr'] = [round(f, 3) for f in fpr_list]
    metrics['tpr'] = [round(f, 3) for f in tpr_list]
    metrics['thrs'] = [round(f, 3) for f in thr_list]
    metrics['confusion_matrix'] = c_matrix.tolist()
    metrics['avg_precision_score'] = avg_precision_score
    metrics['f1_score'] = f1
    metrics['log_loss'] = lg_loss
    metrics['precision_score'] = prec_score
    metrics['recall_score'] = rec_score
    
    return metrics


def main():
    args = get_args()
    if args.verbose:
        set_logging()
        
    #variables
    bucket = args.bucket.replace('gs://', '/gcs/')
    deliverable_uri = (Path(bucket)/'deliverables')
    metrics_uri = (deliverable_uri/'metrics.json')

    #read data
    train_df = dask_df.read_csv(TRAINING_DATA_PATH, dtype=DATA_SCHEMA)
    test_df = dask_df.read_csv(TEST_DATA_PATH, dtype=DATA_SCHEMA)
    
    #downsampling
    train_nfraud_df = train_df[train_df[LABEL_COLUMN]==0]
    train_fraud_df = train_df[train_df[LABEL_COLUMN]==1]
    train_nfraud_downsample = resample(train_nfraud_df,
                          replace=True, 
                          frac=len(train_fraud_df)/len(train_df))
    
    downsampled_train_df = dask_df.multi.concat([train_nfraud_downsample, train_fraud_df])
    
    #preprocessing
    preprocessed_train_df = preprocess(downsampled_train_df)
    preprocessed_test_df = preprocess(test_df)
    
    #target, features split
    x_train = preprocessed_train_df[preprocessed_train_df.columns.difference([LABEL_COLUMN])]
    y_train = preprocessed_train_df.loc[:, LABEL_COLUMN].astype(int)
    x_true = preprocessed_test_df[preprocessed_test_df.columns.difference([LABEL_COLUMN])]
    y_true = preprocessed_test_df.loc[:, LABEL_COLUMN].astype(int)
    
    #train model
    cluster =  LocalCluster()
    client = Client(cluster)
    model = xgb.dask.DaskXGBClassifier(objective='reg:logistic', eval_metric='logloss')
    model.client = client  # assign the client
    model.fit(x_train, y_train, eval_set=[(x_true, y_true)])
    if not Path(MODEL_DIR).exists():
        Path(MODEL_DIR).mkdir(parents=True, exist_ok=True)
    model.save_model(MODEL_PATH)
    
    #generate metrics
    metrics = evaluate_model(model, x_true, y_true)
    if not Path(deliverable_uri).exists():
        Path(deliverable_uri).mkdir(parents=True, exist_ok=True)
    with open(metrics_uri, 'w') as file:
        json.dump(metrics, file, sort_keys = True, indent = 4)
    file.close()
    
if __name__ == '__main__':
    main()

Writing build_training/train_xgb.py


#### Define a custom image for dask model training

In [19]:
# Create image repo
!gcloud artifacts repositories create $IMAGE_REPOSITORY \
    --repository-format=docker \
    --location=us-central1 \
    --description="Fraud Finder Docker Image repository"

# List repositories under the project
!gcloud artifacts repositories list

E0314 13:26:58.172367902   25880 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies


Create request issued for: [fraudfinder-7551]
Waiting for operation [projects/transaction-detective22-1559/locations/us-centr
al1/operations/711a4f11-ac47-42ed-98b8-254e80c1706a] to complete...done.       
Created repository [fraudfinder-7551].


E0314 13:27:10.764017976   25880 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies


Listing items under project transaction-detective22-1559, across all locations.

                                                                ARTIFACT_REGISTRY
REPOSITORY        FORMAT  DESCRIPTION                           LOCATION     LABELS  ENCRYPTION          CREATE_TIME          UPDATE_TIME
fraudfinder-7551  DOCKER  Fraud Finder Docker Image repository  us-central1          Google-managed key  2022-03-14T13:27:07  2022-03-14T13:27:07


In [20]:
!gcloud auth configure-docker us-central1-docker.pkg.dev -q

E0314 13:27:56.549409938   25880 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies



{
  "credHelpers": {
    "gcr.io": "gcloud",
    "us.gcr.io": "gcloud",
    "eu.gcr.io": "gcloud",
    "asia.gcr.io": "gcloud",
    "staging-k8s.gcr.io": "gcloud",
    "marketplace.gcr.io": "gcloud"
  }
}
Adding credentials for: us-central1-docker.pkg.dev
Docker configuration file updated.


In [21]:
%%writefile build_training/Dockerfile
# Specifies base image and tag
# FROM us-docker.pkg.dev/vertex-ai/training/xgboost-cpu.1-1:latest
FROM python:3.7
WORKDIR /root

# Installs additional packages
RUN pip install gcsfs numpy pandas scikit-learn dask distributed xgboost --upgrade

# Copies the trainer code to the docker image.
COPY ./train_xgb.py /root/train_xgb.py

# Sets up the entry point to invoke the trainer.
ENTRYPOINT ["python3", "train_xgb.py"]

Writing build_training/Dockerfile


In [22]:
# Build and push docker file
!docker build -t $IMAGE_URI ./build_training/
!docker push $IMAGE_URI

E0314 13:28:10.847847945   25880 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies


Sending build context to Docker daemon  9.728kB
Step 1/5 : FROM python:3.7
3.7: Pulling from library/python

[1B1adff207: Pulling fs layer 
[1B945c672b: Pulling fs layer 
[1B10aec998: Pulling fs layer 
[1B8c754e45: Pulling fs layer 
[1B762e7602: Pulling fs layer 
[1B2e030155: Pulling fs layer 
[1Bdc84acc9: Pulling fs layer 
[1B4992c7bf: Pulling fs layer 
[1BDigest: sha256:b48983bebd0fe1c09639fa008e4cb51aac6277af6c6762fc58ac9d2cb7fc24ef[5A[2K[6A[2K[9A[2K[6A[2K[5A[2K[9A[2K[4A[2K[6A[2K[9A[2K[6A[2K[9A[2K[9A[2K[3A[2K[9A[2K[3A[2K[9A[2K[1A[2K[1A[2K[5A[2K[9A[2K[5A[2K[9A[2K[5A[2K[5A[2K[9A[2K[5A[2K[9A[2KDownloading  170.4MB/196.5MB[9A[2K[5A[2K[9A[2K[5A[2K[9A[2K[9A[2K[9A[2K[9A[2K[9A[2K[8A[2K[8A[2K[8A[2K[8A[2K[7A[2K[7A[2K[7A[2K[7A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[5A[2K[5A[2K[5A[2K[5A

E0314 13:29:52.842527906   25880 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies


The push refers to repository [us-central1-docker.pkg.dev/transaction-detective22-1559/fraudfinder-7551/dask-xgb-classificator]

[1B7992c0a5: Preparing 
[1B3d4afda0: Preparing 
[1Bef05ed8e: Preparing 
[1Bde6b4d9c: Preparing 
[1Be231faac: Preparing 
[1B3ce3bf27: Preparing 
[1B3949bffa: Preparing 
[1B4a8cee1f: Preparing 
[1B4a6f44ae: Preparing 
[1B108b2cba: Preparing 
[10Bd4afda0: Pushed    1.23GB/1.22GBB[11A[2K[9A[2K[10A[2K[9A[2K[7A[2K[10A[2K[8A[2K[9A[2K[7A[2K[9A[2K[6A[2K[9A[2K[9A[2K[5A[2K[9A[2K[10A[2K[7A[2K[6A[2K[9A[2K[7A[2K[10A[2K[5A[2K[10A[2K[5A[2K[10A[2K[6A[2K[6A[2K[6A[2K[10A[2K[6A[2K[5A[2K[10A[2K[5A[2K[10A[2K[5A[2K[7A[2K[5A[2K[6A[2K[7A[2K[4A[2K[7A[2K[4A[2K[10A[2K[7A[2K[5A[2K[7A[2K[5A[2K[10A[2K[5A[2K[10A[2K[5A[2K[7A[2K[5A[2K[4A[2K[10A[2K[7A[2K[5A[2K[4A[2K[5A[2K[10A[2K[4A[2K[10A[2K[7A[2K[3A[2K[4A[2K[10A[2K[4A[2K[10A[2K[5A[2K[4A[2K[5A

#### Submit the script to run on Vertex AI
In this section, we create a training pipeline. It will create custom training jobs, load our dataset and upload the model to Vertex AI after the training job is successfully completed.

In [24]:
job = vertex_ai.CustomContainerTrainingJob(
    display_name=TRAIN_JOB_NAME,
    container_uri=IMAGE_URI,
    model_serving_container_image_uri=MODEL_SERVING_IMAGE_URI,
)

parameters = {"MAX_DEPTH": 4, "ETA": 0.3, "GAMMA": 0.1}

CMDARGS = [ f"""--bucket={BUCKET_NAME}""",
    "--max_depth=" + str(parameters["MAX_DEPTH"]),
    "--eta=" + str(parameters["ETA"]),
    "--gamma=" + str(parameters["GAMMA"]),
    "--verbose"
]


model = job.run(
    dataset=dataset,
    model_display_name=MODEL_NAME,
    args=CMDARGS,
    replica_count=1,
    machine_type=TRAIN_COMPUTE,
    accelerator_count=0)

INFO:google.cloud.aiplatform.training_jobs:Training Output directory:
gs://transaction-detective22-1559/aiplatform-custom-training-2022-03-14-13:40:34.334 
INFO:google.cloud.aiplatform.training_jobs:No dataset split provided. The service will use a default split.
INFO:google.cloud.aiplatform.training_jobs:View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/7855333831052623872?project=130114571864
INFO:google.cloud.aiplatform.training_jobs:CustomContainerTrainingJob projects/130114571864/locations/us-central1/trainingPipelines/7855333831052623872 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.training_jobs:CustomContainerTrainingJob projects/130114571864/locations/us-central1/trainingPipelines/7855333831052623872 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.training_jobs:CustomContainerTrainingJob projects/130114571864/locations/us-central1/trainingPipelines/7855333831052623872 c

#### Evaluate the model locally

Before you can run the data through the endpoint, you need to preprocess it to match the format that your custom model defined in task.py expects.

In [25]:
LABEL_COLUMN = "tx_fraud"
UNUSED_COLUMNS = ["timestamp","entity_type_event","terminal_id","customer_id","entity_type_customer","entity_type_terminal"]
NA_VALUES = ["NA", "."]
def preprocess(df):
    """Converts categorical features to numeric. Removes unused columns.

    Args:
      df: Pandas df with raw data

    Returns:
      df with preprocessed data
    """
    df = df.drop(columns=UNUSED_COLUMNS)

    # Drop rows with NaN's
    df = df.dropna()

    # Convert integer valued (numeric) columns to floating point
    numeric_columns = df.select_dtypes(["int32", "float32", "float64"]).columns
    df[numeric_columns] = df[numeric_columns].astype("float32")

    dummy_columns = list(df.dtypes[df.dtypes == 'category'].index)
    df = pd.get_dummies(df, columns=dummy_columns)

    return df
#test set
train_sample_path = os.path.join(TRAIN_DATA_DIR, '000000000000.csv')
df_test = pd.read_csv(train_sample_path)
preprocessed_test_Data = preprocess(df_test)

x_test = preprocessed_test_Data[preprocessed_test_Data.columns.drop(LABEL_COLUMN).to_list()].values
y_test = preprocessed_test_Data.loc[:,LABEL_COLUMN].astype(int)

Here we copy the mdoel artifact to the local directory

In [26]:
!gsutil cp -r $model.uri .

Copying gs://transaction-detective22-1559/aiplatform-custom-training-2022-03-14-13:40:34.334/model/model.bst...
/ [1 files][ 81.9 KiB/ 81.9 KiB]                                                
Operation completed over 1 objects/81.9 KiB.                                     


In [27]:
import xgboost as xgb

bst = xgb.Booster()  # init model
bst.load_model('./model/model.bst') 
xgtest = xgb.DMatrix(x_test)
y_pred_prob = bst.predict(xgtest)
y_pred = y_pred_prob.round().astype(int)
y_pred_prob[0:10]

array([0.9832954, 0.9832954, 0.9832954, 0.9832954, 0.9832954, 0.9832954,
       0.9832954, 0.9832954, 0.9832954, 0.9832954], dtype=float32)

In [28]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test.values, y_pred, average='weighted')

(0.5706887328652624, 0.211, 0.07671063395430816, None)

#### Deploy the model
Before you use your model to make predictions, you need to deploy it to an Endpoint. You can do this by calling the deploy function on the Model resource. This will do two things:

Create an Endpoint resource for deploying the Model resource to.
Deploy the Model resource to the Endpoint resource.

In [29]:
DEPLOY_COMPUTE='n1-standard-4'
TRAFFIC_SPLIT = {"0": 100}

MIN_NODES = 1
MAX_NODES = 1


endpoint = model.deploy(
    deployed_model_display_name=DEPLOYED_NAME,
    traffic_split=TRAFFIC_SPLIT,
    machine_type=DEPLOY_COMPUTE,
    accelerator_count=0,
    min_replica_count=MIN_NODES,
    max_replica_count=MAX_NODES,
)

INFO:google.cloud.aiplatform.models:Creating Endpoint
INFO:google.cloud.aiplatform.models:Create Endpoint backing LRO: projects/130114571864/locations/us-central1/endpoints/2144461090535243776/operations/2828907697301094400
INFO:google.cloud.aiplatform.models:Endpoint created. Resource name: projects/130114571864/locations/us-central1/endpoints/2144461090535243776
INFO:google.cloud.aiplatform.models:To use this Endpoint in another session:
INFO:google.cloud.aiplatform.models:endpoint = aiplatform.Endpoint('projects/130114571864/locations/us-central1/endpoints/2144461090535243776')
INFO:google.cloud.aiplatform.models:Deploying model to Endpoint : projects/130114571864/locations/us-central1/endpoints/2144461090535243776
INFO:google.cloud.aiplatform.models:Deploy Endpoint model backing LRO: projects/130114571864/locations/us-central1/endpoints/2144461090535243776/operations/4405167566880768000
INFO:google.cloud.aiplatform.models:Endpoint model deployed. Resource name: projects/13011457186

#### Test the deployed model (Make an online prediction request)
Send an online prediction request to your deployed model. To make sure your deployed model is working, test it out by sending a request to the endpoint
Let's first get test data

prediction input instances need to be formatted as JSON.

In [30]:
DEPLOYED_NAME

'fraudfinder_xgb_prediction_frmlz-7551-1647254678'

In [31]:
payload = {
  "instances": [
    [0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 3., 0.],
    [1., 2., 0., 2., 1., 0., 1., 0., 0., 0., 0., 3., 0.]]
  
}
#In case you want to test it in the console
import json
with open('predictions.json', 'w', encoding='utf-8') as f:
    json.dump(payload, f, ensure_ascii=False, indent=4)

In [35]:
endpoint.predict(instances = payload['instances'])

Prediction(predictions=[0.09744301438331604, 0.5049852728843689], deployed_model_id='1052531694944387072', explanations=None)

## (DO NOT RUN) Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:



In [34]:
# Delete endpoint resource
#! gcloud ai endpoints delete $ENDPOINT_NAME --quiet --region $REGION_NAME

# Delete model resource
#! gcloud ai models delete $MODEL_NAME --quiet

# Delete Cloud Storage objects that were created
#! gsutil -m rm -r $JOB_DIR