In [1]:
from google.cloud import aiplatform
aiplatform.init(
    project='protean-pipe-385122',
    location='northamerica-northeast2',
    staging_bucket='doremon123'
)

In [2]:

import random
import string


# Generate a uuid of length 8
def generate_uuid():
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=8))


UUID = generate_uuid()

In [3]:
from google.cloud import aiplatform as aip



In [4]:
IMPORT_FILE = "gs://cloud-ml-tables-data/bank-marketing.csv"

In [5]:
from google.cloud import aiplatform as aip

# Initialize the Vertex AI SDK
aip.init(project='protean-pipe-385122',
    location='northamerica-northeast2')

dataset = aip.TabularDataset.create(
    display_name="Bank Marketing" + "_" + UUID, gcs_source=[IMPORT_FILE]
)

print(dataset.resource_name)


Creating TabularDataset
Create TabularDataset backing LRO: projects/918002543242/locations/northamerica-northeast2/datasets/8268327440875520000/operations/2497655011702472704
TabularDataset created. Resource name: projects/918002543242/locations/northamerica-northeast2/datasets/8268327440875520000
To use this TabularDataset in another session:
ds = aiplatform.TabularDataset('projects/918002543242/locations/northamerica-northeast2/datasets/8268327440875520000')
projects/918002543242/locations/northamerica-northeast2/datasets/8268327440875520000


In [6]:
dag = aip.AutoMLTabularTrainingJob(
    display_name="bank_" + UUID,
    optimization_prediction_type="classification",
    optimization_objective="minimize-log-loss",
)

print(dag)

<google.cloud.aiplatform.training_jobs.AutoMLTabularTrainingJob object at 0x7f726eae6310>


In [None]:

model = dag.run(
    dataset=dataset,
    model_display_name="bank_" + UUID,
    training_fraction_split=0.6,
    validation_fraction_split=0.2,
    test_fraction_split=0.2,
    budget_milli_node_hours=8000,
    disable_early_stopping=False,
    target_column='POutcome',
)


No column transformations provided, so now retrieving columns from dataset in order to set default column transformations.
The column transformation of type 'auto' was set for the following columns: ['Age', 'Duration', 'Previous', 'Education', 'MaritalStatus', 'Day', 'Deposit', 'Campaign', 'Contact', 'Housing', 'PDays', 'Loan', 'Month', 'Default', 'Balance', 'Job'].
View Training:
https://console.cloud.google.com/ai/platform/locations/northamerica-northeast2/training/4453576246353199104?project=918002543242
AutoMLTabularTrainingJob projects/918002543242/locations/northamerica-northeast2/trainingPipelines/4453576246353199104 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLTabularTrainingJob projects/918002543242/locations/northamerica-northeast2/trainingPipelines/4453576246353199104 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLTabularTrainingJob projects/918002543242/locations/northamerica-northeast2/trainingPipelines/4453576246353199104 current state:
PipelineSta

In [None]:

model_evaluations = model.list_model_evaluations()

for model_evaluation in model_evaluations:
    print(model_evaluation.to_dict())

In [10]:
BUCKET_URI = "gs://doremon123"

In [67]:
! gsutil cat $IMPORT_FILE | head -n 1 > tmp.csv
! gsutil cat $IMPORT_FILE | tail -n 10 >> tmp.csv

! cut -d, -f1-17 tmp.csv > batch.csv

gcs_input_uri = BUCKET_URI + "/test.csv"

! gsutil cp batch.csv $gcs_input_uri

Copying file://batch.csv [Content-Type=text/csv]...
/ [1 files][  949.0 B/  949.0 B]                                                
Operation completed over 1 objects/949.0 B.                                      


In [68]:
batch_predict_job = model.batch_predict(
    job_display_name="bank_" + UUID,
    gcs_source=gcs_input_uri,
    gcs_destination_prefix=BUCKET_URI,
    instances_format="csv",
    predictions_format="jsonl",
    generate_explanation=True,
    sync=False,
)

print(batch_predict_job)

Creating BatchPredictionJob
<google.cloud.aiplatform.jobs.BatchPredictionJob object at 0x7f728fb6d150> is waiting for upstream dependencies to complete.


In [None]:
batch_predict_job.wait()

In [None]:
import pandas as pd

df = pd.read_csv("batch.csv")


In [None]:
df.head()

In [57]:
df_tmp = pd.read_csv("tmp.csv")

In [58]:
df_tmp.head()

Unnamed: 0,Age,Job,MaritalStatus,Education,Default,Balance,Housing,Loan,Contact,Day,Month,Duration,Campaign,PDays,Previous,POutcome,Deposit
0,53,management,married,tertiary,no,583,no,no,cellular,17,nov,226,1,184,4,success,2
1,34,admin.,single,secondary,no,557,no,no,cellular,17,nov,224,1,-1,0,unknown,2
2,23,student,single,tertiary,no,113,no,no,cellular,17,nov,266,1,-1,0,unknown,2
3,73,retired,married,secondary,no,2850,no,no,cellular,17,nov,300,1,40,8,failure,2
4,25,technician,single,secondary,no,505,no,yes,cellular,17,nov,386,2,-1,0,unknown,2


In [59]:
df.shape

(10, 17)

In [60]:
df_tmp.shape

(10, 17)

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Age            10 non-null     int64 
 1   Job            10 non-null     object
 2   MaritalStatus  10 non-null     object
 3   Education      10 non-null     object
 4   Default        10 non-null     object
 5   Balance        10 non-null     int64 
 6   Housing        10 non-null     object
 7   Loan           10 non-null     object
 8   Contact        10 non-null     object
 9   Day            10 non-null     int64 
 10  Month          10 non-null     object
 11  Duration       10 non-null     int64 
 12  Campaign       10 non-null     int64 
 13  PDays          10 non-null     int64 
 14  Previous       10 non-null     int64 
 15  POutcome       10 non-null     object
 16  Deposit        10 non-null     int64 
dtypes: int64(8), object(9)
memory usage: 1.5+ KB


In [62]:
df_tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Age            10 non-null     int64 
 1   Job            10 non-null     object
 2   MaritalStatus  10 non-null     object
 3   Education      10 non-null     object
 4   Default        10 non-null     object
 5   Balance        10 non-null     int64 
 6   Housing        10 non-null     object
 7   Loan           10 non-null     object
 8   Contact        10 non-null     object
 9   Day            10 non-null     int64 
 10  Month          10 non-null     object
 11  Duration       10 non-null     int64 
 12  Campaign       10 non-null     int64 
 13  PDays          10 non-null     int64 
 14  Previous       10 non-null     int64 
 15  POutcome       10 non-null     object
 16  Deposit        10 non-null     int64 
dtypes: int64(8), object(9)
memory usage: 1.5+ KB


In [63]:
batch_predict_job

<google.cloud.aiplatform.jobs.BatchPredictionJob object at 0x7f71ee1c2610> 
resource name: projects/918002543242/locations/northamerica-northeast2/batchPredictionJobs/2533635430209814528

In [None]:
import tensorflow as tf

bp_iter_outputs = batch_predict_job.iter_outputs()

explanation_results = list()
for blob in bp_iter_outputs:
    if blob.name.split("/")[-1].startswith("explanation"):
        explanation_results.append(blob.name)

tags = list()
for explanation_result in explanation_results:
    gfile_name = f"gs://{bp_iter_outputs.bucket.name}/{explanation_result}"
    with tf.io.gfile.GFile(name=gfile_name, mode="r") as gfile:
        for line in gfile.readlines():
            print(line)

In [None]:
import os
# Set this to true only if you'd like to delete your bucket
delete_bucket = False

dataset.delete()
model.delete()
batch_predict_job.delete()

if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil rm -r $BUCKET_URI