In [4]:
!pip3 install google-cloud-aiplatform --upgrade



In [5]:
!pip3 install kfp google-cloud-pipeline-components==0.1.5 --upgrade



In [15]:
PROJECT_ID = "my-project-timeseries-pp"

DATA_BUCKET = 'new-hub-data-bucket'
REGION = "us-central1"
SETTING_FILE = 'Model_Settings.txt'

In [16]:
from datetime import datetime
from google.cloud import storage

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

BUCKET_NAME = TIMESTAMP + "-bucket"
print(BUCKET_NAME)
# !gsutil mb -l $REGION $BUCKET_NAME
storage_client = storage.Client(project=PROJECT_ID)
bucket = storage_client.bucket(BUCKET_NAME)
bucket.create(location = REGION)

20210907074707-bucket


In [17]:
from typing import NamedTuple

import kfp
from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component)
from kfp.v2.google.client import AIPlatformClient
import google.cloud
from google.cloud import aiplatform as ap
from google_cloud_pipeline_components import aiplatform as gcc_aip



In [18]:
PATH=%env PATH
%env PATH={PATH}:/home/jupyter/.local/bin


PIPELINE_ROOT = f"gs://{BUCKET_NAME}/pipeline_root/"
PIPELINE_ROOT

env: PATH=/usr/local/cuda/bin:/opt/conda/bin:/opt/conda/condabin:/usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games:/home/jupyter/.local/bin:/home/jupyter/.local/bin


'gs://20210907074707-bucket/pipeline_root/'

In [68]:
@component(
    base_image="gcr.io/deeplearning-platform-release/tf2-cpu.2-3:latest"
)
def initialize(
    PROJECT_ID: str,
    setting_file: str,
    REGION: str,
    bucket_name: str
        
) -> NamedTuple(
    "Outputs",
    [
        ("PROJECT_ID", str),  # Return parameters
        ("REGION", str),
        ("TIMESTAMP", str),  # Return parameters
        ("API_ENDPOINT", str),
        ("BQ_PATH", str),
        ("forecast_file", str),  # Return parameters
        ("number_prediciton_units", str),
        ("forecasts_start_date", str),
        ("actuals_file", str),
        ("drivers_file", str),
        ("calendar_file", str),
        ("files_from_datahub_in", str),
        ("model_display_name", str),
        ("dataset_train_name", str),
        ("training_pipeline_name", str),
        ("bq_dataset_id_preds", str),
        ("bq_table_name_preds", str),
        ("budget_milli_node_hours", str),
        ("data_granularity_unit", str),
        ("data_granularity_count", str),
        ("optimization_objective", str),
        ("pred_op_path", str),
        ("past_pred_op_path", str),
        ("pred_batchjob_disp_name", str),
        ("past_pred_batchjob_disp_name", str)
    ],
):
    from datetime import datetime
    from google.cloud import storage
    TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
    
    # API Endpoint
    #API_ENDPOINT = "us-central1-aiplatform.googleapis.com"
    API_ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)

    
    BQ_PATH = "bq://"+PROJECT_ID+":"+bucket_name.replace("-", "_")+ ":evaluated_data"
    
    
    # reading settings.txt file from google cloud storage

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)

    blob = bucket.blob(setting_file)
    blob = blob.download_as_string()
    blob = blob.decode('utf-8')
    lines = blob.splitlines()
        
    values = {k: v for k, v in (line.split('\t') for line in lines)}   
        
    actuals_file = values["GCP Day Actual File Name"].strip()

    drivers_file = values["GCP Day Driver File"].strip()
    
    forecast_file = "prediction_output.csv"

    calendar_file = values["GCP Calendar File"].strip()

    actuals_start_date = values["First Day of Actual"].strip()

    forecasts_start_date = values["First Day of Forecast"].strip()

    forecasts_last_date = values["Last Day of Forecast"].strip()

    number_prediciton_units = values["Final: # of Periods to Forecast"].strip()
    
    files_from_datahub_in  = values["Load Actuals and Drivers from Data Hub"].strip()
    
    model_display_name  = values["GCP Model Name"].strip()
    
    dataset_train_name = "automl-train-" + values["GCP Dataset Name"].strip()
    
    training_pipeline_name = dataset_train_name + "-pipeline" 
    
    bq_dataset_id_preds = values["GCP Dataset Name"].strip()
    
    bq_table_name_preds = bq_dataset_id_preds + "_predictions"
    
    budget_milli_node_hours = str(int(values["GCP Computing hours"].strip())*1000)
    
    data_granularity_unit = values["Time Level"].strip().lower() #day, minute, hour, week, month, year
    
    data_granularity_count = "1"
    
    optimization_objective = values["GCP Optimization"].strip()
    
    
    
    #create temporary bucket
    bucket_name_temp = bucket_name + "-temp" 
    storage_client = storage.Client(project=PROJECT_ID)
    bucket_temp = storage_client.bucket(bucket_name_temp)
    bucket_temp.location = REGION
    if bucket_temp.exists():
        pass
    else: 
        bucket_temp.create()
    
    pred_op_path = 'gs://' + bucket_name_temp + "/" + TIMESTAMP + "/" + "prediction"
    
    past_pred_op_path = 'gs://' + bucket_name_temp + "/" +  TIMESTAMP + "/" + "past-prediction"
    
    pred_batchjob_disp_name = model_display_name + '-batch'
    
    past_pred_batchjob_disp_name = model_display_name + '-past-batch'

   
 
    return (PROJECT_ID, REGION, TIMESTAMP, API_ENDPOINT, BQ_PATH, forecast_file, number_prediciton_units, forecasts_start_date, actuals_file, drivers_file, calendar_file, files_from_datahub_in, model_display_name, dataset_train_name, training_pipeline_name, bq_dataset_id_preds, bq_table_name_preds, budget_milli_node_hours, data_granularity_unit, data_granularity_count, optimization_objective, pred_op_path, past_pred_op_path, pred_batchjob_disp_name, past_pred_batchjob_disp_name)

In [69]:
@component(
    base_image="gcr.io/deeplearning-platform-release/tf2-cpu.2-3:latest"
)
def transform_data(
    bucket_name: str,
    project_id: str,
    region: str,
    TIMESTAMP: str,
    actuals_file: str,
    drivers_file: str,
    calendar_file: str,
    forecasts_start_date: str,
    number_prediciton_units: str,
    files_from_datahub_in: str
    
) -> NamedTuple(
    "Outputs",
    [
        ("drivers_train_file", str),  # Return parameters
        ("drivers_prediction_file", str),
        ("drivers_list_attrib", str),
        ("drivers_past_prediction_file", str),
        ("transformations_attrib", str)
    ],
):
    
    import os
    import pandas as pd
    import numpy  as np
    import datetime
    import calendar
    from dateutil.relativedelta import relativedelta
    #initializing variables
    
    
    bucket_path = 'gs://' + bucket_name + '/'
   
    bucket_preprocessed_path = 'gs://' + bucket_name + '-temp' + '/' + TIMESTAMP+ '/' + 'preprocessed-data' + '/'
    
    
    
    if files_from_datahub_in == 'true':
        files_from_datahub = True
    else:
        files_from_datahub = False

    
    # reading the actuals file
    df_actuals = pd.read_csv(bucket_path + actuals_file, sep=',', skiprows=[0], header= [0], low_memory=False)
    print("test")
    #converting the column's format
    new_columns = []
    for i in df_actuals.columns:
        if i == "Unnamed: 0":
            new_columns.append("id")
        else:
            new_columns.append(datetime.datetime.strptime(i, '%d %b %y'))
    
    df_actuals.columns = new_columns
    #Find number of days from forecast start date to last day in the actuals file. 
    # This step is done to get the actual only till one day before forecasting start date. 
    # E.g. m5 data contains data until May 22, 2019. We are omitting the data starting from forecasting period start date.
    # Max date of the list
    max_date = max(d for d in new_columns if isinstance(d, datetime.date))
    formatted_forecast_start_date = datetime.datetime.strptime(forecasts_start_date, '%Y-%m-%d')
    diff_days = max_date - formatted_forecast_start_date

    # Find number of date columns in actuals
    full_number_columns = len(df_actuals.columns)
    number_columns = full_number_columns - diff_days.days - 1
    
    # transforming the data format and writing it to "_transformed" file
    df_actuals_transformed = pd.melt(df_actuals, id_vars=df_actuals.columns[0], value_vars=df_actuals.columns[1:number_columns], var_name='date', value_name='label')
    df_actuals_transformed.to_csv(bucket_preprocessed_path + actuals_file.split('.')[0] + '_transformed.csv', index=False)


    #Work with Calendar file to get the events
    calendar_input_file = bucket_path + calendar_file
    #calendar_transformed_file = bucket_preprocessed_path + calendar_file.split(' ')[0] + '_'+ calendar_file.split(' ')[1].split('.')[0] + TIMESTAMP + '_transformed.csv'
    calendar_transformed_file = bucket_preprocessed_path + calendar_file.split('.')[0] + '_transformed.csv'
    # reading the calendar file
    df_calendar = pd.read_csv(calendar_input_file, sep=',', skiprows=[-1], header= [0], low_memory=False)
   
    # transform calendar file
    df_calendar_transformed = df_calendar.rename(columns = {'Unnamed: 0': 'date'}, inplace = False)
    df_calendar_transformed['date'] = pd.to_datetime(df_calendar_transformed['date'])

    #Dropping the event_types if not necessary.
    df_calendar_transformed = df_calendar_transformed.drop(columns=['event_type_1', 'event_type_2'])

    #Save transformed events per date to csv
    df_calendar_transformed.to_csv(calendar_transformed_file, index=False)

    drivers_input_file = bucket_path + drivers_file
    drivers_train_file = bucket_preprocessed_path + drivers_file.split('.')[0] + '_train.csv'
    drivers_prediction_file = bucket_preprocessed_path + drivers_file.split('.')[0] + '_prediction.csv'

    # reading the drivers file
    # skipping first 2 rows
    if files_from_datahub:
        df_drivers = pd.read_csv( drivers_input_file, sep=',', skiprows=[-1], header= [0], low_memory=False)
    else:
        df_drivers = pd.read_csv( drivers_input_file, sep=',', skiprows=[0], header= [0], low_memory=False)


    #converting the column's format
    new_columns = []
    for i in df_drivers.columns:
        if i == "Unnamed: 0":
            new_columns.append("drivers")
        elif i == "Unnamed: 1":
            new_columns.append("id")
        else:
            new_columns.append(datetime.datetime.strptime(i, '%d %b %y'))

    df_drivers.columns = new_columns

    # splitting the drivers data to train and prediction data
    last_column_index_train = number_columns + 1

    #Find the indices of labelled and unlabelled prediction data to send to prediction batch job
    first_column_index_test = last_column_index_train - int(number_prediciton_units)
    last_column_index_test = last_column_index_train + int(number_prediciton_units)


    calendar_columns = []
    drivers_train = []
    drivers_prediction = []
    drivers_list = []
    transformations = []

    for i in df_calendar_transformed.columns:
        if i != "date":
            calendar_columns.append(i.lower().replace(' ', '_'))
        else:
            pass

    for driver, df_driver in df_drivers.groupby('drivers'):
        drivers_list.append(driver.lower().replace(' ', '_'))
        drivers_train.append(pd.melt(df_driver, id_vars=df_driver.columns[1], value_vars=df_driver.columns[2:last_column_index_train], var_name='date', value_name=driver))
        drivers_prediction.append(pd.melt(df_driver, id_vars=df_driver.columns[1], value_vars=df_driver.columns[first_column_index_test:last_column_index_test], var_name='date', value_name=driver))

    #Add more events/features/drivers from calendar dataset to the events in the drivers dataset.
    drivers_list.extend(calendar_columns)

    #Transforming and merging the drivers data together

    drivers_train_merged_1 = drivers_train[0]

    for df_ in drivers_train[1:]:  
        drivers_train_merged_1 = drivers_train_merged_1.merge(df_, how="outer", on =['id', 'date'])

    # add more drivers to training file
    drivers_train_merged = drivers_train_merged_1.merge(df_calendar_transformed, how="left", on =['date'])

    # transform prediction file
    drivers_prediction_merged_1 = drivers_prediction[0]


    for df_ in drivers_prediction[1:]:  
        drivers_prediction_merged_1 = drivers_prediction_merged_1.merge(df_, how="outer", on =['id', 'date'])

    # add calendar events to prediction drivers
    drivers_prediction_merged = drivers_prediction_merged_1.merge(df_calendar_transformed, how="left", on =['date'])


    # writing the drivers for the prediction part directly to a file 
    #NaN = np.nan
    #drivers_prediction_merged["label"] = NaN
    drivers_predictions_merged_labels = drivers_prediction_merged.merge(df_actuals_transformed, how="left", on =['id', 'date'])
    drivers_predictions_merged_labels.columns = [x.lower().replace(' ', '_') for x in drivers_predictions_merged_labels.columns]
    drivers_predictions_merged_labels.to_csv(drivers_prediction_file, index=False)


    # merging the drivers train part with Actuals data and write it to GCS
    drivers_train_final = df_actuals_transformed.merge(drivers_train_merged, how="outer", on=["id", "date"])
    drivers_train_final.columns = [x.lower().replace(' ', '_') for x in drivers_train_final.columns]
    drivers_train_final.to_csv(drivers_train_file, index=False)

    ## Prepare past prediction file
    drivers_past_prediction_file = bucket_preprocessed_path + drivers_file.split('.')[0] + '_past_prediction.csv'

    # reading the actuals file
    df_actuals = pd.read_csv(bucket_path + actuals_file, sep=',', skiprows=[0], header= [0], low_memory=False)

    #converting the column's format
    new_columns = []
    for i in df_actuals.columns:
        if i == "Unnamed: 0":
            new_columns.append("id")
        else:
            new_columns.append(datetime.datetime.strptime(i, '%d %b %y'))

    df_actuals.columns = new_columns

    max_date_past = max(d for d in new_columns if isinstance(d, datetime.date))
    formatted_past_forecast_start_date = datetime.datetime.strptime(forecasts_start_date, '%Y-%m-%d')
    diff_days_past = max_date_past - formatted_past_forecast_start_date


    # Find number of date columns in actuals
    full_number_columns_past = len(df_actuals.columns)
    number_columns_past = full_number_columns_past - diff_days_past.days - int(number_prediciton_units)


    # transforming the data format and writing it to "_transformed" file
    df_actuals_transformed_past = pd.melt(df_actuals, id_vars=df_actuals.columns[0], value_vars=df_actuals.columns[1:number_columns_past], var_name='date', value_name='label')


    # splitting the drivers data to train and prediction data
    last_column_index_train = number_columns_past 
    first_column_index_test = last_column_index_train - int(number_prediciton_units)
    last_column_index_test = last_column_index_train + int(number_prediciton_units)

    # Now prepare the predictions file for the past period.
    # Create past data prediction file indices for labelled and unlabelled data.
    past_first_column_index_test = first_column_index_test - int(number_prediciton_units)
    past_last_column_index_test = last_column_index_test + int(number_prediciton_units)



    #  Working with the past predictions file for the past period.

    drivers_prediction_past = []

    for driver, df_driver in df_drivers.groupby('drivers'):
        #drivers_prediction_past.append(pd.melt(df_driver, id_vars=df_driver.columns[1], value_vars=df_driver.columns[first_column_index_test:past_last_column_index_test], var_name='date', value_name=driver))
        drivers_prediction_past.append(pd.melt(df_driver, id_vars=df_driver.columns[1], value_vars=df_driver.columns[first_column_index_test:last_column_index_test], var_name='date', value_name=driver))

    # transform prediction file
    drivers_prediction_past_merged = drivers_prediction_past[0]

    for df_ in drivers_prediction_past[1:]:  
        drivers_prediction_past_merged = drivers_prediction_past_merged.merge(df_, how="outer", on =['id', 'date'])

    # add calendar events to prediction drivers
    drivers_prediction_past_merged = drivers_prediction_past_merged.merge(df_calendar_transformed, how="left", on =['date'])

    # writing the drivers for the prediction part directly to a file 
    #NaN = np.nan
    #drivers_prediction_merged["label"] = NaN
    drivers_prediction_past_merged_labels = drivers_prediction_past_merged.merge(df_actuals_transformed_past, how="left", on =['id', 'date'])
    drivers_prediction_past_merged_labels.columns = [x.lower().replace(' ', '_') for x in drivers_prediction_past_merged_labels.columns]
    drivers_prediction_past_merged_labels.to_csv(drivers_past_prediction_file, index=False)
    
    #import copy
    import json
    #drivers = copy.copy(drivers_list)
    transformations = [
        #{"auto": {"column_name": "id"}},
        {"auto": {"column_name": "date"}},
        {"auto": {"column_name": "label"}}
        ]

    for driver in drivers_list:
        driver_json = '{"auto": { "column_name": "'+ driver.lower().replace(' ', '_') + '"}}'
        transformations.append(json.loads(driver_json))
    
    transformations_attrib = json.dumps(transformations)
    #transformations_1 = str(transformations).replace("[]","")
    
    driverslist = drivers_list
    driverslist.append("date")
    
    drivers_list_attrib = json.dumps(driverslist)
    
    return (drivers_train_file, drivers_prediction_file, drivers_list_attrib,  drivers_past_prediction_file, transformations_attrib)

In [70]:
@component(
    base_image="gcr.io/deeplearning-platform-release/tf2-cpu.2-3:latest",
    packages_to_install = [
        "google-cloud-aiplatform"
    ],
)
def model_evaluation(
    model: Input[Model],
    API_ENDPOINT: str
):
    from google.cloud import aiplatform
    from google.cloud.aiplatform import gapic as aip
    from google.protobuf import json_format
    import json
    import logging
    
    client_options = {"api_endpoint": API_ENDPOINT}
    print("model name parameter: ", model)
    def create_model_client():
        client = aip.ModelServiceClient(
            client_options=client_options
        )
        return client
    client = create_model_client()
    
    def list_model_evaluations(name):
        response = client.list_model_evaluations(parent=name)
        for evaluation in response:
            print("model_evaluation")
            print(" name:", evaluation.name)
            print(" metrics_schema_uri:", evaluation.metrics_schema_uri)
            metrics = json_format.MessageToDict(evaluation._pb.metrics)
            for metric in metrics.keys():
                print(metric)
            print('rootMeanSquaredError', metrics['rootMeanSquaredError'])
            print('meanAbsoluteError', metrics['meanAbsoluteError'])

        return evaluation.name

    logging.getLogger().setLevel(logging.INFO)
    #aiplatform.init(project=project)
    # extract the model resource name from the input Model Artifact
    model_resource_path = model.uri.replace("aiplatform://v1/", "")
    logging.info("model path: %s", model_resource_path)

    #last_evaluation = list_model_evaluations(model_to_deploy_name)
    last_evaluation = list_model_evaluations(model_resource_path)
    def model_evaluation(name):
        response = client.get_model_evaluation(name=name)
        print("response")
        print(" name:", response.name)
        print(" metrics_schema_uri:", response.metrics_schema_uri)
        print(" metrics:", json_format.MessageToDict(response._pb.metrics))
        print(" create_time:", response.create_time)


    model_evaluation(last_evaluation)

In [75]:
@component(
    base_image="gcr.io/deeplearning-platform-release/tf2-cpu.2-3:latest"
)
def append_results(
    MODEL_NAME: str,
    forecast_file: str,
    BUCKET_NAME: str,
    BUCKET_NAME_PAST: str,
    TIMESTAMP: str,
    current_batch_prediction_job: Input[Artifact],
    past_batch_prediction_job: Input[Artifact]
    
) -> str:
    import pandas as pd

    prediction_path = BUCKET_NAME + "/prediction-"+MODEL_NAME.replace("-", "_")+"*/predictions*.csv"

    print(prediction_path)
    predictions = pd.read_csv(prediction_path, usecols= ['date','id','predicted_label'])
    predictions.columns = ['date','id',MODEL_NAME]

    past_prediction_path = BUCKET_NAME_PAST + "/prediction-"+MODEL_NAME.replace("-", "_")+"*/predictions*.csv"
    past_predictions = pd.read_csv(past_prediction_path, usecols= ['date','id','predicted_label'])
    past_predictions.columns = ['date','id',MODEL_NAME]


    final_result = predictions.append(past_predictions)
    final_result.sort_values(by=['date']).to_csv(BUCKET_NAME + "/" + forecast_file, index=False)

    # Final result file path:
    final_result_path = BUCKET_NAME + "/" + forecast_file
    return final_result_path

In [76]:
@component(
    base_image="gcr.io/deeplearning-platform-release/tf2-cpu.2-3:latest"
)
def load_final_result_to_big_query(
    forecasts_start_date: str,
    REGION: str,
    final_result_path: str,
    project_id: str,
    bq_dataset_id_preds: str,
    bq_table_name_preds: str
):
    from google.cloud import bigquery
    from google.cloud import storage
    from google.cloud.exceptions import NotFound
    


    # Make clients.
    bq_client = bigquery.Client(project=project_id, location=REGION)
   
    print("Client creating using default project: {}".format(bq_client.project))
    
  
    # Define a name for the new dataset.
    dataset_id = bq_dataset_id_preds # BigQuery dataset name. Replace with your BQ dataset name.


    try:
        dataset = bq_client.get_dataset(dataset_id)  # Make an API request.
        print("Dataset {} already exists".format(dataset_id))
    except NotFound:
        print("Dataset {} is not found".format(dataset_id))
        #The project defaults to the Client's project if not specified.
        dataset = bq_client.create_dataset(dataset_id)  # API request
        print("Created Dataset {}".format(dataset_id))

      
    ## Set up variables for BigQuery load: 
    FINAL_TABLE_NAME = bq_table_name_preds
    
    # Configure the load job to load from Google Cloud Storage bucket. Here, we create the schema of table and load the data from a GCS bucket.
    job_config = bigquery.LoadJobConfig(
        schema=[
            bigquery.SchemaField('date', 'DATE'),
            bigquery.SchemaField('id', 'STRING'),
            bigquery.SchemaField('predictions', 'FLOAT')
        ],
        skip_leading_rows=1,
        # The source format defaults to CSV. The line below is optional.
        source_format=bigquery.SourceFormat.CSV
    )
    uri = final_result_path
    destination_table_ref = dataset.table(FINAL_TABLE_NAME)
    table_ref = dataset.table(FINAL_TABLE_NAME)
    
    table_id="{}.{}.{}".format(project_id,dataset_id,FINAL_TABLE_NAME)
    try:
        bq_client.get_table(table_id)  # Make an API request.
        print("Table {} already exists.".format(table_id))
        bq_client.delete_table(table_id)
    except NotFound:
        print("Table {} is not found.".format(table_id))

    # Start the load job
    load_job = bq_client.load_table_from_uri(
        uri, destination_table_ref, job_config=job_config)
    print('Starting job {}'.format(load_job.job_id))

    load_job.result()  # Waits for table load to complete.
    print('Job finished.')

    # Retreive the destination table
    destination_table = bq_client.get_table(table_ref)
    print('Loaded {} rows.'.format(destination_table.num_rows))

In [77]:
@dsl.pipeline(
    # Default pipeline root. You can override it when submitting the pipeline.
    pipeline_root=PIPELINE_ROOT,
    # A name for the pipeline. Use to determine the pipeline Context.
    name="pipeline-test-1"
)
def pipeline( 
    project: str, 
    bucket_name: str,
    region: str,
    setting_file: str
):
    variables = initialize(project, setting_file, region, bucket_name)
    transformed_data = transform_data(
        bucket_name,  
        project, 
        region, 
        variables.outputs["TIMESTAMP"],
        variables.outputs["actuals_file"],
        variables.outputs["drivers_file"],
        variables.outputs["calendar_file"],
        variables.outputs["forecasts_start_date"],
        variables.outputs["number_prediciton_units"],
        variables.outputs["files_from_datahub_in"]).set_memory_limit('64G')
    dataset_create_op = gcc_aip.TimeSeriesDatasetCreateOp(
        project=project, 
        display_name=variables.outputs["dataset_train_name"], 
        gcs_source=transformed_data.outputs["drivers_train_file"], 
        location=region)
    training_op = gcc_aip.AutoMLForecastingTrainingJobRunOp(
        project=project,
        location=region,
        display_name=variables.outputs["training_pipeline_name"],
        dataset=dataset_create_op.outputs["dataset"],
        column_transformations=transformed_data.outputs["transformations_attrib"],
        optimization_objective=variables.outputs["optimization_objective"],
        budget_milli_node_hours=variables.outputs["budget_milli_node_hours"], #needs to be modified using new parameters
        target_column="label", 
        time_column="date",
        time_series_identifier_column="id",
        unavailable_at_forecast_columns=["label"],
        available_at_forecast_columns=transformed_data.outputs["drivers_list_attrib"],
        forecast_horizon=variables.outputs["number_prediciton_units"],
        data_granularity_unit=variables.outputs["data_granularity_unit"],
        data_granularity_count=variables.outputs["data_granularity_count"],
        #predefined_split_column_name=
        #weight_column=
        time_series_attribute_columns=None,
        context_window=variables.outputs["number_prediciton_units"],
        export_evaluated_data_items=True, #False
        export_evaluated_data_items_bigquery_destination_uri=variables.outputs["BQ_PATH"],
        #"bq://"+project+":"+bucket_name.replace("-", "_")+ ":evaluated_data",
        export_evaluated_data_items_override_destination=True,
        #quantiles=
        #validation_options=
        model_display_name=variables.outputs["model_display_name"])
    model_evaluation_op = model_evaluation(training_op.outputs["model"], variables.outputs["API_ENDPOINT"])
    current_batch_predict_op = gcc_aip.ModelBatchPredictOp(
        model=training_op.outputs["model"], 
        gcs_source=transformed_data.outputs["drivers_prediction_file"], 
        gcs_destination_prefix=variables.outputs["pred_op_path"],
        instances_format= 'csv',
        predictions_format= 'csv',
        job_display_name=variables.outputs["pred_batchjob_disp_name"])
    past_batch_predict_op = gcc_aip.ModelBatchPredictOp(
        model=training_op.outputs["model"], 
        gcs_source=transformed_data.outputs["drivers_past_prediction_file"], 
        gcs_destination_prefix=variables.outputs["past_pred_op_path"],
        instances_format= 'csv',
        predictions_format= 'csv',
        job_display_name=variables.outputs["past_pred_batchjob_disp_name"])
    append_results_op = append_results(
        variables.outputs["model_display_name"],
        variables.outputs["forecast_file"],
        variables.outputs["pred_op_path"],
        variables.outputs["past_pred_op_path"],
        variables.outputs["TIMESTAMP"],
        current_batch_predict_op.outputs["batchpredictionjob"],
        past_batch_predict_op.outputs["batchpredictionjob"]
    )
    load_final_result_to_big_query(
        variables.outputs["forecasts_start_date"],
        variables.outputs["REGION"],
        append_results_op.output,
        variables.outputs["PROJECT_ID"],
        variables.outputs["bq_dataset_id_preds"],
        variables.outputs["bq_table_name_preds"]
    )
compiler.Compiler().compile(pipeline_func=pipeline,
        package_path='test_pipe.json')

In [78]:
api_client = ap.PipelineJob(
            display_name="pipeline-job-test",
            template_path="test_pipe.json",
            pipeline_root=PIPELINE_ROOT,
            project=PROJECT_ID,
            location=REGION,
            enable_caching= False,
            parameter_values={"project": PROJECT_ID, "bucket_name": DATA_BUCKET, "region": REGION, "setting_file": SETTING_FILE}
            
        )

# Run the pipeline specifying the Service Account
api_client.run()

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/932447320071/locations/us-central1/pipelineJobs/pipeline-test-1-20210907124011
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/932447320071/locations/us-central1/pipelineJobs/pipeline-test-1-20210907124011')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/pipeline-test-1-20210907124011?project=932447320071
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/932447320071/locations/us-central1/pipelineJobs/pipeline-test-1-20210907124011 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/932447320071/locations/us-central1/pipelineJobs/pipel