## SageMaker Pipelines: Train, Register, and Batch Inference

In [None]:
import os
import boto3
import re
import time
import json
from sagemaker import get_execution_role, session
import pandas as pd

from time import gmtime, strftime
import sagemaker
from sagemaker.model import Model
from sagemaker.image_uris import retrieve
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.model_step import ModelStep
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep
from sagemaker.workflow.parameters import ParameterString
from sagemaker.estimator import Estimator

region = boto3.Session().region_name
sagemaker_session = sagemaker.Session()
s3_prefix = 'xgboost-example'
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()
print("RoleArn: {}".format(role))
from sagemaker.workflow.pipeline import Pipeline

In [None]:
pipeline_session = PipelineSession()

In [None]:
!aws s3 cp s3://sagemaker-sample-files/datasets/tabular/uci_abalone/train_csv/abalone_dataset1_train.csv .

In [None]:
!aws s3 cp abalone_dataset1_train.csv s3://{default_bucket}/xgboost-regression/train.csv

In [None]:
training_path = 's3://{}/xgboost-regression/train.csv'.format(default_bucket)
training_path

In [None]:
import pandas as pd

test = pd.read_csv('abalone_dataset1_train.csv')
test = test.iloc[: , 1:]
test.to_csv('test.csv', index=False)

In [None]:
#Create a sagemaker session to be able to upload data to s3
import boto3
import sagemaker
sagemaker_session = sagemaker.Session()

#Uploading data to S3 bucket titled "tf-iris-data"
prefix = "xgb-test-batch-abalone"
test_data_path = sagemaker_session.upload_data('test.csv', key_prefix=prefix + '/test')

In [None]:
training_input_param = ParameterString(
    name = "training_input",
    default_value=training_path,
)

test_data_param = ParameterString(
    name = "test_input",
    default_value=test_data_path,
)

training_instance_param = ParameterString(
    name = "training_instance",
    default_value = "ml.c5.xlarge")

batch_transform_param = ParameterString(
    name = "batch_inference",
    default_value = "ml.m5.xlarge")

In [None]:
model_path = f's3://{default_bucket}/{s3_prefix}/xgb_model'

image_uri = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.0-1",
    py_version="py3",
    instance_type=training_instance_param,
)

image_uri

In [None]:
xgb_train = Estimator(
    image_uri=image_uri,
    instance_type=training_instance_param,
    instance_count=1,
    output_path=model_path,
    sagemaker_session=pipeline_session,
    role=role
)

xgb_train.set_hyperparameters(
    objective="reg:linear",
    num_round=40,
    max_depth=4,
    eta=0.1,
    gamma=3,
    min_child_weight=5,
    subsample=0.6,
    silent=0,
)

In [None]:
train_args = xgb_train.fit(
    inputs={
        "train": TrainingInput(
            s3_data=training_input_param,
            content_type="text/csv",
        )
    }
)

In [None]:
training_step = TrainingStep(
    name="Training",
    step_args=train_args,
)

In [None]:
model = Model(
    image_uri=image_uri,
    model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts,
    role=role,
    sagemaker_session=pipeline_session
)

In [None]:
create_model_step = ModelStep(
    name="CreateXGBoostModel",
    step_args=model.create(),
)

In [None]:
from sagemaker.workflow.step_collections import RegisterModel

register_step = RegisterModel(
 name="AbaloneRegisterModel",
 model=model,
 content_types=["text/csv"],
 response_types=["text/csv"],
 inference_instances=["ml.t2.medium", "ml.m5.xlarge"],
 transform_instances=["ml.m5.xlarge"],
 model_package_group_name='batchgroup',
)

In [None]:
from sagemaker.workflow.pipeline_context import PipelineSession

from sagemaker.transformer import Transformer
from sagemaker.inputs import TransformInput
from sagemaker.workflow.steps import TransformStep

transformer = Transformer(model_name=create_model_step.properties.ModelName,
                          instance_count=1, instance_type=batch_transform_param, 
                          assemble_with="Line", accept="text/csv", 
                          sagemaker_session=PipelineSession())

In [None]:
transform_step = TransformStep(
    name="AbaloneTransform",
    step_args=transformer.transform(data=test_data_param, 
                                    content_type = "text/csv"),
)

In [None]:
pipeline = Pipeline(
    name="batch-pipeline-abalone",
    steps=[training_step, create_model_step, register_step, transform_step],
    parameters= [training_input_param, training_instance_param, test_data_param, batch_transform_param]
)

In [None]:
pipeline.upsert(role_arn=role)

In [None]:
execution = pipeline.start()

In [None]:
execution.wait()