In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import json
import pprint
import warnings
import boto3
import sagemaker
import pandas as pd
from sagemaker.estimator import Estimator
from sagemaker.inputs import CreateModelInput, TrainingInput, TransformInput
from sagemaker.lineage.visualizer import LineageTableVisualizer
from sagemaker.model import Model
from sagemaker.model_metrics import MetricsSource, ModelMetrics
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.sklearn.processing import ScriptProcessor, SKLearnProcessor
from sagemaker.transformer import Transformer
from sagemaker.workflow.condition_step import ConditionStep, JsonGet
from sagemaker.workflow.conditions import (
    ConditionGreaterThanOrEqualTo,
    ConditionLessThanOrEqualTo,
)
from sagemaker.workflow.parameters import ParameterInteger, ParameterString
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.workflow.steps import (
    CacheConfig,
    CreateModelStep,
    ProcessingStep,
    TrainingStep,
    TransformStep,
)
from sagemaker.workflow.pipeline import Pipeline

warnings.filterwarnings(action="ignore")

In [3]:
RAW_DATA_PATH = "../../data/ieee-fraud-detection"

sagemaker_session = sagemaker.session.Session()
BUCKET = sagemaker_session.default_bucket()
BASE_JOB_PREFIX = "ieee-fraud-detection"
BASE_DIR = "/opt/ml/processing"
MODEL_PACKAGE_GROUP_NAME = "ieee-fraud-detection"
PIPELINE_NAME = "ieee-fraud-detection-pipeline"

region = boto3.Session().region_name
role = sagemaker.get_execution_role()

In [4]:
%%time
!aws s3 cp {RAW_DATA_PATH}/train_identity.csv s3://{BUCKET}/{BASE_JOB_PREFIX}/training/train_identity.csv  --quiet
!aws s3 cp {RAW_DATA_PATH}/train_transaction.csv s3://{BUCKET}/{BASE_JOB_PREFIX}/training/train_transaction.csv --quiet
!aws s3 cp {RAW_DATA_PATH}/test_identity.csv s3://{BUCKET}/{BASE_JOB_PREFIX}/prediction/test_identity.csv --quiet 
!aws s3 cp {RAW_DATA_PATH}/test_transaction.csv s3://{BUCKET}/{BASE_JOB_PREFIX}/prediction/test_transaction.csv --quiet

In [5]:
training_data_uri = f"s3://{BUCKET}/{BASE_JOB_PREFIX}/training"
prediction_data_uri = f"s3://{BUCKET}/{BASE_JOB_PREFIX}/prediction"

processing_instance_count = ParameterInteger(
    name="ProcessingInstanceCount", default_value=1
)
processing_instance_type = ParameterString(
    name="ProcessingInstanceType", default_value="ml.m5.2xlarge"
)
training_data = ParameterString(name="TrainingData", default_value=training_data_uri)
training_instance_type = ParameterString(
    name="TrainingInstanceType", default_value="ml.m5.2xlarge"
)
prediction_data = ParameterString(
    name="PredictionData",
    default_value=prediction_data_uri,
)
model_approval_status = ParameterString(
    name="ModelApprovalStatus", default_value="PendingManualApproval"
)

cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")

In [6]:
valid_size = 0.1
test_size = 0.1
valid_size = str(valid_size)
test_size = str(test_size)

sklearn_processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name=f"{BASE_JOB_PREFIX}-sklearn-processing",
)

step_preprocess = ProcessingStep(
    name="PreprocessData",
    processor=sklearn_processor,
    inputs=[ProcessingInput(source=training_data, destination=BASE_DIR + "/training")],
    outputs=[
        ProcessingOutput(source=BASE_DIR + "/train", output_name="train"),
        ProcessingOutput(source=BASE_DIR + "/valid", output_name="valid"),
        ProcessingOutput(source=BASE_DIR + "/test", output_name="test"),
    ],
    code=os.path.join("scripts", "preprocessing.py"),
    cache_config=cache_config,
    job_arguments=[
        "--base_dir",
        BASE_DIR,
        "--valid_size",
        valid_size,
        "--test_size",
        test_size,
    ],
)

In [7]:
sklearn_processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name=f"{BASE_JOB_PREFIX}-sklearn-processing",
)

step_preprocess = ProcessingStep(
    name="PreprocessData",
    processor=sklearn_processor,
    inputs=[ProcessingInput(source=training_data, destination=BASE_DIR + "/training")],
    outputs=[
        ProcessingOutput(source=BASE_DIR + "/train", output_name="train"),
        ProcessingOutput(source=BASE_DIR + "/valid", output_name="valid"),
        ProcessingOutput(source=BASE_DIR + "/test", output_name="test"),
    ],
    code=os.path.join("scripts", "preprocessing.py"),
    cache_config=cache_config,
)

In [8]:
model_output_uri = f"s3://{BUCKET}/{BASE_JOB_PREFIX}/models"
training_image_uri = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.2-1",
    py_version="py3",
    instance_type=training_instance_type,
)

estimator = Estimator(
    image_uri=training_image_uri,
    role=role,
    instance_count=1,
    instance_type=training_instance_type,
    output_path=model_output_uri,
    use_spot_instances=False,
    max_wait=None,
)
params = {
    "booster": "gbtree",
    "verbosity": 0,
    "objective": "binary:logistic",
    "seed": 42,
    "max_depth": 6,
    "eta": 0.3,
    "gamma": 0.0,
    "min_child_weight": 1.0,
    "subsample": 1.0,
    "colsample_bytree": 1.0,
    "scale_pos_weight": 1.0,
    "eval_metric": "auc",
    "num_round": 1000,
    "early_stopping_rounds": 10,
}
estimator.set_hyperparameters(**params)

In [9]:
step_train = TrainingStep(
    name="TrainModel",
    estimator=estimator,
    inputs={
        "train": TrainingInput(
            s3_data=step_preprocess.properties.ProcessingOutputConfig.Outputs[
                "train"
            ].S3Output.S3Uri,
            content_type="text/csv",
        ),
        "validation": TrainingInput(
            s3_data=step_preprocess.properties.ProcessingOutputConfig.Outputs[
                "valid"
            ].S3Output.S3Uri,
            content_type="text/csv",
        ),
    },
    cache_config=cache_config,
)

In [10]:
script_processor = ScriptProcessor(
    role=role,
    image_uri=training_image_uri,
    command=["python3"],
    instance_count=1,
    instance_type=processing_instance_type,
    base_job_name=f"{BASE_JOB_PREFIX}-script-processing",
)
evaluation = PropertyFile(
    name="ModelEvaluation", output_name="evaluation", path="eval_metrics.json"
)

step_evaluate = ProcessingStep(
    name="EvaluateModel",
    processor=script_processor,
    inputs=[
        ProcessingInput(
            source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
            destination=BASE_DIR + "/models",
        ),
        ProcessingInput(
            source=step_preprocess.properties.ProcessingOutputConfig.Outputs[
                "test"
            ].S3Output.S3Uri,
            destination=BASE_DIR + "/test",
        ),
    ],
    outputs=[ProcessingOutput(source=BASE_DIR + "/eval", output_name="evaluation")],
    code=os.path.join("scripts", "evaluation.py"),
    property_files=[evaluation],
    cache_config=cache_config,
    job_arguments=[
        "--base_dir",
        BASE_DIR,
    ],
)

In [11]:
sklearn_re_processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name=f"{BASE_JOB_PREFIX}-sklearn-re-processing",
)

step_re_preprocess = ProcessingStep(
    name="Re-preprocessData",
    processor=sklearn_re_processor,
    inputs=[
        ProcessingInput(source=training_data, destination=BASE_DIR + "/training"),
        ProcessingInput(source=prediction_data, destination=BASE_DIR + "/prediction"),
    ],
    outputs=[
        ProcessingOutput(source=BASE_DIR + "/re_train", output_name="re_train"),
        ProcessingOutput(source=BASE_DIR + "/re_valid", output_name="re_valid"),
        ProcessingOutput(source=BASE_DIR + "/re_test", output_name="re_test"),
    ],
    code=os.path.join("scripts", "re_preprocessing.py"),
    cache_config=cache_config,
    job_arguments=[
        "--base_dir",
        BASE_DIR,
        "--test_size",
        test_size,
    ],
)

In [12]:
full_estimator = Estimator(
    image_uri=training_image_uri,
    role=role,
    instance_count=1,
    instance_type=training_instance_type,
    output_path=model_output_uri,
    use_spot_instances=False,
    max_wait=None,
)
full_estimator.set_hyperparameters(**params)

step_re_train = TrainingStep(
    name="Re-trainModel",
    estimator=full_estimator,
    inputs={
        "train": TrainingInput(
            s3_data=step_re_preprocess.properties.ProcessingOutputConfig.Outputs[
                "re_train"
            ].S3Output.S3Uri,
            content_type="text/csv",
        ),
        "validation": TrainingInput(
            s3_data=step_re_preprocess.properties.ProcessingOutputConfig.Outputs[
                "re_valid"
            ].S3Output.S3Uri,
            content_type="text/csv",
        ),
    },
    cache_config=cache_config,
)

In [13]:
model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        content_type="application/json",
        s3_uri=f"{step_evaluate.arguments['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri']}/\
        eval_metrics.json",
    ),
)

step_register = RegisterModel(
    name="RegisterModel",
    estimator=full_estimator,
    model_data=step_re_train.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.t2.medium", "ml.m5.2xlarge"],
    transform_instances=["ml.m5.2xlarge"],
    model_package_group_name=MODEL_PACKAGE_GROUP_NAME,
    model_metrics=model_metrics,
    approval_status=model_approval_status,
)

In [14]:
model = Model(
    image_uri=training_image_uri,
    model_data=step_re_train.properties.ModelArtifacts.S3ModelArtifacts,
    role=role,
    sagemaker_session=sagemaker_session,
)

step_deploy = CreateModelStep(
    name="DeployModel",
    model=model,
    inputs=CreateModelInput(
        instance_type="ml.m5.2xlarge", accelerator_type="ml.eia2.medium"
    ),
)

In [15]:
full_transformer = Transformer(
    model_name=step_deploy.properties.ModelName,
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    output_path=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/pred",
)

step_predict = TransformStep(
    name="PredictData",
    transformer=full_transformer,
    inputs=TransformInput(
        data=step_re_preprocess.properties.ProcessingOutputConfig.Outputs[
            "re_test"
        ].S3Output.S3Uri,
        content_type="text/csv",
        split_type="Line",
    ),
    cache_config=cache_config,
)

In [16]:
target_metric = "auroc"
target_value = 0.9
target_minimize = False

step = ConditionLessThanOrEqualTo if target_minimize else ConditionGreaterThanOrEqualTo
condition = step(
    left=JsonGet(
        step=step_evaluate,
        property_file=evaluation,
        json_path=f"eval_metric.{target_metric}",
    ),
    right=target_value,
)

step_check = ConditionStep(
    name="CheckCondition",
    conditions=[condition],
    if_steps=[
        step_re_preprocess,
        step_re_train,
        step_register,
        step_deploy,
        step_predict,
    ],
    else_steps=[],
)

In [17]:
pipeline = Pipeline(
    name=PIPELINE_NAME,
    parameters=[
        processing_instance_count,
        processing_instance_type,
        training_data,
        training_instance_type,
        prediction_data,
        model_approval_status,
    ],
    steps=[step_preprocess, step_train, step_evaluate, step_check],
)

In [18]:
definition = json.loads(pipeline.definition())
# pprint.pprint(definition)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


In [19]:
_ = pipeline.upsert(role_arn=role)
execution = pipeline.start()
description = execution.describe()
# pprint.pprint(description)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


In [20]:
execution.wait()

In [21]:
# execution.list_steps()

In [22]:
eval_metrics = sagemaker.s3.S3Downloader.read_file(
    f"{step_evaluate.arguments['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri']}/eval_metrics.json"
)

string = "<MODEL EVALUATION>\n"
for key, value in json.loads(eval_metrics)["eval_metric"].items():
    string += f"{key.upper()}: {value:.2%}, "
print(string[:-2])

<MODEL EVALUATION>
ACCURACY: 98.26%, PRECISION: 92.40%, RECALL: 54.84%, F1: 68.83%, AUROC: 95.11%, AUPRC: 76.30%


In [23]:
viz = LineageTableVisualizer(sagemaker_session)
for execution_step in reversed(execution.list_steps()):
    display(pd.json_normalize(execution_step))
    display(viz.show(pipeline_execution_step=execution_step))
    print("")

Unnamed: 0,StepName,StartTime,EndTime,StepStatus,Metadata.ProcessingJob.Arn
0,PreprocessData,2021-06-05 23:19:02.981000+09:00,2021-06-05 23:26:08.049000+09:00,Succeeded,arn:aws:sagemaker:us-east-1:...:proce...


Unnamed: 0,Name/Source,Direction,Type,Association Type,Lineage Type
0,s3://...14-18-55-619/input/code/preprocessing.py,Input,DataSet,ContributedTo,artifact
1,s3://...8601677581/ieee-fraud-detection/training,Input,DataSet,ContributedTo,artifact
2,68331...om/sagemaker-scikit-learn:0.23-1-cpu-py3,Input,Image,ContributedTo,artifact
3,s3://...sing-2021-06-05-14-18-42-185/output/test,Output,DataSet,Produced,artifact
4,s3://...ing-2021-06-05-14-18-42-185/output/valid,Output,DataSet,Produced,artifact
5,s3://...ing-2021-06-05-14-18-42-185/output/train,Output,DataSet,Produced,artifact





Unnamed: 0,StepName,StartTime,EndTime,StepStatus,Metadata.TrainingJob.Arn
0,TrainModel,2021-06-05 23:26:08.755000+09:00,2021-06-05 23:34:38.148000+09:00,Succeeded,arn:aws:sagemaker:us-east-1:...:train...


Unnamed: 0,Name/Source,Direction,Type,Association Type,Lineage Type
0,s3://...ing-2021-06-05-14-18-42-185/output/valid,Input,DataSet,ContributedTo,artifact
1,s3://...ing-2021-06-05-14-18-42-185/output/train,Input,DataSet,ContributedTo,artifact
2,68331...-1.amazonaws.com/sagemaker-xgboost:1.2-1,Input,Image,ContributedTo,artifact
3,s3://...rainModel-1zL97L8LQv/output/model.tar.gz,Output,Model,Produced,artifact





Unnamed: 0,StepName,StartTime,EndTime,StepStatus,Metadata.ProcessingJob.Arn
0,EvaluateModel,2021-06-05 23:34:38.748000+09:00,2021-06-05 23:39:32.616000+09:00,Succeeded,arn:aws:sagemaker:us-east-1:...:proce...


Unnamed: 0,Name/Source,Direction,Type,Association Type,Lineage Type
0,s3://...05-14-18-57-272/input/code/evaluation.py,Input,DataSet,ContributedTo,artifact
1,s3://...sing-2021-06-05-14-18-42-185/output/test,Input,DataSet,ContributedTo,artifact
2,s3://...rainModel-1zL97L8LQv/output/model.tar.gz,Input,Model,ContributedTo,artifact
3,68331...-1.amazonaws.com/sagemaker-xgboost:1.2-1,Input,Image,ContributedTo,artifact
4,s3://...021-06-05-14-18-39-018/output/evaluation,Output,DataSet,Produced,artifact





Unnamed: 0,StepName,StartTime,EndTime,StepStatus,Metadata.Condition.Outcome
0,CheckCondition,2021-06-05 23:39:42.013000+09:00,2021-06-05 23:39:42.527000+09:00,Succeeded,True


None




Unnamed: 0,StepName,StartTime,EndTime,StepStatus,Metadata.ProcessingJob.Arn
0,Re-preprocessData,2021-06-05 23:39:42.927000+09:00,2021-06-05 23:48:28.295000+09:00,Succeeded,arn:aws:sagemaker:us-east-1:...:proce...


Unnamed: 0,Name/Source,Direction,Type,Association Type,Lineage Type
0,s3://...18-58-420/input/code/re_preprocessing.py,Input,DataSet,ContributedTo,artifact
1,s3://...01677581/ieee-fraud-detection/prediction,Input,DataSet,ContributedTo,artifact
2,s3://...8601677581/ieee-fraud-detection/training,Input,DataSet,ContributedTo,artifact
3,68331...om/sagemaker-scikit-learn:0.23-1-cpu-py3,Input,Image,ContributedTo,artifact
4,s3://...s-2021-06-05-14-18-46-417/output/re_test,Output,DataSet,Produced,artifact
5,s3://...-2021-06-05-14-18-46-417/output/re_valid,Output,DataSet,Produced,artifact
6,s3://...-2021-06-05-14-18-46-417/output/re_train,Output,DataSet,Produced,artifact





Unnamed: 0,StepName,StartTime,EndTime,StepStatus,Metadata.TrainingJob.Arn
0,Re-trainModel,2021-06-05 23:48:28.981000+09:00,2021-06-06 00:07:20.175000+09:00,Succeeded,arn:aws:sagemaker:us-east-1:...:train...


Unnamed: 0,Name/Source,Direction,Type,Association Type,Lineage Type
0,s3://...-2021-06-05-14-18-46-417/output/re_valid,Input,DataSet,ContributedTo,artifact
1,s3://...-2021-06-05-14-18-46-417/output/re_train,Input,DataSet,ContributedTo,artifact
2,68331...-1.amazonaws.com/sagemaker-xgboost:1.2-1,Input,Image,ContributedTo,artifact
3,s3://...rainModel-6TbaVKVaCx/output/model.tar.gz,Output,Model,Produced,artifact





Unnamed: 0,StepName,StartTime,EndTime,StepStatus,Metadata.Model.Arn
0,DeployModel,2021-06-06 00:07:20.501000+09:00,2021-06-06 00:07:21.696000+09:00,Succeeded,arn:aws:sagemaker:us-east-1:...:model...


None




Unnamed: 0,StepName,StartTime,EndTime,StepStatus,Metadata.RegisterModel.Arn
0,RegisterModel,2021-06-06 00:07:20.519000+09:00,2021-06-06 00:07:21.646000+09:00,Succeeded,arn:aws:sagemaker:us-east-1:...:model...


Unnamed: 0,Name/Source,Direction,Type,Association Type,Lineage Type
0,s3://...rainModel-6TbaVKVaCx/output/model.tar.gz,Input,Model,ContributedTo,artifact
1,68331...-1.amazonaws.com/sagemaker-xgboost:1.2-1,Input,Image,ContributedTo,artifact
2,ieee-fraud-detection-13-PendingManualApproval-...,Input,Approval,ContributedTo,action
3,ieee-fraud-detection-1620038248-aws-model-pack...,Output,ModelGroup,AssociatedWith,context





Unnamed: 0,StepName,StartTime,EndTime,StepStatus,Metadata.TransformJob.Arn
0,PredictData,2021-06-06 00:07:31.826000+09:00,2021-06-06 00:13:07.213000+09:00,Succeeded,arn:aws:sagemaker:us-east-1:...:trans...


Unnamed: 0,Name/Source,Direction,Type,Association Type,Lineage Type
0,s3://...rainModel-6TbaVKVaCx/output/model.tar.gz,Input,Model,ContributedTo,artifact
1,68331...-1.amazonaws.com/sagemaker-xgboost:1.2-1,Input,Image,ContributedTo,artifact
2,s3://...s-2021-06-05-14-18-46-417/output/re_test,Input,DataSet,ContributedTo,artifact
3,s3://...1-.../ieee-fraud-detection/pred,Output,DataSet,Produced,artifact





In [24]:
# pipeline.delete()