In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import json
import pprint
import boto3
import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import CreateModelInput, TrainingInput, TransformInput
from sagemaker.model import Model
from sagemaker.model_metrics import MetricsSource, ModelMetrics 
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.sklearn.processing import ScriptProcessor, SKLearnProcessor
from sagemaker.transformer import Transformer
from sagemaker.workflow.condition_step import ConditionStep, JsonGet
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo, ConditionLessThanOrEqualTo
from sagemaker.workflow.parameters import ParameterInteger, ParameterString
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.workflow.steps import CreateModelStep, ProcessingStep, TrainingStep, TransformStep
from sagemaker.workflow.pipeline import Pipeline

In [3]:
RAW_DATA_PATH = '../../Data/ieee-fraud-detection'

sagemaker_session = sagemaker.session.Session()
BUCKET = sagemaker_session.default_bucket()
PREFIX = 'ieee-fraud-detection-pipeline'
JOB_NAME = 'ieee-fraud-detection'
MODEL_PACKAGE_GROUP_NAME = 'ieee-fraud-detection'

TARGET_METRIC = 'auroc'
TARGET_VALUE = 0.9
MINIMIZE = True

region = boto3.Session().region_name
role = 'arn:aws:iam::998601677581:role/service-role/AmazonSageMaker-ExecutionRole-20210114T163887' # sagemaker.get_execution_role()

In [4]:
%%time

!aws s3 cp {RAW_DATA_PATH}/train_identity.csv s3://{BUCKET}/{PREFIX}/training/train_identity.csv  --quiet
!aws s3 cp {RAW_DATA_PATH}/train_transaction.csv s3://{BUCKET}/{PREFIX}/training/train_transaction.csv --quiet
!aws s3 cp {RAW_DATA_PATH}/test_identity.csv s3://{BUCKET}/{PREFIX}/prediction/test_identity.csv --quiet 
!aws s3 cp {RAW_DATA_PATH}/test_transaction.csv s3://{BUCKET}/{PREFIX}/prediction/test_transaction.csv --quiet

CPU times: user 12.6 s, sys: 4.84 s, total: 17.5 s
Wall time: 14min 11s


In [5]:
training_data_uri = f's3://{BUCKET}/{PREFIX}/training'
prediction_data_uri = f's3://{BUCKET}/{PREFIX}/prediction'

processing_instance_count = ParameterInteger(
    name='ProcessingInstanceCount',
    default_value=1
)
processing_instance_type = ParameterString(
    name='ProcessingInstanceType',
    default_value='ml.m5.2xlarge'
)
training_data = ParameterString(
    name='TrainingData',
    default_value=training_data_uri
)
training_instance_type = ParameterString(
    name='TrainingInstanceType',
    default_value='ml.m5.2xlarge'
)
prediction_data = ParameterString(
    name='TransformData',
    default_value=prediction_data_uri,
)
model_approval_status = ParameterString(
    name='ModelApprovalStatus',
    default_value='PendingManualApproval'
)

In [6]:
sklearn_processor = SKLearnProcessor(
    framework_version='0.23-1',
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name=f'{JOB_NAME}-sklearn-processing'
)

step_preprocess = ProcessingStep(
    name=f'{JOB_NAME}-preprocess_data',
    processor=sklearn_processor,
    inputs=[
      ProcessingInput(source=training_data, destination='/opt/ml/processing/training'),  
    ],
    outputs=[
        ProcessingOutput(source='/opt/ml/processing/train', output_name='train'),
        ProcessingOutput(source='/opt/ml/processing/valid', output_name='valid'),
        ProcessingOutput(source='/opt/ml/processing/test', output_name='test')
    ],
    code='scripts/preprocessing.py'
)

In [7]:
model_output_uri = f's3://{BUCKET}/{PREFIX}/models'
image_uri = sagemaker.image_uris.retrieve(
    framework='xgboost',
    region=region,
    version='1.2-1',
    py_version='py3',
    instance_type=training_instance_type
)

clf = Estimator(
    image_uri=image_uri,
    role=role,
    instance_count=1,
    instance_type=training_instance_type,
    output_path=model_output_uri,
    use_spot_instances=False,
    max_wait=None
)
clf.set_hyperparameters(
    booster='gbtree',
    verbosity=0,
    objective='binary:logistic',
    seed=42,
    max_depth=6,
    eta=0.3,
    gamma=0.0,
    min_child_weight=1.0,
    subsample=1.0,
    colsample_bytree=1.0,
    scale_pos_weight=1.0,
    eval_metric='auc',
    num_round=1000,
    early_stopping_rounds=10
)

In [8]:
step_train = TrainingStep(
    name=f'{JOB_NAME}-train_model',
    estimator=clf,
    inputs={
        'train': TrainingInput(
            s3_data=step_preprocess.properties.ProcessingOutputConfig.Outputs['train'].S3Output.S3Uri,
            content_type='text/csv'
        ),
        'validation': TrainingInput(
            s3_data=step_preprocess.properties.ProcessingOutputConfig.Outputs['valid'].S3Output.S3Uri,
            content_type='text/csv'
        )
    }
)

In [9]:
script_processor = ScriptProcessor(
    role=role,
    image_uri=image_uri,
    command=['python3'],
    instance_count=1,
    instance_type=processing_instance_type,
    base_job_name=f'{JOB_NAME}-script-processing'
)
evaluation = PropertyFile(
    name=f'{JOB_NAME}-evaluation',
    output_name='evaluation',
    path='evaluation.json'
)

step_evaluate = ProcessingStep(
    name=f'{PREFIX}-evaluate_model',
    processor=script_processor,
    inputs=[
        ProcessingInput(
            source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
            destination='/opt/ml/processing/models'
        ),
        ProcessingInput(
            source=step_preprocess.properties.ProcessingOutputConfig.Outputs['test'].S3Output.S3Uri,
            destination='/opt/ml/processing/test'
        )
    ],
    outputs=[
        ProcessingOutput(source='/opt/ml/processing/evaluation', output_name='evaluation'),
    ],
    code='scripts/evaluation.py',
    property_files=[evaluation]
)

In [10]:
model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        content_type='application/json',
        s3_uri='{}/evaluation.json'.format(
            step_evaluate.arguments['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri']
        )
    )
)

step_register = RegisterModel(
    name=f'{JOB_NAME}-register_model',
    estimator=clf,
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=['text/csv'],
    response_types=['text/csv'],
    inference_instances=['ml.t2.medium', 'ml.m5.2xlarge'],
    transform_instances=['ml.m5.2xlarge'],
    model_package_group_name=MODEL_PACKAGE_GROUP_NAME,
    model_metrics=model_metrics,
    approval_status=model_approval_status
)

In [11]:
model = Model(
    image_uri=image_uri,
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    role=role,
    sagemaker_session=sagemaker_session
)

step_deploy = CreateModelStep(
    name=f'{JOB_NAME}-deploy_model',
    model=model,
    inputs=CreateModelInput(
        instance_type='ml.m5.2xlarge',
        accelerator_type='ml.eia2.medium'
    )
)

In [12]:
transformer = Transformer(
    model_name=step_deploy.properties.ModelName,
    instance_count=1,
    instance_type='ml.m5.2xlarge',
    output_path=f's3://{BUCKET}/{PREFIX}/predictions'
)

step_predict = TransformStep(
    name=f'{JOB_NAME}-predict_data',
    transformer=transformer,
    inputs=TransformInput(data=prediction_data)
)

In [13]:
keys = ['left', 'right'] if MINIMIZE else ['right', 'left']    
values = [
    JsonGet(
        step=step_evaluate,
        property_file=evaluation,
        json_path=f'eval_metric.{TARGET_METRIC}'
    ), 
    TARGET_VALUE
]
condition = ConditionLessThanOrEqualTo(
    **{key: value for key, value in zip(keys, values)}
)

step_check = ConditionStep(
    name=f'{JOB_NAME}-check_condition',
    conditions=[condition],
    if_steps=[step_register, step_deploy, step_predict],
    else_steps=[]
)

In [14]:
pipeline_name = f'{JOB_NAME}-pipeline'
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_count,
        processing_instance_type, 
        training_data,
        training_instance_type,
        prediction_data,
        model_approval_status
    ],
    steps=[step_preprocess, step_train, step_evaluate, step_check]
)

In [15]:
definition = json.loads(pipeline.definition())
# pprint.pprint(definition)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


In [20]:
_ = pipeline.upsert(role_arn=role)
execution = pipeline.start()
description = execution.describe()
# pprint.pprint(description)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


In [48]:
%%time
execution.wait()

WaiterError: Waiter PipelineExecutionComplete failed: Waiter encountered a terminal failure state: For expression "PipelineExecutionStatus" we matched expected path: "Failed"

In [49]:
steps_list = execution.list_steps()
pprint.pprint(steps_list)

[{'EndTime': datetime.datetime(2021, 5, 3, 18, 7, 23, 926000, tzinfo=tzlocal()),
  'FailureReason': 'ClientError: Cannot access S3 key.',
  'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:998601677581:processing-job/pipelines-titglejsenlp-ieee-fraud-detection-hx5kbhfiqe'}},
  'StartTime': datetime.datetime(2021, 5, 3, 18, 2, 55, 871000, tzinfo=tzlocal()),
  'StepName': 'ieee-fraud-detection-pipeline-evaluate_model',
  'StepStatus': 'Failed'},
 {'EndTime': datetime.datetime(2021, 5, 3, 18, 2, 55, 442000, tzinfo=tzlocal()),
  'Metadata': {'TrainingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:998601677581:training-job/pipelines-titglejsenlp-ieee-fraud-detection-jiekwqbmca'}},
  'StartTime': datetime.datetime(2021, 5, 3, 17, 50, 38, 4000, tzinfo=tzlocal()),
  'StepName': 'ieee-fraud-detection-train_model',
  'StepStatus': 'Succeeded'},
 {'EndTime': datetime.datetime(2021, 5, 3, 17, 50, 25, 195000, tzinfo=tzlocal()),
  'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemake

In [24]:
description = execution.describe()

In [25]:
description

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:998601677581:pipeline/ieee-fraud-detection-pipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-1:998601677581:pipeline/ieee-fraud-detection-pipeline/execution/q4l913tyk00k',
 'PipelineExecutionDisplayName': 'execution-1620025918561',
 'PipelineExecutionStatus': 'Failed',
 'CreationTime': datetime.datetime(2021, 5, 3, 16, 11, 58, 442000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2021, 5, 3, 16, 37, 3, 677000, tzinfo=tzlocal()),
 'CreatedBy': {},
 'LastModifiedBy': {},
 'ResponseMetadata': {'RequestId': 'cff1a0cf-bb36-4dbb-8f6c-978c2a0f5285',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'cff1a0cf-bb36-4dbb-8f6c-978c2a0f5285',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '546',
   'date': 'Mon, 03 May 2021 07:39:11 GMT'},
  'RetryAttempts': 0}}

In [None]:
s3_input_train = TrainingInput(s3_data='s3://{}/{}/train/'.format(default_bucket, 'ifd-sklearn-process-2021-04-29-02-26-39-184/output'), content_type='csv')
s3_input_valid = TrainingInput(s3_data='s3://{}/{}/valid/'.format(default_bucket, 'ifd-sklearn-process-2021-04-29-02-26-39-184/output'), content_type='csv')


In [None]:
transformer = clf.transformer(
    instance_count=1, 
    instance_type='ml.m5.2xlarge', 
    output_path=f's3://{default_bucket}/{prefix}/prediction'
)

_ = transformer.transform(
    data=f's3://{default_bucket}/{prefix}/test/',
    content_type='text/csv', 
    split_type='Line'
)

In [None]:
hyperparameter_ranges = {'max_depth': IntegerParameter(1, 10),
                         'eta': ContinuousParameter(0.01, 1.0),
                         'gamma': ContinuousParameter(0.0, 1.0),
                         'min_child_weight': ContinuousParameter(1e-06, 1000),
                         'colsample_bytree': ContinuousParameter(0.1, 1.0)}

In [None]:
tuner = HyperparameterTuner(xgb_clf,
                            'validation:auc',
                            hyperparameter_ranges,
                            objective_type='Maximize',
                            max_jobs=20,
                            max_parallel_jobs=3,
                            base_tuning_job_name='ifd-xgboost-hpo',
                            early_stopping_type='Auto')

In [None]:
from sagemaker.inputs import TrainingInput

In [None]:
s3_input_train = TrainingInput(s3_data='s3://{}/{}/train/'.format(default_bucket, 'ifd-sklearn-process-2021-04-29-02-26-39-184/output'), content_type='csv')
s3_input_valid = TrainingInput(s3_data='s3://{}/{}/valid/'.format(default_bucket, 'ifd-sklearn-process-2021-04-29-02-26-39-184/output'), content_type='csv')

tuner.fit({'train': s3_input_train, 'validation': s3_input_valid}, include_cls_metadata=False)

In [None]:
tuner.estimator.

In [None]:
sklearn_processor.run(
    code='ieee-fraud-detection/preprocessing.py',
    inputs=[
        ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),
    ],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/valid"),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/test")
    ]
)

In [None]:
step_process = ProcessingStep(
    name='IFD-Split_and_Preprocess',
    processor=sklearn_processor,
    inputs=[
      ProcessingInput(source=input_data, destination='/opt/ml/processing/input'),  
    ],
    outputs=[
        ProcessingOutput(output_name='train', source='/opt/ml/processing/train'),
        ProcessingOutput(output_name='valid', source='/opt/ml/processing/valid')
    ],
    code='ieee-fraud-detection/preprocessing.py'
)

In [None]:
s3 = boto3.resource('s3')

base_uri = f's3://{default_bucket}/ieee-fraud-detection'
input_data_uri = sagemaker.s3.S3Uploader.upload(
    local_path=DATA_DIR, 
    desired_s3_uri=base_uri,
)
print(input_data_uri)