In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import json
import pprint
import warnings
import boto3
import sagemaker
import pandas as pd
from sagemaker.estimator import Estimator
from sagemaker.inputs import CreateModelInput, TrainingInput, TransformInput
from sagemaker.lineage.visualizer import LineageTableVisualizer
from sagemaker.model import Model
from sagemaker.model_metrics import MetricsSource, ModelMetrics
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.sklearn.processing import ScriptProcessor, SKLearnProcessor
from sagemaker.transformer import Transformer
from sagemaker.workflow.condition_step import ConditionStep, JsonGet
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo, ConditionLessThanOrEqualTo
from sagemaker.workflow.parameters import ParameterInteger, ParameterString
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.workflow.steps import CacheConfig, CreateModelStep, ProcessingStep, TrainingStep, TransformStep
from sagemaker.workflow.pipeline import Pipeline
warnings.filterwarnings(action='ignore')

In [3]:
RAW_DATA_PATH = '../../Data/ieee-fraud-detection'

sagemaker_session = sagemaker.session.Session()
BUCKET = sagemaker_session.default_bucket()
BASE_JOB_PREFIX = 'ieee-fraud-detection'
MODEL_PACKAGE_GROUP_NAME = 'ieee-fraud-detection'
PIPELINE_NAME = 'ieee-fraud-detection-pipeline'

region = boto3.Session().region_name
role = 'arn:aws:iam::998601677581:role/service-role/AmazonSageMaker-ExecutionRole-20210114T163887' # sagemaker.get_execution_role()

In [4]:
# %%time
# !aws s3 cp {RAW_DATA_PATH}/train_identity.csv s3://{BUCKET}/{BASE_JOB_PREFIX}/raw_data/training/train_identity.csv  --quiet
# !aws s3 cp {RAW_DATA_PATH}/train_transaction.csv s3://{BUCKET}/{BASE_JOB_PREFIX}/raw_data/training/train_transaction.csv --quiet
# !aws s3 cp {RAW_DATA_PATH}/test_identity.csv s3://{BUCKET}/{BASE_JOB_PREFIX}/raw_data/prediction/test_identity.csv --quiet 
# !aws s3 cp {RAW_DATA_PATH}/test_transaction.csv s3://{BUCKET}/{BASE_JOB_PREFIX}/raw_data/prediction/test_transaction.csv --quiet

In [5]:
training_data_uri = f's3://{BUCKET}/{BASE_JOB_PREFIX}/raw_data/training'
prediction_data_uri = f's3://{BUCKET}/{BASE_JOB_PREFIX}/raw_data/prediction'

processing_instance_count = ParameterInteger(
    name='ProcessingInstanceCount',
    default_value=1
)
processing_instance_type = ParameterString(
    name='ProcessingInstanceType',
    default_value='ml.m5.2xlarge'
)
training_data = ParameterString(
    name='TrainingData',
    default_value=training_data_uri
)
training_instance_type = ParameterString(
    name='TrainingInstanceType',
    default_value='ml.m5.2xlarge'
)
prediction_data = ParameterString(
    name='PredictionData',
    default_value=prediction_data_uri,
)
model_approval_status = ParameterString(
    name='ModelApprovalStatus',
    default_value='PendingManualApproval'
)

cache_config = CacheConfig(enable_caching=True, expire_after='PT1H')

In [6]:
sklearn_processor = SKLearnProcessor(
    framework_version='0.23-1',
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name=f'{BASE_JOB_PREFIX}-sklearn-processing'
)

step_preprocess = ProcessingStep(
    name='PreprocessData',
    processor=sklearn_processor,
    inputs=[
      ProcessingInput(source=training_data, destination='/opt/ml/processing/training') 
    ],
    outputs=[
        ProcessingOutput(source='/opt/ml/processing/train', output_name='train'),
        ProcessingOutput(source='/opt/ml/processing/valid', output_name='valid'),
        ProcessingOutput(source='/opt/ml/processing/test', output_name='test')
    ],
    code='scripts/preprocessing.py',
    cache_config=cache_config
)

In [7]:
model_output_uri = f's3://{BUCKET}/{BASE_JOB_PREFIX}/models'
image_uri = sagemaker.image_uris.retrieve(
    framework='xgboost',
    region=region,
    version='1.2-1',
    py_version='py3',
    instance_type=training_instance_type
)

estimator = Estimator(
    image_uri=image_uri,
    role=role,
    instance_count=1,
    instance_type=training_instance_type,
    output_path=model_output_uri,
    use_spot_instances=False,
    max_wait=None
)
params = {
    'booster': 'gbtree',
    'verbosity': 0,
    'objective': 'binary:logistic',
    'seed': 42,
    'max_depth': 6,
    'eta': 0.3,
    'gamma': 0.0,
    'min_child_weight': 1.0,
    'subsample': 1.0,
    'colsample_bytree': 1.0,
    'scale_pos_weight': 1.0,
    'eval_metric': 'auc',
    'num_round': 1000,
    'early_stopping_rounds': 10
}
estimator.set_hyperparameters(**params)

In [8]:
step_train = TrainingStep(
    name='TrainModel',
    estimator=estimator,
    inputs={
        'train': TrainingInput(
            s3_data=step_preprocess.properties.ProcessingOutputConfig.Outputs['train'].S3Output.S3Uri,
            content_type='text/csv'
        ),
        'validation': TrainingInput(
            s3_data=step_preprocess.properties.ProcessingOutputConfig.Outputs['valid'].S3Output.S3Uri,
            content_type='text/csv'
        )
    },
    cache_config=cache_config
)

In [9]:
script_processor = ScriptProcessor(
    role=role,
    image_uri=image_uri,
    command=['python3'],
    instance_count=1,
    instance_type=processing_instance_type,
    base_job_name=f'{BASE_JOB_PREFIX}-script-processing'
)
evaluation = PropertyFile(
    name='ModelEvaluation',
    output_name='evaluation',
    path='eval_metrics.json'
)

step_evaluate = ProcessingStep(
    name='EvaluateModel',
    processor=script_processor,
    inputs=[
        ProcessingInput(
            source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
            destination='/opt/ml/processing/models'
        ),
        ProcessingInput(
            source=step_preprocess.properties.ProcessingOutputConfig.Outputs['test'].S3Output.S3Uri,
            destination='/opt/ml/processing/test'
        )
    ],
    outputs=[
        ProcessingOutput(source='/opt/ml/processing/eval', output_name='evaluation')
    ],
    code='scripts/evaluation.py',
    property_files=[evaluation],
    cache_config=cache_config
)

In [10]:
sklearn_reprocessor = SKLearnProcessor(
    framework_version='0.23-1',
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name=f'{BASE_JOB_PREFIX}-sklearn-reprocessing'
)

step_repreprocess = ProcessingStep(
    name='RepreprocessData',
    processor=sklearn_reprocessor,
    inputs=[
        ProcessingInput(source=training_data, destination='/opt/ml/processing/training'),  
        ProcessingInput(source=prediction_data, destination='/opt/ml/processing/prediction')
    ],
    outputs=[
        ProcessingOutput(source='/opt/ml/processing/retrain', output_name='retrain'),
        ProcessingOutput(source='/opt/ml/processing/revalid', output_name='revalid'),
        ProcessingOutput(source='/opt/ml/processing/retest', output_name='retest')
    ],
    code='scripts/repreprocessing.py',
    cache_config=cache_config
)

In [11]:
full_estimator = Estimator(
    image_uri=image_uri,
    role=role,
    instance_count=1,
    instance_type=training_instance_type,
    output_path=model_output_uri,
    use_spot_instances=False,
    max_wait=None
)
full_estimator.set_hyperparameters(**params)

step_retrain = TrainingStep(
    name='RetrainModel',
    estimator=full_estimator,
    inputs={
        'train': TrainingInput(
            s3_data=step_repreprocess.properties.ProcessingOutputConfig.Outputs['retrain'].S3Output.S3Uri,
            content_type='text/csv'
        ),
        'validation': TrainingInput(
            s3_data=step_repreprocess.properties.ProcessingOutputConfig.Outputs['revalid'].S3Output.S3Uri,
            content_type='text/csv'
        )
    },
    cache_config=cache_config
)

In [12]:
model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        content_type='application/json',
        s3_uri='{}/eval_metrics.json'.format(
            step_evaluate.arguments['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri']
        )
    )
)

step_register = RegisterModel(
    name='RegisterModel',
    estimator=full_estimator,
    model_data=step_retrain.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=['text/csv'],
    response_types=['text/csv'],
    inference_instances=['ml.t2.medium', 'ml.m5.2xlarge'],
    transform_instances=['ml.m5.2xlarge'],
    model_package_group_name=MODEL_PACKAGE_GROUP_NAME,
    model_metrics=model_metrics,
    approval_status=model_approval_status
)

In [13]:
model = Model(
    image_uri=image_uri,
    model_data=step_retrain.properties.ModelArtifacts.S3ModelArtifacts,
    role=role,
    sagemaker_session=sagemaker_session
)

step_deploy = CreateModelStep(
    name='DeployModel',
    model=model,
    inputs=CreateModelInput(
        instance_type='ml.m5.2xlarge',
        accelerator_type='ml.eia2.medium'
    )
)

In [14]:
full_transformer = Transformer(
    model_name=step_deploy.properties.ModelName,
    instance_count=1,
    instance_type='ml.m5.2xlarge',
    output_path=f's3://{BUCKET}/{BASE_JOB_PREFIX}/pred'
)

step_predict = TransformStep(
    name='PredictData',
    transformer=full_transformer,
    inputs=TransformInput(
        data=step_repreprocess.properties.ProcessingOutputConfig.Outputs['retest'].S3Output.S3Uri,    
        content_type='text/csv', 
        split_type='Line'
    ),
    cache_config=cache_config
)

In [15]:
target_metric = 'auroc'
target_value = 0.9
target_minimize = False

step = ConditionLessThanOrEqualTo if target_minimize else ConditionGreaterThanOrEqualTo
condition = step(
    left=JsonGet(
        step=step_evaluate,
        property_file=evaluation,
        json_path=f'eval_metric.{target_metric}'
    ),
    right=target_value
)

step_check = ConditionStep(
    name='CheckCondition',
    conditions=[condition],
    if_steps=[step_repreprocess, step_retrain, step_register, step_deploy, step_predict],
    else_steps=[]
)

In [16]:
pipeline = Pipeline(
    name=PIPELINE_NAME,
    parameters=[
        processing_instance_count,
        processing_instance_type, 
        training_data,
        training_instance_type,
        prediction_data,
        model_approval_status
    ],
    steps=[step_preprocess, step_train, step_evaluate, step_check]
)

In [17]:
definition = json.loads(pipeline.definition())
# pprint.pprint(definition)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


In [18]:
_ = pipeline.upsert(role_arn=role)
execution = pipeline.start()
description = execution.describe()
# pprint.pprint(description)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


In [19]:
execution.wait()

WaiterError: Waiter PipelineExecutionComplete failed: Max attempts exceeded

In [None]:
# execution.list_steps()

In [None]:
eval_metrics = sagemaker.s3.S3Downloader.read_file(
    '{}/eval_metrics.json'.format(
        step_evaluate.arguments['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri'])
)

string = '<MODEL EVALUATION>\n'
for k, v in json.loads(eval_metrics)['eval_metric'].items():
    string += '{}: {:.2%}, '.format(k.upper(), v)
print(string[:-2])

In [None]:
viz = LineageTableVisualizer(sagemaker_session)
for execution_step in reversed(execution.list_steps()):
    display(pd.json_normalize(execution_step))
    display(viz.show(pipeline_execution_step=execution_step))
    print('')

In [None]:
# pipeline.delete()