# Pipelines

## Setup

### Environment

* Base Python3.0
* ml.t3.medium
* 2 vCPU + 4 GiB

### Dependencies

In [2]:
!pip install sagemaker
!pip install -U scikit-learn

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


### Imports

In [3]:
import pathlib
import json
import sagemaker
import sagemaker.sklearn
import sklearn.model_selection
import sagemaker.workflow.parameters
import sagemaker.workflow.steps
import sagemaker.workflow.pipeline
import sagemaker.sklearn
from scripts import shared_constants
import logging

In [4]:
logging.basicConfig(level=logging.INFO)

### Clients

In [5]:
sagemaker_session = sagemaker.Session()

In [6]:
role = sagemaker.get_execution_role()

### Constants

In [7]:
SKLEARN_FRAMEWORK_VERSION = "1.2-1"
BASE_JOB_NAME = "birds-200-pipeline"
PIPELINE_NAME = "Birds200Pipeline"
IS_LOCAL_PIPELINE = False
REGION = "us-east-1"

#### Parameter Names

In [8]:
PROC_INSTANCE_TYPE = "ProcessingInstanceType"
PROC_INSTANCE_COUNT = "ProcessingInstanceCount"
TRAIN_INSTANCE_TYPE = "TrainingInstanceType"

## Build Model Pipeline

### Parameters

In [9]:
# processing step parameters
processing_instance_type = sagemaker.workflow.parameters.ParameterString(name=PROC_INSTANCE_TYPE, default_value="ml.m5.large")
processing_instance_count = sagemaker.workflow.parameters.ParameterInteger(name=PROC_INSTANCE_COUNT, default_value=1)

# training step parameters
training_instance_type = sagemaker.workflow.parameters.ParameterString(name=TRAIN_INSTANCE_TYPE, default_value="ml.g4dn.4xlarge")

### Helpers

In [10]:
def _step_name(step):
    return f"{PIPELINE_NAME}-{step}"

### Pipeline Session

Use LocalPipelineSession for initial development. Then when confident with changes, switch to PipelineSession.

In [11]:
if IS_LOCAL_PIPELINE:
    pipeline_session = sagemaker.workflow.pipeline_context.LocalPipelineSession()
else:
    pipeline_session = sagemaker.workflow.pipeline_context.PipelineSession()

### Workflow Steps

#### Processing Step

In [12]:
sklearn_processor = sagemaker.sklearn.processing.SKLearnProcessor(
    framework_version=SKLEARN_FRAMEWORK_VERSION,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name=BASE_JOB_NAME,
    role=role,
    sagemaker_session=pipeline_session,
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [13]:
inputs = [
    sagemaker.processing.ProcessingInput(
        source="./scripts",
        destination=str(shared_constants.ML_INPUT_DIR),
        input_name="scripts",
    ),
]

outputs = [
    sagemaker.processing.ProcessingOutput(output_name=str(output_name), source=str(source))
    for (output_name, source) in [
        (shared_constants.TRAIN_CHANNEL, shared_constants.ML_TRAIN_DIR),
        (shared_constants.VAL_CHANNEL, shared_constants.ML_VAL_DIR),
        (shared_constants.TEST_CHANNEL, shared_constants.ML_TEST_DIR),
    ]
]

processing_step = sagemaker.workflow.steps.ProcessingStep(
    name=_step_name("Processing"),
    step_args=sklearn_processor.run(
        inputs=inputs,
        outputs=outputs,
        code="./scripts/install_packages.py"),
)



#### Model Step

I utilize the built-in Docker image and model for object detection. One can also use a custom model within a prebuilt image or deploy a custom image. Here are some helpful resources:
* Learn more about training with Amazon SageMaker: [link](https://docs.aws.amazon.com/sagemaker/latest/dg/how-it-works-training.html)
* Explore an example of using PyTorch for MNIST classification in SageMaker: [link](https://github.com/aws/amazon-sagemaker-examples/blob/main/sagemaker-python-sdk/pytorch_mnist/pytorch_mnist.ipynb)
* Understand Docker containers in SageMaker: [link](https://docs.aws.amazon.com/sagemaker/latest/dg/docker-containers.html)

In [14]:
training_image = sagemaker.image_uris.retrieve(region=REGION, framework="object-detection", version="latest")

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [15]:
estimator = sagemaker.estimator.Estimator(
    training_image,
    role,
    instance_count=1,
    instance_type=training_instance_type,
    volume_size=50,
    max_run=int(3600*1.5),
    input_mode="File",
    output_path=shared_constants.S3_OUTPUT_OBJECT_KEY,
    sagemaker_session=pipeline_session,
    base_job_name=BASE_JOB_NAME,
)

For information on object-detection hyperparameters, refer to the documentation at: https://docs.aws.amazon.com/sagemaker/latest/dg/object-detection-api-config.html.

In [16]:
estimator.set_hyperparameters(
    num_classes=len(shared_constants.CLASS_IDS),
    num_training_samples=shared_constants.NUM_TRAINING_SAMPLES,
)

In [17]:
estimator_inputs = {
    "train": sagemaker.inputs.TrainingInput(
        s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[shared_constants.TRAIN_CHANNEL].S3Output.S3Uri,
        content_type="application/x-recordio",
    ),
    "validation": sagemaker.inputs.TrainingInput(
        s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[shared_constants.VAL_CHANNEL].S3Output.S3Uri,
        content_type="application/x-recordio",
    ),
}

In [18]:
training_step = sagemaker.workflow.steps.TrainingStep(
    name=_step_name("Training"),
    step_args=estimator.fit(inputs=estimator_inputs),
)

### Create the Pipeline

In [19]:
pipeline = sagemaker.workflow.pipeline.Pipeline(
    name=PIPELINE_NAME,
    parameters=[processing_instance_type, processing_instance_count, training_instance_type],
    steps=[processing_step, training_step],
    sagemaker_session=pipeline_session,
)

#### Inspect the Pipeline Definition

In [20]:
json.loads(pipeline.definition())

{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'ProcessingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.large'},
  {'Name': 'ProcessingInstanceCount', 'Type': 'Integer', 'DefaultValue': 1},
  {'Name': 'TrainingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.g4dn.4xlarge'}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'Birds200Pipeline-Processing',
   'Type': 'Processing',
   'Arguments': {'ProcessingResources': {'ClusterConfig': {'InstanceType': {'Get': 'Parameters.ProcessingInstanceType'},
      'InstanceCount': {'Get': 'Parameters.ProcessingInstanceCount'},
      'VolumeSizeInGB': 30}},
    'AppSpecification': {'ImageUri': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:1.2-1-cpu-py3',
     'ContainerEntrypoint': ['python3',
      '/opt/ml/processing/input/code/install_packages.py']},
    'RoleArn': '

### Build the Pipeline

In [21]:
pipeline.upsert(role_arn=role)

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:180797159824:pipeline/birds200pipeline',
 'ResponseMetadata': {'RequestId': '407a320c-6c16-4588-9d52-65672cd276f4',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '407a320c-6c16-4588-9d52-65672cd276f4',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '84',
   'date': 'Sat, 13 May 2023 21:10:38 GMT'},
  'RetryAttempts': 0}}

In [22]:
execution = pipeline.start()

In [23]:
execution.wait()

In [24]:
execution.list_steps()

[{'StepName': 'Birds200Pipeline-Training',
  'StartTime': datetime.datetime(2023, 5, 13, 21, 16, 56, 804000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2023, 5, 13, 21, 25, 29, 392000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'AttemptCount': 0,
  'Metadata': {'TrainingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:180797159824:training-job/pipelines-ell1gp7g1fih-Birds200Pipeline-Tra-WjX1uMhOZj'}}},
 {'StepName': 'Birds200Pipeline-Processing',
  'StartTime': datetime.datetime(2023, 5, 13, 21, 10, 40, 935000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2023, 5, 13, 21, 16, 55, 787000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'AttemptCount': 0,
  'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:180797159824:processing-job/pipelines-ell1gp7g1fih-Birds200Pipeline-Pro-nBkUKdeWy6'}}}]