In [2]:
!mkdir -p tmp

In [3]:
g = "raw.githubusercontent.com"
p = "PacktPublishing"
a = "Amazon-SageMaker-Cookbook"
mc = "master/Chapter01"

path = f"https://{g}/{p}/{a}/{mc}/files"

In [4]:
csv = "management_experience_and_salary.csv"

!wget -P tmp {path}/{csv}

--2021-06-08 02:43:54--  https://raw.githubusercontent.com/PacktPublishing/Amazon-SageMaker-Cookbook/master/Chapter01/files/management_experience_and_salary.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 331 [text/plain]
Saving to: ‘tmp/management_experience_and_salary.csv.4’


2021-06-08 02:43:54 (12.2 MB/s) - ‘tmp/management_experience_and_salary.csv.4’ saved [331/331]



In [5]:
s3_bucket = 'sagemaker-cookbook-bucket'
prefix = 'chapter09'
input_data_uri = f"s3://{s3_bucket}/{prefix}/input/{csv}"

In [6]:
!aws s3 cp tmp/{csv} {input_data_uri}

upload: tmp/management_experience_and_salary.csv to s3://sagemaker-cookbook-bucket/chapter09/input/management_experience_and_salary.csv


In [7]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
)

processing_instance_type = ParameterString(
    name="ProcessingInstanceType", 
    default_value="ml.m5.xlarge"
)

training_instance_type = ParameterString(
    name="TrainingInstanceType", 
    default_value="ml.m5.xlarge"
)

input_data = ParameterString(
    name="InputData",
    default_value=input_data_uri,
)

In [8]:
from sagemaker import get_execution_role 

role = get_execution_role()

In [9]:
g = "raw.githubusercontent.com"
p = "PacktPublishing"
a = "Amazon-SageMaker-Cookbook"
mc = "master/Chapter09"

path = f"https://{g}/{p}/{a}/{mc}/scripts"

In [10]:
!wget -P tmp {path}/preprocessing.py

--2021-06-08 02:43:56--  https://raw.githubusercontent.com/PacktPublishing/Amazon-SageMaker-Cookbook/master/Chapter09/scripts/preprocessing.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 793 [text/plain]
Saving to: ‘tmp/preprocessing.py.2’


2021-06-08 02:43:56 (24.7 MB/s) - ‘tmp/preprocessing.py.2’ saved [793/793]



In [11]:
from sagemaker.sklearn.processing import SKLearnProcessor

framework_version = "0.23-1"

sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    instance_type=processing_instance_type,
    instance_count=1,
    role=role,
)

In [12]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

step_process = ProcessingStep(
    name="ProcessingStep",
    processor=sklearn_processor,
    inputs=[
        ProcessingInput(
            source=input_data, 
            destination="/opt/ml/processing/input"
        ),
    ],
    outputs=[
        ProcessingOutput(
            output_name="output", 
            source="/opt/ml/processing/output"
        ),
    ],
    code="tmp/preprocessing.py",
)

In [13]:
import sagemaker 
import boto3
from sagemaker import get_execution_role 

role = get_execution_role()
session = sagemaker.Session()
region_name = boto3.Session().region_name

In [14]:
from sagemaker.image_uris import retrieve 

model_path = f"s3://{s3_bucket}/{prefix}/model"

container = retrieve(
    "linear-learner", 
    region_name, "1"
)

estimator = sagemaker.estimator.Estimator(
    container,
    role, 
    instance_count=1, 
    instance_type='ml.m5.xlarge',
    output_path=model_path,
    sagemaker_session=session
)

estimator.set_hyperparameters(
    predictor_type='regressor', 
    mini_batch_size=4
)

In [15]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep

s3_input_data = step_process.properties.ProcessingOutputConfig.Outputs["output"].S3Output.S3Uri

step_train = TrainingStep(
    name="TrainStep",
    estimator=estimator,
    inputs={
        "train": TrainingInput(
            s3_data=s3_input_data,
            content_type="text/csv",
        )
    },
)

In [16]:
from sagemaker.workflow.pipeline import Pipeline

pipeline_name = f"Pipeline"
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_type,
        training_instance_type,
        input_data,
    ],
    steps=[step_process, step_train],
)

In [17]:
pipeline.upsert(role_arn=role)

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:581320662326:pipeline/pipeline',
 'ResponseMetadata': {'RequestId': 'f94129c9-cf8b-4d5f-a1bf-1b9aa364f5ab',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'f94129c9-cf8b-4d5f-a1bf-1b9aa364f5ab',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '76',
   'date': 'Tue, 08 Jun 2021 02:43:56 GMT'},
  'RetryAttempts': 0}}

In [18]:
execution = pipeline.start()

In [19]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:581320662326:pipeline/pipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-1:581320662326:pipeline/pipeline/execution/eg9wnguv5caj',
 'PipelineExecutionDisplayName': 'execution-1623120238141',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2021, 6, 8, 2, 43, 58, 4000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2021, 6, 8, 2, 43, 58, 4000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:581320662326:user-profile/d-rgvubtsq1vug/arvs',
  'UserProfileName': 'arvs',
  'DomainId': 'd-rgvubtsq1vug'},
 'LastModifiedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:581320662326:user-profile/d-rgvubtsq1vug/arvs',
  'UserProfileName': 'arvs',
  'DomainId': 'd-rgvubtsq1vug'},
 'ResponseMetadata': {'RequestId': 'de75cb4a-912c-4ab1-836a-2526f6e2f2b1',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'de75cb4a-912c-4ab1-836a-2526f6e2f2b1',
   'content-ty

In [20]:
execution.wait()

In [21]:
execution.list_steps()

[{'StepName': 'TrainStep',
  'StartTime': datetime.datetime(2021, 6, 8, 2, 48, 41, 895000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2021, 6, 8, 2, 52, 20, 663000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'TrainingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:581320662326:training-job/pipelines-eg9wnguv5caj-trainstep-3slug8gveh'}}},
 {'StepName': 'ProcessingStep',
  'StartTime': datetime.datetime(2021, 6, 8, 2, 43, 59, 417000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2021, 6, 8, 2, 48, 41, 498000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:581320662326:processing-job/pipelines-eg9wnguv5caj-processingstep-3xmqm1lfef'}}}]

In [22]:
import time
from sagemaker.lineage.visualizer import LineageTableVisualizer

session = sagemaker.session.Session()
viz = LineageTableVisualizer(session)
ess = reversed(execution.list_steps())

for execution_step in ess:
    print(execution_step)
    display(viz.show(
        pipeline_execution_step=execution_step
    ))
    time.sleep(5)

{'StepName': 'ProcessingStep', 'StartTime': datetime.datetime(2021, 6, 8, 2, 43, 59, 417000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2021, 6, 8, 2, 48, 41, 498000, tzinfo=tzlocal()), 'StepStatus': 'Succeeded', 'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:581320662326:processing-job/pipelines-eg9wnguv5caj-processingstep-3xmqm1lfef'}}}


Unnamed: 0,Name/Source,Direction,Type,Association Type,Lineage Type
0,s3://...02-43-57-070/input/code/preprocessing.py,Input,DataSet,ContributedTo,artifact
1,s3://...put/management_experience_and_salary.csv,Input,DataSet,ContributedTo,artifact
2,68331...om/sagemaker-scikit-learn:0.23-1-cpu-py3,Input,Image,ContributedTo,artifact
3,s3://...rn-2021-06-08-02-43-57-070/output/output,Output,DataSet,Produced,artifact


{'StepName': 'TrainStep', 'StartTime': datetime.datetime(2021, 6, 8, 2, 48, 41, 895000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2021, 6, 8, 2, 52, 20, 663000, tzinfo=tzlocal()), 'StepStatus': 'Succeeded', 'Metadata': {'TrainingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:581320662326:training-job/pipelines-eg9wnguv5caj-trainstep-3slug8gveh'}}}


Unnamed: 0,Name/Source,Direction,Type,Association Type,Lineage Type
0,s3://...rn-2021-06-08-02-43-57-070/output/output,Input,DataSet,ContributedTo,artifact
1,38241...us-east-1.amazonaws.com/linear-learner:1,Input,Image,ContributedTo,artifact
2,s3://...TrainStep-3sLug8gVEh/output/model.tar.gz,Output,Model,Produced,artifact


In [23]:
from pprint import pprint

pprint(pipeline.describe())

{'CreatedBy': {'DomainId': 'd-rgvubtsq1vug',
               'UserProfileArn': 'arn:aws:sagemaker:us-east-1:581320662326:user-profile/d-rgvubtsq1vug/arvs',
               'UserProfileName': 'arvs'},
 'CreationTime': datetime.datetime(2021, 6, 8, 2, 43, 57, 600000, tzinfo=tzlocal()),
 'LastModifiedBy': {'DomainId': 'd-rgvubtsq1vug',
                    'UserProfileArn': 'arn:aws:sagemaker:us-east-1:581320662326:user-profile/d-rgvubtsq1vug/arvs',
                    'UserProfileName': 'arvs'},
 'LastModifiedTime': datetime.datetime(2021, 6, 8, 2, 52, 27, 428000, tzinfo=tzlocal()),
 'PipelineArn': 'arn:aws:sagemaker:us-east-1:581320662326:pipeline/pipeline',
 'PipelineDefinition': '{"Version": "2020-12-01", "Metadata": {}, '
                       '"Parameters": [{"Name": "ProcessingInstanceType", '
                       '"Type": "String", "DefaultValue": "ml.m5.xlarge"}, '
                       '{"Name": "TrainingInstanceType", "Type": "String", '
                       '"DefaultValue":

In [24]:
# pipeline.delete()