In [62]:
!mkdir -p tmp

In [63]:
path = "https://raw.githubusercontent.com/PacktPublishing/Amazon-SageMaker-Cookbook/master/Chapter01/files"

In [64]:
!wget -P tmp {path}/management_experience_and_salary.csv

--2021-06-08 01:59:53--  https://raw.githubusercontent.com/PacktPublishing/Amazon-SageMaker-Cookbook/master/Chapter01/files/management_experience_and_salary.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 331 [text/plain]
Saving to: ‘tmp/management_experience_and_salary.csv.2’


2021-06-08 01:59:53 (10.3 MB/s) - ‘tmp/management_experience_and_salary.csv.2’ saved [331/331]



In [65]:
s3_bucket = 'sagemaker-cookbook-bucket'
prefix = 'chapter09'

input_data_uri = f"s3://{s3_bucket}/{prefix}/input/management_experience_and_salary.csv"

In [66]:
!aws s3 cp tmp/management_experience_and_salary.csv {input_data_uri}

upload: tmp/management_experience_and_salary.csv to s3://sagemaker-cookbook-bucket/chapter09/input/management_experience_and_salary.csv


In [67]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
)

processing_instance_type = ParameterString(
    name="ProcessingInstanceType", 
    default_value="ml.m5.xlarge"
)

training_instance_type = ParameterString(
    name="TrainingInstanceType", 
    default_value="ml.m5.xlarge"
)

input_data = ParameterString(
    name="InputData",
    default_value=input_data_uri,
)

In [68]:
from sagemaker import get_execution_role 

role = get_execution_role()

In [69]:
path = "https://raw.githubusercontent.com/PacktPublishing/Amazon-SageMaker-Cookbook/master/Chapter09/scripts"

In [70]:
!wget -P tmp {path}/preprocessing.py

--2021-06-08 01:59:55--  https://raw.githubusercontent.com/PacktPublishing/Amazon-SageMaker-Cookbook/master/Chapter09/scripts/preprocessing.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 793 [text/plain]
Saving to: ‘tmp/preprocessing.py.1’


2021-06-08 01:59:55 (29.1 MB/s) - ‘tmp/preprocessing.py.1’ saved [793/793]



In [71]:
from sagemaker.sklearn.processing import SKLearnProcessor

framework_version = "0.23-1"

sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    instance_type=processing_instance_type,
    instance_count=1,
    role=role,
)

In [72]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep


step_process = ProcessingStep(
    name="ProcessingStep",
    processor=sklearn_processor,
    inputs=[
        ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),
    ],
    outputs=[
        ProcessingOutput(output_name="output", source="/opt/ml/processing/output"),
    ],
    code="tmp/preprocessing.py",
)

In [73]:
import sagemaker 
import boto3
from sagemaker import get_execution_role 

role = get_execution_role()
session = sagemaker.Session()
region_name = boto3.Session().region_name

In [74]:
from sagemaker.image_uris import retrieve 

model_path = f"s3://{s3_bucket}/{prefix}/model"

container = retrieve(
    "linear-learner", 
    region_name, "1"
)

estimator = sagemaker.estimator.Estimator(
    container,
    role, 
    instance_count=1, 
    instance_type='ml.m5.xlarge',
    output_path=model_path,
    sagemaker_session=session
)

estimator.set_hyperparameters(
    predictor_type='regressor', 
    mini_batch_size=4
)

In [75]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep

s3_input_data = step_process.properties.ProcessingOutputConfig.Outputs["output"].S3Output.S3Uri

step_train = TrainingStep(
    name="TrainStep",
    estimator=estimator,
    inputs={
        "train": TrainingInput(
            s3_data=s3_input_data,
            content_type="text/csv",
        )
    },
)

In [76]:
from sagemaker.workflow.pipeline import Pipeline

pipeline_name = f"Pipeline"
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_type,
        training_instance_type,
        input_data,
    ],
    steps=[step_process, step_train],
)

In [77]:
pipeline.upsert(role_arn=role)

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:581320662326:pipeline/pipeline',
 'ResponseMetadata': {'RequestId': '706ca26a-c936-4c92-93be-2b79d9c2f253',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '706ca26a-c936-4c92-93be-2b79d9c2f253',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '76',
   'date': 'Tue, 08 Jun 2021 01:59:56 GMT'},
  'RetryAttempts': 0}}

In [78]:
execution = pipeline.start()

In [79]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:581320662326:pipeline/pipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-1:581320662326:pipeline/pipeline/execution/5tq9cptlv67x',
 'PipelineExecutionDisplayName': 'execution-1623117597042',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2021, 6, 8, 1, 59, 56, 930000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2021, 6, 8, 1, 59, 56, 930000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:581320662326:user-profile/d-rgvubtsq1vug/arvs',
  'UserProfileName': 'arvs',
  'DomainId': 'd-rgvubtsq1vug'},
 'LastModifiedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:581320662326:user-profile/d-rgvubtsq1vug/arvs',
  'UserProfileName': 'arvs',
  'DomainId': 'd-rgvubtsq1vug'},
 'ResponseMetadata': {'RequestId': '806a83a6-a7b6-496b-8379-69f018828ef2',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '806a83a6-a7b6-496b-8379-69f018828ef2',
   'conten

In [80]:
execution.wait()

In [81]:
execution.list_steps()

[{'StepName': 'TrainStep',
  'StartTime': datetime.datetime(2021, 6, 8, 2, 4, 27, 529000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2021, 6, 8, 2, 8, 24, 184000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'TrainingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:581320662326:training-job/pipelines-5tq9cptlv67x-trainstep-owbouy86xs'}}},
 {'StepName': 'ProcessingStep',
  'StartTime': datetime.datetime(2021, 6, 8, 1, 59, 58, 6000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2021, 6, 8, 2, 4, 27, 180000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:581320662326:processing-job/pipelines-5tq9cptlv67x-processingstep-pahg14xjvs'}}}]

In [82]:
import time
from sagemaker.lineage.visualizer import LineageTableVisualizer

viz = LineageTableVisualizer(sagemaker.session.Session())
for execution_step in reversed(execution.list_steps()):
    print(execution_step)
    display(viz.show(pipeline_execution_step=execution_step))
    time.sleep(5)

{'StepName': 'ProcessingStep', 'StartTime': datetime.datetime(2021, 6, 8, 1, 59, 58, 6000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2021, 6, 8, 2, 4, 27, 180000, tzinfo=tzlocal()), 'StepStatus': 'Succeeded', 'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:581320662326:processing-job/pipelines-5tq9cptlv67x-processingstep-pahg14xjvs'}}}


Unnamed: 0,Name/Source,Direction,Type,Association Type,Lineage Type
0,s3://...01-59-56-071/input/code/preprocessing.py,Input,DataSet,ContributedTo,artifact
1,s3://...put/management_experience_and_salary.csv,Input,DataSet,ContributedTo,artifact
2,68331...om/sagemaker-scikit-learn:0.23-1-cpu-py3,Input,Image,ContributedTo,artifact
3,s3://...rn-2021-06-08-01-59-56-071/output/output,Output,DataSet,Produced,artifact


{'StepName': 'TrainStep', 'StartTime': datetime.datetime(2021, 6, 8, 2, 4, 27, 529000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2021, 6, 8, 2, 8, 24, 184000, tzinfo=tzlocal()), 'StepStatus': 'Succeeded', 'Metadata': {'TrainingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:581320662326:training-job/pipelines-5tq9cptlv67x-trainstep-owbouy86xs'}}}


Unnamed: 0,Name/Source,Direction,Type,Association Type,Lineage Type
0,s3://...rn-2021-06-08-01-59-56-071/output/output,Input,DataSet,ContributedTo,artifact
1,38241...us-east-1.amazonaws.com/linear-learner:1,Input,Image,ContributedTo,artifact
2,s3://...TrainStep-OWBouy86XS/output/model.tar.gz,Output,Model,Produced,artifact


In [83]:
from pprint import pprint

pprint(pipeline.describe())

{'CreatedBy': {'DomainId': 'd-rgvubtsq1vug',
               'UserProfileArn': 'arn:aws:sagemaker:us-east-1:581320662326:user-profile/d-rgvubtsq1vug/arvs',
               'UserProfileName': 'arvs'},
 'CreationTime': datetime.datetime(2021, 6, 8, 1, 59, 56, 608000, tzinfo=tzlocal()),
 'LastModifiedBy': {'DomainId': 'd-rgvubtsq1vug',
                    'UserProfileArn': 'arn:aws:sagemaker:us-east-1:581320662326:user-profile/d-rgvubtsq1vug/arvs',
                    'UserProfileName': 'arvs'},
 'LastModifiedTime': datetime.datetime(2021, 6, 8, 2, 8, 24, 639000, tzinfo=tzlocal()),
 'PipelineArn': 'arn:aws:sagemaker:us-east-1:581320662326:pipeline/pipeline',
 'PipelineDefinition': '{"Version": "2020-12-01", "Metadata": {}, '
                       '"Parameters": [{"Name": "ProcessingInstanceType", '
                       '"Type": "String", "DefaultValue": "ml.m5.xlarge"}, '
                       '{"Name": "TrainingInstanceType", "Type": "String", '
                       '"DefaultValue": 

In [84]:
# pipeline.delete()

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:581320662326:pipeline/pipeline',
 'ResponseMetadata': {'RequestId': '5037a9bf-b611-4ccb-b4fc-86a498b12a08',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '5037a9bf-b611-4ccb-b4fc-86a498b12a08',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '76',
   'date': 'Tue, 08 Jun 2021 02:08:39 GMT'},
  'RetryAttempts': 0}}