# Managing ML workflows with AWS Step Functions and the Data Science SDK

<img align="left" width="130" src="https://raw.githubusercontent.com/PacktPublishing/Amazon-SageMaker-Cookbook/master/Extra/cover-small-padded.png"/>

This notebook contains the code to help readers work through one of the recipes of the book [Machine Learning with Amazon SageMaker Cookbook: 80 proven recipes for data scientists and developers to perform ML experiments and deployments](https://www.amazon.com/Machine-Learning-Amazon-SageMaker-Cookbook/dp/1800567030)

### How to do it...

In [None]:
!mkdir -p tmp

In [None]:
g = "raw.githubusercontent.com"
p = "PacktPublishing"
a = "Amazon-SageMaker-Cookbook"
mc = "master/Chapter01"

path = f"https://{g}/{p}/{a}/{mc}/files"

In [None]:
fname = "management_experience_and_salary.csv"

!wget -P tmp {path}/{fname}

In [None]:
import pandas as pd
filename = f"tmp/{fname}"
df_all_data = pd.read_csv(filename)

In [None]:
df_all_data

In [None]:
from sklearn.model_selection import train_test_split

dad = df_all_data

X = dad['management_experience_months'].values 
y = dad['monthly_salary'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3, random_state=0
)

In [None]:
import pandas as pd

df_training_data = pd.DataFrame({ 
    'monthly_salary': y_train, 
    'management_experience_months': X_train
})

df_training_data

In [None]:
df_training_data.to_csv(
    'tmp/training_data.csv', 
    header=False, index=False
)

In [None]:
s3_bucket = 'sagemaker-cookbook-bucket'
prefix = 'chapter09'

In [None]:
tn = "training_data.csv"
source = f"tmp/{tn}"
dest = f"s3://{s3_bucket}/{prefix}/input/{tn}"

!aws s3 cp {source} {dest}

In [None]:
import sagemaker 
import boto3
from sagemaker import get_execution_role 

role = get_execution_role()
session = sagemaker.Session()
region_name = boto3.Session().region_name

In [None]:
training_s3_input_location = f"s3://{s3_bucket}/{prefix}/input/training_data.csv" 
training_s3_output_location = f"s3://{s3_bucket}/{prefix}/output/"

In [None]:
from sagemaker.inputs import TrainingInput

train = TrainingInput(
    training_s3_input_location, 
    content_type="text/csv"
)

In [None]:
from sagemaker.image_uris import retrieve 

container = retrieve(
    "linear-learner", 
    region_name, "1"
)

container

In [None]:
estimator = sagemaker.estimator.Estimator(
    container,
    role, 
    instance_count=1, 
    instance_type='ml.m5.xlarge',
    output_path=training_s3_output_location,
    sagemaker_session=session
)

In [None]:
estimator.set_hyperparameters(
    predictor_type='regressor', 
    mini_batch_size=4
)

In [None]:
!pip -q install --upgrade stepfunctions

In [None]:
execution_role = 'arn:aws:iam::581320662326:role/test-002'

In [None]:
from stepfunctions.inputs import ExecutionInput

In [None]:
execution_input = ExecutionInput(
    schema={ 
        'ModelName': str,
        'EndpointName': str,
        'JobName': str
    }
)

ei = execution_input

In [None]:
from stepfunctions.steps import TrainingStep

In [None]:
training_step = TrainingStep(
    'Training Step', 
    estimator=estimator,
    data={
        'train': train
    },
    job_name=ei['JobName']
)

In [None]:
from stepfunctions.steps import ModelStep

In [None]:
model_step = ModelStep(
    'Model Step',
    model=training_step.get_expected_model(),
    model_name=ei['ModelName']  
)

In [None]:
from stepfunctions.steps import EndpointConfigStep

In [None]:
endpoint_config_step = EndpointConfigStep(
    "Create Endpoint Configuration",
    endpoint_config_name=ei['ModelName'],
    model_name=ei['ModelName'],
    initial_instance_count=1,
    instance_type='ml.m5.xlarge'
)

In [None]:
from stepfunctions.steps import EndpointStep

In [None]:
endpoint_step = EndpointStep(
    "Deploy Endpoint",
    endpoint_name=ei['EndpointName'],
    endpoint_config_name=ei['ModelName']
)

In [None]:
from stepfunctions.steps import Chain

In [None]:
workflow_definition = Chain([
    training_step,
    model_step,
    endpoint_config_step,
    endpoint_step
])

In [None]:
import uuid

uuid.uuid4().hex

In [None]:
def generate_random_string():
    return uuid.uuid4().hex

grs = generate_random_string

In [None]:
import uuid
from stepfunctions.workflow import Workflow

workflow = Workflow(
    name='{}-{}'.format('Workflow', grs()),
    definition=workflow_definition,
    role=execution_role,
    execution_input=execution_input
)

In [None]:
workflow.create()

In [None]:
execution = workflow.execute(
    inputs={
        'JobName': 'll-{}'.format(grs()),
        'ModelName': 'll-{}'.format(grs()),
        'EndpointName': 'll-{}'.format(grs())
    }
)

In [None]:
execution.list_events()

In [None]:
import pandas as pd

In [None]:
events = execution.list_events()
pd.json_normalize(events)

In [None]:
workflow.__dict__

In [None]:
print(workflow.definition.to_json(pretty=True))