## SKLearn E2E Step Decorator Pipelines

Example of lifting and shifting local Python code with the SageMaker Pipelines step decorator. Taking a local SKLearn example, where we create a dummy dataset, run model training, and perform sample inference/evaluation.

### Setup

In [None]:
#%pip install -r ./requirements.txt

In [None]:
import os

# Set path to config file
os.environ["SAGEMAKER_USER_CONFIG_OVERRIDE"] = os.getcwd()

In [None]:
import sagemaker
from sagemaker.workflow.function_step import step
from sagemaker.workflow.parameters import ParameterString

sagemaker_session = sagemaker.session.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = sagemaker_session.boto_region_name

instance_type = ParameterString(name="TrainInstanceType", default_value="ml.m5.xlarge")

### Step Orchestration

In [None]:
# step one
@step(
    name = "preprocess",
    instance_type = instance_type,
    keep_alive_period_in_seconds=300
)
def create_data() -> tuple:
    import numpy as np
    np.random.seed(0)
    X = np.random.rand(100, 1)
    y = 2 * X + 1 + 0.1 * np.random.randn(100, 1)
    data = (X,y)
    return data

In [None]:
# step two
@step(
    name = "training",
    instance_type = instance_type,
    keep_alive_period_in_seconds=300
)
def train_model(data: tuple) -> tuple:
    import joblib
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LinearRegression
    import boto3
    s3 = boto3.client("s3")

    # unique bucket name
    bucket_name = "unique-bucket-step-pipelines-example-two"
    # create s3 bucket
    s3.create_bucket(Bucket=bucket_name)

    # unpack data
    X = data[0]
    y = data[1]
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create a Linear Regression model
    model = LinearRegression()
    
    # Train the model on the training data
    model.fit(X_train, y_train)

    # Serialize trained model for inference
    model_filename = "model.joblib"
    joblib.dump(model, model_filename)

    # Upload model artifact to s3
    s3_file_name = "model-artifacts/model.joblib" #key to store model artifacts
    s3.upload_file(model_filename, bucket_name, s3_file_name)
    artifacts = (model_filename, bucket_name, s3_file_name, X_test, y_test)
    return artifacts

In [None]:
# step three
@step(
    name = "inference_evaluation",
    instance_type = instance_type,
    keep_alive_period_in_seconds=300
)
def model_inference(artifacts: tuple) -> float:
    import joblib
    from sklearn.metrics import mean_squared_error
    import numpy as np
    import boto3
    s3 = boto3.client("s3")
    
    # load up artifacts from previous step
    model_filename = artifacts[0]
    bucket_name = artifacts[1]
    s3_file_name = artifacts[2]
    X_test = artifacts[3]
    y_test = artifacts[4]

    # download model.joblib
    s3.download_file(bucket_name, s3_file_name, model_filename)

    # model loading + inference
    serialized_model = joblib.load(model_filename)
    preds = serialized_model.predict(X_test)

    # evaluation
    mse = mean_squared_error(y_test, preds)
    rmse = float(np.sqrt(mse))

    return rmse

### Pipeline Orchestration and Execution

In [None]:
# stitch together pipeline
from sagemaker.workflow.pipeline import Pipeline

data = create_data()
artifacts = train_model(data)
rmse = model_inference(artifacts)

pipeline = Pipeline(
    name="sklearn-pipeline",
    parameters=[
        instance_type
    ],
    steps=[
        rmse,
    ],
)

In [None]:
pipeline.upsert(role_arn=role)
execution = pipeline.start()
execution.describe()
execution.wait()

In [None]:
execution.list_steps()