# Inference Pipeline with Scikit-learn and Linear Learner

In [1]:
!pwd

/home/ec2-user/SageMaker/aws-ml/pipeline/sklearn-pipeline-linear


In [2]:
import sagemaker
from sagemaker import get_execution_role
import pandas as pd

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

# S3 prefix
bucket = sagemaker_session.default_bucket()
prefix = 'Scikit-LinearLearner-pipeline-abalone-example'

In [3]:
bucket

'sagemaker-us-east-1-120286446822'

# 1. Get raw data

In [4]:
!wget --directory-prefix=./abalone_data https://s3-us-west-2.amazonaws.com/sparkml-mleap/data/abalone/abalone.csv

--2020-09-16 18:23:55--  https://s3-us-west-2.amazonaws.com/sparkml-mleap/data/abalone/abalone.csv
Resolving s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)... 52.218.237.112
Connecting to s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)|52.218.237.112|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 191873 (187K) [binary/octet-stream]
Saving to: ‘./abalone_data/abalone.csv.3’


2020-09-16 18:23:56 (971 KB/s) - ‘./abalone_data/abalone.csv.3’ saved [191873/191873]



In [5]:
RAW_FILE = 'abalone.csv'
WORK_DIRECTORY = 'abalone_data'

RAW_FILE_PATH  = WORK_DIRECTORY + "/abalone.csv"
RAW_TRAIN_PATH = WORK_DIRECTORY + "/abalone_train.csv"
RAW_TEST_PATH  = WORK_DIRECTORY + "/abalone_test.csv"
RAW_VAL_PATH  = WORK_DIRECTORY + "/abalone_val.csv"


X = pd.read_csv(filepath_or_buffer=RAW_FILE_PATH, header=None)

train_data = X.head(int(len(X)*0.8)).copy()
test_data  = X.tail(int(len(X)*0.2)).copy()
val_data   = X.tail(int(len(X)*0.2)).drop(columns=[8]).copy()

train_data.to_csv(path_or_buf=RAW_TRAIN_PATH, index=False)
test_data.to_csv(path_or_buf=RAW_TEST_PATH, index=False)
val_data.to_csv(path_or_buf=RAW_VAL_PATH, index=False)

print(train_data.shape, test_data.shape, val_data.shape)
print(len(train_data)+len(test_data))

print(X.shape)
X.head(2)

(3341, 9) (835, 9) (835, 8)
4176
(4177, 9)


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7


In [6]:
test_data.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8
3342,M,0.43,0.33,0.095,0.34,0.1315,0.085,0.112,14


## Upload the data for training

In [7]:
train_input = sagemaker_session.upload_data(
    path=RAW_TRAIN_PATH, 
    bucket=bucket,
    key_prefix='{}/{}'.format(prefix, 'train'))

test_input = sagemaker_session.upload_data(
    path=RAW_TEST_PATH, 
    bucket=bucket,
    key_prefix='{}/{}'.format(prefix, 'test'))

val_input = sagemaker_session.upload_data(
    path=RAW_VAL_PATH, 
    bucket=bucket,
    key_prefix='{}/{}'.format(prefix, 'val'))

train_input, test_input, val_input

('s3://sagemaker-us-east-1-120286446822/Scikit-LinearLearner-pipeline-abalone-example/train/abalone_train.csv',
 's3://sagemaker-us-east-1-120286446822/Scikit-LinearLearner-pipeline-abalone-example/test/abalone_test.csv',
 's3://sagemaker-us-east-1-120286446822/Scikit-LinearLearner-pipeline-abalone-example/val/abalone_val.csv')

# Data pre-processing

## train preprocessor

In [9]:
# preprocessor setup
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"
script_path = 'sklearn_abalone_featurizer.py'

sklearn_preprocessor = SKLearn(
    entry_point=script_path,
    role=role,
    framework_version=FRAMEWORK_VERSION,
    train_instance_type="ml.c4.xlarge",
    sagemaker_session=sagemaker_session)

In [10]:
sklearn_preprocessor.fit({'train': train_input})

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2020-09-16 18:24:36 Starting - Starting the training job...
2020-09-16 18:24:39 Starting - Launching requested ML instances.........
2020-09-16 18:26:12 Starting - Preparing the instances for training......
2020-09-16 18:27:12 Downloading - Downloading input data...
2020-09-16 18:27:56 Training - Downloading the training image..[34m2020-09-16 18:28:19,975 sagemaker-training-toolkit INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-09-16 18:28:19,977 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-09-16 18:28:19,986 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-09-16 18:28:20,469 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-09-16 18:28:23,533 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-09-16 18:28:23,545 sagemaker-training-toolkit INFO     No GPUs detected (norm

### batch transform the raw data

In [11]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
transformer = sklearn_preprocessor.transformer(
    instance_count=1, 
    instance_type='ml.m5.xlarge',
    assemble_with = 'Line',
    accept = 'text/csv')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


In [12]:
"""
raw data + label : 9
features + label : 12

raw data : 8
features : 11 (this is pred model required data)

"""
print()




In [13]:
# Preprocess training input
transformer.transform(train_input, content_type="text/csv")
print("Waiting for transform job: " + transformer.latest_transform_job.job_name)
transformer.wait()
preprocessed_train = transformer.output_path

Waiting for transform job: sagemaker-scikit-learn-2020-09-16-18-28-50-643
..............................
[34m2020-09-16 18:33:39,321 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2020-09-16 18:33:39,323 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2020-09-16 18:33:39,324 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[35m2020-09-16 18:33:39,321 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[35m2020-09-16 18:33:39,323 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[35m2020-09-16 18:33:39,324 INFO - sagemaker-containers - nginx config: [0m
[35mworker_processes auto;[0m
[35mdaemon off;[0m
[35mpid /tmp/nginx.pid;[0m
[35mer

In [14]:
"""
# Preprocess test input
transformer.transform(test_input, content_type="text/csv")
print("Waiting for transform job: " + transformer.latest_transform_job.job_name)
transformer.wait()
preprocessed_test = transformer.output_path
"""
preprocessed_test = preprocessed_train

In [15]:
"""
# Preprocess test input
transformer.transform(val_input, content_type="text/csv")
print("Waiting for transform job: " + transformer.latest_transform_job.job_name)
transformer.wait()
preprocessed_val = transformer.output_path
"""
preprocessed_val = None

In [16]:
preprocessed_train, preprocessed_test, preprocessed_val

('s3://sagemaker-us-east-1-120286446822/sagemaker-scikit-learn-2020-09-16-18-28-50-643',
 's3://sagemaker-us-east-1-120286446822/sagemaker-scikit-learn-2020-09-16-18-28-50-643',
 None)

# ML model (sklearn)

## train sklearn ml model

In [17]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = '0.23-1'

sklearn_estimator = SKLearn(
    entry_point='script.py',
    role = get_execution_role(),
    train_instance_count=1,
    train_instance_type='ml.c5.xlarge',
    framework_version=FRAMEWORK_VERSION,
    base_job_name='rf-scikit',
    metric_definitions=[
        {'Name': 'median-AE',
         'Regex': "AE-at-50th-percentile: ([0-9.]+).*$"}],
    hyperparameters = {'n-estimators': 100,
                       'min-samples-leaf': 2,
                       'features': 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT',
                       'target': 'target'})

In [18]:
# TRAIN the model
# launch training job, with asynchronous call
sklearn_estimator.fit({'train':preprocessed_train, 'test': preprocessed_test}, wait=True)

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2020-09-16 18:34:06 Starting - Starting the training job...
2020-09-16 18:34:16 Starting - Launching requested ML instances......
2020-09-16 18:35:32 Starting - Preparing the instances for training...
2020-09-16 18:36:06 Downloading - Downloading input data...
2020-09-16 18:36:30 Training - Downloading the training image...
2020-09-16 18:36:54 Training - Training image download completed. Training in progress.[34m2020-09-16 18:36:54,149 sagemaker-training-toolkit INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-09-16 18:36:54,151 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-09-16 18:36:54,160 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-09-16 18:37:09,775 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-09-16 18:37:09,785 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34

In [None]:
stop here

## predict : only using model

#### deploy model

In [None]:
predictor = sklearn_estimator.deploy(instance_type='ml.m4.xlarge', initial_instance_count=1)

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


-------------

#### get test data

In [24]:
import json
import io
from urllib.parse import urlparse
import boto3

def get_csv_output_from_s3(s3uri, file_name):
    parsed_url = urlparse(s3uri)
    bucket_name = parsed_url.netloc
    prefix = parsed_url.path[1:]
    s3 = boto3.resource('s3')
    print(bucket_name)
    print(prefix)
    print(file_name)
    obj = s3.Object(bucket_name, '{}/{}'.format(prefix, file_name))
    return obj.get()["Body"].read().decode('utf-8')   

In [25]:
import pandas as pd

path       = preprocessed_val
batch_file = 'abalone_val.csv' # imp
output = get_csv_output_from_s3(path, '{}.out'.format(batch_file))
validate_df = pd.read_csv(io.StringIO(output), sep=",", header=None)
print(validate_df.shape)
validate_df.sample(2) 

sagemaker-us-east-1-120286446822
sagemaker-scikit-learn-2020-09-16-16-08-33-347
abalone_val.csv.out
(836, 11)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
377,-1.344091,-1.520743,-0.387194,-0.72716,-0.62461,-0.508931,-0.614029,0.0,0.0,1.0,0.0
731,-0.186211,-0.457556,-0.31073,-0.779739,-0.719999,-0.508931,-0.600388,0.0,0.0,1.0,0.0


#### prediction

In [33]:
# `data` is a NumPy array or a Python list.
# `response` is a NumPy array.

#data = validate_df.drop(columns=[0]).values
data = validate_df.values

response = predictor.predict(data)
response

array([12.45246429,  8.03694048,  9.55503571, 10.1077381 , 13.27410714,
       13.00064286,  8.69838095,  8.68016667,  9.20690079, 14.02083333,
       11.11721429,  9.38420238,  9.61866667,  8.01614286,  8.00316667,
        7.289     ,  7.09097619,  6.00842857, 14.47003571, 14.14786508,
        9.09646825,  8.80054113,  9.13007143,  7.70307143,  8.82632143,
        5.76028571, 17.77535714, 12.88803571, 13.6364127 ,  9.23141667,
        7.73933333, 11.36795238, 15.36401587, 10.67302381, 10.16031349,
        7.23818254,  6.00183333,  9.5252381 ,  5.90111905,  4.56783333,
       14.31067063,  9.93507143, 10.18045238,  6.65067857,  9.09302381,
       14.99533333, 12.45389286, 11.0537381 , 13.24965476, 10.05202381,
       14.02027778, 11.91478571, 13.37496429, 12.15009524, 16.47415476,
       11.48481746, 12.55232143,  7.44984848,  8.71553571, 13.50642857,
       15.21750794, 14.5859145 ,  7.80533333,  8.64614286,  6.20102381,
        6.59116667,  5.99966667,  7.29283333,  7.33830952,  6.66

# Serial Inference Pipeline

In [19]:
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel
import boto3
from time import gmtime, strftime

timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())

# step_1 : get models
scikit_learn_inferencee_model = sklearn_preprocessor.create_model()
sklearn_estimator_RF = sklearn_estimator.create_model()

# step_2 : set-up pipeline
model_name = 'sklearn-inference-pipeline-' + timestamp_prefix
endpoint_name = 'sklearn-inference-pipeline-ep-' + timestamp_prefix
sm_model = PipelineModel(
    name=model_name, 
    role=role, 
    models=[
        scikit_learn_inferencee_model, 
        sklearn_estimator_RF])

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.
Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


In [25]:
# batch prediction job

pipeline_tf = sm_model.transformer(
                            instance_count=1, 
                            instance_type='ml.m5.xlarge',
                            assemble_with = 'Line',
                            accept = 'text/csv')

# input : val_input
pipeline_tf.transform(val_input, content_type="text/csv")
print("Waiting for transform job: " + transformer.latest_transform_job.job_name)
pipeline_tf.wait()
predictions_path = pipeline_tf.output_path
predictions_path

Waiting for transform job: sagemaker-scikit-learn-2020-09-16-17-34-23-067
.............................[34m2020-09-16 17:57:39,747 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2020-09-16 17:57:39,750 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2020-09-16 17:57:39,751 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[35m2020-09-16 17:57:39,747 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[35m2020-09-16 17:57:39,750 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[35m2020-09-16 17:57:39,751 INFO - sagemaker-containers - nginx config: [0m
[35mworker_processes auto;[0m
[35mdaemon off;[0m
[35mpid /tmp/nginx.pid;[0m
[35merror_log  /dev/


[34m2020-09-16 17:57:45,025 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[35m2020-09-16 17:57:45,025 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[32m169.254.255.130 - - [16/Sep/2020:17:57:44 +0000] "GET /ping HTTP/1.1" 200 0 "-" "Go-http-client/1.1"[0m
[32m2020-09-16 17:57:44,528 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[32m169.254.255.130 - - [16/Sep/2020:17:57:45 +0000] "GET /execution-parameters HTTP/1.1" 404 232 "-" "Go-http-client/1.1"[0m
[36m2020-09-16T17:57:45.005:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m
[34mThis is an unlabelled example.[0m
[34mdf.shape :  (836, 8)[0m
[34mimp : raw data shape : [0m
[34mtest/pred job[0m
[34mimp : pp feature data shape : (836, 11)[0m
[34msample data : 
 [ 3.94907139 15.39359754 43.73256666  6.42153636 19.68260568 39.09370325
 36.88224197  1.          0.          0

's3://sagemaker-us-east-1-120286446822/sklearn-inference-pipeline-2020-09-16-1-2020-09-16-17-52-56-265'

### deploy pipeline model

In [21]:
#sm_model.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge', endpoint_name=endpoint_name)
sm_model.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge', endpoint_name=endpoint_name)

Using already existing model: sklearn-inference-pipeline-2020-09-16-18-37-48


---------------!

## predict from pipeline endpoint

In [22]:
endpoint_name

'sklearn-inference-pipeline-ep-2020-09-16-18-37-48'

In [23]:
test_data.head(1).values

array([['M', 0.43, 0.33, 0.095, 0.34, 0.1315, 0.085, 0.11199999999999999,
        14]], dtype=object)

In [24]:
from sagemaker.predictor import json_serializer, csv_serializer, json_deserializer, RealTimePredictor
from sagemaker.content_types import CONTENT_TYPE_CSV, CONTENT_TYPE_JSON


payload = 'M, 0.43, 0.33, 0.095, 0.34, 0.1315, 0.085, 0.11' # 14
# b'[7.8421190476190485]'
# b'[8.006166666666667]'

predictor = RealTimePredictor(
    endpoint=endpoint_name,
    sagemaker_session=sagemaker_session,
    serializer=csv_serializer,
    content_type=CONTENT_TYPE_CSV,
    accept=CONTENT_TYPE_JSON)

print(predictor.predict(payload))

b'[8.006166666666667]'


In [None]:
stop

## Delete Endpoint <a class="anchor" id="delete_endpoint"></a>
Once we are finished with the endpoint, we clean up the resources!

In [None]:
sm_client = sagemaker_session.boto_session.client('sagemaker')
sm_client.delete_endpoint(EndpointName=endpoint_name)

In [15]:
sm_client = sagemaker_session.boto_session.client('sagemaker')
sm_client.delete_endpoint(EndpointName=endpoint_name)

'inference-pipeline-ep-2020-09-14-10-41-27'