# Inference Pipeline with Scikit-learn and Linear Learner

1. fit/train a sklearn pre-processor
   
   it will perform preprocessing of numeric cat cols
   
** numeric : imputation, scaling

** categoric : imputation, one-hot-encoding
   
** also perform batch transformation of train/test data to be used for ml_model training
   
   
2. train sklearn ml model (RF regressor)

3. build up inference-ml-pipeline
    raw_data --> [preprocessing ==> ml_model] --> prediction
    
4. deploy inference-ml-pipeline as an endpoint

5. prediction using the endpoint

# Utils

In [16]:
import json
import io
from urllib.parse import urlparse
import boto3

def get_csv_output_from_s3(s3uri, file_name):
    parsed_url = urlparse(s3uri)
    bucket_name = parsed_url.netloc
    prefix = parsed_url.path[1:]
    s3 = boto3.resource('s3')
    print(bucket_name)
    print(prefix)
    print(file_name)
    obj = s3.Object(bucket_name, '{}/{}'.format(prefix, file_name))
    return obj.get()["Body"].read().decode('utf-8')   

In [17]:
!pwd

/home/ec2-user/SageMaker/aws-ml/pipeline/sklearn-pipeline-classification


In [18]:
import sagemaker
from sagemaker import get_execution_role

import os
import numpy as np
import pandas as pd

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

# S3 prefix
S3_BUCKET = "sklearn-pipeline"
S3_PREFIX = 'sklearn-pipeline-linearRegression'

S3_BUCKET, S3_PREFIX

('sklearn-pipeline', 'sklearn-pipeline-linearRegression')

# 1. Get raw data

In [19]:
#!wget --directory-prefix=./abalone_data_dir https://s3-us-west-2.amazonaws.com/sparkml-mleap/data/abalone/abalone.csv

In [20]:
RAW_FILE       = 'abalone.csv'
WORK_DIRECTORY = 'abalone_data_dir'

RAW_FILE_PATH  = "{}/{}".format(WORK_DIRECTORY, RAW_FILE)
RAW_TRAIN_PATH = "{}/train/train_{}".format(WORK_DIRECTORY, RAW_FILE)
RAW_TEST_PATH  = "{}/test/test_{}".format(WORK_DIRECTORY, RAW_FILE)
RAW_VAL_PATH   = "{}/val/val_{}".format(WORK_DIRECTORY, RAW_FILE)

PP_TRAIN_PATH = "{}/pp_train/train_{}".format(WORK_DIRECTORY, RAW_FILE)
PP_TEST_PATH  = "{}/pp_test/test_{}".format(WORK_DIRECTORY, RAW_FILE)
PP_VAL_PATH   = "{}/pp_val/val_{}".format(WORK_DIRECTORY, RAW_FILE)

ARTIFACTS_PATH = "{}/artifacts".format(WORK_DIRECTORY)

X = pd.read_csv(filepath_or_buffer=RAW_FILE_PATH, header=None)
X[8] = np.where(X[8]>=8, 0, 1)

train_data = X.head(int(len(X)*0.8)).copy()
test_data  = X.tail(int(len(X)*0.2)).copy()
val_data   = X.tail(int(len(X)*0.2)).drop(columns=[8]).copy()

train_data.to_csv(path_or_buf=RAW_TRAIN_PATH, index=False, header=None)
test_data.to_csv(path_or_buf=RAW_TEST_PATH, index=False, header=None)
val_data.to_csv(path_or_buf=RAW_VAL_PATH, index=False, header=None)

print(train_data.shape, test_data.shape, val_data.shape)
print(len(train_data)+len(test_data))

print(X.shape)
X.head(2)

(3341, 9) (835, 9) (835, 8)
4176
(4177, 9)


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,0
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,1


In [21]:
train_data[8].value_counts(), test_data[8].value_counts()

(0    2670
 1     671
 Name: 8, dtype: int64,
 0    667
 1    168
 Name: 8, dtype: int64)

In [22]:
train_data.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,0


## Upload the data for training

In [23]:
s3_input_raw_train = sagemaker_session.upload_data(
    path=RAW_TRAIN_PATH, 
    bucket=S3_BUCKET,
    key_prefix='{}/{}'.format(S3_PREFIX, 'data_train'))

s3_input_raw_test = sagemaker_session.upload_data(
    path=RAW_TEST_PATH, 
    bucket=S3_BUCKET,
    key_prefix='{}/{}'.format(S3_PREFIX, 'data_test'))

s3_input_raw_val = sagemaker_session.upload_data(
    path=RAW_VAL_PATH, 
    bucket=S3_BUCKET,
    key_prefix='{}/{}'.format(S3_PREFIX, 'data_val'))

s3_input_raw_train, s3_input_raw_test, s3_input_raw_val

('s3://sklearn-pipeline/sklearn-pipeline-linearRegression/data_train/train_abalone.csv',
 's3://sklearn-pipeline/sklearn-pipeline-linearRegression/data_test/test_abalone.csv',
 's3://sklearn-pipeline/sklearn-pipeline-linearRegression/data_val/val_abalone.csv')

# Data pre-processing

## setup

In [24]:
PP_SCRIPT_NAME = 'sklearn_abalone_featurizer.py'

# preprocessor setup
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"
sklearn_preprocessor = SKLearn(
                            entry_point=PP_SCRIPT_NAME,
                            role=role,
                            framework_version=FRAMEWORK_VERSION,
                            train_instance_type="ml.c4.xlarge",
                            sagemaker_session=sagemaker_session
                            )

## local pp

In [25]:
train_data_local_dir = "/".join(RAW_TRAIN_PATH.split("/")[:-1])
artifacts_local_dir  = ARTIFACTS_PATH

! python sklearn_abalone_featurizer.py --output-data-dir abalone_data_dir/artifacts/ \
                                       --model-dir abalone_data_dir/artifacts/ \
                                       --train abalone_data_dir/train/

extracting arguments
data shape :  (3341, 9)
to_predict_col : [0. 1.] : [2670  671]
imp : shape of data before pp:  (3341, 8)
sample data : 
 [['M' 0.455 0.365 0.095 0.514 0.2245 0.101 0.15]]
imp : shape of data after pp:  (3341, 10)
sample data : 
 [-0.55842639 -0.41971975 -1.03973418 -0.62995301 -0.59257744 -0.71598627
 -0.62714171  0.          0.          1.        ]
saved model!


In [26]:
# once local run is succesfull, train the container based model

## train

In [27]:
sklearn_preprocessor.fit({'train': s3_input_raw_train})

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2020-09-17 11:57:53 Starting - Starting the training job...
2020-09-17 11:57:56 Starting - Launching requested ML instances......
2020-09-17 11:59:11 Starting - Preparing the instances for training......
2020-09-17 12:00:13 Downloading - Downloading input data......
2020-09-17 12:01:00 Training - Downloading the training image..
2020-09-17 12:01:38 Uploading - Uploading generated training model
2020-09-17 12:01:38 Completed - Training job completed
Training seconds: 85
Billable seconds: 85


### batch transform the raw data to train/test data
required for training the ML model

In [28]:
"""
raw data + label : 9
features + label : 12

raw data : 8
features : 11 (this is pred model required data)

"""
print()




In [29]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
pp_transformer = sklearn_preprocessor.transformer(
                                                    instance_count=1, 
                                                    instance_type='ml.m5.xlarge',
                                                    assemble_with = 'Line',
                                                    accept = 'text/csv'
                                                 )

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


In [30]:
# Preprocess training data : s3_input_raw_train
pp_transformer.transform(s3_input_raw_train, content_type="text/csv")
print("Waiting for transform job: " + pp_transformer.latest_transform_job.job_name)
pp_transformer.wait()
s3_pp_train = pp_transformer.output_path

Waiting for transform job: sagemaker-scikit-learn-2020-09-17-12-02-06-346
..................................[34m2020-09-17 12:07:38,677 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2020-09-17 12:07:38,679 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2020-09-17 12:07:38,679 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }

  server {
    listen 8080 deferred;
    client_max_body_size 0;

    keepalive_timeout 3;

    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_ad

[32m2020-09-17T12:07:42.101:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m



In [31]:
"""
# batch preprocess test data : s3_input_raw_test
pp_transformer.transform(s3_input_raw_test, content_type="text/csv")
print("Waiting for transform job: " + pp_transformer.latest_transform_job.job_name)
pp_transformer.wait()
s3_pp_test = pp_transformer.output_path
"""
s3_pp_test = s3_pp_train

In [32]:
# only useful to assessing the ml_model only endpoint
"""
# batch preprocess val data : s3_input_raw_val
pp_transformer.transform(s3_input_raw_val, content_type="text/csv")
print("Waiting for transform job: " + pp_transformer.latest_transform_job.job_name)
pp_transformer.wait()
s3_pp_val = pp_transformer.output_path
"""
s3_pp_val = None

In [33]:
s3_input_raw_train

's3://sklearn-pipeline/sklearn-pipeline-linearRegression/data_train/train_abalone.csv'

In [34]:
s3_pp_train, s3_pp_test, s3_pp_val

('s3://sagemaker-us-east-1-120286446822/sagemaker-scikit-learn-2020-09-17-12-02-06-346',
 's3://sagemaker-us-east-1-120286446822/sagemaker-scikit-learn-2020-09-17-12-02-06-346',
 None)

# ML model (sklearn)

## setup

In [35]:
ML_MODEL_SCRIPT_NAME = "model_script.py"

from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = '0.23-1'
ml_estimator = SKLearn(
                    entry_point=ML_MODEL_SCRIPT_NAME,
                    role = get_execution_role(),
                    train_instance_count=1,
                    train_instance_type='ml.c5.xlarge',
                    framework_version=FRAMEWORK_VERSION,
                    base_job_name='rf-scikit',
                    metric_definitions=[
                                        {'Name': 'median-AE',
                                         'Regex': "AE-at-50th-percentile: ([0-9.]+).*$"}
                                        ],
                    hyperparameters = {'n-estimators': 100,
                                       'min-samples-leaf': 2,
                                       'features': 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT',
                                       'target': 'target'
                                      }
                    )

## local train

In [36]:
# fetch data
s3uri     = s3_pp_train
file_name = '{}.out'.format(RAW_TRAIN_PATH.split("/")[-1])
s3_obj = get_csv_output_from_s3(s3uri, file_name)
train_df_pp  = pd.read_csv(io.StringIO(s3_obj), sep=",", header=None)

"""
s3uri     = s3_pp_test
file_name = '{}.out'.format(RAW_TEST_PATH.split("/")[-1])
s3_obj = get_csv_output_from_s3(s3uri, file_name)
test_df_pp  = pd.read_csv(io.StringIO(s3_obj), sep=",", header=None)
"""
test_df_pp = train_df_pp.copy()


train_df_pp.to_csv(path_or_buf=PP_TRAIN_PATH, index=False, header=None)
test_df_pp.to_csv(path_or_buf=PP_TEST_PATH, index=False, header=None)
#val_df_pp.to_csv(path_or_buf=PP_VAL_PATH, index=False, header=None)

print(train_df_pp.shape, test_df_pp.shape)
train_df_pp.sample(2)

sagemaker-us-east-1-120286446822
sagemaker-scikit-learn-2020-09-17-12-02-06-346
train_abalone.csv.out
(3341, 11) (3341, 11)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
1564,1.0,-0.516983,-0.570109,-0.688968,-0.861892,-0.81123,-0.880144,-0.836163,0.0,1.0,0.0
1572,0.0,-0.351209,-0.36959,-0.338202,-0.711335,-0.797705,-0.693187,-0.450005,0.0,1.0,0.0


In [37]:
pp_train_data_local_dir = "/".join(PP_TRAIN_PATH.split("/")[:-1])
pp_test_data_local_dir = "/".join(PP_TRAIN_PATH.split("/")[:-1])
artifacts_local_dir  = ARTIFACTS_PATH
pp_train_data_local_dir, pp_test_data_local_dir, artifacts_local_dir

('abalone_data_dir/pp_train',
 'abalone_data_dir/pp_train',
 'abalone_data_dir/artifacts')

In [57]:
! python model_script.py --n-estimators 100 \
                         --min-samples-leaf 2 \
                         --model-dir 'abalone_data_dir/artifacts' \
                         --train 'abalone_data_dir/pp_train/' \
                         --test 'abalone_data_dir/pp_test/' \
                         --features 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT' \
                         --target 'abc'

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
extracting arguments
args.train abalone_data_dir/pp_train/
args.test abalone_data_dir/pp_test/
reading train data
args.train :  abalone_data_dir/pp_train/
(3341, 11)
    0         1        2         3   ...        7    8    9    10
0  0.0 -0.558426 -0.41972 -1.039734  ... -0.627142  0.0  0.0  1.0

[1 rows x 11 columns]
reading test data
args.test :  abalone_data_dir/pp_test/
(3341, 11)
building training and testing datasets
columns :  [ 0  1  2  3  4  5  6  7  8  9 10]
col_to_predict : 0, arg_type : <class 'numpy.int64'>
<class 'pandas.core.series.Series'>
train : [0. 1.] : [2670  671]
test  : [0. 1.] : [2670  671]
training model
----------------------------------------------------------------------------------------------------

In [58]:
# once local run is succesfull, train the container based model

## train

In [59]:
# TRAIN the model
ml_estimator.fit({'train':s3_pp_train, 'test': s3_pp_test}, wait=True)

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2020-09-17 13:21:02 Starting - Starting the training job...
2020-09-17 13:21:04 Starting - Launching requested ML instances......
2020-09-17 13:22:21 Starting - Preparing the instances for training......
2020-09-17 13:23:07 Downloading - Downloading input data...
2020-09-17 13:23:57 Training - Training image download completed. Training in progress.[34m2020-09-17 13:23:57,593 sagemaker-training-toolkit INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-09-17 13:23:57,595 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-09-17 13:23:57,604 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-09-17 13:23:57,965 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-09-17 13:24:04,231 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-09-17 13:24:04,241 sagemaker-training-toolkit INFO     N

[34mextracting arguments[0m
[34margs.train /opt/ml/input/data/train[0m
[34margs.test /opt/ml/input/data/test[0m
[34mreading train data[0m
[34margs.train :  /opt/ml/input/data/train[0m
[34m(3341, 11)
    0         1        2         3   ...        7    8    9    10[0m
[34m0  0.0 -0.558426 -0.41972 -1.039734  ... -0.627142  0.0  0.0  1.0
[0m
[34m[1 rows x 11 columns][0m
[34mreading test data[0m
[34margs.test :  /opt/ml/input/data/test[0m
[34m(3341, 11)[0m
[34mbuilding training and testing datasets[0m
[34mcolumns :  [ 0  1  2  3  4  5  6  7  8  9 10][0m
[34mcol_to_predict : 0, arg_type : <class 'numpy.int64'>[0m
[34m<class 'pandas.core.series.Series'>[0m
[34mtrain : [0. 1.] : [2670  671][0m
[34mtest  : [0. 1.] : [2670  671][0m
[34mtraining model[0m
[34m----------------------------------------------------------------------------------------------------[0m
[34mX_train.shape :  (3341, 10)[0m
[34mmodel training on num features :  10[0m
[34msample dat

# Serial Inference Pipeline

In [60]:
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel
import boto3
from time import gmtime, strftime

timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())

# step_1 : get models
pp_transformer_model = sklearn_preprocessor.create_model()
ml_estimator_model   = ml_estimator.create_model()

# step_2 : set-up pipeline
model_name    = 'sklearn-inference-pipeline-' + timestamp_prefix
endpoint_name = 'sklearn-inference-pipeline-ep-' + timestamp_prefix
ml_pipeline_model = PipelineModel(
                                    name=model_name, 
                                    role=role, 
                                    models=[
                                            pp_transformer_model, 
                                            ml_estimator_model
                                            ]
                                    )

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.
Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


In [61]:
# batch prediction job

"""
ml_pipeline_tf = ml_pipeline_model.transformer(
                                            instance_count=1, 
                                            instance_type='ml.m5.xlarge',
                                            assemble_with = 'Line',
                                            accept = 'text/csv')

# input : s3_input_raw_val (raw input data)
ml_pipeline_tf.transform(s3_input_raw_val, content_type="text/csv")
print("Waiting for transform job: " + ml_pipeline_tf.latest_transform_job.job_name)
ml_pipeline_tf.wait()
s3_pred_val = ml_pipeline_tf.output_path
s3_pred_val
"""
print()




## deploy pipeline model

In [62]:
#sm_model.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge', endpoint_name=endpoint_name)
ml_pipeline_model.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge', endpoint_name=endpoint_name)

-----------------!

# predict from pipeline endpoint

In [63]:
endpoint_name

'sklearn-inference-pipeline-ep-2020-09-17-13-25-55'

In [64]:
test_data.head(1).values

array([['M', 0.43, 0.33, 0.095, 0.34, 0.1315, 0.085, 0.11199999999999999,
        0]], dtype=object)

In [65]:
from sagemaker.predictor import json_serializer, csv_serializer, json_deserializer, RealTimePredictor
from sagemaker.content_types import CONTENT_TYPE_CSV, CONTENT_TYPE_JSON


payload = 'M, 0.43, 0.33, 0.095, 0.34, 0.1315, 0.085, 0.11' # 14
# b'[7.8421190476190485]'
# b'[8.006166666666667]'
#[1.0]

predictor = RealTimePredictor(
    endpoint=endpoint_name,
    sagemaker_session=sagemaker_session,
    serializer=csv_serializer,
    content_type=CONTENT_TYPE_CSV,
    accept=CONTENT_TYPE_JSON)

print(predictor.predict(payload))

b'[1.0]'


In [None]:
stop here

## predict : only using model

#### deploy model

In [None]:
# deploy only the - ml model
ml_predictor = ml_estimator.deploy(instance_type='ml.m4.xlarge', initial_instance_count=1)

#### get test data

In [None]:
import json
import io
from urllib.parse import urlparse
import boto3

def get_csv_output_from_s3(s3uri, file_name):
    parsed_url = urlparse(s3uri)
    bucket_name = parsed_url.netloc
    prefix = parsed_url.path[1:]
    s3 = boto3.resource('s3')
    print(bucket_name)
    print(prefix)
    print(file_name)
    obj = s3.Object(bucket_name, '{}/{}'.format(prefix, file_name))
    return obj.get()["Body"].read().decode('utf-8')   

In [None]:
import pandas as pd

path       = preprocessed_val
batch_file = 'abalone_val.csv' # imp
output = get_csv_output_from_s3(path, '{}.out'.format(batch_file))
validate_df = pd.read_csv(io.StringIO(output), sep=",", header=None)
print(validate_df.shape)
validate_df.sample(2) 

#### prediction

In [None]:
# `data` is a NumPy array or a Python list.
# `response` is a NumPy array.

#payload = validate_df.drop(columns=[0]).values
payload = validate_df.values

response = ml_predictor.predict(payload)
response

In [None]:
stop

## Delete Endpoint <a class="anchor" id="delete_endpoint"></a>
Once we are finished with the endpoint, we clean up the resources!

In [None]:
sm_client = sagemaker_session.boto_session.client('sagemaker')
sm_client.delete_endpoint(EndpointName=endpoint_name)

In [None]:
sm_client = sagemaker_session.boto_session.client('sagemaker')
sm_client.delete_endpoint(EndpointName=endpoint_name)