# Inference Pipeline with Scikit-learn and Linear Learner

In [1]:
!pwd

/home/ec2-user/SageMaker/aws-ml/pipeline/sklearn-pipeline-linear


In [2]:
import sagemaker
from sagemaker import get_execution_role
import pandas as pd

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

# S3 prefix
S3_BUCKET = "sklearn-pipeline"
S3_PREFIX = 'sklearn-pipeline-linearRegression'

S3_BUCKET, S3_PREFIX

('sklearn-pipeline', 'sklearn-pipeline-linearRegression')

# 1. Get raw data

In [3]:
#!wget --directory-prefix=./abalone_data_dir https://s3-us-west-2.amazonaws.com/sparkml-mleap/data/abalone/abalone.csv

In [4]:
RAW_FILE       = 'abalone.csv'
WORK_DIRECTORY = 'abalone_data_dir/'

RAW_FILE_PATH  = "{}/{}".format(WORK_DIRECTORY, RAW_FILE)
RAW_TRAIN_PATH = "{}/train_{}".format(WORK_DIRECTORY, RAW_FILE)
RAW_TEST_PATH  = "{}/test_{}".format(WORK_DIRECTORY, RAW_FILE)
RAW_VAL_PATH   = "{}/val_{}".format(WORK_DIRECTORY, RAW_FILE)

X = pd.read_csv(filepath_or_buffer=RAW_FILE_PATH, header=None)

train_data = X.head(int(len(X)*0.8)).copy()
test_data  = X.tail(int(len(X)*0.2)).copy()
val_data   = X.tail(int(len(X)*0.2)).drop(columns=[8]).copy()

train_data.to_csv(path_or_buf=RAW_TRAIN_PATH, index=False)
test_data.to_csv(path_or_buf=RAW_TEST_PATH, index=False)
val_data.to_csv(path_or_buf=RAW_VAL_PATH, index=False)

print(train_data.shape, test_data.shape, val_data.shape)
print(len(train_data)+len(test_data))

print(X.shape)
X.head(2)

(3341, 9) (835, 9) (835, 8)
4176
(4177, 9)


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7


In [5]:
test_data.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8
3342,M,0.43,0.33,0.095,0.34,0.1315,0.085,0.112,14


## Upload the data for training

In [6]:
s3_input_raw_train = sagemaker_session.upload_data(
    path=RAW_TRAIN_PATH, 
    bucket=S3_BUCKET,
    key_prefix='{}/{}'.format(S3_PREFIX, 'data_train'))

s3_input_raw_test = sagemaker_session.upload_data(
    path=RAW_TEST_PATH, 
    bucket=S3_BUCKET,
    key_prefix='{}/{}'.format(S3_PREFIX, 'data_test'))

s3_input_raw_val = sagemaker_session.upload_data(
    path=RAW_VAL_PATH, 
    bucket=S3_BUCKET,
    key_prefix='{}/{}'.format(S3_PREFIX, 'data_val'))

s3_input_raw_train, s3_input_raw_test, s3_input_raw_val

('s3://sklearn-pipeline/sklearn-pipeline-linearRegression/data_train/train_abalone.csv',
 's3://sklearn-pipeline/sklearn-pipeline-linearRegression/data_test/test_abalone.csv',
 's3://sklearn-pipeline/sklearn-pipeline-linearRegression/data_val/val_abalone.csv')

# Data pre-processing

## setup

In [7]:
PP_SCRIPT_NAME = 'sklearn_abalone_featurizer.py'

# preprocessor setup
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"
sklearn_preprocessor = SKLearn(
                            entry_point=PP_SCRIPT_NAME,
                            role=role,
                            framework_version=FRAMEWORK_VERSION,
                            train_instance_type="ml.c4.xlarge",
                            sagemaker_session=sagemaker_session
                            )

## train

In [None]:
sklearn_preprocessor.fit({'train': s3_input_raw_train})

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2020-09-16 19:54:01 Starting - Starting the training job...
2020-09-16 19:54:04 Starting - Launching requested ML instances......

### batch transform the raw data
only useful to test the ml_model individually
else not needed

In [None]:
"""
raw data + label : 9
features + label : 12

raw data : 8
features : 11 (this is pred model required data)

"""
print()

In [None]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
pp_transformer = sklearn_preprocessor.transformer(
                                                    instance_count=1, 
                                                    instance_type='ml.m5.xlarge',
                                                    assemble_with = 'Line',
                                                    accept = 'text/csv'
                                                 )

In [None]:
# Preprocess training data : s3_input_raw_train
pp_transformer.transform(s3_input_raw_train, content_type="text/csv")
print("Waiting for transform job: " + pp_transformer.latest_transform_job.job_name)
pp_transformer.wait()
s3_pp_train = pp_transformer.output_path

In [None]:
"""
# batch preprocess test data : s3_input_raw_test
pp_transformer.transform(s3_input_raw_test, content_type="text/csv")
print("Waiting for transform job: " + pp_transformer.latest_transform_job.job_name)
pp_transformer.wait()
s3_pp_test = pp_transformer.output_path
"""
s3_pp_test = s3_pp_train

In [None]:
"""
# batch preprocess val data : s3_input_raw_val
pp_transformer.transform(s3_input_raw_val, content_type="text/csv")
print("Waiting for transform job: " + pp_transformer.latest_transform_job.job_name)
pp_transformer.wait()
s3_pp_val = pp_transformer.output_path
"""
s3_pp_val = None

In [None]:
s3_input_raw_train

In [None]:
s3_pp_train, s3_pp_test, s3_pp_val

# ML model (sklearn)

## setup

In [None]:
ML_MODEL_SCRIPT_NAME = "model_script.py"

from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = '0.23-1'
ml_estimator = SKLearn(
                    entry_point=ML_MODEL_SCRIPT_NAME,
                    role = get_execution_role(),
                    train_instance_count=1,
                    train_instance_type='ml.c5.xlarge',
                    framework_version=FRAMEWORK_VERSION,
                    base_job_name='rf-scikit',
                    metric_definitions=[
                                        {'Name': 'median-AE',
                                         'Regex': "AE-at-50th-percentile: ([0-9.]+).*$"}
                                        ],
                    hyperparameters = {'n-estimators': 100,
                                       'min-samples-leaf': 2,
                                       'features': 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT',
                                       'target': 'target'
                                      }
                    )

## train

In [None]:
# TRAIN the model
ml_estimator.fit({'train':s3_pp_train, 'test': s3_pp_test}, wait=True)

# Serial Inference Pipeline

In [None]:
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel
import boto3
from time import gmtime, strftime

timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())

# step_1 : get models
pp_transformer_model = sklearn_preprocessor.create_model()
ml_estimator_model   = ml_estimator.create_model()

# step_2 : set-up pipeline
model_name    = 'sklearn-inference-pipeline-' + timestamp_prefix
endpoint_name = 'sklearn-inference-pipeline-ep-' + timestamp_prefix
ml_pipeline_model = PipelineModel(
                                    name=model_name, 
                                    role=role, 
                                    models=[
                                            pp_transformer_model, 
                                            ml_estimator_model
                                            ]
                                    )

In [None]:
# batch prediction job

"""
ml_pipeline_tf = ml_pipeline_model.transformer(
                                            instance_count=1, 
                                            instance_type='ml.m5.xlarge',
                                            assemble_with = 'Line',
                                            accept = 'text/csv')

# input : s3_input_raw_val (raw input data)
ml_pipeline_tf.transform(s3_input_raw_val, content_type="text/csv")
print("Waiting for transform job: " + ml_pipeline_tf.latest_transform_job.job_name)
ml_pipeline_tf.wait()
s3_pred_val = ml_pipeline_tf.output_path
s3_pred_val
"""
print()

## deploy pipeline model

In [None]:
#sm_model.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge', endpoint_name=endpoint_name)
ml_pipeline_model.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge', endpoint_name=endpoint_name)

# predict from pipeline endpoint

In [None]:
endpoint_name

In [None]:
test_data.head(1).values

In [None]:
from sagemaker.predictor import json_serializer, csv_serializer, json_deserializer, RealTimePredictor
from sagemaker.content_types import CONTENT_TYPE_CSV, CONTENT_TYPE_JSON


payload = 'M, 0.43, 0.33, 0.095, 0.34, 0.1315, 0.085, 0.11' # 14
# b'[7.8421190476190485]'
# b'[8.006166666666667]'

predictor = RealTimePredictor(
    endpoint=endpoint_name,
    sagemaker_session=sagemaker_session,
    serializer=csv_serializer,
    content_type=CONTENT_TYPE_CSV,
    accept=CONTENT_TYPE_JSON)

print(predictor.predict(payload))

In [None]:
stop here

## predict : only using model

#### deploy model

In [None]:
# deploy only the - ml model
ml_predictor = ml_estimator.deploy(instance_type='ml.m4.xlarge', initial_instance_count=1)

#### get test data

In [None]:
import json
import io
from urllib.parse import urlparse
import boto3

def get_csv_output_from_s3(s3uri, file_name):
    parsed_url = urlparse(s3uri)
    bucket_name = parsed_url.netloc
    prefix = parsed_url.path[1:]
    s3 = boto3.resource('s3')
    print(bucket_name)
    print(prefix)
    print(file_name)
    obj = s3.Object(bucket_name, '{}/{}'.format(prefix, file_name))
    return obj.get()["Body"].read().decode('utf-8')   

In [None]:
import pandas as pd

path       = preprocessed_val
batch_file = 'abalone_val.csv' # imp
output = get_csv_output_from_s3(path, '{}.out'.format(batch_file))
validate_df = pd.read_csv(io.StringIO(output), sep=",", header=None)
print(validate_df.shape)
validate_df.sample(2) 

#### prediction

In [None]:
# `data` is a NumPy array or a Python list.
# `response` is a NumPy array.

#payload = validate_df.drop(columns=[0]).values
payload = validate_df.values

response = ml_predictor.predict(payload)
response

In [None]:
stop

## Delete Endpoint <a class="anchor" id="delete_endpoint"></a>
Once we are finished with the endpoint, we clean up the resources!

In [None]:
sm_client = sagemaker_session.boto_session.client('sagemaker')
sm_client.delete_endpoint(EndpointName=endpoint_name)

In [None]:
sm_client = sagemaker_session.boto_session.client('sagemaker')
sm_client.delete_endpoint(EndpointName=endpoint_name)