# Inference Pipeline with Scikit-learn and Linear Learner

1. fit/train a sklearn pre-processor
   
   it will perform preprocessing of numeric cat cols
   
** numeric : imputation, scaling

** categoric : imputation, one-hot-encoding
   
** also perform batch transformation of train/test data to be used for ml_model training
   
   
2. train sklearn ml model (RF regressor)

3. build up inference-ml-pipeline
    raw_data --> [preprocessing ==> ml_model] --> prediction
    
4. deploy inference-ml-pipeline as an endpoint

5. prediction using the endpoint

In [44]:
# 1. SHAP
# 2. store artifacts of sm train job

# Utils

In [45]:
import json
import io
from urllib.parse import urlparse
import boto3

def get_csv_output_from_s3(s3uri, file_name):
    parsed_url = urlparse(s3uri)
    bucket_name = parsed_url.netloc
    prefix = parsed_url.path[1:]
    s3 = boto3.resource('s3')
    print(bucket_name)
    print(prefix)
    print(file_name)
    obj = s3.Object(bucket_name, '{}/{}'.format(prefix, file_name))
    return obj.get()["Body"].read().decode('utf-8')   

In [46]:
!pwd

/home/ec2-user/SageMaker/aws-ml/pipeline/sklearn-pipeline-classification-titanic


In [47]:
import sagemaker
from sagemaker import get_execution_role

import os
import numpy as np
import pandas as pd

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

# S3 prefix
S3_BUCKET = "sklearn-pipeline"
S3_PREFIX = 'sklearn-pipeline-titanic'

S3_BUCKET, S3_PREFIX

('sklearn-pipeline', 'sklearn-pipeline-titanic')

In [48]:
col_to_predict = "survived"

# 1. Get raw data

In [49]:
SEED = 100

RAW_FILE       = 'titanic_dataset.csv'
WORK_DIRECTORY = '/home/ec2-user/SageMaker/aws-ml/pipeline/sklearn-pipeline-classification-titanic/data_dir'

RAW_FILE_PATH  = "{}/raw_data/{}".format(WORK_DIRECTORY, RAW_FILE)
RAW_TRAIN_PATH = "{}/train/train_{}".format(WORK_DIRECTORY, RAW_FILE)
RAW_TEST_PATH  = "{}/test/test_{}".format(WORK_DIRECTORY, RAW_FILE)
RAW_VAL_PATH   = "{}/val/val_{}".format(WORK_DIRECTORY, RAW_FILE)

PP_TRAIN_PATH = "{}/pp_train/train_{}".format(WORK_DIRECTORY, RAW_FILE)
PP_TEST_PATH  = "{}/pp_test/test_{}".format(WORK_DIRECTORY, RAW_FILE)
PP_VAL_PATH   = "{}/pp_val/val_{}".format(WORK_DIRECTORY, RAW_FILE)

ARTIFACTS_PATH = "{}/artifacts".format(WORK_DIRECTORY)

In [50]:
from sklearn.datasets import fetch_openml

X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
X[col_to_predict] = y
X["user_id"] = ["user_"+str(i) for i in range(len(X))]

from pre_processing_script import COLLIST_ALL, COLLIST_FEATURE
X = X[COLLIST_ALL]

# train, test, val : 65%, 25%, 10%
train_data, test_data, val_data = np.split(X.sample(frac=1, random_state=SEED), [int(.65*len(X)), int(.9*len(X))])
val_data = val_data[COLLIST_FEATURE]

# save the data
train_data.to_csv(path_or_buf=RAW_TRAIN_PATH, index=False, header=None)
test_data.to_csv(path_or_buf=RAW_TEST_PATH, index=False, header=None)
val_data.to_csv(path_or_buf=RAW_VAL_PATH, index=False, header=None)

print("raw_data: {}, train: {}, test: {}, val: {}".format(X.shape, train_data.shape, test_data.shape, val_data.shape))
X.head(2)

raw_data: (1309, 7), train: (850, 7), test: (328, 7), val: (131, 5)


Unnamed: 0,survived,user_id,age,fare,embarked,sex,pclass
0,1,user_0,29.0,211.3375,S,female,1.0
1,1,user_1,0.9167,151.55,S,male,1.0


In [8]:
train_data[col_to_predict].value_counts(), test_data[col_to_predict].value_counts()

(0    526
 1    324
 Name: survived, dtype: int64,
 0    202
 1    126
 Name: survived, dtype: int64)

In [9]:
# reverify that the train/test/data are successfully created and saved
train_data.head(1)

Unnamed: 0,survived,user_id,age,fare,embarked,sex,pclass
173,0,user_173,32.5,211.5,C,male,1.0


In [10]:
# reverify that the train/test/data are successfully created and saved
temp_train_data = pd.read_csv(filepath_or_buffer=RAW_TRAIN_PATH, header=None)
temp_train_data.head(1)

Unnamed: 0,0,1,2,3,4,5,6
0,0,user_173,32.5,211.5,C,male,1.0


## Upload the data for training

In [11]:
s3_input_raw_train = sagemaker_session.upload_data(
    path=RAW_TRAIN_PATH, 
    bucket=S3_BUCKET,
    key_prefix='{}/{}'.format(S3_PREFIX, 'data_train'))

s3_input_raw_test = sagemaker_session.upload_data(
    path=RAW_TEST_PATH, 
    bucket=S3_BUCKET,
    key_prefix='{}/{}'.format(S3_PREFIX, 'data_test'))

s3_input_raw_val = sagemaker_session.upload_data(
    path=RAW_VAL_PATH, 
    bucket=S3_BUCKET,
    key_prefix='{}/{}'.format(S3_PREFIX, 'data_val'))

s3_input_raw_train, s3_input_raw_test, s3_input_raw_val

('s3://sklearn-pipeline/sklearn-pipeline-titanic/data_train/train_titanic_dataset.csv',
 's3://sklearn-pipeline/sklearn-pipeline-titanic/data_test/test_titanic_dataset.csv',
 's3://sklearn-pipeline/sklearn-pipeline-titanic/data_val/val_titanic_dataset.csv')

# Data pre-processing

## local pp

In [12]:
#from sklearn import set_config
#set_config(display='diagram')
#preprocessor

In [13]:
train_data_local_dir = "/".join(RAW_TRAIN_PATH.split("/")[:-1])
artifacts_local_dir  = ARTIFACTS_PATH
artifacts_local_dir, train_data_local_dir

! python pre_processing_script.py --output-data-dir /home/ec2-user/SageMaker/aws-ml/pipeline/sklearn-pipeline-classification-titanic/data_dir/artifacts/ \
                                  --model-dir /home/ec2-user/SageMaker/aws-ml/pipeline/sklearn-pipeline-classification-titanic/data_dir/artifacts/ \
                                  --train /home/ec2-user/SageMaker/aws-ml/pipeline/sklearn-pipeline-classification-titanic/data_dir/train/

extracting arguments
data loading completed:
data shape :  (850, 7)
columns : ['survived', 'user_id', 'age', 'fare', 'embarked', 'sex', 'pclass']
loaded RAW data : 
 [[0 'user_173' 32.5 211.5 'C' 'male' 1]] 

to_predict_col : [0 1] : [526 324]

before pp : FEATURE data shape :  (850, 5)
columns (5 columns): ['age', 'fare', 'embarked', 'sex', 'pclass']
sample data : 
 [[32.5 211.5 'C' 'male' 1]] 

[ColumnTransformer] ........... (1 of 2) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing cat, total=   0.0s

after pp : FEATURE data shape :  (850, 10)
(IMP) column (10 columns): ['age' 'fare' 'x0_C' 'x0_Q' 'x0_S' 'x1_female' 'x1_male' 'x2_1' 'x2_2'
 'x2_3']
sample data : 
 [0.23750065 3.71563628 1.         0.         0.         0.
 1.         1.         0.         0.        ]

saved model at :  /home/ec2-user/SageMaker/aws-ml/pipeline/sklearn-pipeline-classification-titanic/data_dir/artifacts/model.joblib


In [14]:
def get_s3uri_df_data(s3uri=None, return_df=False, header=None):
    parsed_url = urlparse(s3uri)
    bucket_name = parsed_url.netloc
    prefix = parsed_url.path[1:]
    s3 = boto3.resource('s3')
    print(bucket_name)
    print(prefix)
    obj = s3.Object(bucket_name, prefix)
    data_stream = obj.get()["Body"].read().decode('utf-8')   
    
    if return_df:
        df = pd.read_csv(StringIO(input_data), header=header)
        return df
    
    return data_stream

In [15]:
%load_ext autoreload
%autoreload 2

In [16]:
%reload_ext autoreload
from pre_processing_script import model_fn as pp_model_fn
from pre_processing_script import predict_fn as pp_predict_fn
from pre_processing_script import input_fn as pp_input_fn

In [17]:
#s3_data_path = s3_input_raw_train
s3_data_path = s3_input_raw_val
model_dir = artifacts_local_dir

# verify model_fn
model     = pp_model_fn(model_dir)
print(type(model))

# verify input_fn
input_data = get_s3uri_df_data(s3uri=s3_data_path, return_df=False, header=None)
df = pp_input_fn(input_data, content_type="text/csv")
print(df.shape)

<class 'sklearn.compose._column_transformer.ColumnTransformer'>
sklearn-pipeline
sklearn-pipeline-titanic/data_val/val_titanic_dataset.csv
This pred/val data i.e unlabelled, not include col_to_predict
df.shape :  (131, 5)
(131, 5)


In [18]:
# verify predict_fn
pp_data = pp_predict_fn(input_data=df, model=model)

print(pp_data.shape)
print(type(pp_data))

before pp :  data shape : (131, 5)
input data type : <class 'pandas.core.frame.DataFrame'>
test/pred job
only contain the feature data
after pp : data shape : (131, 10)
sample data : 
 [-1.69835544  0.08898475  0.          0.          1.          0.
  1.          0.          1.          0.        ]
(131, 10)
<class 'numpy.ndarray'>


In [19]:
import matplotlib.pyplot as plt
from sklearn import set_config

In [20]:
#model
f = plt.figure()
set_config(display='diagram')
model

<Figure size 432x288 with 0 Axes>

In [21]:
# once local run is succesfull, train the container based model

## setup

In [22]:
PP_SCRIPT_NAME = 'pre_processing_script.py'

# preprocessor setup
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"
sklearn_preprocessor = SKLearn(
                            entry_point=PP_SCRIPT_NAME,
                            role=role,
                            framework_version=FRAMEWORK_VERSION,
                            train_instance_type="ml.c4.xlarge",
                            sagemaker_session=sagemaker_session
                            )

## train

In [23]:
sklearn_preprocessor.fit({'train': s3_input_raw_train})

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2020-09-19 11:09:50 Starting - Starting the training job...
2020-09-19 11:09:52 Starting - Launching requested ML instances.........
2020-09-19 11:11:22 Starting - Preparing the instances for training......
2020-09-19 11:12:29 Downloading - Downloading input data...
2020-09-19 11:13:09 Training - Downloading the training image..[34m2020-09-19 11:13:32,813 sagemaker-training-toolkit INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-09-19 11:13:32,815 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-09-19 11:13:32,825 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-09-19 11:13:33,187 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m

2020-09-19 11:13:50 Uploading - Uploading generated training model
2020-09-19 11:13:50 Completed - Training job completed
[34m2020-09-19 11:13:39,453 sagemaker-training-toolkit INFO     No GPUs detecte

### batch transform the raw data to train/test data
required for training the ML model

In [24]:
"""
raw data + label : 9
features + label : 12

raw data : 8
features : 11 (this is pred model required data)

"""
print()




In [25]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
pp_transformer = sklearn_preprocessor.transformer(
                                                    instance_count=1, 
                                                    instance_type='ml.m5.xlarge',
                                                    assemble_with = 'Line',
                                                    accept = 'text/csv'
                                                 )

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


In [26]:
# Preprocess training data : s3_input_raw_train
pp_transformer.transform(s3_input_raw_train, content_type="text/csv")
print("Waiting for transform job: " + pp_transformer.latest_transform_job.job_name)
pp_transformer.wait()
s3_pp_train = pp_transformer.output_path

Waiting for transform job: sagemaker-scikit-learn-2020-09-19-11-14-04-030
.............................[34m2020-09-19 11:18:48,026 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2020-09-19 11:18:48,028 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2020-09-19 11:18:48,029 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }

  server {
    listen 8080 deferred;
    client_max_body_size 0;

    keepalive_timeout 3;

    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_f


[32m2020-09-19T11:18:51.685:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m


In [27]:
"""
# batch preprocess test data : s3_input_raw_test
pp_transformer.transform(s3_input_raw_test, content_type="text/csv")
print("Waiting for transform job: " + pp_transformer.latest_transform_job.job_name)
pp_transformer.wait()
s3_pp_test = pp_transformer.output_path
"""
s3_pp_test = s3_pp_train

In [28]:
# only useful for assessing the ml_model only endpoint

# batch preprocess val data : s3_input_raw_val
pp_transformer.transform(s3_input_raw_val, content_type="text/csv")
print("Waiting for transform job: " + pp_transformer.latest_transform_job.job_name)
pp_transformer.wait()
s3_pp_val = pp_transformer.output_path
"""
s3_pp_val = None
"""

Waiting for transform job: sagemaker-scikit-learn-2020-09-19-11-19-16-956
.............................[34m2020-09-19 11:23:50,770 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2020-09-19 11:23:50,772 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2020-09-19 11:23:50,773 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }

  server {
    listen 8080 deferred;
    client_max_body_size 0;

    keepalive_timeout 3;

    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_f




'\ns3_pp_val = None\n'

In [29]:
s3_input_raw_train

's3://sklearn-pipeline/sklearn-pipeline-titanic/data_train/train_titanic_dataset.csv'

In [30]:
s3_pp_train, s3_pp_test, s3_pp_val

('s3://sagemaker-us-east-1-120286446822/sagemaker-scikit-learn-2020-09-19-11-14-04-030',
 's3://sagemaker-us-east-1-120286446822/sagemaker-scikit-learn-2020-09-19-11-14-04-030',
 's3://sagemaker-us-east-1-120286446822/sagemaker-scikit-learn-2020-09-19-11-19-16-956')

# ML model (sklearn)

## local train

In [31]:
# fetch data
s3uri     = s3_pp_train
file_name = '{}.out'.format(RAW_TRAIN_PATH.split("/")[-1])
s3_obj = get_csv_output_from_s3(s3uri, file_name)
train_df_pp  = pd.read_csv(io.StringIO(s3_obj), sep=",", header=None)

"""
s3uri     = s3_pp_test
file_name = '{}.out'.format(RAW_TEST_PATH.split("/")[-1])
s3_obj = get_csv_output_from_s3(s3uri, file_name)
test_df_pp  = pd.read_csv(io.StringIO(s3_obj), sep=",", header=None)
"""
test_df_pp = train_df_pp.copy()


train_df_pp.to_csv(path_or_buf=PP_TRAIN_PATH, index=False, header=None)
test_df_pp.to_csv(path_or_buf=PP_TEST_PATH, index=False, header=None)
#val_df_pp.to_csv(path_or_buf=PP_VAL_PATH, index=False, header=None)

print(train_df_pp.shape, test_df_pp.shape)
train_df_pp.sample(2)

sagemaker-us-east-1-120286446822
sagemaker-scikit-learn-2020-09-19-11-14-04-030
train_titanic_dataset.csv.out
(850, 12) (850, 12)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
76,0,user_615,-0.750181,-0.509231,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
80,1,user_953,-0.592152,-0.523759,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [32]:
predict_col = 0
train_df_pp[predict_col].value_counts()

0    526
1    324
Name: 0, dtype: int64

In [33]:
pp_train_data_local_dir = "/".join(PP_TRAIN_PATH.split("/")[:-1])
pp_test_data_local_dir = "/".join(PP_TRAIN_PATH.split("/")[:-1])
artifacts_local_dir  = ARTIFACTS_PATH

artifacts_local_dir, pp_train_data_local_dir, pp_test_data_local_dir

('/home/ec2-user/SageMaker/aws-ml/pipeline/sklearn-pipeline-classification-titanic/data_dir/artifacts',
 '/home/ec2-user/SageMaker/aws-ml/pipeline/sklearn-pipeline-classification-titanic/data_dir/pp_train',
 '/home/ec2-user/SageMaker/aws-ml/pipeline/sklearn-pipeline-classification-titanic/data_dir/pp_train')

In [34]:
! python model_script.py --n-estimators 100 \
                         --min-samples-leaf 2 \
                         --model-dir '/home/ec2-user/SageMaker/aws-ml/pipeline/sklearn-pipeline-classification-titanic/data_dir/artifacts/' \
                         --train '/home/ec2-user/SageMaker/aws-ml/pipeline/sklearn-pipeline-classification-titanic/data_dir/pp_train/' \
                         --test '/home/ec2-user/SageMaker/aws-ml/pipeline/sklearn-pipeline-classification-titanic/data_dir/pp_train/' \
                         --target '0' \
                         --col_index_to_drop '0, 1'

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
extracting arguments
loading train data
args.train :  /home/ec2-user/SageMaker/aws-ml/pipeline/sklearn-pipeline-classification-titanic/data_dir/pp_train/
loading test data
args.test :  /home/ec2-user/SageMaker/aws-ml/pipeline/sklearn-pipeline-classification-titanic/data_dir/pp_train/

building training and testing datasets
!! below data print includes col_to_predict/col_primary_identifer !!

col_to_predict: 0
col_index_to_drop: [0, 1]
training data shape :  (850, 12)
train data head(1) : 
   0         1         2         3    4    5    6    7    8    9    10   11
0   0  user_173  0.237501  3.715636  1.0  0.0  0.0  0.0  1.0  1.0  0.0  0.0
train : [0 1] : [526 324]
test  : [0 1] : [526 324]
model training started!!
model training 

In [35]:
# once local run is succesfull, train the container based model
# uncomment install('matplotlib')

## setup

In [36]:
ML_MODEL_SCRIPT_NAME = "model_script.py"

from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = '0.23-1'
ml_estimator = SKLearn(
                    entry_point=ML_MODEL_SCRIPT_NAME,
                    role = get_execution_role(),
                    train_instance_count=1,
                    train_instance_type='ml.c5.xlarge',
                    framework_version=FRAMEWORK_VERSION,
                    base_job_name='rf-scikit',
                    metric_definitions=[
                                        {'Name': 'median-AE',
                                         'Regex': "AE-at-50th-percentile: ([0-9.]+).*$"}
                                        ],
                    hyperparameters = {'n-estimators': 100,
                                       'min-samples-leaf': 2,
                                       'target': '0',
                                       'col_index_to_drop': '0, 1'
                                      }
                    )

## train

In [37]:
# TRAIN the model
ml_estimator.fit({'train':s3_pp_train, 'test': s3_pp_test}, wait=True)

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2020-09-19 11:24:37 Starting - Starting the training job...
2020-09-19 11:24:40 Starting - Launching requested ML instances.........
2020-09-19 11:26:10 Starting - Preparing the instances for training...
2020-09-19 11:27:02 Downloading - Downloading input data
2020-09-19 11:27:02 Training - Downloading the training image...
2020-09-19 11:27:32 Training - Training image download completed. Training in progress..[34m2020-09-19 11:27:33,018 sagemaker-training-toolkit INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-09-19 11:27:33,019 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-09-19 11:27:33,028 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-09-19 11:27:33,280 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-09-19 11:27:33,498 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[3


2020-09-19 11:27:50 Uploading - Uploading generated training model
2020-09-19 11:27:50 Completed - Training job completed
Training seconds: 65
Billable seconds: 65


# Serial Inference Pipeline

In [38]:
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel
import boto3
from time import gmtime, strftime

timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())

# step_1 : get models
pp_transformer_model = sklearn_preprocessor.create_model()
ml_estimator_model   = ml_estimator.create_model()

# step_2 : set-up pipeline
model_name    = 'sip-clf-' + timestamp_prefix
endpoint_name = 'sip-clf-ep-' + timestamp_prefix
ml_pipeline_model = PipelineModel(
                                    name=model_name, 
                                    role=role, 
                                    models=[
                                            pp_transformer_model, 
                                            ml_estimator_model
                                            ]
                                    )

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.
Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


In [39]:
# batch prediction job

"""
ml_pipeline_tf = ml_pipeline_model.transformer(
                                            instance_count=1, 
                                            instance_type='ml.m5.xlarge',
                                            assemble_with = 'Line',
                                            accept = 'text/csv')

# input : s3_input_raw_val (raw input data)
ml_pipeline_tf.transform(s3_input_raw_val, content_type="text/csv")
print("Waiting for transform job: " + ml_pipeline_tf.latest_transform_job.job_name)
ml_pipeline_tf.wait()
s3_pred_val = ml_pipeline_tf.output_path
s3_pred_val
"""
print()




## deploy pipeline model

In [52]:
#sm_model.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge', endpoint_name=endpoint_name)
ml_pipeline_model.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge', endpoint_name=endpoint_name)

Using already existing model: sip-clf-2020-09-19-11-28-20


-----------------!

# predict from pipeline endpoint

In [53]:
endpoint_name

'sip-clf-ep-2020-09-19-11-28-20'

In [54]:
temp_test_data = pd.read_csv(filepath_or_buffer=RAW_TEST_PATH, header=None)
temp_test_data.head(2)

Unnamed: 0,0,1,2,3,4,5,6
0,0,user_1132,,7.8958,S,male,3.0
1,1,user_245,33.0,86.5,S,female,1.0


In [55]:
temp_test_data.head(4).values

array([[0, 'user_1132', nan, 7.8958, 'S', 'male', 3.0],
       [1, 'user_245', 33.0, 86.5, 'S', 'female', 1.0],
       [0, 'user_1298', 36.0, 9.5, 'S', 'male', 3.0],
       [0, 'user_426', 30.0, 13.0, 'S', 'male', 2.0]], dtype=object)

In [None]:
'33.0, 86.5, S, female, 1.0\n30.0, 13.0, S, male, 2'

In [59]:
from sagemaker.predictor import json_serializer, csv_serializer, json_deserializer, RealTimePredictor
from sagemaker.content_types import CONTENT_TYPE_CSV, CONTENT_TYPE_JSON


#payload = '33.0, 86.5, S, female, 1.0' # user_245, 1, pred=1
#payload = '30.0, 13.0, S, male, 2' # user_426, 0, pred=1
#payload = ', 7.89, S, male, 3.0' # user_1132, 0, pred=0
payload = '33.0, 86.5, S, female, 1.0\n30.0, 13.0, S, male, 2' #user_245,user_426 ;1,1; '[1, 1]'

predictor = RealTimePredictor(
    endpoint=endpoint_name,
    sagemaker_session=sagemaker_session,
    serializer=csv_serializer,
    content_type=CONTENT_TYPE_CSV,
    accept=CONTENT_TYPE_JSON)

print(predictor.predict(payload))

b'[1, 1]'


In [None]:
stop here

## predict : only using model

#### deploy model

In [None]:
# deploy only the - ml model
ml_predictor = ml_estimator.deploy(instance_type='ml.m4.xlarge', initial_instance_count=1)

#### get test data

In [None]:
import json
import io
from urllib.parse import urlparse
import boto3

def get_csv_output_from_s3(s3uri, file_name):
    parsed_url = urlparse(s3uri)
    bucket_name = parsed_url.netloc
    prefix = parsed_url.path[1:]
    s3 = boto3.resource('s3')
    print(bucket_name)
    print(prefix)
    print(file_name)
    obj = s3.Object(bucket_name, '{}/{}'.format(prefix, file_name))
    return obj.get()["Body"].read().decode('utf-8')   

In [None]:
import pandas as pd

path       = preprocessed_val
batch_file = 'abalone_val.csv' # imp
output = get_csv_output_from_s3(path, '{}.out'.format(batch_file))
validate_df = pd.read_csv(io.StringIO(output), sep=",", header=None)
print(validate_df.shape)
validate_df.sample(2) 

#### prediction

In [None]:
# `data` is a NumPy array or a Python list.
# `response` is a NumPy array.

#payload = validate_df.drop(columns=[0]).values
payload = validate_df.values

response = ml_predictor.predict(payload)
response

In [None]:
stop

## Delete Endpoint <a class="anchor" id="delete_endpoint"></a>
Once we are finished with the endpoint, we clean up the resources!

In [None]:
sm_client = sagemaker_session.boto_session.client('sagemaker')
sm_client.delete_endpoint(EndpointName=endpoint_name)

In [None]:
sm_client = sagemaker_session.boto_session.client('sagemaker')
sm_client.delete_endpoint(EndpointName=endpoint_name)