# Pre-Trained Sklearn Model Batch Transform

Setup: Studio ml.3.medium Data Science Kernel

## Locally Train and Serialize Model

In [18]:
import argparse, os
import boto3
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import joblib
import pickle

In [None]:
df = pd.read_csv('petrol_consumption.csv')
    
############
#Preprocessing data
############
X = df.drop('Petrol_Consumption', axis = 1)
y = df['Petrol_Consumption']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
###########
#Model Building
###########
regressor = RandomForestRegressor(n_estimators=20)
regressor.fit(X_train, y_train)

In [None]:
regressor.predict(X_test)

In [None]:
joblib.dump(regressor, open("model.joblib", 'wb')) #serialize model

In [None]:
loaded_model = joblib.load(open("model.joblib", 'rb'))

In [None]:
result = loaded_model.predict(X_test) #test the serialized model to ensure working properly
result

## Tar the model artifact

This can be a joblib or a pkl file as long as it is captured in the model.tar.gz

In [None]:
!tar -cvpzf model.tar.gz model.joblib

In [None]:
!aws s3 cp model.tar.gz s3://pre-trained-sklearn/model.tar.gz #replace this path with where your S3 path (can be anything)

In [2]:
!aws s3 ls s3://pre-trained-sklearn/ #make sure model.tar.gz was uploaded

2023-02-01 02:49:24      12498 model.tar.gz


## Creating Test Data for Batch Inference

The train part of this is not necessary for this example as we are not doing a SageMaker training job.

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("petrol_consumption.csv")
df.head()

In [None]:
#Splitting data in 80-20 split to use testing data for model inference later
train = df.iloc[:35,:]
test = df.iloc[36:,:]

In [None]:
#Train and test csv
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

In [4]:
#Create a sagemaker session to be able to upload data to s3
import boto3
import sagemaker
sagemaker_session = sagemaker.Session()

#Uploading data to S3 bucket titled "tf-iris-data"
prefix = "sklearn-petrol-data"

In [5]:
test_data_path = sagemaker_session.upload_data('test.csv', key_prefix=prefix + '/test') #replace with your s3 path if needed

In [6]:
test_data_path #this is what we pass into batch inference

's3://sagemaker-us-east-1-474422712127/sklearn-petrol-data/test/test.csv'

In [7]:
!aws s3 ls s3://sagemaker-us-east-1-474422712127/sklearn-petrol-data/test/ #replace with your s3 path make sure data is present

2023-02-01 03:13:18        378 test.csv


In [8]:
import sagemaker

role = sagemaker.get_execution_role()

## Create SKLearn Model

Here we pass the model data and the inference script which contains our pre and post processing. This is located in the same directory as this notebook. Adjust FWK version as needed.

In [9]:
from sagemaker.sklearn import SKLearnModel

sk_estimator = SKLearnModel(model_data = 's3://pre-trained-sklearn/model.tar.gz',
                       entry_point= 'inference.py',
                       role=role,
                       framework_version='0.23-1')

## Batch Inference

In [10]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
transformer = sk_estimator.transformer(
    instance_count=1, instance_type="ml.m5.xlarge", assemble_with="Line", accept="text/csv"
)

In [11]:
# Feed the test data
transformer.transform(test_data_path, content_type="text/csv")
print("Waiting for transform job: " + transformer.latest_transform_job.job_name)
transformer.wait()
output = transformer.output_path

...........................
[34m2023-02-01 03:17:50,438 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2023-02-01 03:17:50,440 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2023-02-01 03:17:50,441 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
  

## View Results

In [12]:
import boto3
client = boto3.client('sagemaker')

In [13]:
output_path = client.describe_transform_job(TransformJobName = "sagemaker-scikit-learn-2023-02-01-03-13-32-143")['TransformOutput']['S3OutputPath']
output_path

's3://sagemaker-us-east-1-474422712127/sagemaker-scikit-learn-2023-02-01-03-13-32-143'

In [14]:
!aws s3 ls 's3://sagemaker-us-east-1-474422712127/sagemaker-scikit-learn-2023-02-01-03-13-32-143/'

2023-02-01 03:17:57         78 test.csv.out


In [16]:
!aws s3 cp s3://sagemaker-us-east-1-474422712127/sagemaker-scikit-learn-2023-02-01-03-13-32-143/test.csv.out results.csv

download: s3://sagemaker-us-east-1-474422712127/sagemaker-scikit-learn-2023-02-01-03-13-32-143/test.csv.out to ./results.csv


In [17]:
import pandas as pd

res = pd.read_csv("results.csv")
res

Unnamed: 0,605.25
0,651.2
1,741.3
2,816.35
3,578.0
4,662.7
5,629.25
6,581.3
7,732.2
8,491.25
9,639.65
