# SkLearn Batch Transform + Custom Inference Script

- [Sklearn SM Documentation](https://sagemaker.readthedocs.io/en/stable/frameworks/sklearn/using_sklearn.html)
- [SageMaker Batch Transform](https://github.com/awsdocs/amazon-sagemaker-developer-guide/blob/master/doc_source/batch-transform.md)
- Dataset: [Petrol Consumption](https://www.kaggle.com/harinir/petrol-consumption)
- Setting: SageMaker Studio Data Science Kernel (ml.c5.large)

# Data Reading

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("petrol_consumption.csv")
df.head()

In [None]:
print(len(df))

In [None]:
#Splitting data in 80-20 split to use testing data for model inference later
train = df.iloc[:35,:]
test = df.iloc[36:,:]

In [None]:
#Train and test csv
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

# Upload Data to S3

In [None]:
#Create a sagemaker session to be able to upload data to s3
import boto3
import sagemaker
sagemaker_session = sagemaker.Session()

#Uploading data to S3 bucket titled "tf-iris-data"
prefix = "sklearn-petrol-data"

#Create train and test paths, with the test dataset we will use batch inference
training_input_path = sagemaker_session.upload_data('train.csv', key_prefix=prefix + '/training')
test_data_path = sagemaker_session.upload_data('test.csv', key_prefix=prefix + '/test')

In [None]:
training_input_path

In [None]:
test_data_path

## Locally Test (Optional)

In [None]:
import argparse, os
import boto3
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import joblib
import pickle
from io import StringIO

train = pd.read_csv("train.csv")
############
#Preprocessing data
############
X = train.drop('Petrol_Consumption', axis = 1)
y = train['Petrol_Consumption']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

###########
#Model Building
###########
regressor = RandomForestRegressor(n_estimators=5)
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)
y_pred

In [None]:
test = pd.read_csv("test.csv")
test.head()

In [None]:
test.columns

In [None]:
testInput = test[['Petrol_tax', 'Average_income', 'Paved_Highways', 'Population_Driver_licence(%)']]
#testInput = sc.transform(testInput)
testInput

In [None]:
regressor.predict(testInput)

## Create Estimator & Training

In [None]:
#Sagemaker role, make sure you've allowed access to any S3 bucket
role = sagemaker.get_execution_role()
role

In [None]:
#Docs: https://sagemaker.readthedocs.io/en/stable/frameworks/sklearn/sagemaker.sklearn.html
from sagemaker.sklearn import SKLearn


sk_estimator = SKLearn(entry_point='train.py', 
                          role=role,
                          instance_count=1, 
                          instance_type='ml.c5.18xlarge',
                          py_version='py3',
                          framework_version='0.23-1',
                          script_mode=True,
                          hyperparameters={
                              'estimators': 20
                          }
                         )

#Training
sk_estimator.fit({'train': training_input_path})

## Batch Transform

[Transformer](https://sagemaker.readthedocs.io/en/stable/api/inference/transformer.html)

In [None]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
transformer = sk_estimator.transformer(
    instance_count=1, instance_type="ml.m5.xlarge", assemble_with="Line", accept="text/csv"
)

In [None]:
# Feed the test data
transformer.transform(test_data_path, content_type="text/csv")
print("Waiting for transform job: " + transformer.latest_transform_job.job_name)
transformer.wait()
output = transformer.output_path

## Process Output With SM Boto3 Client

In [None]:
import boto3
client = boto3.client('sagemaker')

In [None]:
output_path = client.describe_transform_job(TransformJobName = "Enter your transform job name from console")['TransformOutput']['S3OutputPath']
output_path

Take this path and replace it in the aws cp call to download your output data to S3, if it does not work get the object URI from S3.

In [None]:
!aws s3 cp 'Replace with your S3 output path' output.csv

In [None]:
results = pd.read_csv('output.csv')
results