## LightGBM BYOC Real-Time SageMaker Inference

Dockerfile: https://github.com/microsoft/LightGBM/blob/master/docker/dockerfile-python

In [12]:
%%sh

# Name of algo -> ECR
algorithm_name=byoc-light-gbm

cd container

#executable for training + serve
chmod +x regressor/train
chmod +x regressor/serve

account=$(aws sts get-caller-identity --query Account --output text)

# Region, defaults to us-west-2
region=$(aws configure get region)
region=${region:-us-east-1}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly
aws ecr get-login-password --region ${region}|docker login --username AWS --password-stdin ${fullname}

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build  -t ${algorithm_name} .
docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

Login Succeeded
Sending build context to Docker daemon  31.23kB
Step 1/12 : FROM ubuntu:18.04
 ---> 5a214d77f5d7
Step 2/12 : ARG CONDA_DIR=/opt/conda
 ---> Using cache
 ---> 8c9a1632e184
Step 3/12 : ENV PATH $CONDA_DIR/bin:$PATH
 ---> Using cache
 ---> 12c4b3fa71e5
Step 4/12 : RUN apt-get -y update && apt-get install -y --no-install-recommends          wget          python3-pip          python3-setuptools          nginx          ca-certificates     && rm -rf /var/lib/apt/lists/*
 ---> Running in 241eb50b715e
Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic InRelease [242 kB]
Get:3 http://security.ubuntu.com/ubuntu bionic-security/multiverse amd64 Packages [26.8 kB]
Get:4 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Get:5 http://security.ubuntu.com/ubuntu bionic-security/main amd64 Packages [2430 kB]
Get:6 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Get:7 http://arch

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



## Setup & Prepare Data For Training

In [13]:
# S3 prefix
prefix = "byo-lgbm"

# Define IAM role
import boto3
import re

import os
import numpy as np
import pandas as pd
from sagemaker import get_execution_role

role = get_execution_role() #ensure sagemaker full access, ECR, S3
print(role)

arn:aws:iam::474422712127:role/sagemaker-role-BYOC


In [14]:
import sagemaker as sage
from time import gmtime, strftime

sess = sage.Session()

In [15]:
WORK_DIRECTORY = "data" #local directory with data
data_location = sess.upload_data(WORK_DIRECTORY, key_prefix=prefix)

## Training

In [16]:
account = sess.boto_session.client("sts").get_caller_identity()["Account"]
region = sess.boto_session.region_name

#replace image with your ECR Image URI created in first step
image = "474422712127.dkr.ecr.us-east-1.amazonaws.com/byoc-light-gbm:latest".format(account, region)

gbm = sage.estimator.Estimator(
    image,
    role,
    1,
    "ml.c4.2xlarge",
    output_path="s3://{}/output".format(sess.default_bucket()),
    sagemaker_session=sess,
)

gbm.fit(data_location)

2021-11-22 00:44:54 Starting - Starting the training job...
2021-11-22 00:45:17 Starting - Launching requested ML instancesProfilerReport-1637541894: InProgress
......
2021-11-22 00:46:18 Starting - Preparing the instances for training......
2021-11-22 00:47:18 Downloading - Downloading input data...
2021-11-22 00:47:38 Training - Downloading the training image......
2021-11-22 00:48:50 Uploading - Uploading generated training model
2021-11-22 00:48:50 Completed - Training job completed
[34mImports completed[0m
[34mStarting the training.[0m
[34merror with reading in dataset[0m
[34m2[0m
[34m/opt/ml/input/data/training/.ipynb_checkpoints[0m
[34m<class 'str'>[0m
[34m/opt/ml/input/data/training/petrol_consumption.csv[0m
[34m<class 'str'>[0m
[34m['/opt/ml/input/data/training/.ipynb_checkpoints', '/opt/ml/input/data/training/petrol_consumption.csv'][0m
[34m['/opt/ml/input/data/training/petrol_consumption.csv'][0m
[34m48[0m
[34mIndex(['Petrol_tax', 'Average_income', 'Pa

## Deploy Endpoint

In [17]:
from sagemaker.predictor import csv_serializer
lgbm_pred = gbm.deploy(1, "ml.m4.xlarge", serializer=csv_serializer)

---------!

## Inference

In [18]:
shape = pd.read_csv("data/petrol_consumption.csv", header=None)
shape.drop(shape.columns[[4]], axis=1, inplace=True)
shape.sample(3)

Unnamed: 0,0,1,2,3
12,7.5,5126,14186,0.525
4,7.5,4870,2351,0.529
34,7.5,3357,4121,0.547


In [19]:
import itertools

a = [10*i for i in range(3)]
#print(a)
b = [10+i for i in range(5)]
#print(b)
indices = [i+j for i,j in itertools.product(a,b)]
#print(indices)

test_data = shape.iloc[indices[:-1]]
test_data

Unnamed: 0,0,1,2,3
10,7.0,4512,8507,0.552
11,8.0,4391,5939,0.53
12,7.5,5126,14186,0.525
13,7.0,4817,6930,0.574
14,7.0,4207,6580,0.545
20,8.5,4341,6010,0.677
21,7.0,4593,7834,0.663
22,8.0,4983,602,0.602
23,9.0,4897,2449,0.511
24,9.0,4258,4686,0.517


In [20]:
df = pd.read_csv("data/petrol_consumption.csv")
res = df.iloc[indices, : ]['Petrol_Consumption']
res

10    580
11    471
12    525
13    508
14    566
20    649
21    540
22    464
23    547
24    460
30    571
31    554
32    577
33    628
34    487
Name: Petrol_Consumption, dtype: int64

In [23]:
print(lgbm_pred.predict(test_data.values).decode('utf-8'))

The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


585.5343618802953
534.9980447531753
534.9980447531753
619.8918568764199
585.5343618802953
569.3555397493
619.8918568764199
569.3555397493
534.9980447531753
534.9980447531753
534.9980447531753
585.5343618802953
585.5343618802953
569.3555397493



## Cleanup

In [24]:
lgbm_pred.delete_endpoint()