# Prepare data with SageMaker Processing

## Setup environment

In [1]:
import os
import boto3
import sagemaker
from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role() # we are using the notebook instance role for training in this example
bucket = sagemaker_session.default_bucket() # you can specify a bucket name here

## Get data

In [2]:
!wget https://aws-mlops-workshop.s3-eu-west-1.amazonaws.com/reviews/workshop_data/reviews.csv

--2020-06-26 21:41:27--  https://aws-mlops-workshop.s3-eu-west-1.amazonaws.com/reviews/workshop_data/reviews.csv
Resolving aws-mlops-workshop.s3-eu-west-1.amazonaws.com (aws-mlops-workshop.s3-eu-west-1.amazonaws.com)... 52.218.105.187
Connecting to aws-mlops-workshop.s3-eu-west-1.amazonaws.com (aws-mlops-workshop.s3-eu-west-1.amazonaws.com)|52.218.105.187|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4445279 (4.2M) [text/csv]
Saving to: ‘reviews.csv’


2020-06-26 21:41:28 (7.93 MB/s) - ‘reviews.csv’ saved [4445279/4445279]



In [3]:
prefix = 'data/input'
s3_input = sagemaker_session.upload_data('reviews.csv', bucket, prefix)
print(s3_input)

s3://sagemaker-us-east-1-175748383800/data/input/reviews.csv


## Build and push container

In [4]:
image_name = 'data-processing-containers'

In [5]:
!sh ./docker/build_and_push.sh $image_name

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
Building image with name data-processing-containers
Sending build context to Docker daemon   7.68kB
Step 1/6 : FROM ubuntu:18.04
18.04: Pulling from library/ubuntu

[1B167c320d: Pulling fs layer 
[1B805ec7fd: Pulling fs layer 
[1Bd380e680: Pulling fs layer 
[1BDigest: sha256:86510528ab9cd7b64209cbbe6946e094a6d10c6db21def64a93ebdd20011de1d[K[4A[1K[K[4A[1K[K[4A[1K[K[4A[1K[K[4A[1K[K[4A[1K[K[4A[1K[K[3A[1K[K[2A[1K[K[2A[1K[K[1A[1K[K[1A[1K[K
Status: Downloaded newer image for ubuntu:18.04
 ---> 8e4ce0a6ce69
Step 2/6 : RUN apt-get update && apt-get install -y --no-install-recommends         wget         zip         unzip         git         ca-certificates         curl         python3.6         python3-pip         && rm -rf /var/lib/apt-get/lists/*
 ---> Running in 41bdcdd0905c
Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:2 h

Get:38 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libsasl2-modules-db amd64 2.1.27~101-g0780600+dfsg-3ubuntu2.1 [14.8 kB]
Get:39 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libsasl2-2 amd64 2.1.27~101-g0780600+dfsg-3ubuntu2.1 [49.2 kB]
Get:40 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libldap-common all 2.4.45+dfsg-1ubuntu1.5 [16.9 kB]
Get:41 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libldap-2.4-2 amd64 2.4.45+dfsg-1ubuntu1.5 [155 kB]
Get:42 http://archive.ubuntu.com/ubuntu bionic/main amd64 libnghttp2-14 amd64 1.30.0-1ubuntu1 [77.8 kB]
Get:43 http://archive.ubuntu.com/ubuntu bionic/main amd64 librtmp1 amd64 2.4+20151223.gitfa8646d.1-1 [54.2 kB]
Get:44 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libcurl4 amd64 7.58.0-2ubuntu3.9 [214 kB]
Get:45 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 curl amd64 7.58.0-2ubuntu3.9 [159 kB]
Get:46 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libc

Selecting previously unselected package libroken18-heimdal:amd64.
Preparing to unpack .../15-libroken18-heimdal_7.5.0+dfsg-1_amd64.deb ...
Unpacking libroken18-heimdal:amd64 (7.5.0+dfsg-1) ...
Selecting previously unselected package libasn1-8-heimdal:amd64.
Preparing to unpack .../16-libasn1-8-heimdal_7.5.0+dfsg-1_amd64.deb ...
Unpacking libasn1-8-heimdal:amd64 (7.5.0+dfsg-1) ...
Selecting previously unselected package libheimbase1-heimdal:amd64.
Preparing to unpack .../17-libheimbase1-heimdal_7.5.0+dfsg-1_amd64.deb ...
Unpacking libheimbase1-heimdal:amd64 (7.5.0+dfsg-1) ...
Selecting previously unselected package libhcrypto4-heimdal:amd64.
Preparing to unpack .../18-libhcrypto4-heimdal_7.5.0+dfsg-1_amd64.deb ...
Unpacking libhcrypto4-heimdal:amd64 (7.5.0+dfsg-1) ...
Selecting previously unselected package libwind0-heimdal:amd64.
Preparing to unpack .../19-libwind0-heimdal_7.5.0+dfsg-1_amd64.deb ...
Unpacking libwind0-heimdal:amd64 (7.5.0+dfsg-1) ...
Selecting previously unselected pac

 ---> Running in 57fe0b5b40ae
Collecting numpy (from -r /tmp/requirements.txt (line 1))
  Downloading https://files.pythonhosted.org/packages/00/16/476826a84d545424084499763248abbbdc73d065168efed9aa71cdf2a7dc/numpy-1.19.0-cp36-cp36m-manylinux1_x86_64.whl (13.5MB)
Collecting pandas (from -r /tmp/requirements.txt (line 2))
  Downloading https://files.pythonhosted.org/packages/c0/95/cb9820560a2713384ef49060b0087dfa2591c6db6f240215c2bce1f4211c/pandas-1.0.5-cp36-cp36m-manylinux1_x86_64.whl (10.1MB)
Collecting scikit-learn (from -r /tmp/requirements.txt (line 3))
  Downloading https://files.pythonhosted.org/packages/d9/3a/eb8d7bbe28f4787d140bb9df685b7d5bf6115c0e2a969def4027144e98b6/scikit_learn-0.23.1-cp36-cp36m-manylinux1_x86_64.whl (6.8MB)
Collecting python-dateutil>=2.6.1 (from pandas->-r /tmp/requirements.txt (line 2))
  Downloading https://files.pythonhosted.org/packages/d4/70/d60450c3dd48ef87586924207ae8907090de0b306af2bce5d134d78615cb/python_dateutil-2.8.1-py2.py3-none-any.whl (227kB)

In [6]:
# Replace by your ECR image ID
container = '175748383800.dkr.ecr.us-east-1.amazonaws.com/data-processing-containers:latest' 

## Launch data processing job

In [7]:
data_processor = Processor(role=role, 
                           image_uri=container, 
                           instance_count=1, 
                           instance_type='ml.m5.xlarge',
                           volume_size_in_gb=30, 
                           max_runtime_in_seconds=1200,
                           base_job_name='data-processing')

In [8]:
input_folder = '/opt/ml/processing/input'
output_folder = '/opt/ml/processing/output'

data_processor.run(
    arguments= [
        f'--input={input_folder}',
        f'--output={output_folder}'
    ],
    inputs = [
        ProcessingInput(
            input_name='input',
            source=s3_input,
            destination=input_folder
        )
    ],
    outputs= [
        ProcessingOutput(
            output_name='preprocessed',
            source=output_folder,
            destination=bucket
        )
    ]
)


Job Name:  data-processing-2020-06-26-21-44-08-917
Inputs:  [{'InputName': 'input', 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-175748383800/data/input/reviews.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'preprocessed', 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-175748383800/data-processing-2020-06-26-21-44-08-917/output/preprocessed', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]
.....................
..