In [None]:
%%sh
pygmentize Dockerfile

In [None]:
%%sh
export REGION=eu-west-1
aws ecr create-repository --repository-name sm-processing-custom --region $REGION

In [None]:
%%sh
# This cell will not run on SageMaker Studio
# The simplest option is to run these commands on your local machine
export REGION=eu-west-1
export ACCOUNT_ID=`aws sts get-caller-identity --query Account --output text`
docker build -t sm-processing-custom:latest -f Dockerfile .
export IMAGE_ID=`docker images -q sm-processing-custom:latest`
docker tag $IMAGE_ID $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/sm-processing-custom:latest
aws ecr get-login-password --region $REGION | docker login --username AWS --password-stdin $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/sklearn-custom:estimator
docker push $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/sm-processing-custom:latest

In [None]:
import sagemaker

print(sagemaker.__version__)

sess = sagemaker.Session()

In [None]:
prefix = 'headlines-lda-ntm'

input_data = sess.upload_data(path='../../ch6/lda-ntm/abcnews-date-text.csv.gz', key_prefix=prefix)
print(input_data)

In [None]:
account_id = sess.boto_session.client('sts').get_caller_identity()['Account']
region = sess.boto_session.region_name

In [None]:
from sagemaker.processing import Processor

sklearn_processor = Processor(
    image_uri=account_id+'.dkr.ecr.'+region+'.amazonaws.com/sm-processing-custom:latest',
    role=sagemaker.get_execution_role(),
    instance_type='ml.c5.2xlarge',
    instance_count=1)

In [None]:
%%time

from sagemaker.processing import ProcessingInput, ProcessingOutput

sklearn_processor.run(    
    inputs=[
        ProcessingInput(
                source=input_data,
                destination='/opt/ml/processing/input')
    ],
    
    outputs=[
        ProcessingOutput(
                output_name='train_data',
                source='/opt/ml/processing/train/')
    ],
    
    arguments=[
               '--filename', 'abcnews-date-text.csv.gz'
    ]
)

In [None]:
preprocessing_job_description = sklearn_processor.jobs[-1].describe()

output_config = preprocessing_job_description['ProcessingOutputConfig']
for output in output_config['Outputs']:
    print(output['S3Output']['S3Uri'])