# Managed Data Processing with SageMaker Processing in R

<img align="left" width="130" src="https://raw.githubusercontent.com/PacktPublishing/Amazon-SageMaker-Cookbook/master/Extra/cover-small-padded.png"/>

This notebook contains the code to help readers work through one of the recipes of the book [Machine Learning with Amazon SageMaker Cookbook: 80 proven recipes for data scientists and developers to perform ML experiments and deployments](https://www.amazon.com/Machine-Learning-Amazon-SageMaker-Cookbook/dp/1800567030)

### How to do it...

In [None]:
library('reticulate')
sagemaker <- import('sagemaker')
boto3 <- import('boto3')
role <- sagemaker$get_execution_role()

In [None]:
processing_repository_uri <- "581320662326.dkr.ecr.us-east-1.amazonaws.com/sagemaker-processing-r:2"

session <- boto3$session$Session()
sagemaker_session <- sagemaker$Session(
    boto_session=session)

In [None]:
ScriptProcessor <- sagemaker$processing$ScriptProcessor

script_processor <- ScriptProcessor(
    command=list('Rscript'),
    image_uri=processing_repository_uri,
    role=role,
    sagemaker_session=sagemaker_session,
    instance_count=1L,
    instance_type='ml.m5.xlarge')

In [None]:
ProcessingInput <- sagemaker$processing$ProcessingInput
ProcessingOutput <- sagemaker$processing$ProcessingOutput

In [None]:
source <- 'tmp/dataset.processing.csv'
pinput1 <- ProcessingInput(source=source, destination='/opt/ml/processing/input')
poutput1 <- ProcessingOutput(source='/opt/ml/processing/output')

In [None]:
script_processor$run(code='processing.r',
                     inputs=list(pinput1),
                     outputs=list(poutput1),
                     arguments=list('--sample-argument','3'),
                     wait=TRUE)

In [None]:
cmd <- function(bash_command) {
    print(bash_command)
    output <- system(bash_command, intern=TRUE)
    last_line = ""
    
    for (line in output) { 
        cat(line)
        cat("\n")
        last_line = line 
    }
    
    return(last_line) 
}

In [None]:
cmd('pip install awslogs')

In [None]:
# Make sure SageMaker Execution Role has CloudWatchLogsReadOnlyAccess
cmd("awslogs get /aws/sagemaker/ProcessingJobs -s1h --aws-region=us-east-1")

In [None]:
latest_job <- script_processor$latest_job
destination <- latest_job$outputs[[1]]$destination
destination

In [None]:
csv_path <- paste0(destination, "/output.csv")
command <- paste("aws s3 cp", csv_path, "tmp/output.processing.r.csv")

In [None]:
cmd(command)

In [None]:
read.csv("tmp/output.processing.r.csv", header=FALSE)[[1]]