In [None]:
%%sh
pip install -q sagemaker --upgrade

In [None]:
import sagemaker

print(sagemaker.__version__)

session = sagemaker.Session()
role = sagemaker.get_execution_role()

In [None]:
%%sh
# https://s3.amazonaws.com/amazon-reviews-pds/readme.html
aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Camera_v1_00.tsv.gz /tmp

In [None]:
prefix = 'amazon-reviews-camera'

input_data = session.upload_data(
    path='/tmp/amazon_reviews_us_Camera_v1_00.tsv.gz', 
    key_prefix=prefix)

In [None]:
prefix = 'amazon-reviews-camera'

input_data = session.upload_data(path='/tmp/amazon_reviews_us_Camera_v1_00.tsv.gz', key_prefix=prefix)

# We could also use the Athena output file

In [None]:
print(input_data)

In [None]:
from sagemaker.sklearn.processing import SKLearnProcessor

sklearn_processor = SKLearnProcessor(framework_version='0.23-1',
                                     role=role,
                                     instance_type='ml.m5.2xlarge',
                                     instance_count=1)

In [None]:
%%time

from sagemaker.processing import ProcessingInput, ProcessingOutput

sklearn_processor.run(
    code='preprocessing.py',
    
    inputs=[
        ProcessingInput(
                source=input_data,
                destination='/opt/ml/processing/input')
    ],
    
    outputs=[
        ProcessingOutput(
                output_name='bt_data',
                source='/opt/ml/processing/output/bt'),
        ProcessingOutput(
                output_name='fs_data',
                source='/opt/ml/processing/output/fs')
    ],
    
    arguments=[
               '--filename', 'amazon_reviews_us_Camera_v1_00.tsv.gz',
               '--num-reviews', '10000',
               '--library', 'spacy'  # 'spacy' or 'nltk'
    ]
)

In [None]:
preprocessing_job_description = sklearn_processor.jobs[-1].describe()

output_config = preprocessing_job_description['ProcessingOutputConfig']
for output in output_config['Outputs']:
    print(output['S3Output']['S3Uri'])