## Inspect and processing data manually

In [None]:
%%sh
aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Camera_v1_00.tsv.gz /tmp

In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv('/tmp/amazon_reviews_us_Camera_v1_00.tsv.gz', sep='\t', 
                   compression='gzip', error_bad_lines=False, dtype='str')
data.dropna(inplace=True)

In [None]:
print(data.shape)
print(data.columns)

In [None]:
data.head()

In [None]:
data = data[:100000]

In [None]:
data = data[['star_rating', 'review_body']]

In [None]:
data.star_rating.unique()

In [None]:
data['label'] = data.star_rating.map({
    '1': '__label__negative__',
    '2': '__label__negative__',
    '3': '__label__neutral__',
    '4': '__label__positive__',
    '5': '__label__positive__'})

In [None]:
data = data.drop(['star_rating'], axis=1)

In [None]:
data = data[['label', 'review_body']]

In [None]:
data.head()

In [None]:
import nltk
nltk.download('punkt')

In [None]:
%%time
data['review_body'] = data['review_body'].apply(nltk.word_tokenize)

In [None]:
%%time
data['review_body'] = data.apply(lambda row: " ".join(row['review_body']).lower(), axis=1)

In [None]:
data.head()

In [None]:
from sklearn.model_selection import train_test_split

training, validation = train_test_split(data, test_size=0.05)

In [None]:
print(training.shape)
print(validation.shape)

In [None]:
np.savetxt('/tmp/training.txt', training.values, fmt='%s')
np.savetxt('/tmp/validation.txt', validation.values, fmt='%s')

In [None]:
!head -5 /tmp/training.txt

## Training

In [None]:
import sagemaker

print(sagemaker.__version__)

session = sagemaker.Session()
bucket = session.default_bucket()

In [None]:
# Run this cell if you want to use the data you processed manually

prefix = 'amazon-reviews'

s3_train_path = session.upload_data(path='/tmp/training.txt', bucket=bucket, key_prefix=prefix+'/input/train')
s3_val_path = session.upload_data(path='/tmp/validation.txt', bucket=bucket, key_prefix=prefix+'/input/validation')
s3_output = 's3://{}/{}/output/'.format(bucket, prefix)

print(s3_train_path)
print(s3_val_path)
print(s3_output)

In [None]:
# Run this cell if you want to use the data processed by SageMaker Processing

prefix = 'amazon-reviews'

s3_train_path = 
s3_val_path = 
s3_output = 's3://{}/{}/output/'.format(bucket, prefix)

print(s3_train_path)
print(s3_val_path)
print(s3_output)

In [None]:
from sagemaker.image_uris import retrieve

region = session.boto_session.region_name    
container = retrieve('blazingtext', region)
print(container)

In [None]:
bt = sagemaker.estimator.Estimator(
    container,
    sagemaker.get_execution_role(), 
    instance_count=1, 
    instance_type='ml.c5.2xlarge',
    output_path=s3_output)

In [None]:
bt.set_hyperparameters(mode='supervised')

In [None]:
train_data = sagemaker.TrainingInput(
    s3_train_path,
    content_type='text/plain')

validation_data = sagemaker.TrainingInput(
    s3_val_path,
    content_type='text/plain')

s3_channels = {'train': train_data, 'validation': validation_data}

In [None]:
bt.fit(inputs=s3_channels)

In [None]:
bt_predictor = bt.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

In [None]:
import json
import pprint

sentences = ['This is a bad camera it doesnt work at all , i want a refund  .', 
             'The camera works , the pictures are decent quality, nothing special to say about it .',
             'Very happy to have bought this , exactly what I needed']

payload = {"instances" : sentences, "configuration": {"k": 3}}

bt_predictor.serializer = sagemaker.serializers.JSONSerializer()
response = bt_predictor.predict(payload)

print(response)

In [None]:
bt_predictor.delete_endpoint()