## Inspect and processing data manually

In [None]:
%%sh
aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Camera_v1_00.tsv.gz /tmp
gzip -d /tmp/amazon_reviews_us_Camera_v1_00.tsv.gz

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('/tmp/amazon_reviews_us_Camera_v1_00.tsv', sep='\t', error_bad_lines=False, dtype='str')

In [None]:
data.head()

In [None]:
data = data.dropna()
print(data.shape)

In [None]:
data = data[:1000]

In [None]:
data = data.drop(['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title',
                  'product_category', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 
                  'review_headline', 'review_date', 'star_rating'], axis=1)

In [None]:
data.head()

In [None]:
import nltk
nltk.download('punkt')

In [None]:
%%time
data['review_body'] = data['review_body'].apply(nltk.word_tokenize)
data['review_body'] = data.apply(lambda row: " ".join(row['review_body']).lower(), axis=1)

In [None]:
data.head()

In [None]:
import numpy as np

np.savetxt('/tmp/training.txt', data.values, fmt='%s')

In [None]:
!head -5 /tmp/training.txt

## Training

In [None]:
import boto3
import sagemaker

session = sagemaker.Session()
bucket = session.default_bucket()

In [None]:
# Run this cell if you want to use the data you processed manually

prefix = 'amazon-reviews-word2vec'

s3_train_path = session.upload_data(path='/tmp/training.txt', bucket=bucket, key_prefix=prefix+'/input/train')
s3_output = 's3://{}/{}/output/'.format(bucket, prefix)

print(s3_train_path)
print(s3_output)

In [None]:
# Run this cell if you want to use the data processed by SageMaker Processing

prefix = 'amazon-reviews-word2vec'

s3_train_path = 's3://sagemaker-eu-west-1-613904931467/sagemaker-scikit-learn-2020-05-09-09-23-47-787/output/train_data'
s3_output = 's3://{}/{}/output/'.format(bucket, prefix)

print(s3_train_path)
print(s3_output)

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri

region_name = boto3.Session().region_name
container = get_image_uri(region_name, "blazingtext", "latest")
print(container)

In [None]:
role = sagemaker.get_execution_role()

bt = sagemaker.estimator.Estimator(container,
                                   role, 
                                   train_instance_count=1, 
                                   train_instance_type='ml.p3.2xlarge',
                                   output_path=s3_output,
                                   sagemaker_session=session)

In [None]:
bt.set_hyperparameters(mode='skipgram')

In [None]:
from sagemaker.session import s3_input

train_data = s3_input(s3_train_path, 
                      distribution='FullyReplicated', 
                      content_type='text/plain',
                      s3_data_type='S3Prefix')

s3_channels = {'train': train_data}

In [None]:
bt.fit(inputs=s3_channels)