In [None]:
%%sh
pip -q install spacy
python -m spacy download en_core_web_sm
python -m spacy validate

## Inspect and processing data manually

In [None]:
%%sh
aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Camera_v1_00.tsv.gz /tmp

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('/tmp/amazon_reviews_us_Camera_v1_00.tsv.gz', sep='\t', compression='gzip',
                   error_bad_lines=False, dtype='str')

In [None]:
data.head()

In [None]:
data = data.dropna()
print(data.shape)

In [None]:
data = data[:100000]

In [None]:
data = data[['review_body']]

In [None]:
data.head()

In [None]:
import spacy

spacy_nlp = spacy.load('en_core_web_sm')

def tokenize(text):
    tokens = spacy_nlp.tokenizer(text)
    tokens = [ t.text for t in tokens ]
    return " ".join(tokens).lower()

In [None]:
%%time
data['review_body'] = data['review_body'].apply(tokenize)

In [None]:
data.head()

In [None]:
import numpy as np

np.savetxt('/tmp/training.txt', data.values, fmt='%s')

In [None]:
!head -5 /tmp/training.txt

## Training

In [None]:
import sagemaker

print(sagemaker.__version__)

session = sagemaker.Session()
bucket = session.default_bucket()

In [None]:
# Run this cell if you want to use the data you processed manually

prefix = 'amazon-reviews-word2vec'

s3_train_path = session.upload_data(path='/tmp/training.txt', bucket=bucket, key_prefix=prefix+'/input/train')
s3_output = 's3://{}/{}/output/'.format(bucket, prefix)

print(s3_train_path)
print(s3_output)

In [None]:
# Run this cell if you want to use the data processed by SageMaker Processing

prefix = 'amazon-reviews-word2vec'

s3_train_path = 
s3_output = 's3://{}/{}/output/'.format(bucket, prefix)

print(s3_train_path)
print(s3_output)

In [None]:
from sagemaker import image_uris

region = session.boto_session.region_name    
container = image_uris.retrieve('blazingtext', region)
print(container)

In [None]:
role = sagemaker.get_execution_role()

bt = sagemaker.estimator.Estimator(
    container,
    sagemaker.get_execution_role(),
    instance_count=1, 
    instance_type='ml.p3.2xlarge',
    output_path=s3_output)

In [None]:
bt.set_hyperparameters(mode='skipgram', subwords=True)

In [None]:
train_data = sagemaker.TrainingInput(
    s3_train_path, 
    content_type='text/plain')

s3_channels = {'train': train_data}

In [None]:
bt.fit(inputs=s3_channels)

In [None]:
%%bash -s "$s3_output"
aws s3 ls --recursive $1