In [1]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3
import pandas as pd
sess = sagemaker.Session()

role = get_execution_role()
print(role) # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf

bucket = 'qanv-aws-1' #sess.default_bucket() # Replace with your own bucket name if needed
print(bucket)
#prefix = 'blazingtext/supervised' #Replace with the prefix under which you want to store the data if needed
prefix = 'nlp/supervised' #Replace with the prefix under which you want to store the data if needed

arn:aws:iam::023375022819:role/service-role/AmazonSageMaker-ExecutionRole-20181029T121824
qanv-aws-1


In [9]:
!mkdir Data

In [2]:
!wget https://s3.amazonaws.com/fast-ai-nlp/amazon_review_polarity_csv.tgz

--2019-03-13 00:46:13--  https://s3.amazonaws.com/fast-ai-nlp/amazon_review_polarity_csv.tgz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.233.13
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.233.13|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 688339454 (656M) [application/x-tar]
Saving to: ‘amazon_review_polarity_csv.tgz.1’


2019-03-13 00:46:26 (49.7 MB/s) - ‘amazon_review_polarity_csv.tgz.1’ saved [688339454/688339454]



In [3]:
 
!tar -xvzf amazon_review_polarity_csv.tgz
#df = pd.read_csv("amazon_review_polarity_csv/train.csv", names=["Label", "Title", "Review"])

amazon_review_polarity_csv/
amazon_review_polarity_csv/train.csv
amazon_review_polarity_csv/readme.txt
amazon_review_polarity_csv/test.csv


In [2]:
test_df = pd.read_csv("amazon_review_polarity_csv/test.csv", names=["Label", "Title", "Review"])

In [3]:
test_df.Label.value_counts()

2    200000
1    200000
Name: Label, dtype: int64

In [4]:
test_df.dtypes

Label      int64
Title     object
Review    object
dtype: object

In [5]:
test_df.sample(10)


Unnamed: 0,Label,Title,Review
149518,1,Poor Gilbert :-(,Poor Gilbert! He must have turned over in his ...
90156,1,too much of bad things can't be good,Usually I am able to fly through books by Larr...
32609,2,This is one nice cd.,Nas has done it again. This cd is all that. It...
175748,1,dont buy,This is trash I dont know who rates this devic...
221435,1,Not for technical use.,"If you want read guitar amp appreciation, this..."
316393,2,Written from the heart...,Mr. Sapoznik not only does an enviable job of ...
175026,1,Functional but not made to withstand travel,Its silly that this bag is so flimsy and not m...
309768,1,One of the worst....,I like her aerobics classes on TV but this boo...
71884,2,The Beasties have done it again!,"Wow, what a good CD! I really like the Beastie..."
143618,1,Sorry I bought it -- a one-time read at most,I saw several good reviews of this book but I ...


In [6]:
from random import shuffle
import multiprocessing
from multiprocessing import Pool
import csv
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
index_to_label = {'1': 'Negative', '2': 'Positive'}

In [8]:
def transform_instance(row):
    cur_row = []
    label = "__label__" + index_to_label[row[0]]  #Prefix the index-ed label with __label__
    cur_row.append(label)
    cur_row.extend(nltk.word_tokenize(row[1].lower()))
    cur_row.extend(nltk.word_tokenize(row[2].lower()))
    return cur_row

In [9]:
def preprocess(input_file, output_file, keep=1):
    all_rows = []
    with open(input_file, 'r') as csvinfile:
        csv_reader = csv.reader(csvinfile, delimiter=',')
        for row in csv_reader:
            all_rows.append(row)
    shuffle(all_rows)
    all_rows = all_rows[:int(keep*len(all_rows))]
    pool = Pool(processes=multiprocessing.cpu_count())
    transformed_rows = pool.map(transform_instance, all_rows)
    pool.close() 
    pool.join()
    
    with open(output_file, 'w') as csvoutfile:
        csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
        csv_writer.writerows(transformed_rows)

In [10]:
%%timeit
# Preparing the training dataset

# Since preprocessing the whole dataset might take a couple of mintutes,
# we keep 20% of the training dataset for this demo.
# Set keep to 1 if you want to use the complete dataset
preprocess('amazon_review_polarity_csv/train.csv', 'polarity.train', keep=.2)
        
# Preparing the validation dataset        
preprocess('amazon_review_polarity_csv/test.csv', 'polarity.validation')

1min 27s ± 1.88 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
%%time

train_channel = prefix + '/train'
validation_channel = prefix + '/validation'

sess.upload_data(path='polarity.train', bucket=bucket, key_prefix=train_channel)
sess.upload_data(path='polarity.validation', bucket=bucket, key_prefix=validation_channel)

s3_train_data = 's3://{}/{}'.format(bucket, train_channel)
s3_validation_data = 's3://{}/{}'.format(bucket, validation_channel)
s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)

CPU times: user 4.56 s, sys: 1.25 s, total: 5.81 s
Wall time: 15.5 s


In [12]:
region_name = boto3.Session().region_name
container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
print('Using SageMaker BlazingText container: {} ({})'.format(container, region_name))


Using SageMaker BlazingText container: 811284229777.dkr.ecr.us-east-1.amazonaws.com/blazingtext:latest (us-east-1)


In [13]:
bt_model = sagemaker.estimator.Estimator(container,
                                         role, 
                                         base_job_name = 'qanv',
                                         train_instance_count=1, 
                                         train_instance_type='ml.c4.4xlarge',
                                         train_volume_size = 30,
                                         train_max_run = 360000,
                                         input_mode= 'File',
                                         output_path=s3_output_location,
                                         sagemaker_session=sess)
bt_model.set_hyperparameters(mode="supervised",
                            epochs=10,
                            min_count=2,
                            learning_rate=0.05,
                            vector_dim=10,
                            early_stopping=True,
                            patience=4,
                            min_epochs=5,
                            word_ngrams=2)


In [14]:
train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input(s3_validation_data, distribution='FullyReplicated', 
                             content_type='text/plain', s3_data_type='S3Prefix')
data_channels = {'train': train_data, 'validation': validation_data}

In [15]:
bt_model.fit(inputs=data_channels, logs=True)

INFO:sagemaker:Creating training-job with name: qanv-2019-03-13-15-30-42-360


2019-03-13 15:30:42 Starting - Starting the training job...
2019-03-13 15:30:44 Starting - Launching requested ML instances......
2019-03-13 15:31:51 Starting - Preparing the instances for training......
2019-03-13 15:32:59 Downloading - Downloading input data
2019-03-13 15:32:59 Training - Training image download completed. Training in progress.
[31mArguments: train[0m
[31m[03/13/2019 15:32:59 INFO 139623346853696] nvidia-smi took: 0.0252201557159 secs to identify 0 gpus[0m
[31m[03/13/2019 15:32:59 INFO 139623346853696] Running single machine CPU BlazingText training using supervised mode.[0m
[31m[03/13/2019 15:32:59 INFO 139623346853696] Processing /opt/ml/input/data/train/polarity.train . File size: 316 MB[0m
[31m[03/13/2019 15:32:59 INFO 139623346853696] Processing /opt/ml/input/data/validation/polarity.validation . File size: 176 MB[0m
[31mRead 10M words[0m
[31mRead 20M words[0m
[31mRead 30M words[0m
[31mRead 40M words[0m
[31mRead 50M words[0m
[31mRead 60M wor

In [16]:
text_classifier = bt_model.deploy(initial_instance_count = 1,instance_type = 'ml.m4.xlarge')

INFO:sagemaker:Creating model with name: blazingtext-2019-03-13-15-36-04-081
INFO:sagemaker:Creating endpoint with name qanv-2019-03-13-15-30-42-360


---------------------------------------------------------------------------!

In [19]:
sentences = ["this python book is easy to use and it has number of useful examples."]
#sentences = ["this book is very bad and full of erros."]

# using the same nltk tokenizer that we used during data preparation for training
tokenized_sentences = [' '.join(nltk.word_tokenize(sent)) for sent in sentences]

payload = {"instances" : tokenized_sentences}

response = text_classifier.predict(json.dumps(payload))

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

[
  {
    "prob": [
      0.9999558925628662
    ],
    "label": [
      "__label__Positive"
    ]
  }
]


In [None]:
#sess.delete_endpoint(text_classifier.endpoint)

## unsupervised