In [2]:
import sagemaker
import json
import boto3

role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name
bucket = sess.default_bucket()
prefix = 'sagemaker-studio-book/chapter07'

In [3]:
import numpy as np
import tensorflow as tf
import os

from tensorflow.keras.preprocessing import sequence
from tensorflow.python.keras.datasets import imdb

In [11]:
max_features = 20000
maxlen = 400

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

# csv_test_dir_prefix = 'imdb_data/test'
# csv_test_filename = 'test.csv'
# csv_test_dir = os.path.join(os.getcwd(), csv_test_dir_prefix)
# os.makedirs(csv_test_dir, exist_ok=True)

# np.savetxt(os.path.join(csv_test_dir, csv_test_filename), 
#            np.array(x_test, dtype=np.int32), fmt='%d', delimiter=",")

# test_data_s3prefix = f'{prefix}/data/csv_test'
# test_data_s3 = sagemaker.Session().upload_data(path=csv_test_dir, 
#                                                key_prefix=test_data_s3prefix)
# print(test_data_s3)

25000 train sequences
25000 test sequences
x_train shape: (25000,)
x_test shape: (25000, 400)


In [None]:
from sagemaker.tensorflow import TensorFlow

training_job_name='imdb-tf-2021-09-21-17-37-20'

estimator = TensorFlow.attach(training_job_name)

In [None]:
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from botocore.exceptions import ClientError
from time import gmtime, strftime

experiment_name = 'imdb-sentiment-analysis'

exp_datetime = strftime('%Y-%m-%d-%H-%M-%S', gmtime())
jobname = f'imdb-tf-bt-{exp_datetime}'

# Creating a new trial for the experiment
exp_trial = Trial.load(
    trial_name=training_job_name
)

experiment_config={
    'ExperimentName': experiment_name,
    'TrialName': exp_trial.trial_name,
    'TrialComponentDisplayName': 'Inference-BatchTransform',

}

In [None]:
transformer = estimator.transformer(instance_count=1, 
                                    instance_type='ml.c5.xlarge',
                                    max_payload = 6, 
                                    accept = 'text/csv', 
                                    assemble_with = 'Line')

transformer.transform(test_data_s3, 
                      content_type='text/csv', 
                      split_type = 'Line', 
                      job_name = jobname,
                      experiment_config = experiment_config)
print('Waiting for transform job: ' + transformer.latest_transform_job.job_name)

In [5]:
# output = transformer.output_path
output_prefix = 'imdb_data/test_output'
# !mkdir -p {output_prefix}
# !aws s3 cp --recursive {output} {output_prefix}
# !head {output_prefix}/csv-test.csv.out

In [6]:
with open(f'{output_prefix}/csv-test.csv.out', 'r') as f:
    json_output = json.load(f)
    results = [float('%.3f'%(item)) for sublist in json_output['predictions'] 
                                    for item in sublist]
    print(results)

[0.004, 1.0, 1.0, 0.4, 1.0, 1.0, 0.164, 0.101, 0.793, 1.0, 1.0, 0.0, 0.0, 0.4, 1.0, 0.0, 1.0, 0.4, 0.0, 0.0, 1.0, 1.0, 0.037, 1.0, 0.476, 1.0, 0.008, 0.476, 1.0, 0.0, 1.0, 0.4, 0.4, 0.0, 0.0, 0.0, 1.0, 1.0, 0.228, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.4, 0.0, 0.4, 1.0, 1.0, 0.824, 0.4, 0.4, 1.0, 0.0, 0.0, 0.4, 0.0, 1.0, 0.0, 0.0, 1.0, 0.4, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.4, 0.0, 1.0, 0.4, 0.0, 0.991, 0.4, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.4, 0.023, 0.784, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.015, 0.0, 0.4, 0.4, 0.0, 0.0, 0.0, 0.015, 0.009, 0.058, 0.4, 1.0, 1.0, 0.0, 0.4, 0.027, 0.149, 0.4, 0.713, 0.0, 1.0, 1.0, 0.0, 0.4, 0.0, 1.0, 1.0, 0.988, 1.0, 0.0, 0.4, 0.0, 0.84, 0.022, 0.0, 1.0, 1.0, 0.0, 0.4, 0.0, 0.0, 0.008, 1.0, 0.0, 1.0, 0.0, 0.4, 0.4, 0.0, 0.003, 1.0, 0.0, 0.0, 0.0, 0.4, 1.0, 0.999, 1.0, 0.867, 1.0, 0.008, 1.0, 0.0, 0.002, 0.4, 0.0, 1.0, 0.0, 0.4, 0.4, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.032, 0.4, 0.0, 0.4, 0.4, 0.0, 0.0, 0.4, 0.0, 1.0, 0.0, 0.0, 0.4, 

In [7]:
len(results)

1000

In [8]:
def get_sentiment(score):
    return 'positive' if score > 0.5 else 'negative' 

In [9]:
import re

regex = re.compile(r'^[\?\s]+')
word_index = imdb.get_word_index()

In [12]:
data_index=199
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
first_decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') 
                                 for i in x_test[data_index]])
regex.sub('', first_decoded_review)

"i watched this movie purely for the setting it was filmed in an old hotel that a friend owns shares of the plot was predictable the acting was ? at best the scares were all gross outs not true scares br br i don't remember much of the plot and i think that's because there wasn't much of one to remember they didn't even use the hotel to it's fullest potential the beaches are fantastic and the hotel is situated on a ? at low tide you can walk almost 1 4 mile into the bay which is actually an eerie sight first thing in the morning or late at night when the wind is howling through the cracks br br the best way to see this movie is with the remote in your hand so you can fast forward through the action and i'm using that term ? scenes and pause at the beauty of the surroundings"

In [13]:
print(f'Labeled sentiment for this review is {get_sentiment(y_test[data_index])}')
print(f'Predicted sentiment is {get_sentiment(results[data_index])}')

Labeled sentiment for this review is negative
Predicted sentiment is negative
