# Building a text classification model on the Amazon Reviews dataset
1. Inspect and process data with pandas and nltk
2. Store engineered features in Amazon SageMaker Feature Store (offline and online)
3. Build a dataset from the offline feature store with Amazon Athena
4. Train and deploy a classification model with Amazon SageMaker and BlazingText
5. Predict a few samples
6. Clean up

# 1 - Inspect and process data

In [None]:
import pandas as pd
import numpy as np
import time
from time import gmtime, strftime

In [None]:
fs_training_output_path = 's3://sagemaker-us-east-1-613904931467/sagemaker-scikit-learn-2021-07-05-07-54-15-145/output/fs_data/fs_data.tsv'

In [None]:
data = pd.read_csv(fs_training_output_path, sep='\t', 
                   error_bad_lines=False, dtype='str')

In [None]:
data.head()

# 2 - Create Feature Group and load data

In [None]:
import boto3, sagemaker 
from sagemaker.session import Session

sagemaker_session = Session()
region = sagemaker_session.boto_region_name
boto_session = boto3.Session(region_name=region)
role = sagemaker.get_execution_role()

default_bucket = sagemaker_session.default_bucket()
prefix = 'amazon-reviews-featurestore'

sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

### Define the feature group name

In [None]:
from sagemaker.feature_store.feature_group import FeatureGroup

feature_group_name = 'amazon-reviews-feature-group-' + strftime('%d-%H-%M-%S', gmtime())
feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session)

### Define the name of the column storing a unique record id (e.g. primary key)

In [None]:
record_identifier_feature_name = 'review_id'

### Add a column to store feature timestamps

In [None]:
event_time_feature_name = 'event_time'
current_time_sec = int(round(time.time()))

# event_time is the name of the new column. A bit confusing!
data = data.assign(event_time=current_time_sec)

In [None]:
# Make sure that timestamps are correctly set
# NaN timestamps will cause ingestion errors

data[data.isna().any(axis=1)]

In [None]:
data.head()

### Infer feature definitions from the pandas dataframe

In [None]:
data['review_id']     = data['review_id'].astype('str').astype('string')
data['product_id']    = data['product_id'].astype('str').astype('string')
data['review_body']   = data['review_body'].astype('str').astype('string')
data['label']         = data['label'].astype('str').astype('string')
data['star_rating']   = data['star_rating'].astype('int64')
data['event_time']    = data['event_time'].astype('float64')
# We could also use the UNIX date/time format, and we'd set the type to string

In [None]:
feature_group.load_feature_definitions(data_frame=data)

### Create the feature group

In [None]:
feature_group.create(
    s3_uri='s3://{}/{}'.format(default_bucket, prefix),
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
    description="1.8M+ tokenized camera reviews from the Amazon Customer Reviews dataset",
    tags=[
        { 'Key': 'Dataset', 'Value': 'amazon customer reviews' },
        { 'Key': 'Subset', 'Value': 'cameras' },
        { 'Key': 'Owner', 'Value': 'Julien Simon' }
    ]
)

In [None]:
from time import sleep
import sys

while feature_group.describe().get("FeatureGroupStatus") != 'Created':
    sys.stdout.write('.')
    sleep(1)
    
# boto3 doesn't have waiters for Feature Store
# Please +1 this issue on GitHub https://github.com/boto/boto3/issues/2788 

### Ingest features into our feature group, directly from the pandas dataframe

In [None]:
feature_group.ingest(data_frame=data, max_workers=10, wait=True)

# 3 - Use Amazon Athena to build a training dataset

In [None]:
feature_group_query = feature_group.athena_query()
feature_group_table = feature_group_query.table_name

print(feature_group_table)

### Build and run SQL query

In [None]:
# Find the most popular cameras and their average rating

query_string = 'SELECT product_id, avg(star_rating), count(*) as review_count \
FROM "'+ feature_group_table+'"' \
+ ' GROUP BY product_id \
    ORDER BY review_count desc;'

print(query_string)

In [None]:
# Keep only best selling cameras with at least 1,000 reviews

query_string = 'SELECT * FROM \
    (SELECT product_id, avg(star_rating) as avg_rating, count(*) as review_count \
    FROM "'+ feature_group_table+'"' + ' \
    GROUP BY product_id) \
WHERE review_count > 1000 \
ORDER BY review_count DESC;'

print(query_string)

In [None]:
# Find the corresponding labeled reviews, ready for BlazingText training

query_string = 'SELECT label,review_body FROM "' \
+ feature_group_table+'"' \
+ ' INNER JOIN (SELECT product_id FROM (SELECT product_id, avg(star_rating) as avg_rating, count(*) as review_count \
                FROM "' + feature_group_table+'"' \
                + ' GROUP BY product_id) WHERE review_count > 1000) tmp ON "' \
+ feature_group_table+'"'+ '.product_id=tmp.product_id;'

print(query_string)

In [None]:
dataset = pd.DataFrame()
feature_group_query.run(query_string=query_string, output_location='s3://'+default_bucket+'/query_results/')
feature_group_query.wait()

In [None]:
dataset = feature_group_query.as_dataframe()
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset['label'].value_counts()

### Split dataset for training and validation, and save it to text files

In [None]:
from sklearn.model_selection import train_test_split

training, validation = train_test_split(dataset, test_size=0.1)

In [None]:
print(training.shape)
print(validation.shape)

In [None]:
np.savetxt('/tmp/training.txt', training.values, fmt='%s')
np.savetxt('/tmp/validation.txt', validation.values, fmt='%s')

In [None]:
!head -5 /tmp/training.txt

# 4 - Train a classification model on Amazon SageMaker with the BlazingText algorithm

In [None]:
prefix = 'blazing-text-amazon-reviews'

s3_train_path = sagemaker_session.upload_data(path='/tmp/training.txt', bucket=default_bucket, key_prefix=prefix+'/input/train')
s3_val_path = sagemaker_session.upload_data(path='/tmp/validation.txt', bucket=default_bucket, key_prefix=prefix+'/input/validation')
s3_output = 's3://{}/{}/output/'.format(default_bucket, prefix)

print(s3_train_path)
print(s3_val_path)
print(s3_output)

In [None]:
from sagemaker import image_uris

container = image_uris.retrieve('blazingtext', region)
print(container)

In [None]:
bt = sagemaker.estimator.Estimator(container,
                                   role, 
                                   instance_count=1, 
                                   instance_type='ml.p3.2xlarge',
                                   output_path=s3_output)

In [None]:
bt.set_hyperparameters(mode='supervised')

In [None]:
train_data = sagemaker.TrainingInput(
    s3_train_path, 
    content_type='text/plain')

validation_data = sagemaker.TrainingInput(
    s3_val_path,
    content_type='text/plain')

s3_channels = {'train': train_data, 'validation': validation_data}

In [None]:
bt.fit(inputs=s3_channels)

In [None]:
bt_predictor = bt.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

In [None]:
instances = [' I really love this camera , it takes amazing pictures . ',
            ' this camera is ok , it gets the job done . Nothing fancy . ', 
            ' Poor quality , the camera stopped working after a couple of days .']

In [None]:
import pprint 

payload = {"instances" : instances, "configuration": {"k": 3}}

bt_predictor.serializer   = sagemaker.serializers.JSONSerializer()
bt_predictor.deserializer = sagemaker.deserializers.JSONDeserializer()

response = bt_predictor.predict(payload)

pprint.pprint(response)

# 5 - Clean up

In [None]:
bt_predictor.delete_endpoint()

In [None]:
feature_group.delete()

In [None]:
# How to remove old feature groups
old_feature_group_name = 'amazon-reviews-feature-group-19-09-49-20'
old_feature_group = FeatureGroup(name=old_feature_group_name, sagemaker_session=feature_store_session)
old_feature_group.delete()