# Getting started with Hugging Face and Amazon Sagemaker

## Sentiment analysis on product reviews

* https://huggingface.co/distilbert-base-uncased
* https://huggingface.co/transformers/model_doc/distilbert.html
* https://huggingface.co/datasets/generated_reviews_enth

# Setup

In [None]:
!pip -q install sagemaker --upgrade

In [None]:
!pip -q install "transformers>=4.4.2" "datasets[s3]==1.5.0" --upgrade

In [None]:
!pip -q install widgetsnbextension ipywidgets

In [None]:
#!pip -q install torch --upgrade

In [None]:
import sagemaker

print(sagemaker.__version__)

sess = sagemaker.Session()
bucket = sess.default_bucket()

In [None]:
import transformers
import datasets

print(transformers.__version__)
print(datasets.__version__)

# Preprocessing

We are using the `datasets` library to download and preprocess the `imdb` dataset. After preprocessing, the dataset will be uploaded to our `sagemaker_session_bucket` to be used within our training job. The [imdb](http://ai.stanford.edu/~amaas/data/sentiment/) dataset consists of 25000 training and 25000 testing highly polar movie reviews.

In [None]:
from datasets import load_dataset

train_dataset, valid_dataset = load_dataset('generated_reviews_enth', split=['train', 'validation'])

print(train_dataset.shape)
print(valid_dataset.shape)

In [None]:
train_dataset[0]

In [None]:
def map_stars_to_sentiment(row):
    return {
        'labels': 1 if row['review_star'] >= 4 else 0
    }

In [None]:
train_dataset = train_dataset.map(map_stars_to_sentiment)
valid_dataset = valid_dataset.map(map_stars_to_sentiment)

In [None]:
train_dataset[0]

In [None]:
train_dataset = train_dataset.flatten()
valid_dataset = valid_dataset.flatten()

In [None]:
train_dataset[0]

In [None]:
train_dataset = train_dataset.remove_columns(['correct', 'translation.th', 'review_star'])
valid_dataset = valid_dataset.remove_columns(['correct', 'translation.th', 'review_star'])

In [None]:
train_dataset = train_dataset.rename_column('translation.en', 'text')
valid_dataset = valid_dataset.rename_column('translation.en', 'text')

In [None]:
train_dataset[0]

## Tokenize

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

In [None]:
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))

In [None]:
valid_dataset = valid_dataset.map(tokenize, batched=True, batch_size=len(valid_dataset))

In [None]:
import json

json.dumps(train_dataset[0])

In [None]:
train_dataset = train_dataset.remove_columns(['text'])
valid_dataset = valid_dataset.remove_columns(['text'])

# Upload data to S3

In [None]:
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()  

s3_prefix = 'hugging-face/sentiment-analysis'

train_input_path = 's3://{}/{}/training'.format(bucket, s3_prefix)
train_dataset.save_to_disk(train_input_path, fs=s3)

valid_input_path = 's3://{}/{}/validation'.format(bucket, s3_prefix)
valid_dataset.save_to_disk(valid_input_path, fs=s3)

In [None]:
# If you're using the output from a SageMaker Processing job
train_input_path = 's3://'
valid_input_path = 's3://'

In [None]:
print(train_input_path)
print(valid_input_path)

# Fine-tuning & starting Sagemaker Training Job

In [None]:
!pygmentize train.py

## Fine-tune the Hugging Face model on SageMaker

In [None]:
hyperparameters={
    'epochs': 1,
    'train-batch_size': 32,
    'model-name':'distilbert-base-uncased'
}

In [None]:
from sagemaker.huggingface import HuggingFace

huggingface_estimator = HuggingFace(
    role=sagemaker.get_execution_role(),
    # Fine-tuning script
    entry_point='train.py',
    hyperparameters=hyperparameters,
    # Infrastructure
    transformers_version='4.6.1',
    pytorch_version='1.7.1',
    py_version='py36',
    instance_type='ml.p3.2xlarge',
    instance_count=1
)

In [None]:
huggingface_estimator.fit({'train': train_input_path, 'valid': valid_input_path})

In [None]:
huggingface_estimator.model_data

# Deploy with the Hugging Face container

In [None]:
huggingface_predictor = huggingface_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge')

In [None]:
test_data = {
   "inputs": "This is a very nice camera, I'm super happy with it."
}

In [None]:
prediction = huggingface_predictor.predict(test_data)
print(prediction)

In [None]:
test_data = {
   "inputs": "Terrible purchase, I want my money back!"
}

In [None]:
prediction = huggingface_predictor.predict(test_data)
print(prediction)

In [None]:
huggingface_predictor.delete_endpoint()

# Deploy with the PyTorch container

In [None]:
from sagemaker.pytorch import PyTorchModel 
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

In [None]:
class SentimentAnalysis(Predictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super().__init__(endpoint_name, 
                         sagemaker_session=sagemaker_session, 
                         serializer=JSONSerializer(), 
                         deserializer=JSONDeserializer())

In [None]:
model = PyTorchModel(
    model_data=huggingface_estimator.model_data,
    role=sagemaker.get_execution_role(), 
    entry_point='torchserve-predictor.py',
    source_dir='src',
    framework_version='1.7.1',
    py_version='py36',
    predictor_cls=SentimentAnalysis)

In [None]:
pytorch_predictor = model.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge')

In [None]:
test_data = {'text': "This is a very nice camera, I'm super happy with it."}

In [None]:
prediction = pytorch_predictor.predict(test_data)
print(prediction)

In [None]:
pytorch_predictor.delete_endpoint()