## Classifying news with HuggingFace and PyTorch on Amazon SageMaker

In [None]:
#make sure you have the updated SageMaker SDK
!pip install "sagemaker>=2.48.0" --upgrade

## Preparation

In [39]:
import sagemaker
from sagemaker.huggingface import HuggingFace
import boto3
import pandas as pd
import os
import numpy as np

In [3]:
# setup the SDK session, role, region, and define the Amazon S3 bucket and prefix to be used
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
bucket = sagemaker_session.default_bucket()
prefix = "news"

For this sample notebook we will use the Kaggle's News Category Dataset. You can download this dataset to your local environment and extract the JSON data.

https://www.kaggle.com/rmisra/news-category-dataset

In [30]:
# we can now read our News Category Dataset, we will just keep the news headline and category columns for this sample
df=pd.read_json('News_Category_Dataset_v2.json', lines=True)
df = df.drop(['authors', 'link', 'short_description', 'date'], axis=1)
df.columns = ['label', 'sentence'] 

In [31]:
# we make sure we replace any missing values with NaN, for avoiding JSON issues when reading this data
nan_value = float("NaN")
df.replace("", nan_value, inplace=True)
df.dropna(subset = ["sentence"], inplace=True)

In [32]:
# we can preview our training data
df

Unnamed: 0,label,sentence
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...
...,...,...
200848,TECH,RIM CEO Thorsten Heins' 'Significant' Plans Fo...
200849,SPORTS,Maria Sharapova Stunned By Victoria Azarenka I...
200850,SPORTS,"Giants Over Patriots, Jets Over Colts Among M..."
200851,SPORTS,Aldon Smith Arrested: 49ers Linebacker Busted ...


In [49]:
# just for keeping track of the labels' map, let's also keep a dataframe of unique and ordered labels
labels = pd.read_csv('train-json.csv')
map_label = pd.unique(labels.label)
map_label = np.sort(map_label)
print(map_label)

['ARTS' 'ARTS & CULTURE' 'BLACK VOICES' 'BUSINESS' 'COLLEGE' 'COMEDY'
 'CRIME' 'CULTURE & ARTS' 'DIVORCE' 'EDUCATION' 'ENTERTAINMENT'
 'ENVIRONMENT' 'FIFTY' 'FOOD & DRINK' 'GOOD NEWS' 'GREEN' 'HEALTHY LIVING'
 'HOME & LIVING' 'IMPACT' 'LATINO VOICES' 'MEDIA' 'MONEY' 'PARENTING'
 'PARENTS' 'POLITICS' 'QUEER VOICES' 'RELIGION' 'SCIENCE' 'SPORTS' 'STYLE'
 'STYLE & BEAUTY' 'TASTE' 'TECH' 'THE WORLDPOST' 'TRAVEL' 'WEDDINGS'
 'WEIRD NEWS' 'WELLNESS' 'WOMEN' 'WORLD NEWS' 'WORLDPOST']


In [48]:
# we now split our dataset for training and evaluation
from sklearn.model_selection import train_test_split
train, test = train_test_split(df)
train.to_csv("train-json.csv", index=False)
test.to_csv("test-json.csv", index=False)

In [None]:
# and upload the datasets to S3
inputs_train = sagemaker_session.upload_data("train-json.csv", bucket=bucket, key_prefix='{}/training-json'.format(prefix))
inputs_test = sagemaker_session.upload_data("test-json.csv", bucket=bucket, key_prefix='{}/testing-json'.format(prefix))
print(inputs_train, inputs_test)

---------
## Fine-tuning

We are ready to train (fine-tune) our model with our data, leveraging on the pre-trained models in the HuggingFace hub.

### Case #1: BERT-large (uncased)

We can start the training of our text classifier model, this time relying on the BERT-large uncased model from the HuggingFace hub.

https://huggingface.co/bert-large-uncased

In [79]:
hyperparameters_bert = {
	'model_name_or_path':'bert-large-uncased',
	'output_dir':'/opt/ml/model',
    'train_file':'/opt/ml/input/data/training/train-json.csv',
    'validation_file':'/opt/ml/input/data/testing/test-json.csv',
    'do_train':True,
    'do_eval':True,
    'num_train_epochs': 1,
    'save_total_limit': 1
	# add your remaining hyperparameters
	# more info here https://github.com/huggingface/transformers/tree/v4.6.1/examples/pytorch/text-classification
}

In [80]:
# git configuration to download our fine-tuning script
git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.6.1'}

In [81]:
# creates Hugging Face estimator
huggingface_estimator_bert = HuggingFace(
	entry_point='run_glue.py',
	source_dir='./examples/pytorch/text-classification',
	instance_type='ml.p3.16xlarge',
	instance_count=1,
	role=role,
	git_config=git_config,
	transformers_version='4.6.1',
	pytorch_version='1.7.1',
	py_version='py36',
	hyperparameters = hyperparameters_bert,
    disable_profiler=True
)

In [82]:
training_path='s3://{}/{}/training-json'.format(bucket, prefix)
testing_path='s3://{}/{}/testing-json'.format(bucket, prefix)
# starting the train job
huggingface_estimator_bert.fit({"training": training_path, "testing": testing_path}, wait=False)

### Case #2: Amazon BORT

We can now run another training, this time using the Amazon BORT model from the HuggingFace hub.

https://huggingface.co/amazon/bort

In [83]:
hyperparameters_bort = {
	'model_name_or_path':'amazon/bort',
	'output_dir':'/opt/ml/model',
    'train_file':'/opt/ml/input/data/training/train-json.csv',
    'validation_file':'/opt/ml/input/data/testing/test-json.csv',
    'do_train':True,
    'do_eval':True,
    'num_train_epochs': 1,
    'save_total_limit': 1
    # add your remaining hyperparameters
	# more info here https://github.com/huggingface/transformers/tree/v4.6.1/examples/pytorch/text-classification
}

In [84]:
# git configuration to download our fine-tuning script
git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.6.1'}

In [85]:
# creates Hugging Face estimator
huggingface_estimator_bort = HuggingFace(
	entry_point='run_glue.py',
	source_dir='./examples/pytorch/text-classification',
	instance_type='ml.p3.16xlarge',
	instance_count=1,
	role=role,
	git_config=git_config,
	transformers_version='4.6.1',
	pytorch_version='1.7.1',
	py_version='py36',
	hyperparameters = hyperparameters_bort,
    disable_profiler=True
)

In [86]:
training_path='s3://{}/{}/training-json'.format(bucket, prefix)
testing_path='s3://{}/{}/testing-json'.format(bucket, prefix)
# starting the train job
huggingface_estimator_bort.fit({"training": training_path, "testing": testing_path}, wait=False)

---

## Inference

We now have two models trained/fine-tuned to our data. We can find those model's artifacts in Amazon S3 for downloading, or host directly with an Amazon SageMaker Endpoint.

In [89]:
from sagemaker.huggingface import HuggingFaceModel

# create Hugging Face Model Class
huggingface_model_bert = HuggingFaceModel(
    transformers_version='4.6.1',
    pytorch_version='1.7.1',
    py_version="py36",
    role=role,
    model_data=huggingface_estimator_bert.model_data)

huggingface_model_bort = HuggingFaceModel(
    transformers_version='4.6.1',
    pytorch_version='1.7.1',
    py_version="py36",
    role=role,
    model_data=huggingface_estimator_bort.model_data)


In [90]:
# create the predictors for deploying our Endpoints for inference
predictor_bert = huggingface_model_bert.deploy(
                initial_instance_count=1,
                instance_type="ml.g4dn.xlarge"
            )

-----------------!

In [91]:
predictor_bort = huggingface_model_bort.deploy(
                initial_instance_count=1,
                instance_type="ml.g4dn.xlarge"
            )

---------------!

In [120]:
# example request, you always need to define "inputs"
data = {
   "inputs": "Hollywood's biggest night was celebrated last night with the Academy Awards."
}

In [121]:
# classify with BERT model
predictor_bert.predict(data)

[{'label': 'LABEL_10', 'score': 0.9710166454315186}]

In [122]:
# classify with BORT model
predictor_bort.predict(data)

[{'label': 'LABEL_10', 'score': 0.6589199304580688}]

In [123]:
# let's get the name for the returned label from our label's map
print(map_label[10])

ENTERTAINMENT


In [233]:
data = {
   "inputs": "Novak Djokovic's bidding for the tennis Golden Slam this year."
}

In [234]:
# classify with BERT model
predictor_bert.predict(data)

[{'label': 'LABEL_28', 'score': 0.9598392844200134}]

In [235]:
# classify with BORT model
predictor_bort.predict(data)

[{'label': 'LABEL_28', 'score': 0.5563121438026428}]

In [236]:
# let's get the name for the returned label from our label's map
print(map_label[28])

SPORTS
