# Data Preparation and Training

In this notebook, we are going to run `sentiment extraction(question answer)` training on Amazon Sagemaker powered by hugging face.

---

# Data Preparation

## Load required libraries

In [1]:
import pandas as pd
import numpy as np
import os

import sagemaker
import boto3
import json

import sagemaker
from sagemaker import get_execution_role
from sagemaker.huggingface import HuggingFace

from sagemaker.s3 import S3Downloader
from sagemaker.s3 import S3Uploader

data_dir = "../inputs"

sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
role = get_execution_role()
bucket = sagemaker_session.default_bucket()

---

# Train Roberta model using Amazon Sagemaker

In [2]:
train_file_location = f"{data_dir}/train.csv"
bucket = sagemaker_session.default_bucket()
prefix = "sentiment_extraction/data"


inputs = S3Uploader.upload(train_file_location, "s3://{}/{}".format(bucket, prefix))

inputs = "s3://{}/{}".format(bucket, prefix)
print(inputs)

s3://sagemaker-ap-south-1-296512243111/sentiment_extraction/data


In [3]:
hyperparameters = {
    "model_name": "roberta-base",
    "batch_size": 16,
    "epochs": 5,
    "lr": 2e-5,
}

In [4]:
local_script_location = "../src"
hub = {"HF_TASK": "question-answering"}  ## NLP task you want to use for predictions
huggingface_estimator = HuggingFace(
    entry_point="train.py",
    source_dir=local_script_location,
    env=hub,
    instance_type="ml.g4dn.12xlarge",
    #         instance_type='ml.p2.xlarge',
    instance_count=1,
    role=role,
    transformers_version="4.6",
    pytorch_version="1.7",
    py_version="py36",
    hyperparameters=hyperparameters,
)
huggingface_estimator.fit(inputs)

2023-01-09 13:24:47 Starting - Starting the training job...
2023-01-09 13:25:13 Starting - Preparing the instances for trainingProfilerReport-1673270687: InProgress
.........
2023-01-09 13:26:32 Downloading - Downloading input data...
2023-01-09 13:27:16 Training - Downloading the training image...
2023-01-09 13:27:46 Training - Training image download completed. Training in progress......[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-01-09 13:28:32,372 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-01-09 13:28:32,419 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-01-09 13:28:32,421 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2023-01-09 13:28:32,687 sagemaker-training-toolkit INFO     Invoking user script[0m
[34mTraining Env:[0m
[34m{
  

[34m  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.0.8
    Uninstalling huggingface-hub-0.0.8:
      Successfully uninstalled huggingface-hub-0.0.8
  Attempting uninstall: transformers
    Found existing installation: transformers 4.6.1
    Uninstalling transformers-4.6.1:
      Successfully uninstalled transformers-4.6.1[0m
[34mSuccessfully installed huggingface-hub-0.4.0 tokenizers-0.12.1 transformers-4.18.0[0m
[34mModel will be saved in - /opt/ml/model[0m
[34mPath to data folder - /opt/ml/input/data/training[0m
[34mContents of folder /opt/ml/input/data/training - ['train.csv'][0m
[34m/opt/ml/input/data/training/train.csv[0m
[34mShape of training dataset - (27481, 5)[0m
[34mSome weights of the model checkpoint at roberta-base were not used when initializing RobertaForQuestionAnswering: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias

[34mThe following columns in the evaluation set  don't have a corresponding argument in `RobertaForQuestionAnswering.forward` and have been ignored: text, sentiment, answer_start, textID, selected_text. If text, sentiment, answer_start, textID, selected_text are not expected by `RobertaForQuestionAnswering.forward`,  you can safely ignore this message.[0m
[34m***** Running Evaluation *****
  Num examples = 5497
  Batch size = 16[0m
[34m{'eval_loss': 1.0661251544952393, 'eval_runtime': 27.8458, 'eval_samples_per_second': 197.409, 'eval_steps_per_second': 3.088, 'epoch': 1.0}[0m
[34m{'loss': 1.4014, 'learning_rate': 1.4186046511627909e-05, 'epoch': 1.45}[0m
[34mThe following columns in the evaluation set  don't have a corresponding argument in `RobertaForQuestionAnswering.forward` and have been ignored: text, sentiment, answer_start, textID, selected_text. If text, sentiment, answer_start, textID, selected_text are not expected by `RobertaForQuestionAnswering.forward`,  you can 


2023-01-09 13:58:40 Uploading - Uploading generated training model
2023-01-09 13:59:40 Completed - Training job completed
ProfilerReport-1673270687: IssuesFound
Training seconds: 1998
Billable seconds: 1998


---

# Generate Endpoint for online inference

In [5]:
predictor = huggingface_estimator.deploy(
    initial_instance_count=1, instance_type="ml.c5.xlarge"
)

----!

In [6]:
sentences = [
    "Tommy is ridiculous but a good dog.",
    "Day went poor. Night was fantastic.",
    "What a slave this beautiful boy is.",
    "Chelsea lost inspite of their remarkable play.",
]
for sentence in sentences:
    for sentiment in ["positive", "negative"]:
        data = {"context": sentence, "question": sentiment}
        print("Sentence ", sentence)
        print("Sentiment ", sentiment)
        print(predictor.predict(data))

Sentence  Tommy is ridiculous but a good dog.
Sentiment  positive
{'score': 0.3929924964904785, 'start': 26, 'end': 35, 'answer': 'good dog.'}
Sentence  Tommy is ridiculous but a good dog.
Sentiment  negative
{'score': 0.3233451843261719, 'start': 9, 'end': 19, 'answer': 'ridiculous'}
Sentence  Day went poor. Night was fantastic.
Sentiment  positive
{'score': 0.7062358260154724, 'start': 25, 'end': 35, 'answer': 'fantastic.'}
Sentence  Day went poor. Night was fantastic.
Sentiment  negative
{'score': 0.3059115707874298, 'start': 9, 'end': 14, 'answer': 'poor.'}
Sentence  What a slave this beautiful boy is.
Sentiment  positive
{'score': 0.4300776720046997, 'start': 18, 'end': 35, 'answer': 'beautiful boy is.'}
Sentence  What a slave this beautiful boy is.
Sentiment  negative
{'score': 0.3133225739002228, 'start': 7, 'end': 12, 'answer': 'slave'}
Sentence  Chelsea lost inspite of their remarkable play.
Sentiment  positive
{'score': 0.40357911586761475, 'start': 30, 'end': 40, 'answer': '