# Data Preparation and Training

In this notebook, we are going to prepare data and run training on Amazon Sagemaker.

---

# Data Preparation

## Load required libraries

In [1]:
import pandas as pd
import numpy as np
import os

import pickle

import sagemaker
import boto3
import json
import uuid

import sagemaker
from sagemaker import get_execution_role
from sagemaker.huggingface import HuggingFace

from sagemaker.s3 import S3Downloader
from sagemaker.s3 import S3Uploader

from sklearn.model_selection import train_test_split

data_dir = "../input"

sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
role = get_execution_role()

## Load the data

In [2]:
train = pd.read_csv(f"{data_dir}/train.tsv", sep = "\t")

## Prepare the data and Upload it to s3

In [3]:
def extract_title_and_body(data):
    boilerplatedf = data["boilerplate"].apply(json.loads)
    boilerplatedf = pd.DataFrame(boilerplatedf.tolist())
    data["boilerplate_title"] = boilerplatedf["title"].copy()
    data["boilerplate_body"] = boilerplatedf["body"].copy()   
    data["boilerplate_title"] = data["boilerplate_title"].fillna('')
    data["boilerplate_body"] = data["boilerplate_body"].fillna('')
    data = data.drop(columns = ["boilerplate"])
    del boilerplatedf
    return data
train = extract_title_and_body(train)
train["boilerplate_title"] = train["boilerplate_title"].fillna("")
train["boilerplate_body"] = train["boilerplate_body"].fillna("")
train["text"] = train["boilerplate_title"] +". " + train["boilerplate_body"]
train["text"] = train["text"].str.lower()

text_features = ["text"]

xtrain =   train[text_features + ["label"]]

x_train , x_valid = train_test_split(xtrain, test_size=0.3,random_state=2020)
x_test , x_valid = train_test_split(x_valid, test_size=0.5,random_state=2020)

y_train , x_train =  x_train["label"].values, x_train["text"].values
y_valid , x_valid =  x_valid["label"].values, x_valid["text"].values
y_test , x_test =  x_test["label"].values, x_test["text"].values

# Save the dataset as pickle file

pklfile = f'{data_dir}/train_data_pre_processed.pkl'
with open(pklfile,'wb') as f:
    pickle.dump({
        'x_train': x_train,
        'x_valid': x_valid,
        'x_test' : x_test,
        'y_train': y_train,
        'y_valid': y_valid,
        'y_test' : y_test
    }, f)

bucket = sagemaker_session.default_bucket()
prefix = "stumbleUpon"


inputs = S3Uploader.upload(pklfile, "s3://{}/{}/train-pkl_data".format(bucket, prefix))

inputs =  "s3://{}/{}/train-pkl_data".format(bucket, prefix)
print(inputs)

s3://sagemaker-ap-south-1-296512243111/stumbleUpon/train-pkl_data


---

# Train Bert model using Amazon Sagemaker

In [4]:
hyperparameters={
                 
    "model_name": "bert-base-uncased",
    "batch_size": 8,
    "epochs": 2 ,
    "lr" : 2e-5,
                 }


In [5]:
local_script_location = "../src"
hub = {
  'HF_TASK':'text-classification'     ## NLP task you want to use for predictions
}
huggingface_estimator = HuggingFace(
        entry_point='train.py',
        source_dir=local_script_location,
        env=hub, 
        instance_type='ml.g4dn.12xlarge', 
        instance_count=1,
        role=role,
        transformers_version='4.6',
        pytorch_version='1.7',
        py_version='py36',
        hyperparameters = hyperparameters
)
huggingface_estimator.fit(inputs)



2022-11-27 11:25:46 Starting - Starting the training job...
2022-11-27 11:26:11 Starting - Preparing the instances for trainingProfilerReport-1669548346: InProgress
.....................
2022-11-27 11:29:35 Downloading - Downloading input data
2022-11-27 11:29:35 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-11-27 11:29:37,220 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-11-27 11:29:37,266 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-11-27 11:29:37,268 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-11-27 11:29:37,540 sagemaker-training-toolkit INFO     Invoking user script[0m
[34mTraining Env:[0m
[34m{
    "additional_framework_parameters": {},
    "channel_input_d

[34mEpoch 1[0m
[34mTraining loss: 0.46980270395599527[0m
[34mValidation loss: 0.44482530932203473[0m
[34mValidation AUC: 0.883191976480995[0m
[34mEpoch 2[0m
[34mTraining loss: 0.3967237152804087[0m
[34mValidation loss: 0.4587415420322967[0m
[34mValidation AUC: 0.882174077061165[0m

2022-11-27 11:38:51 Uploading - Uploading generated training model[34m[0.16690026 0.58673298 0.49015713 ... 0.79235131 0.2160888  0.28396618]
 Test AUC -- 0.8650594610770652[0m
[34mSaving model to -- /opt/ml/model[0m
[34mSaving tokenizer to -- /opt/ml/model[0m
[34mTraining Finished -- [0m
[34m#015Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]#015Downloading: 100%|██████████| 570/570 [00:00<00:00, 689kB/s][0m
[34m#015Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]#015Downloading:   2%|▏         | 4.10k/232k [00:00<00:10, 22.1kB/s]#015Downloading:  16%|█▋        | 37.9k/232k [00:00<00:06, 30.0kB/s]#015Downloading:  42%|████▏     | 98.3k/232k [00:00<00:03, 41.2kB/s]#01

In [6]:
predictor = huggingface_estimator.deploy(initial_instance_count=1, instance_type="ml.c5.xlarge")

----!