In [2]:

import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor

# region = boto3.session.Session().region_name
role = "arn:aws:iam::941656036254:role/service-role/AmazonSageMaker-ExecutionRole-20210904T193230"
sagemaker_session = sagemaker.Session()
# sagemaker_local_session = LocalSession()
s3_bucket_name = sagemaker_session.default_bucket()

# sklearn_processor = SKLearnProcessor(
#     framework_version="0.20.0", role=role, instance_type="ml.m5.xlarge", instance_count=1
# )

In [None]:
from datasets import load_dataset

dataset = load_dataset("imdb")

In [None]:
dataset_df = dataset['train'].to_pandas()

In [None]:
import pandas as pd
import time

current_time_sec = int(round(time.time()))
dataset_df["EventTime"] = pd.Series([current_time_sec]*len(dataset_df), dtype="float64")
dataset_df["ID"] = dataset_df.index


In [None]:
dataset_df["text"] = dataset_df["text"].astype('string')
dataset_df["text"] = dataset_df["text"].str.encode("utf8")
dataset_df["text"] = dataset_df["text"].astype('string')

In [None]:
print(dataset_df['text'].dtype)

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
# remove in future
tok_dataset = tokenizer(dataset_df["text"].tolist(), truncation=True, padding=True)

In [None]:
print(tok_dataset[0])

In [None]:
dataset_df["tokenized-text"] = tokenizer(dataset_df["text"].tolist(), truncation=True, padding=True)["input_ids"]
# dataset_df["tokenized-text"] = tokenizer(dataset_df["text"].tolist(), truncation=True)['input_ids']

In [None]:
dataset_df["tokenized-text"] = dataset_df["tokenized-text"].astype('string')

In [None]:
dataset_df

# Setup Feature Store

In [4]:
from sagemaker.feature_store.feature_group import FeatureGroup

imdb_feature_group_name = "imdb-reviews-tokenized-4"

imdb_feature_group = FeatureGroup(
    name=imdb_feature_group_name, sagemaker_session=sagemaker_session
)

In [None]:
imdb_feature_group.load_feature_definitions(data_frame=dataset_df)

In [None]:
s3_bucket_name

In [None]:
imdb_feature_group.create(
    s3_uri=f"s3://{s3_bucket_name}/{imdb_feature_group_name}",
    record_identifier_name="ID",
    event_time_feature_name="EventTime",
    role_arn=role,
    enable_online_store=True
)

# Waiter for FeatureGroup creation
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get('FeatureGroupStatus')
    print(f'Initial status: {status}')
    while status == 'Creating':
        print(f'Waiting for feature group: {feature_group.name} to be created ...')
        time.sleep(5)
        status = feature_group.describe().get('FeatureGroupStatus')
    if status != 'Created':
        raise SystemExit(f'Failed to create feature group {feature_group.name}: {status}')
    print(f'FeatureGroup {feature_group.name} was successfully created.')

wait_for_feature_group_creation_complete(imdb_feature_group)

In [None]:
%%time
import os

# To disable Tokenizer warning
os.environ["TOKENIZERS_PARALLELISM"] = "true"

imdb_feature_group.ingest(data_frame=dataset_df, max_processes=16, wait=True)


### Take several minutes to populate the tables!

In [5]:
athena_query = imdb_feature_group.athena_query()
imdb_table_name = athena_query.table_name
# (catalog="AwsDataCatalog", database="sagemaker_featurestore", table_name=imdb_feature_group_name, sagemaker_session=sagemaker_session)
result = athena_query.run(f'SELECT "label", COUNT("label") as "Count" FROM "sagemaker_featurestore"."{imdb_table_name}" group by "label";', output_location=f"s3://{s3_bucket_name}/athena_output")
athena_query.wait()
print(f"Counting labels in dataset: \n {athena_query.as_dataframe()}")


Counting labels in dataset: 
    label  Count
0      0  12500
1      1  12500


In [31]:
athena_query = imdb_feature_group.athena_query()
imdb_table_name = athena_query.table_name
# (catalog="AwsDataCatalog", database="sagemaker_featurestore", table_name=imdb_feature_group_name, sagemaker_session=sagemaker_session)
result = athena_query.run(f'SELECT * FROM "sagemaker_featurestore"."{imdb_table_name}" limit 10;', output_location=f"s3://{s3_bucket_name}/athena_output")
athena_query.wait()
print(f"Counting labels in dataset: \n {athena_query.as_dataframe()}")

Counting labels in dataset: 
                                                 text  label     eventtime  \
0  b'b\'This is one of my favorites along with th...      1  1.636321e+09   
1  b'b"Enchanted April is a tone poem, an impress...      1  1.636321e+09   
2  b'b\'Have never understood why the MacDonald-E...      1  1.636321e+09   
3  b'b"I\'m going to say first off that I have gi...      0  1.636321e+09   
4  b'b"I am a big fan of Stephen King. I loved Th...      0  1.636321e+09   
5  b'b"Philo Vance (William Powell) helps solve m...      1  1.636321e+09   
6  b'b\'The first half of this movie is a pure de...      0  1.636321e+09   
7  b'b"This was filmed back-to-back with the 1992...      0  1.636321e+09   
8  b'b\'I read a small ad in some horror magazine...      1  1.636321e+09   
9  b'b"A disappointing film.<br /><br />The story...      0  1.636321e+09   

      id                                     tokenized-text  \
0   7819  [101, 1038, 1005, 1038, 1032, 1005, 2023, 2003...

In [5]:
train_dataset_uri = imdb_feature_group.describe()['OfflineStoreConfig']["S3StorageConfig"]["ResolvedOutputS3Uri"]

## Using data for training

In [18]:
from datasets import load_dataset, Dataset
import pandas as pd


local_data_dir = "./fs_data/"

df = pd.read_parquet(local_data_dir)
df["text"] = df["text"].astype('string')
df["input_ids"] = df["tokenized-text"].astype('string')

In [19]:
# dataset = Dataset.from_dict({"input_ids" : df["tokenized-text"].tolist()})

# dataset = Dataset.from_dict({"input_ids" : df["tokenized-text"].tolist()})
dataset = Dataset.from_pandas(df[["input_ids", "label"]])
# dataset = dataset.rename_column("tokenized-text", "input_ids")

In [20]:
def string_to_list(example):
    list_of_str = example["input_ids"].strip("][").split(", ")
    example["input_ids"] = [int(el) for el in list_of_str]
    return example

dataset = dataset.map(string_to_list)

100%|██████████| 25000/25000 [00:04<00:00, 5828.24ex/s]


In [21]:
dataset[0]

{'label': 1,
 'input_ids': [101,
  1038,
  1005,
  1038,
  1000,
  19957,
  1998,
  9414,
  2135,
  2209,
  2011,
  1996,
  2048,
  2402,
  3057,
  1010,
  28616,
  7507,
  12975,
  2004,
  12784,
  1010,
  1998,
  22093,
  24471,
  20755,
  2004,
  14015,
  1010,
  2348,
  1996,
  5436,
  2003,
  2738,
  1037,
  7683,
  1997,
  1996,
  9647,
  1012,
  2402,
  14015,
  2770,
  2005,
  3664,
  3849,
  2041,
  1997,
  2173,
  1010,
  2000,
  2022,
  7481,
  1012,
  1026,
  7987,
  1013,
  1028,
  1026,
  7987,
  1013,
  1028,
  2096,
  1996,
  3772,
  2003,
  2092,
  2589,
  2011,
  2035,
  4986,
  1996,
  3185,
  12102,
  2000,
  3768,
  1037,
  10218,
  7224,
  1997,
  3689,
  1012,
  3383,
  2057,
  1032,
  1005,
  2310,
  4961,
  2000,
  5987,
  24842,
  3723,
  4507,
  1999,
  5691,
  1010,
  2738,
  2066,
  13599,
  17543,
  25789,
  2000,
  2129,
  2665,
  2001,
  2026,
  3028,
  999,
  2196,
  2568,
  1010,
  2169,
  1997,
  2068,
  2024,
  2204,
  1999,
  2037,
  2219,
  2126,
 

In [22]:
NUM_LABELS = 2
VOCAB_SIZE = 30522

from transformers import (
    DistilBertForSequenceClassification,
    DistilBertConfig,
    TrainingArguments,
    Trainer,
)

config = DistilBertConfig()
config.num_labels=NUM_LABELS
config.vocab_size=VOCAB_SIZE

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", config=config)
print(model.config)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "transformers_version": "4.11.3",
  "vocab_size": 30522
}



In [23]:

training_args = TrainingArguments(
    output_dir="./drqs_distilbert/output",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_gpu_train_batch_size=1,
    save_steps=10_000,
    save_total_limit=2,
    max_steps=100,
    prediction_loss_only=True,
)

In [24]:
trainer = Trainer(
    model=model,  # model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=dataset,  # training dataset
#     eval_dataset=test_enc_dataset,  # evaluation dataset
)

max_steps is given, it will override any value given in num_train_epochs


In [25]:
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
***** Running training *****
  Num examples = 25000
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 100
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
100%|██████████| 100/100 [04:54<00:00,  3.10s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 100/100 [04:54<00:00,  2.95s/it]

{'train_runtime': 294.8451, 'train_samples_per_second': 0.339, 'train_steps_per_second': 0.339, 'train_loss': 0.7964998626708985, 'epoch': 0.0}





TrainOutput(global_step=100, training_loss=0.7964998626708985, metrics={'train_runtime': 294.8451, 'train_samples_per_second': 0.339, 'train_steps_per_second': 0.339, 'train_loss': 0.7964998626708985, 'epoch': 0.0})

In [None]:
## 

## Training Job


In [11]:
from sagemaker.huggingface.estimator import HuggingFace

estimator = HuggingFace(
    py_version="py36",
    entry_point="train.py",
    source_dir="1_sources",
    pytorch_version="1.7.1",
    transformers_version="4.6.1",
    hyperparameters={
        "model_name":"distilbert-base-uncased",
        "train_batch_size": 16,
        "epochs": 3
        # "max_steps": 100 # to shorten training cycle, remove in real scenario
    },
    instance_type="ml.p2.xlarge",
    debugger_hook_config=False,
    disable_profiler=True,
    instance_count=1,
    role=role
)


estimator.fit(train_dataset_uri)

2021-11-27 14:54:01 Starting - Starting the training job...
2021-11-27 14:54:04 Starting - Launching requested ML instances......
2021-11-27 14:55:23 Starting - Preparing the instances for training.........
2021-11-27 14:56:48 Downloading - Downloading input data...
2021-11-27 14:57:10 Training - Downloading the training image........................[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-11-27 15:01:20,741 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-11-27 15:01:20,767 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-11-27 15:01:23,802 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-11-27 15:01:24,518 sagemaker-training-toolkit INFO     Invoking user script[0m
[34mTraining Env:[0m
[34m{
    "additional_framework_parameters": {},

## Using data at inference time

In [17]:
import boto3

client = boto3.client('sagemaker-featurestore-runtime')

In [36]:
response = client.batch_get_record(
    Identifiers=[
        {
            'FeatureGroupName':imdb_feature_group.name,
            'RecordIdentifiersValueAsString': ["0", "1", "2"], # picking several records to run inference.
            'FeatureNames': [
                'tokenized-text', "label", 'text'
            ]
        },
    ]
)

# preparing the inference payload
labels = []
input_ids = []
texts = []

for record in response["Records"]:
    for feature in record["Record"]:
        if feature["FeatureName"]=="label":
            labels.append(feature["ValueAsString"])
        if feature["FeatureName"]=="tokenized-text":
            list_of_str = feature["ValueAsString"].strip("][").split(", ")
            input_ids.append([int(el) for el in list_of_str])
        if feature["FeatureName"]=="text":
            # list_of_str = feature["ValueAsString"].strip("][").split(", ")
            texts.append(feature["ValueAsString"])    

print(f"Sample label value: {labels[0]}")
print(f"Sample list of token ids:\n{input_ids[0]}")
print(f"Sample list of token ids:\n{texts[0]}")


Sample label value: 1
Sample list of token ids:
[101, 1038, 1005, 1038, 1032, 1005, 8235, 2058, 1011, 3772, 2011, 23920, 5754, 6031, 1012, 2190, 6918, 7570, 5092, 3203, 1045, 2031, 2412, 2464, 1010, 1998, 2293, 5019, 1999, 4253, 9746, 2024, 2117, 2000, 3904, 1012, 1996, 9781, 2006, 2227, 2003, 1037, 4438, 1010, 2004, 2204, 2004, 2505, 1999, 17162, 12279, 2015, 1012, 1996, 2202, 2006, 9559, 2003, 2036, 21688, 1012, 2044, 2108, 5496, 1997, 2108, 1037, 2735, 16531, 1010, 4855, 2041, 2010, 5795, 1010, 1998, 2108, 9841, 21821, 2102, 1996, 5160, 1997, 27233, 3406, 10053, 23822, 24436, 2135, 1000, 1045, 1032, 1032, 1032, 1005, 1049, 1037, 5160, 1000, 2002, 2758, 1012, 2093, 6057, 2616, 1012, 10799, 17214, 12821, 1010, 1037, 5440, 2013, 1996, 2101, 6554, 12055, 2265, 1010, 2003, 10392, 2182, 2205, 2004, 1037, 5506, 19965, 2040, 4122, 2000, 10188, 1996, 17276, 1012, 2010, 2839, 2003, 2062, 3287, 6767, 16136, 2084, 5156, 1012, 1996, 2902, 3496, 1010, 1998, 1996, 3496, 2073, 1996, 11573, 18445, 1

In [24]:
from sagemaker.huggingface.estimator import HuggingFaceModel

model = estimator.create_model(role=role, 
                               entry_point="inference.py", 
                               source_dir="1_sources",
                              )

In [25]:
predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.xlarge"
)

-----!

In [39]:
import random

for i in range(len(labels)):
    prediction = predictor.predict([texts[i]])
    print(f"Sample index: {i}; predicted label: {prediction[0]['label']}; confidence score: {prediction[0]['score']}")



Sample index: 0; predicted label: LABEL_1; confidence score: 0.5366891026496887
Sample index: 1; predicted label: LABEL_0; confidence score: 0.5006090998649597
Sample index: 2; predicted label: LABEL_1; confidence score: 0.516234815120697
