In [1]:
from datasets import load_dataset, DatasetDict, Dataset


## Load and preprocess the Dataset

In [2]:
fin_data = load_dataset('zeroshot/twitter-financial-news-sentiment')
fin_data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9543
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2388
    })
})

In [3]:
fin_data['train'][0]

{'text': '$BYND - JPMorgan reels in expectations on Beyond Meat https://t.co/bd0xbFGjkT',
 'label': 0}

In [5]:
fin_data['validation'][0]

{'text': '$ALLY - Ally Financial pulls outlook https://t.co/G9Zdi1boy5',
 'label': 0}

In [7]:
for i in range(5):
    print(fin_data['validation'][i]['text'])

$ALLY - Ally Financial pulls outlook https://t.co/G9Zdi1boy5
$DELL $HPE - Dell, HPE targets trimmed on compute headwinds https://t.co/YRUHZw7cYl
$PRTY - Moody's turns negative on Party City https://t.co/MBD5TFGC4P
$SAN: Deutsche Bank cuts to Hold
$SITC: Compass Point cuts to Sell


In [10]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

In [11]:
model_checkpoint = 'distilbert-base-uncased'

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

In [13]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["text"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

# add pad token if none exists
# if tokenizer.pad_token is None:
#     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#     model.resize_token_embeddings(len(tokenizer))


In [14]:
tokenized_dataset = fin_data.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/2388 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 9543
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2388
    })
})

In [15]:
train_data, test_data = load_dataset('zeroshot/twitter-financial-news-sentiment',split=['train','validation'])

In [18]:
# tokenize train and test datasets
train_data = train_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/9543 [00:00<?, ? examples/s]

In [19]:
train_data

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 9543
})

In [20]:
# set dataset format for PyTorch
train_dataset =  train_data.rename_column("label", "labels")
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset = test_data.rename_column("label", "labels")
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [24]:
train_dataset[0]

{'labels': tensor(0),
 'input_ids': tensor([  101,  1002,  2011,  4859,  1011, 16545,  5302, 16998, 15934,  2015,
          1999, 10908,  2006,  3458,  6240, 16770,  1024,  1013,  1013,  1056,
          1012,  2522,  1013,  1038,  2094,  2692,  2595, 29292,  2290, 15992,
          2102,   102]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1])}

## Upload dataset to S3 bucket

In [29]:
import sagemaker
import boto3
import botocore
from datasets.filesystems import S3FileSystem

In [27]:
sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session(default_bucket='findatasagemakerbucket')
region = sess.boto_session.region_name
bucket = 'findatasagemakerbucket'
print('Using bucket:' + bucket)

Using bucket:findatasagemakerbucket


In [28]:
sess.default_bucket()

'findatasagemakerbucket'

In [25]:
s3_prefix = 'sagemaker/datasets/findata'

In [30]:
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test'

# save train and test dataset to s3
train_dataset.save_to_disk(training_input_path)
test_dataset.save_to_disk(test_input_path)

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



Saving the dataset (0/1 shards):   0%|          | 0/9543 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2388 [00:00<?, ? examples/s]

## Fine-tuning and starting Sagemaker Training job

In [49]:
import time
# define Training Job Name 
job_name = f'huggingface-peft-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'

from sagemaker.huggingface import HuggingFace

# hyperparameters, which are passed into the training job
hyperparameters ={
  'model_id': model_checkpoint,                                # pre-trained model
  'dataset_path': '/opt/ml/input/data/training', # path where sagemaker will save training dataset
  'epochs': 3,                                         # number of training epochs
  'per_device_train_batch_size': 1,                    # batch size for training
  'lr': 2e-4,                                          # learning rate used during training
}

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'train.py',      # train script
    source_dir           = 'scripts',         # directory which includes all the files needed for training
    instance_type        = 'ml.g5.2xlarge', # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    base_job_name        = job_name,          # the name of the training job
    role                 = 'arn:aws:iam::859226994255:role/service-role/AmazonSageMaker-ExecutionRole-20240204T170549',              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 300,               # the size of the EBS volume in GB
    transformers_version = '4.26',            # the transformers version used in the training job
    pytorch_version      = '1.13',            # the pytorch_version version used in the training job
    py_version           = 'py39',            # the python version used in the training job
    hyperparameters      =  hyperparameters
)

In [45]:
training_input_path

's3://findatasagemakerbucket/sagemaker/datasets/findata/train'

In [51]:
# define a data input dictonary with our uploaded s3 uris
data = {'training': training_input_path}

# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=True)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-peft-2024-02-12-18-22-30-2024-02-12-23-32-55-549


ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the CreateTrainingJob operation: The account-level service limit 'ml.g5.2xlarge for training job usage' is 0 Instances, with current utilization of 0 Instances and a request delta of 1 Instances. Please use AWS Service Quotas to request an increase for this quota. If AWS Service Quotas is not available, contact AWS support to request an increase for this quota.