In [2]:
from sagemaker.huggingface import HuggingFace


In [3]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")



sagemaker role arn: arn:aws:iam::311851111439:role/mod-6297809195fe4845-SageMakerExecutionRole-K05SDODC7XD2
sagemaker bucket: sagemaker-eu-west-1-311851111439
sagemaker session region: eu-west-1


In [4]:
hyperparameters = {
    'model_name': 'facebook/bart-base',
    'epochs': 1,
    'train_batch_size': 8
}

In [30]:
env = {
    'SAGEMAKER_REQUIREMENTS': 'requirements.txt', # path relative to `source_dir` below.
}

huggingface_estimator = HuggingFace(
    entry_point="train.py",                 # fine-tuning script to use in training job
    source_dir="./scripts/",                 # directory where fine-tuning script is stored
    instance_type="ml.p3.2xlarge",         # instance type
    instance_count=1,                       # number of instances
    role=role,                              # IAM role used in training job to acccess AWS resources (S3)
    transformers_version='4.26.0',
    pytorch_version='1.13.1',                  # PyTorch version
    py_version="py39",                     # Python version
    hyperparameters=hyperparameters,       # hyperparameters to use in training job
    env = env,
)

In [31]:
huggingface_estimator.fit({'train': f's3://{sagemaker_session_bucket}/samples/train/',
                            'test': f's3://{sagemaker_session_bucket}/samples/test/',     
                            'validation': f's3://{sagemaker_session_bucket}/samples/val/'})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-pytorch-training-2023-06-22-14-47-01-648


2023-06-22 14:47:02 Starting - Starting the training job...
2023-06-22 14:47:28 Starting - Preparing the instances for training......
2023-06-22 14:48:34 Downloading - Downloading input data.........
2023-06-22 14:49:50 Training - Downloading the training image..................
2023-06-22 14:52:51 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-06-22 14:53:06,597 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-06-22 14:53:06,616 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-06-22 14:53:06,628 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-06-22 14:53:06,631 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2023-06-22 14:53:06

KeyboardInterrupt: 