In [None]:
!pip install sagemaker==2.100.0
!pip install jedi==0.17  # this is a requirement for pygmentize to work

In [None]:
from sagemaker.huggingface import HuggingFace
from sagemaker import get_execution_role
from sagemaker import Session
from sagemaker.s3 import S3Downloader
import sagemaker
import logging

In [None]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [None]:
logger.info(f'[Using SageMaker: {sagemaker.__version__}]')

In [None]:
session = Session()
ROLE = get_execution_role()
S3_BUCKET = session.default_bucket()
ENTRY_POINT = 'PretrainBERT.py' #or PretrainRoBERTa.py
SOURCE_DIR = './scripts'
INSTANCE_TYPE = 'ml.p3.2xlarge'
INSTANCE_COUNT = 1
EBS_VOLUME_SIZE = 512
TRANSFORMERS_VERSION = '4.17.0'
PYTORCH_VERSION = '1.10.2'
PYTHON_VERSION = 'py38'
BASE_JOB_NAME = 'Bert-pretrain'

In [None]:
logger.info(f'S3 bucket = {S3_BUCKET}')

In [None]:
logger.info(f'S3 bucket = {session}')

In [None]:
!pygmentize ./scripts/pretrain.py

In [None]:
DATA = {'train': f's3://{S3_BUCKET}/data'}

In [None]:
MAX_LENGTH = 512  # Context size for BERT tokenizer 
CHUNK_SIZE = 128  
TRAIN_EPOCHS = 40
BATCH_SIZE = 32
REGION = 'us-east-1'  # [IMPORTANT] Change this to the region you are running your training job

In [None]:
HYPERPARAMETERS = {'s3_bucket': S3_BUCKET, 
                   'max_len': MAX_LENGTH,
                   'chunk_size': CHUNK_SIZE,
                   'num_train_epochs': TRAIN_EPOCHS, 
                   'per_device_train_batch_size': BATCH_SIZE, 
                   'region': REGION,
                   'vocab': "yes",
                   'datasets': "yes",
                   'corpus': "yes"}

In [None]:
HuggingfaceEstimator = HuggingFace(entry_point=ENTRY_POINT, 
                                    source_dir=SOURCE_DIR, 
                                    role=ROLE, 
                                    instance_type=INSTANCE_TYPE, 
                                    instance_count=INSTANCE_COUNT,
                                    volume_size=EBS_VOLUME_SIZE,
                                    hyperparameters=HYPERPARAMETERS,
                                    transformers_version=TRANSFORMERS_VERSION, 
                                    pytorch_version=PYTORCH_VERSION, 
                                    py_version=PYTHON_VERSION, 
                                    disable_profiler=True,
                                    debugger_hook_config=False, 
                                    base_job_name=BASE_JOB_NAME)

In [None]:
HuggingfaceEstimator.fit(DATA, wait=False)