In [1]:
import boto3
iam = boto3.client('iam')
response = iam.list_roles()
sagemaker_roles = [role for role in response['Roles'] if 'SageMaker' in role['RoleName']]
sagemaker_roles[0]['RoleName']

'AmazonSageMaker-ExecutionRole-20231030T210397'

In [2]:
import sagemaker
import boto3
sess = sagemaker.Session()
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='AmazonSageMaker-ExecutionRole-20231030T210397')['Role']['Arn']
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)
print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/username/Library/Application Support/sagemaker/config.yaml


Couldn't call 'get_role' to get Role ARN from role name username to get Role path.


sagemaker role arn: arn:aws:iam::005418323977:role/service-role/AmazonSageMaker-ExecutionRole-20231030T210397
sagemaker bucket: sagemaker-ap-south-1-005418323977
sagemaker session region: ap-south-1


In [3]:
import pandas as pd
import numpy as np
import json

import re
from transformers import AutoTokenizer
from random import randint
import sys
sys.path.append("../utils")
from pack_dataset import pack_dataset
from datasets import Dataset

import requests

In [14]:
#teknium/OpenHermes-2.5-Mistral-7B
model_id = "NousResearch/Nous-Capybara-34B"
# tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)

In [5]:
# instance_type = "ml.g5.2xlarge"
# number_of_gpu = 1
# health_check_timeout = 600
# quantize="gptq"
# num_shard=1
# bits=4
# group_size=32
# revision = 'gptq-4bit-32g-actorder_True'

In [6]:
# config = {
#     'HF_MODEL_ID':model_id,
#     'SM_NUM_GPUS': json.dumps(number_of_gpu),
#     'QUANTIZE':  quantize,
#     'SHARDED': json.dumps(True),
#     'NUM_SHARD':json.dumps(num_shard),
#     'REVISION': revision,
#     'MAX_INPUT_LENGTH': json.dumps(3072), # Max length of input text
#     'MAX_TOTAL_TOKENS': json.dumps(4096), # Max length of the generation (including input text)
#     'MAX_BATCH_PREFILL_TOKENS': json.dumps(16384), # Max length of the generation (including input text)
#     'MAX_BATCH_TOTAL_TOKENS': json.dumps(16384),
#     'GPTQ_BITS': json.dumps(bits),
#     'GPTQ_GROUPSIZE': json.dumps(group_size)
#     # 'DISABLE_CUSTOM_KERNELS': json.dumps(True)
# }

In [15]:
training_input_path = 's3://sagemaker-ap-south-1-005418323977/fine_tuning_datasets/2024-01-03-ApricotTangerineScallop'

In [21]:
from huggingface_hub import HfFolder


# hyperparameters, which are passed into the training job
hyperparameters ={
  'model_id': model_id,                             # pre-trained model
  'dataset_path': '/opt/ml/input/data/training',    # path where sagemaker will save training dataset
  'num_train_epochs': 3,                            # number of training epochs
  'per_device_train_batch_size': 8,                 # batch size for training
  'gradient_accumulation_steps': 5,                 # Number of updates steps to accumulate
  'gradient_checkpointing': True,                   # save memory but slower backward pass
  'bf16': True,                                     # use bfloat16 precision
  'tf32': True,                                     # use tf32 precision
  'learning_rate': 2e-5,                            # learning rate
  'max_grad_norm': 0.3,                             # Maximum norm (for gradient clipping)
  'warmup_ratio': 0.03,                             # warmup ratio
  "lr_scheduler_type":"cosine_with_restarts",                   # learning rate scheduler
  'save_strategy': "epoch",                         # save strategy for checkpoints
  "logging_steps": 3,                              # log every x steps
  'merge_adapters': True,                           # wether to merge LoRA into the model (needs more memory)
  'use_flash_attn': True,                           # Whether to use Flash Attention
  'output_dir': '/tmp/run'                         # output directory, where to save assets during training
                                                    # model will always be saved to s3 at the end of training
}

if HfFolder.get_token() is not None:
    hyperparameters['hf_token'] = HfFolder.get_token() # huggingface token to access gated models, e.g. llama 2

In [22]:
model_id

'NousResearch/Nous-Capybara-34B'

In [18]:
# configuration for running training on smdistributed model parallel
mpi_options = {
    "enabled" : True,
    "processes_per_host" : 8
}

smp_options = {
    "enabled":True,
    "parameters": {
        "microbatches": 4,
        "placement_strategy": "spread",
        "pipeline": "interleaved",
        "optimize": "speed",
        "partitions": 4,
        "ddp": True,
    }
}

distribution={
    "smdistributed": {"modelparallel": smp_options},
    "mpi": mpi_options
}

In [24]:
from sagemaker.huggingface import HuggingFace

# define Training Job Name
job_name = f'huggingface-qlora-{hyperparameters["model_id"].replace("/","-").replace(".","-")}-ApricotTangerineScallop'

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'run_qlora.py',    # train script
    source_dir           = '../utils/',      # directory which includes all the files needed for training
    instance_type        = 'ml.g5.12xlarge',   # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    max_run              = 8*60*60,        # maximum runtime in seconds (days * hours * minutes * seconds)
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 150,               # the size of the EBS volume in GB
    transformers_version = '4.28',            # the transformers version used in the training job
    pytorch_version      = '2.0',             # the pytorch_version version used in the training job
    py_version           = 'py310',           # the python version used in the training job
    hyperparameters      =  hyperparameters,  # the hyperparameters passed to the training job
    environment          = { "HUGGINGFACE_HUB_CACHE": "/tmp/.cache" }, # set env variable to cache models in /tmp
    disable_output_compression = True         # not compress output to save training time and cost
    # distribution=distribution
    
)

In [25]:
# define a data input dictonary with our uploaded s3 uris
data = {'training': training_input_path}
# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=True)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-qlora-NousResearch-Nous-Cap-2024-01-22-18-19-24-632


2024-01-22 18:19:26 Starting - Starting the training job...
2024-01-22 18:19:52 Starting - Preparing the instances for training.........
2024-01-22 18:21:25 Downloading - Downloading the training image...........................
2024-01-22 18:25:46 Training - Training image download completed. Training in progress......[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-01-22 18:26:42,431 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-01-22 18:26:42,484 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-01-22 18:26:42,493 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-01-22 18:26:42,494 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2024-01-22 18:26:43,823 sagemaker-training-toolkit INFO     Instal

UnexpectedStatusException: Error for Training job huggingface-qlora-NousResearch-Nous-Cap-2024-01-22-18-19-24-632: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
ExitCode 1
ErrorMessage "torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.50 GiB (GPU 0; 21.99 GiB total capacity; 16.60 GiB already allocated; 1007.00 MiB free; 20.69 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
 0%|          | 0/111 [00:04<?, ?it/s]"
Command "/opt/conda/bin/python3.10 run_qlora.py --bf16 True --dataset_path /opt/ml/input/data/training --gradient_accumulation_steps 5 --gradient_checkpointing True --hf_token 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 --learning_rate 2e-05 --logging_steps 3 --lr_scheduler_type cosine_with_restarts --max_grad_norm 0.3 --merge_adapters True --model_id NousResearch/Nous-Capybara-34B --num_train_epochs 3 --output_dir /tmp/run --per_device_train_batch_size 8 --save_strategy epoch --tf32 True --use_flash_attn True --warmup_ratio 0.03", exit code: 1