In [1]:
import boto3
iam = boto3.client('iam')
response = iam.list_roles()
sagemaker_roles = [role for role in response['Roles'] if 'SageMaker' in role['RoleName']]
sagemaker_roles[0]['RoleName']

'AmazonSageMaker-ExecutionRole-20231030T210397'

In [2]:
import sagemaker
import boto3
sess = sagemaker.Session()
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='AmazonSageMaker-ExecutionRole-20231030T210397')['Role']['Arn']
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)
print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/username/Library/Application Support/sagemaker/config.yaml


Couldn't call 'get_role' to get Role ARN from role name username to get Role path.


sagemaker role arn: arn:aws:iam::005418323977:role/service-role/AmazonSageMaker-ExecutionRole-20231030T210397
sagemaker bucket: sagemaker-ap-south-1-005418323977
sagemaker session region: ap-south-1


In [3]:
import pandas as pd
import numpy as np
import json

import re
from transformers import AutoTokenizer
from random import randint
import sys
sys.path.append("../utils")
from pack_dataset import pack_dataset
from datasets import Dataset

import requests

In [4]:
model_id = "TheBloke/Nous-Hermes-2-Yi-34B-GPTQ"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True, trust_remote_code=True)



In [5]:
training_s3_path = 's3://sagemaker-ap-south-1-005418323977/fine_tuning_datasets/2024-01-30-Search_PurpleCherryPanda'

In [6]:
from datetime import datetime

In [7]:
finetune_dataset_config = {'finetune_id': 'Search_' + 'PurpleCherryPanda',
                          'date': datetime.strftime(datetime.today(),'%Y-%m-%d'),
                          'num_datapoints': 671,
                            'data_source': 'gpt4'}

In [13]:
from huggingface_hub import HfFolder


# hyperparameters, which are passed into the training job
hyperparameters ={
  'model_id': model_id,                             # pre-trained model
  'dataset_path': '/opt/ml/input/data/training',    # path where sagemaker will save training dataset
  'num_train_epochs': 1,                            # number of training epochs
  'per_device_train_batch_size': 6,                 # batch size for training
  'gradient_accumulation_steps': 3,                 # Number of updates steps to accumulate
  'gradient_checkpointing': True,                   # save memory but slower backward pass
  'bf16': True,                                     # use bfloat16 precision
  'tf32': True,                                     # use tf32 precision
  'learning_rate': 6e-6,                            # learning rate
  'max_grad_norm': 0.3,                             # Maximum norm (for gradient clipping)
  'warmup_ratio': 0.03,                             # warmup ratio
  "lr_scheduler_type":"cosine_with_restarts",                   # learning rate scheduler
  'save_strategy': "epoch",                         # save strategy for checkpoints
  "logging_steps": 10,                              # log every x steps
  'merge_adapters': False,                           # wether to merge LoRA into the model (needs more memory)
  'use_flash_attn': True,                           # Whether to use Flash Attention
  'output_dir': '/tmp/run'                         # output directory, where to save assets during training
    
                                                    # could be used for checkpointing. The final trained
                                                    # model will always be saved to s3 at the end of training
}

if HfFolder.get_token() is not None:
    hyperparameters['hf_token'] = HfFolder.get_token() # huggingface token to access gated models, e.g. llama 2

In [14]:
from sagemaker.huggingface import HuggingFace

# define Training Job Name
job_name = f'huggingface-qlora-{hyperparameters["model_id"].replace("/","-").replace(".","-")}-{finetune_dataset_config["finetune_id"]}'

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'run_qlora-gptq.py',    # train script
    source_dir           = '../utils/',      # directory which includes all the files needed for training
    instance_type        = 'ml.g5.12xlarge',   # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    max_run              = 4*60*60,        # maximum runtime in seconds (days * hours * minutes * seconds)
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 50,               # the size of the EBS volume in GB
    transformers_version = '4.28',            # the transformers version used in the training job
    pytorch_version      = '2.0',             # the pytorch_version version used in the training job
    py_version           = 'py310',           # the python version used in the training job
    hyperparameters      =  hyperparameters,  # the hyperparameters passed to the training job
    environment          = { "HUGGINGFACE_HUB_CACHE": "/tmp/.cache" }, # set env variable to cache models in /tmp
    disable_output_compression = True         # not compress output to save training time and cost
)

In [15]:
# define a data input dictonary with our uploaded s3 uris
data = {'training': training_s3_path}
# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=True)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-qlora-TheBloke-Nous-Hermes--2024-02-01-06-39-17-499


2024-02-01 06:39:18 Starting - Starting the training job...
2024-02-01 06:39:44 Starting - Preparing the instances for training.


KeyboardInterrupt



* Adapter output for the first run is s3://sagemaker-ap-south-1-005418323977/huggingface-qlora-TheBloke-Nous-Capybar-2024-01-31-15-32-29-439/output/model/
    ** Issues - the tokenizer was of hermes and model was capybara
    ** learning rate was very high at 2e-4
    ** batch training size was at 4

In [11]:
2

2