In [3]:
import time

import sagemaker
from sagemaker.huggingface import HuggingFace, get_huggingface_llm_image_uri

%cd /home/jerife/krx/sagemaker

In [4]:
sess = sagemaker.Session()
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

role = "arn:aws:iam::904233131832:role/krx-sagemaker"
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [5]:
deepspeed_parameters = {
  "deepspeed": "./configs/z2_offload_bf16.json", # deepspeed config file
  "training_script": "./scripts/train.py" # real training script, not entrypoint
}

steps = 200
training_hyperparameters = {
    'hub_model_id': "WhipParty/krx-omni-3",
    'dataset_name': "jerife/krx-v4.3",
    'wandb_project': "krx-gemma2-fft-v1",                          # number of training epochs
    'model_name_or_path': "google/gemma-2-9b-it",
    'wandb_token': "[WRITE_YOUR_TOKEN_HERE]",
    'hf_token': "[WRITE_YOUR_TOKEN_HERE]",
    "output_dir": "/opt/ml/model",
    'per_device_train_batch_size': 2,
    'per_device_eval_batch_size': 2,
    'gradient_accumulation_steps': 2,
    'num_train_epochs': 1,
    'learning_rate': 2e-5,
    'warmup_ratio': 0.1,
    'weight_decay': 0.01,
    'eval_steps': steps,
    'save_steps': steps,
    'logging_steps': 50,
    'evaluation_strategy': "steps",
    'save_strategy': "steps",
    'load_best_model_at_end': False,
    'save_total_limit': 2,
    'bf16': True,
    'seed': 42,
    'is_debug': False,
    'push_to_hub': True,
    'hub_private_repo': True,
    'hub_strategy': "every_save", # "every_save" or "end_of_training"
    "hub_token": "[WRITE_YOUR_TOKEN_HERE]",    
}

In [6]:
metric_definitions=[
    {'Name': 'train_loss',              'Regex': "'train_loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_loss',               'Regex': "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_mcqa_accuracy',      'Regex': "'eval_mcqa_accuracy': ([0-9]+(.|e\-)[0-9]+),?"}]


In [7]:
import json
job_name = f'{training_hyperparameters["hub_model_id"].split("/")[1]}'

# create the Estimator
huggingface_estimator = HuggingFace(
    # env=hub,
    entry_point          = 'deepspeed-launcher.py',        # fine-tuning script used in training jon
    source_dir           = '.',      # directory where fine-tuning script is stored
    instance_type        = 'ml.p4de.24xlarge',   # ml.p4d.24xlarge or ml.g5.48xlarge
    instance_count       = 1,                 # the number of instances used for training
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    # volume_size          = 500,
    transformers_version = '4.36.0',
    pytorch_version      = '2.1.0',
    py_version           = 'py310',
    hyperparameters      = {
      **training_hyperparameters,
      **deepspeed_parameters
    },   
)

In [None]:
huggingface_estimator.fit(wait=True)