### imports

In [1]:
# %pip install "torch==2.2.2" tensorboard --quiet
# %pip install  --upgrade "transformers==4.40.0" "datasets==2.18.0" "accelerate==0.29.3" "evaluate==0.4.1" "bitsandbytes==0.43.1" "huggingface_hub==0.22.2" "trl==0.8.6" "peft==0.10.0"  --quiet

In [2]:
import json, boto3, sagemaker
from sagemaker.pytorch import PyTorch
from datasets import load_dataset

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ubuntu/.config/sagemaker/config.yaml


In [3]:
workspace_bucket_name = "applied-agi"

# s3_prefix = "wasmedge/llama3-8b-instruct-262k-gguf"
# model_id = "second-state/Llama-3-8B-Instruct-262k-GGUF"
# base_job_name = "fsdp-llama3-8b-instruct-262k-gguf"

s3_prefix = "wasmedge/llama3-8b-instruct-262k"
model_id = "gradientai/Llama-3-8B-Instruct-262k"
base_job_name = "fsdp-llama3-8b-instruct-262k"

train_dataset_file_path = "/home/ubuntu/random-stuff/ashish/wasmedge/llm-coding-assistant/dataset/train_dataset_processed_processed.json"
validation_dataset_file_path = "/home/ubuntu/random-stuff/ashish/wasmedge/llm-coding-assistant/dataset/validation_dataset_processed_processed.json"
test_dataset_file_path = "/home/ubuntu/random-stuff/ashish/wasmedge/llm-coding-assistant/dataset/test_dataset_processed_processed.json"

In [4]:
role = sagemaker.get_execution_role()  # execution role for the endpoint
session = sagemaker.session.Session(default_bucket=workspace_bucket_name)  # sagemaker session for interacting with different AWS APIs
region = session._region_name  # region name of the current SageMaker Studio environment

print(f'role: {role} region: {region}')

role: arn:aws:iam::324622400514:role/ec2-vscode-role region: us-east-1


### dataset

In [5]:
train_dataset = load_dataset(
    "json",
    data_files=train_dataset_file_path,
    split="train"
)
s3_train_dataset_path = f's3://{workspace_bucket_name}/{s3_prefix}/train'
train_dataset.save_to_disk(s3_train_dataset_path)


validation_dataset = load_dataset(
    "json",
    data_files=validation_dataset_file_path,
    split="train"
)
s3_validation_dataset_path = f's3://{workspace_bucket_name}/{s3_prefix}/validation'
validation_dataset.save_to_disk(s3_validation_dataset_path)


test_dataset = load_dataset(
    "json",
    data_files=test_dataset_file_path,
    split="train"
)
s3_test_dataset_path = f's3://{workspace_bucket_name}/{s3_prefix}/test'
test_dataset.save_to_disk(s3_test_dataset_path)


print(f"s3_train_dataset_path: {s3_train_dataset_path}")
print(f"s3_validation_dataset_path: {s3_validation_dataset_path}")
print(f"s3_test_dataset_path: {s3_test_dataset_path}")

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



Saving the dataset (0/1 shards):   0%|          | 0/854 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/107 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/107 [00:00<?, ? examples/s]

s3_train_dataset_path: s3://applied-agi/wasmedge/llama3-8b-instruct-262k/train
s3_validation_dataset_path: s3://applied-agi/wasmedge/llama3-8b-instruct-262k/validation
s3_test_dataset_path: s3://applied-agi/wasmedge/llama3-8b-instruct-262k/test


In [6]:
# s3_train_dataset_path = session.upload_data(
#     path=train_dataset_file_path, 
#     key_prefix=f"{s3_prefix}/train"
# )
# s3_validation_dataset_path = session.upload_data(
#     path=validation_dataset_file_path, 
#     key_prefix=f"{s3_prefix}/validation"
# )
# s3_test_dataset_path = session.upload_data(
#     path=test_dataset_file_path, 
#     key_prefix=f"{s3_prefix}/test"
# )

# print(f"s3_train_dataset_path: {s3_train_dataset_path}")
# print(f"s3_validation_dataset_path: {s3_validation_dataset_path}")
# print(f"s3_test_dataset_path: {s3_test_dataset_path}")

### finetune

In [7]:
print(f"base_job_name: {base_job_name}")

checkpoint_dir = "/opt/ml/checkpoints"
checkpoint_s3_path = f"s3://{workspace_bucket_name}/{s3_prefix}/checkpoints"
print(f"checkpoint_s3_path: {checkpoint_s3_path}")

save_model_s3_path = f"s3://{workspace_bucket_name}/{s3_prefix}/model/" # s3 path where model artifacts gets stored (Used when trying to save using s5cmd)
print(f"save_model_s3_path: {save_model_s3_path}")


base_job_name: fsdp-llama3-8b-instruct-262k
checkpoint_s3_path: s3://applied-agi/wasmedge/llama3-8b-instruct-262k/checkpoints
save_model_s3_path: s3://applied-agi/wasmedge/llama3-8b-instruct-262k/model/


In [8]:
# define hyperparameters
hyperparameters = {
    # script parameters
    'model_id': model_id,
    # "max_seq_len": 3072,
    "max_seq_len": 1024,
    's3_train_dataset_path': s3_train_dataset_path,
    's3_test_dataset_path' : s3_test_dataset_path,
    'sm_train_dataset_path': "/opt/ml/input/data/train",
    'sm_test_dataset_path' : "/opt/ml/input/data/test",
    
    # training parameters
    "_n_gpu": 4,
    "output_dir": ".",
    "report_to": "tensorboard",               # report metrics to tensorboard
    "learning_rate": 0.0002,                  # learning rate 2e-4
    "lr_scheduler_type": "constant",          # learning rate scheduler
    "num_train_epochs": 3,                    # number of training epochs
    "per_device_train_batch_size": 1,         # batch size per device during training
    "per_device_eval_batch_size": 1,          # batch size for evaluation
    # "gradient_accumulation_steps": 2,         # number of steps before performing a backward/update pass
    "gradient_accumulation_steps": 1,         # number of steps before performing a backward/update pass
    "optim": "adamw_torch",                   # use torch adamw optimizer
    "logging_steps": 10,                      # log every 10 steps
    "save_strategy": "epoch",                 # save checkpoint every epoch
    "evaluation_strategy": "epoch",           # evaluate every epoch
    "max_grad_norm": 0.3,                     # max gradient norm
    "warmup_ratio": 0.03,                     # warmup ratio
    "bf16": False,                             # use bfloat16 precision
    "tf32": True,                             # use tf32 precision
    "gradient_checkpointing": True,           # use gradient checkpointing to save memory
    
    # FSDP parameters: https://huggingface.co/docs/transformers/main/en/fsdp
    "fsdp": "full_shard auto_wrap offload", # remove offload if enough GPU memory
    # "fsdp_config": {
    #     "backward_prefetch": "backward_pre",
    #     "forward_prefetch": "false",
    #     "use_orig_params": "false",
    # },
    
    'save_model_s3_path': save_model_s3_path,
    'checkpoint_dir': "/opt/ml/checkpoints",
    }

print('Hyperparameters: \n', json.dumps(hyperparameters, indent=2, default=str))

Hyperparameters: 
 {
  "model_id": "gradientai/Llama-3-8B-Instruct-262k",
  "max_seq_len": 1024,
  "s3_train_dataset_path": "s3://applied-agi/wasmedge/llama3-8b-instruct-262k/train",
  "s3_test_dataset_path": "s3://applied-agi/wasmedge/llama3-8b-instruct-262k/test",
  "sm_train_dataset_path": "/opt/ml/input/data/train",
  "sm_test_dataset_path": "/opt/ml/input/data/test",
  "_n_gpu": 4,
  "output_dir": ".",
  "report_to": "tensorboard",
  "learning_rate": 0.0002,
  "lr_scheduler_type": "constant",
  "num_train_epochs": 3,
  "per_device_train_batch_size": 1,
  "per_device_eval_batch_size": 1,
  "gradient_accumulation_steps": 1,
  "optim": "adamw_torch",
  "logging_steps": 10,
  "save_strategy": "epoch",
  "evaluation_strategy": "epoch",
  "max_grad_norm": 0.3,
  "warmup_ratio": 0.03,
  "bf16": false,
  "tf32": true,
  "gradient_checkpointing": true,
  "fsdp": "full_shard auto_wrap offload",
  "save_model_s3_path": "s3://applied-agi/wasmedge/llama3-8b-instruct-262k/model/",
  "checkpoint

In [9]:
estimator = PyTorch(
    base_job_name                = base_job_name,
    source_dir                   = "./scripts",
    entry_point                  = "run_fsdp_trl.py",
    role                         = role,
    framework_version            = "2.2.0",
    py_version                   = "py310", 
    instance_count               = 1,
    instance_type                = "ml.g5.12xlarge", # 4
    # instance_type                = "ml.g5.48xlarge", # 8
    # instance_type                = "ml.p4d.24xlarge", # 8
    hyperparameters              = hyperparameters,
    checkpoint_local_path        = checkpoint_dir,   
    checkpoint_s3_uri            = checkpoint_s3_path,
    disable_profiler             = True,
    keep_alive_period_in_seconds = 1800,
    debugger_hook_config         = False,
    distribution={"torch_distributed": {"enabled": True}} # enable torchrun
)

data = {
    'train': s3_train_dataset_path,
    'test' : s3_test_dataset_path,
}

estimator.fit(data, wait=True) # logs='None'

2024-05-27 19:41:31 Starting - Starting the training job...
2024-05-27 19:41:45 Downloading - Downloading the training image
2024-05-27 19:41:45 Training - Training image download completed. Training in progress.bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2024-05-27 19:41:46,414 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2024-05-27 19:41:46,451 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2024-05-27 19:41:46,463 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2024-05-27 19:41:46,465 sagemaker_pytorch_container.training INFO     Invoking TorchDistributed...
2024-05-27 19:41:46,465 sagemaker_pytorch_container.training INFO     Invoking user training script.
2024-05-27 19:41:47,804 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:
/opt/conda/bin/python3.10 -m

KeyboardInterrupt: 

In [None]:
# # define hyperparameters
# hyperparameters = {
#     # script parameters
#     'model_id': model_id,
    
#     'sm_train_dataset_path': "/opt/ml/input/data/train",
#     'sm_test_dataset_path' : "/opt/ml/input/data/test",
#     "epochs": 3,                    # number of training epochs
#     "per_device_train_batch_size": 1,         # batch size per device during training
#     "optimizer": "adamw_torch",                   # use torch adamw optimizer
#     "gradient_checkpointing": True,           # use gradient checkpointing to save memory
#     "bf16": True,                             # use bfloat16 precision
#     "fsdp": "full_shard auto_wrap offload", # remove offload if enough GPU memory
    
    
#     "max_seq_len": 3072,
#     "output_dir": ".",
#     "report_to": "tensorboard",               # report metrics to tensorboard
#     "learning_rate": 0.0002,                  # learning rate 2e-4
#     "lr_scheduler_type": "constant",          # learning rate scheduler
#     # "per_device_eval_batch_size": 1,          # batch size for evaluation
#     "gradient_accumulation_steps": 2,         # number of steps before performing a backward/update pass
#     "logging_steps": 10,                      # log every 10 steps
#     "save_strategy": "epoch",                 # save checkpoint every epoch
#     "evaluation_strategy": "epoch",           # evaluate every epoch
#     "max_grad_norm": 0.3,                     # max gradient norm
#     "warmup_ratio": 0.03,                     # warmup ratio
#     "tf32": True,                             # use tf32 precision
    
    
#     # FSDP parameters: https://huggingface.co/docs/transformers/main/en/fsdp
    
#     "fsdp_config": {
#         "backward_prefetch": "backward_pre",
#         "forward_prefetch": "false",
#         "use_orig_params": "false",
#     },
    
#     'save_model_s3_path': save_model_s3_path,
#     'checkpoint_dir': "/opt/ml/checkpoints",
#     }

# print('Hyperparameters: \n', json.dumps(hyperparameters, indent=2, default=str))

# estimator = PyTorch(
#     base_job_name                = base_job_name,
#     source_dir                   = "./scripts",
#     entry_point                  = "run_fsdp_trl.py",
#     role                         = role,
#     framework_version            = "2.2.0",
#     py_version                   = "py310", 
#     instance_count               = 1,
#     instance_type                = "ml.g5.12xlarge",
#     hyperparameters              = hyperparameters,
#     checkpoint_local_path        = checkpoint_dir,   
#     checkpoint_s3_uri            = checkpoint_s3_path,
#     disable_profiler             = True,
#     keep_alive_period_in_seconds = 1800,
#     debugger_hook_config         = False,
# )

# data = {
#     'train': s3_train_dataset_path,
#     'test' : s3_test_dataset_path,
# }

# estimator.fit(data, wait=True) # logs='None'