ZeRORuntimeException #596

wangha9264 · 2023-07-15T13:58:59Z

File "/xddata/home/wanghaha/LM/src/megatron/training.py", line 122, in pretrain
model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider)
File "/xddata/home/wanghaha/LM/src/megatron/training.py", line 338, in setup_model_and_optimizer
model, optimizer, _, lr_scheduler = deepspeed.initialize(
File "/xddata/home/wanghaha/LM/src/deepspeed/init.py", line 165, in initialize
engine = DeepSpeedEngine(args=args,
File "/xddata/home/wanghaha/LM/src/deepspeed/runtime/engine.py", line 309, in init
self._configure_optimizer(optimizer, model_parameters)
File "/xddata/home/wanghaha/LM/src/deepspeed/runtime/engine.py", line 1172, in _configure_optimizer
self._configure_optimizer(optimizer, model_parameters)
File "/xddata/home/wanghaha/LM/src/deepspeed/runtime/engine.py", line 1172, in _configure_optimizer
self._configure_optimizer(optimizer, model_parameters)
File "/xddata/home/wanghaha/LM/src/deepspeed/runtime/engine.py", line 1172, in _configure_optimizer
self._configure_optimizer(optimizer, model_parameters)
File "/xddata/home/wanghaha/LM/src/deepspeed/runtime/engine.py", line 1172, in _configure_optimizer
raise ZeRORuntimeException(msg)
deepspeed.runtime.zero.utils.ZeRORuntimeException: You are using ZeRO-Offload with a client provided optimizer (<class 'apex.optimizers.fused_adam.FusedAdam'>) which in most cases will yield poor performance. Please either use deepspeed.ops.adam.DeepSpeedCPUAdam or set an optimizer in your ds-config (https://www.deepspeed.ai/docs/config-json/#optimizer-parameters). If you really want to use a custom optimizer w. ZeRO-Offload and understand the performance impacts you can also set <"zero_force_ds_cpu_optimizer": false> in your configuration file.
raise ZeRORuntimeException(msg)

在应用Zero3+offload时出现以上错误

wangha9264 · 2023-07-15T14:03:18Z

def setup_model_and_optimizer(model_provider_func, load_lr_scheduler=True):
"""Setup model and optimizer."""
args = get_args()

model = get_model(model_provider_func)

unwrapped_model = unwrap_model(model,
                               (torchDDP, LocalDDP, Float16Module))

optimizer = get_megatron_optimizer(unwrapped_model)
lr_scheduler = get_learning_rate_scheduler(optimizer)
print("-----------------optimizer--------------"+str(optimizer))

if args.deepspeed:
    print_rank_0("DeepSpeed is enabled.")
    pp = mpu.get_pipeline_model_parallel_world_size()
    model, optimizer, _, lr_scheduler = deepspeed.initialize(
        model=model[0],
        optimizer=optimizer,
        args=args,
        lr_scheduler=lr_scheduler,
        mpu=mpu if args.no_pipeline_parallel else None
    )
    #print_rank_0("FinishInitialization.")
    if isinstance(model, deepspeed.PipelineEngine):
        # hack to get batch_fn from pretrain_gpt.py
        model.set_batch_fn(model.module._megatron_batch_fn)

        assert model.grid.get_pipe_parallel_rank() == mpu.get_pipeline_model_parallel_rank()
        assert model.grid.get_slice_parallel_rank() == mpu.get_tensor_model_parallel_rank()
        assert model.grid.get_data_parallel_rank() == mpu.get_data_parallel_rank()
    model = [model]
if args.load is not None:
    timers = get_timers()
    # Extra barrier is added to make sure all ranks report the
    # max time.
    torch.distributed.barrier()
    timers('load-checkpoint').start()
    args.iteration = load_checkpoint(model, optimizer, lr_scheduler)
    torch.distributed.barrier()
    timers('load-checkpoint').stop()
    timers.log(['load-checkpoint'])
else:
    args.iteration = 0

# We only support local DDP with multiple micro-batches.
if len(model) > 1 or mpu.get_pipeline_model_parallel_world_size() > 1:
    assert args.DDP_impl == 'local'

# get model without FP16 and/or TorchDDP wrappers
if args.iteration == 0 and len(unwrapped_model) == 1 \
    and hasattr(unwrapped_model[0], 'init_state_dict_from_bert'):
    print_rank_0("Initializing ICT from pretrained BERT model")
    unwrapped_model[0].init_state_dict_from_bert()
    if args.fp16:
        optimizer.reload_model_params()

return model, optimizer, lr_scheduler

这是我方法所使用的函数

wangha9264 · 2023-07-15T14:05:58Z

这是我的config文件
cat < $DS_CONFIG
{
"train_micro_batch_size_per_gpu": $MICRO_BATCH,
"steps_per_print": 100,
"gradient_clipping":4,
"zero_optimization": {
"stage": $ZERO_STAGE,
"allgather_partitions":false,
"reduce_bucket_size": 50000000,
"allgather_bucket_size": 50000000,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"sub_group_size": 1e9,
"stage3_gather_fp16_weights_on_model_save": true
},
"bfloat16": {
"enabled": false
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
}
}
EOT

TimDettmers · 2023-07-15T17:12:35Z

It seems this is a problem with DeepSpeed and the Apex optimizer. It is unrelated to bitsandbytes.

TimDettmers closed this as completed Jul 15, 2023

TimDettmers added the invalid This doesn't seem right label Jul 15, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

ZeRORuntimeException #596

ZeRORuntimeException #596

wangha9264 commented Jul 15, 2023

wangha9264 commented Jul 15, 2023

wangha9264 commented Jul 15, 2023

TimDettmers commented Jul 15, 2023

ZeRORuntimeException #596

ZeRORuntimeException #596

Comments

wangha9264 commented Jul 15, 2023

wangha9264 commented Jul 15, 2023

wangha9264 commented Jul 15, 2023

TimDettmers commented Jul 15, 2023