Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ZeRORuntimeException #596

Closed
wangha9264 opened this issue Jul 15, 2023 · 3 comments
Closed

ZeRORuntimeException #596

wangha9264 opened this issue Jul 15, 2023 · 3 comments
Labels
invalid This doesn't seem right

Comments

@wangha9264
Copy link

File "/xddata/home/wanghaha/LM/src/megatron/training.py", line 122, in pretrain
model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider)
File "/xddata/home/wanghaha/LM/src/megatron/training.py", line 338, in setup_model_and_optimizer
model, optimizer, _, lr_scheduler = deepspeed.initialize(
File "/xddata/home/wanghaha/LM/src/deepspeed/init.py", line 165, in initialize
engine = DeepSpeedEngine(args=args,
File "/xddata/home/wanghaha/LM/src/deepspeed/runtime/engine.py", line 309, in init
self._configure_optimizer(optimizer, model_parameters)
File "/xddata/home/wanghaha/LM/src/deepspeed/runtime/engine.py", line 1172, in _configure_optimizer
self._configure_optimizer(optimizer, model_parameters)
File "/xddata/home/wanghaha/LM/src/deepspeed/runtime/engine.py", line 1172, in _configure_optimizer
self._configure_optimizer(optimizer, model_parameters)
File "/xddata/home/wanghaha/LM/src/deepspeed/runtime/engine.py", line 1172, in _configure_optimizer
self._configure_optimizer(optimizer, model_parameters)
File "/xddata/home/wanghaha/LM/src/deepspeed/runtime/engine.py", line 1172, in _configure_optimizer
raise ZeRORuntimeException(msg)
deepspeed.runtime.zero.utils.ZeRORuntimeException: You are using ZeRO-Offload with a client provided optimizer (<class 'apex.optimizers.fused_adam.FusedAdam'>) which in most cases will yield poor performance. Please either use deepspeed.ops.adam.DeepSpeedCPUAdam or set an optimizer in your ds-config (https://www.deepspeed.ai/docs/config-json/#optimizer-parameters). If you really want to use a custom optimizer w. ZeRO-Offload and understand the performance impacts you can also set <"zero_force_ds_cpu_optimizer": false> in your configuration file.
raise ZeRORuntimeException(msg)

在应用Zero3+offload时出现以上错误

@wangha9264
Copy link
Author

def setup_model_and_optimizer(model_provider_func, load_lr_scheduler=True):
"""Setup model and optimizer."""
args = get_args()

model = get_model(model_provider_func)

unwrapped_model = unwrap_model(model,
                               (torchDDP, LocalDDP, Float16Module))

optimizer = get_megatron_optimizer(unwrapped_model)
lr_scheduler = get_learning_rate_scheduler(optimizer)
print("-----------------optimizer--------------"+str(optimizer))

if args.deepspeed:
    print_rank_0("DeepSpeed is enabled.")
    pp = mpu.get_pipeline_model_parallel_world_size()
    model, optimizer, _, lr_scheduler = deepspeed.initialize(
        model=model[0],
        optimizer=optimizer,
        args=args,
        lr_scheduler=lr_scheduler,
        mpu=mpu if args.no_pipeline_parallel else None
    )
    #print_rank_0("FinishInitialization.")
    if isinstance(model, deepspeed.PipelineEngine):
        # hack to get batch_fn from pretrain_gpt.py
        model.set_batch_fn(model.module._megatron_batch_fn)

        assert model.grid.get_pipe_parallel_rank() == mpu.get_pipeline_model_parallel_rank()
        assert model.grid.get_slice_parallel_rank() == mpu.get_tensor_model_parallel_rank()
        assert model.grid.get_data_parallel_rank() == mpu.get_data_parallel_rank()
    model = [model]
if args.load is not None:
    timers = get_timers()
    # Extra barrier is added to make sure all ranks report the
    # max time.
    torch.distributed.barrier()
    timers('load-checkpoint').start()
    args.iteration = load_checkpoint(model, optimizer, lr_scheduler)
    torch.distributed.barrier()
    timers('load-checkpoint').stop()
    timers.log(['load-checkpoint'])
else:
    args.iteration = 0

# We only support local DDP with multiple micro-batches.
if len(model) > 1 or mpu.get_pipeline_model_parallel_world_size() > 1:
    assert args.DDP_impl == 'local'

# get model without FP16 and/or TorchDDP wrappers
if args.iteration == 0 and len(unwrapped_model) == 1 \
    and hasattr(unwrapped_model[0], 'init_state_dict_from_bert'):
    print_rank_0("Initializing ICT from pretrained BERT model")
    unwrapped_model[0].init_state_dict_from_bert()
    if args.fp16:
        optimizer.reload_model_params()

return model, optimizer, lr_scheduler

这是我方法所使用的函数

@wangha9264
Copy link
Author

这是我的config文件
cat < $DS_CONFIG
{
"train_micro_batch_size_per_gpu": $MICRO_BATCH,
"steps_per_print": 100,
"gradient_clipping":4,
"zero_optimization": {
"stage": $ZERO_STAGE,
"allgather_partitions":false,
"reduce_bucket_size": 50000000,
"allgather_bucket_size": 50000000,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"sub_group_size": 1e9,
"stage3_gather_fp16_weights_on_model_save": true
},
"bfloat16": {
"enabled": false
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
}
}
EOT

@TimDettmers
Copy link
Owner

It seems this is a problem with DeepSpeed and the Apex optimizer. It is unrelated to bitsandbytes.

@TimDettmers TimDettmers added the invalid This doesn't seem right label Jul 15, 2023
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
invalid This doesn't seem right
Projects
None yet
Development

No branches or pull requests

2 participants