Skip to content

Commit

Permalink
[bugfix] DeepSpeed with no schedulers (#8580)
Browse files Browse the repository at this point in the history
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
tchaton and pre-commit-ci[bot] committed Jul 27, 2021
1 parent 39de7fe commit c7f8c8c
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 3 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed `BackboneFinetuning` restoration ([#8501](https://github.com/PyTorchLightning/pytorch-lightning/pull/8501))
- Fixed `lr_scheduler` with metric (e.g. `torch.optim.lr_scheduler.ReduceLROnPlateau`) when using `automatic_optimization = False` ([#7643](https://github.com/PyTorchLightning/pytorch-lightning/pull/7643))

- Fixed `DeepSpeed` breaking with no schedulers ([#8580](https://github.com/PyTorchLightning/pytorch-lightning/pull/8580))



## [1.3.8] - 2021-07-01

Expand Down
8 changes: 5 additions & 3 deletions pytorch_lightning/plugins/training_type/deepspeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ def _init_optimizers(self) -> Tuple[Optimizer, Optional[Union[LRSchedulerTypeTup
)
return (
optimizers[0],
schedulers[0] if schedulers else None,
schedulers[0] if schedulers else _get_default_scheduler_config(),
optimizer_frequencies[0] if optimizer_frequencies else None,
)

Expand All @@ -414,6 +414,7 @@ def _initialize_deepspeed_train(self, model):
"Using `configure_optimizers` to define optimizer and scheduler."
)
optimizer, lr_scheduler, _ = self._init_optimizers()

scheduler = lr_scheduler["scheduler"]

model_parameters = filter(lambda p: p.requires_grad, self.model.parameters())
Expand All @@ -430,8 +431,9 @@ def _initialize_deepspeed_train(self, model):

# although we set these here, deepspeed manages the specific optimizer logic
self.lightning_module.trainer.optimizers = [deepspeed_optimizer]
lr_scheduler["scheduler"] = deepspeed_scheduler
self.lightning_module.trainer.lr_schedulers = [lr_scheduler]
if deepspeed_scheduler is not None:
lr_scheduler["scheduler"] = deepspeed_scheduler
self.lightning_module.trainer.lr_schedulers = [lr_scheduler]
self.model = model

@contextlib.contextmanager
Expand Down
19 changes: 19 additions & 0 deletions tests/plugins/test_deepspeed_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
self.configure_sharded_model()


class ModelParallelBoringModelNoSchedulers(ModelParallelBoringModel):
def configure_optimizers(self):
return torch.optim.SGD(self.layer.parameters(), lr=0.1)


class ModelParallelBoringModelManualOptim(BoringModel):
def __init__(self):
super().__init__()
Expand Down Expand Up @@ -687,3 +692,17 @@ def _assert_save_model_is_equal(model, tmpdir, trainer, cls=BoringModel):
# Assert model parameters are identical after loading
for orig_param, trained_model_param in zip(model.parameters(), saved_model.parameters()):
assert torch.equal(orig_param, trained_model_param)


@RunIf(min_gpus=2, deepspeed=True, special=True)
def test_deepspeed_multigpu_no_schedulers(tmpdir):
"""
Test to ensure ZeRO Stage 3 works with a parallel model and no schedulers.
"""
model = ModelParallelBoringModelNoSchedulers()
trainer = Trainer(
default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(stage=3)], gpus=2, fast_dev_run=True, precision=16
)
trainer.fit(model)

_assert_save_model_is_equal(model, tmpdir, trainer, cls=ModelParallelBoringModelNoSchedulers)

0 comments on commit c7f8c8c

Please sign in to comment.