From e122e164568b65c5598a35d5214ba2dc2422c94c Mon Sep 17 00:00:00 2001 From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com> Date: Tue, 13 Oct 2020 11:17:39 +0800 Subject: [PATCH] fix english doc, unittest, and remove useless alias of 2.0 lr_scheduler (#27686) * fix doc and unittest of 2.0 lr_scheduler * fix doc of 2.0 lr_scheduler * fix unittest * fix english doc of lr_scheduler * fix api name of lr scheduler * fix api name of lr scheduler --- python/paddle/__init__.py | 7 - python/paddle/fluid/dygraph/checkpoint.py | 2 +- python/paddle/fluid/executor.py | 8 +- python/paddle/fluid/optimizer.py | 24 +- .../fluid/tests/unittests/test_adam_op.py | 4 +- .../unittests/test_directory_migration.py | 24 +- .../unittests/test_imperative_optimizer_v2.py | 322 +++++++----- .../unittests/test_imperative_save_load_v2.py | 10 +- .../tests/unittests/test_lr_scheduler.py | 55 ++- python/paddle/framework/__init__.py | 13 - python/paddle/framework/io.py | 4 +- python/paddle/nn/__init__.py | 7 - python/paddle/nn/functional/__init__.py | 8 - python/paddle/nn/functional/learning_rate.py | 29 -- python/paddle/nn/layer/__init__.py | 7 - python/paddle/nn/layer/learning_rate.py | 25 - python/paddle/optimizer/__init__.py | 9 +- python/paddle/optimizer/adam.py | 4 +- python/paddle/optimizer/adamax.py | 4 +- python/paddle/optimizer/adamw.py | 4 +- .../optimizer/{lr_scheduler.py => lr.py} | 462 ++++++++++-------- python/paddle/optimizer/optimizer.py | 80 ++- python/paddle/optimizer/rmsprop.py | 4 +- 23 files changed, 570 insertions(+), 546 deletions(-) delete mode 100644 python/paddle/nn/functional/learning_rate.py delete mode 100644 python/paddle/nn/layer/learning_rate.py rename python/paddle/optimizer/{lr_scheduler.py => lr.py} (77%) diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 0af32da4e690b..e1d9450cd5945 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -237,13 +237,6 @@ from .framework import load #DEFINE_ALIAS from .framework import DataParallel #DEFINE_ALIAS -from .framework import NoamDecay #DEFINE_ALIAS -from .framework import PiecewiseDecay #DEFINE_ALIAS -from .framework import NaturalExpDecay #DEFINE_ALIAS -from .framework import ExponentialDecay #DEFINE_ALIAS -from .framework import InverseTimeDecay #DEFINE_ALIAS -from .framework import PolynomialDecay #DEFINE_ALIAS -from .framework import CosineDecay #DEFINE_ALIAS from .framework import set_default_dtype #DEFINE_ALIAS from .framework import get_default_dtype #DEFINE_ALIAS diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py index fb87ea4455d34..5ffbb8c1b51ce 100644 --- a/python/paddle/fluid/dygraph/checkpoint.py +++ b/python/paddle/fluid/dygraph/checkpoint.py @@ -164,7 +164,7 @@ def load_dygraph(model_path, **configs): state_dict = emb.state_dict() fluid.save_dygraph(state_dict, "paddle_dy") - scheduler = paddle.optimizer.lr_scheduler.NoamLR( + scheduler = paddle.optimizer.lr.NoamDecay( d_model=0.01, warmup_steps=100, verbose=True) adam = paddle.optimizer.Adam( learning_rate=scheduler, diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index f5660c3fc91a1..ab9dd5d919a14 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -855,7 +855,7 @@ def close(self): def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name, return_numpy, return_merged): - from paddle.optimizer.lr_scheduler import _LRScheduler + from paddle.optimizer.lr import LRScheduler exe = program._executor # TODO(zhenghuihuang): quantization uses Graph in CompiledProgram # instead of program. We will add support for checking Vars in Graph @@ -901,7 +901,7 @@ def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name, if hasattr(program._program, 'lr_sheduler'): lr_sheduler = program._program.lr_sheduler - assert isinstance(lr_sheduler, _LRScheduler), "must be _LRScheduler" + assert isinstance(lr_sheduler, LRScheduler), "must be LRScheduler" lr_value = lr_sheduler() lr_var = program._program.global_block().vars[lr_sheduler._var_name] lr_tensor = _as_lodtensor(lr_value, core.CPUPlace(), lr_var.dtype) @@ -1238,7 +1238,7 @@ def _run_impl(self, program, feed, fetch_list, feed_var_name, def _run_program(self, program, feed, fetch_list, feed_var_name, fetch_var_name, scope, return_numpy, use_program_cache): - from paddle.optimizer.lr_scheduler import _LRScheduler + from paddle.optimizer.lr import LRScheduler if feed is None: feed = {} elif isinstance(feed, (list, tuple)): @@ -1296,7 +1296,7 @@ def _run_program(self, program, feed, fetch_list, feed_var_name, self._feed_data(program, feed, feed_var_name, scope) if hasattr(program, 'lr_sheduler'): assert isinstance(program.lr_sheduler, - _LRScheduler), "must be _LRScheduler" + LRScheduler), "must be LRScheduler" lr_sheduler = program.lr_sheduler lr_value = lr_sheduler() lr_var = program.global_block().vars[lr_sheduler._var_name] diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 367be181f4725..cf49268a657e4 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -70,15 +70,15 @@ def __init__(self, grad_clip=None, name=None): # Because of the loop import, so place it in the function body - from paddle.optimizer.lr_scheduler import _LRScheduler + from paddle.optimizer.lr import LRScheduler self._parameter_list = list( parameter_list) if parameter_list is not None else None self._name = name if framework.in_dygraph_mode(): if not isinstance(learning_rate, - (float, LearningRateDecay, _LRScheduler)): + (float, LearningRateDecay, LRScheduler)): raise TypeError( - "learning rate should be float or _LRScheduler, got %s here" + "learning rate should be float or LRScheduler, got %s here" % type(learning_rate)) if self._parameter_list is None: raise AttributeError( @@ -94,9 +94,9 @@ def __init__(self, break else: if not isinstance(learning_rate, - (float, framework.Variable, _LRScheduler)): + (float, framework.Variable, LRScheduler)): raise TypeError( - "learning rate should be float or _LRScheduler, got %s here" + "learning rate should be float or LRScheduler, got %s here" % type(learning_rate)) if grad_clip is not None: @@ -147,13 +147,13 @@ def state_dict(self): state_dict = adam.state_dict() ''' - from paddle.optimizer.lr_scheduler import _LRScheduler + from paddle.optimizer.lr import LRScheduler state_dict = {} for k, v in self._accumulators.items(): for para_name, var_tmp in v.items(): state_dict[var_tmp.name] = var_tmp # global step if use lr decay - if isinstance(self._learning_rate, _LRScheduler): + if isinstance(self._learning_rate, LRScheduler): state_dict["LR_Scheduler"] = self._learning_rate.state_dict() return state_dict if isinstance(self._learning_rate, LearningRateDecay): @@ -193,7 +193,7 @@ def set_state_dict(self, state_dict): state_dict = emb.state_dict() fluid.save_dygraph(state_dict, "paddle_dy") - scheduler = paddle.optimizer.lr_scheduler.NoamLR( + scheduler = paddle.optimizer.lr.NoamDecay( d_model=0.01, warmup_steps=100, verbose=True) adam = paddle.optimizer.Adam( learning_rate=scheduler, @@ -203,8 +203,8 @@ def set_state_dict(self, state_dict): para_state_dict, opti_state_dict = fluid.load_dygraph("paddle_dy") ''' - from paddle.optimizer.lr_scheduler import _LRScheduler - if isinstance(self._learning_rate, _LRScheduler): + from paddle.optimizer.lr import LRScheduler + if isinstance(self._learning_rate, LRScheduler): self._learning_rate.set_dict(state_dict["LR_Scheduler"]) if isinstance(self._learning_rate, LearningRateDecay): @@ -269,8 +269,8 @@ def get_opti_var_name_list(self): return self._opti_name_list def _create_global_learning_rate(self): - from paddle.optimizer.lr_scheduler import _LRScheduler - if isinstance(self._learning_rate, _LRScheduler): + from paddle.optimizer.lr import LRScheduler + if isinstance(self._learning_rate, LRScheduler): lr_var = self._global_learning_rate() # only create global lr_var once if not isinstance(lr_var, framework.Variable): diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 47bf8f49e39b6..5167922ccce43 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -455,8 +455,8 @@ def test_adam_op_with_state_dict(self): state_dict = adam.state_dict() adam.set_state_dict(state_dict) - #learning_rate is _LRScheduler - learning_rate = paddle.optimizer.CosineAnnealingLR( + #learning_rate is LRScheduler + learning_rate = paddle.optimizer.lr.CosineAnnealingDecay( learning_rate=0.1, T_max=10) adam = paddle.optimizer.Adam( learning_rate=learning_rate, diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py index 28232e9ba4dc0..9dbbdeee31427 100644 --- a/python/paddle/fluid/tests/unittests/test_directory_migration.py +++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py @@ -43,14 +43,22 @@ def test_new_directory(self): 'paddle.distributed.prepare_context', 'paddle.DataParallel', 'paddle.jit', 'paddle.jit.TracedLayer', 'paddle.jit.to_static', 'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer', - 'paddle.jit.save', 'paddle.jit.load', 'paddle.NoamDecay', - 'paddle.PiecewiseDecay', 'paddle.NaturalExpDecay', - 'paddle.ExponentialDecay', 'paddle.InverseTimeDecay', - 'paddle.PolynomialDecay', 'paddle.CosineDecay', - 'paddle.static.Executor', 'paddle.static.global_scope', - 'paddle.static.scope_guard', 'paddle.static.append_backward', - 'paddle.static.gradients', 'paddle.static.BuildStrategy', - 'paddle.static.CompiledProgram', 'paddle.static.ExecutionStrategy', + 'paddle.jit.save', 'paddle.jit.load', + 'paddle.optimizer.lr.LRScheduler', 'paddle.optimizer.lr.NoamDecay', + 'paddle.optimizer.lr.PiecewiseDecay', + 'paddle.optimizer.lr.NaturalExpDecay', + 'paddle.optimizer.lr.ExponentialDecay', + 'paddle.optimizer.lr.InverseTimeDecay', + 'paddle.optimizer.lr.PolynomialDecay', + 'paddle.optimizer.lr.CosineAnnealingDecay', + 'paddle.optimizer.lr.MultiStepDecay', + 'paddle.optimizer.lr.StepDecay', 'paddle.optimizer.lr.LambdaDecay', + 'paddle.optimizer.lr.ReduceOnPlateau', + 'paddle.optimizer.lr.LinearWarmup', 'paddle.static.Executor', + 'paddle.static.global_scope', 'paddle.static.scope_guard', + 'paddle.static.append_backward', 'paddle.static.gradients', + 'paddle.static.BuildStrategy', 'paddle.static.CompiledProgram', + 'paddle.static.ExecutionStrategy', 'paddle.static.default_main_program', 'paddle.static.default_startup_program', 'paddle.static.Program', 'paddle.static.name_scope', 'paddle.static.program_guard', diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py index 887e50f07c55c..e1b7847a6e6dd 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py @@ -23,7 +23,7 @@ import paddle import paddle.fluid as fluid from paddle.fluid import core -from paddle.fluid.optimizer import SGDOptimizer, Adam, MomentumOptimizer, LarsMomentumOptimizer, AdagradOptimizer, AdamaxOptimizer, DpsgdOptimizer, DecayedAdagradOptimizer, AdadeltaOptimizer, RMSPropOptimizer, FtrlOptimizer, LambOptimizer +from paddle.fluid.optimizer import MomentumOptimizer, LarsMomentumOptimizer, AdagradOptimizer, AdamaxOptimizer, DpsgdOptimizer, DecayedAdagradOptimizer, AdadeltaOptimizer, RMSPropOptimizer, FtrlOptimizer, LambOptimizer from paddle.fluid.optimizer import ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer from paddle.fluid.dygraph import Linear from paddle.fluid.dygraph.base import to_variable @@ -72,15 +72,17 @@ def _check_exception(self, exception_message, place=None): place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( ) else fluid.CPUPlace() - with fluid.dygraph.guard(place): - try: - paddle.manual_seed(seed) - paddle.framework.random._manual_program_seed(seed) - mlp = MLP() - optimizer = self.get_optimizer_dygraph( - parameter_list=mlp.parameters()) - except Exception as e: - assert str(e) == exception_message + try: + paddle.disable_static() + paddle.manual_seed(seed) + paddle.framework.random._manual_program_seed(seed) + mlp = MLP() + optimizer = self.get_optimizer_dygraph( + parameter_list=mlp.parameters()) + except Exception as e: + assert str(e) == exception_message + finally: + paddle.enable_static() def _check_mlp(self, place=None): seed = 90 @@ -90,47 +92,55 @@ def _check_mlp(self, place=None): place = fluid.CPUPlace() if not core.is_compiled_with_cuda( ) else fluid.CUDAPlace(0) - with fluid.dygraph.guard(place): - paddle.manual_seed(seed) - paddle.framework.random._manual_program_seed(seed) + paddle.disable_static(place) + paddle.manual_seed(seed) + paddle.framework.random._manual_program_seed(seed) - mlp = MLP() - optimizer = self.get_optimizer_dygraph( - parameter_list=mlp.parameters()) + mlp = MLP() + optimizer = self.get_optimizer_dygraph(parameter_list=mlp.parameters()) - batch_py_reader = fluid.io.PyReader(capacity=1) - batch_py_reader.decorate_sample_list_generator( - paddle.batch( - self.reader_decorator(paddle.dataset.mnist.train()), - batch_size=batch_size, - drop_last=True), - places=fluid.CPUPlace()) + batch_py_reader = fluid.io.PyReader(capacity=1) + batch_py_reader.decorate_sample_list_generator( + paddle.batch( + self.reader_decorator(paddle.dataset.mnist.train()), + batch_size=batch_size, + drop_last=True), + places=fluid.CPUPlace()) - dy_param_init_value = {} - for batch_id, data in enumerate(batch_py_reader()): - if batch_id >= self.batch_num: - break + dy_param_init_value = {} + for batch_id, data in enumerate(batch_py_reader()): + if batch_id >= self.batch_num: + break - img = data[0] - label = data[1] - label.stop_gradient = True + img = data[0] + label = data[1] - img = fluid.layers.reshape(img, shape=[batch_size, -1]) - cost = mlp(img) - avg_loss = fluid.layers.reduce_mean(cost) - dy_out = avg_loss.numpy() + label.stop_gradient = True - if batch_id == 0: - for param in mlp.parameters(): - dy_param_init_value[param.name] = param.numpy() + img = fluid.layers.reshape(img, shape=[batch_size, -1]) + cost = mlp(img) + avg_loss = fluid.layers.reduce_mean(cost) + dy_out = avg_loss.numpy() - avg_loss.backward() - optimizer.minimize(avg_loss) - mlp.clear_gradients() - dy_param_value = {} + if batch_id == 0: for param in mlp.parameters(): - dy_param_value[param.name] = param.numpy() + dy_param_init_value[param.name] = param.numpy() + avg_loss.backward() + optimizer.minimize(avg_loss) + if isinstance(optimizer._learning_rate, + paddle.optimizer.lr.LRScheduler): + if isinstance(optimizer._learning_rate, + paddle.optimizer.lr.ReduceOnPlateau): + optimizer._learning_rate.step(avg_loss) + else: + optimizer._learning_rate.step() + mlp.clear_gradients() + dy_param_value = {} + for param in mlp.parameters(): + dy_param_value[param.name] = param.numpy() + + paddle.enable_static() with new_program_scope(): paddle.manual_seed(seed) paddle.framework.random._manual_program_seed(seed) @@ -181,6 +191,13 @@ def _check_mlp(self, place=None): feed={"pixel": static_x_data, "label": y_data}, fetch_list=fetch_list) + if isinstance(optimizer._learning_rate, + paddle.optimizer.lr.LRScheduler): + if isinstance(optimizer._learning_rate, + paddle.optimizer.lr.ReduceOnPlateau): + optimizer._learning_rate.step(out[0]) + else: + optimizer._learning_rate.step() static_param_value = {} static_out = out[0] @@ -199,17 +216,19 @@ def _check_mlp(self, place=None): class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): bd = [3, 6, 9] - optimizer = SGDOptimizer( - learning_rate=paddle.optimizer.PiecewiseLR( + optimizer = paddle.optimizer.SGD( + learning_rate=paddle.optimizer.lr.PiecewiseDecay( boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]), - parameter_list=parameter_list) + parameters=parameter_list) return optimizer def get_optimizer(self): bd = [3, 6, 9] - optimizer = SGDOptimizer(learning_rate=paddle.optimizer.PiecewiseLR( - boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)])) + optimizer = paddle.optimizer.SGD( + learning_rate=paddle.optimizer.lr.PiecewiseDecay( + boundaries=bd, + values=[0.1 * (0.1**i) for i in range(len(bd) + 1)])) return optimizer def test_sgd(self): @@ -218,21 +237,16 @@ def test_sgd(self): class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): - optimizer = SGDOptimizer( - learning_rate=fluid.layers.natural_exp_decay( - learning_rate=0.1, - decay_steps=10000, - decay_rate=0.5, - staircase=True), - parameter_list=parameter_list) + optimizer = paddle.optimizer.SGD( + learning_rate=paddle.optimizer.lr.NaturalExpDecay( + learning_rate=0.5, gamma=0.9), + parameters=parameter_list) return optimizer def get_optimizer(self): - optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay( - learning_rate=0.1, - decay_steps=10000, - decay_rate=0.5, - staircase=True)) + optimizer = paddle.optimizer.SGD( + learning_rate=paddle.optimizer.lr.NaturalExpDecay( + learning_rate=0.5, gamma=0.9)) return optimizer def test_sgd(self): @@ -241,21 +255,16 @@ def test_sgd(self): class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): - optimizer = SGDOptimizer( - learning_rate=fluid.layers.exponential_decay( - learning_rate=0.1, - decay_steps=10000, - decay_rate=0.5, - staircase=True), - parameter_list=parameter_list) + optimizer = paddle.optimizer.SGD( + learning_rate=paddle.optimizer.lr.ExponentialDecay( + learning_rate=0.5, gamma=0.9), + parameters=parameter_list) return optimizer def get_optimizer(self): - optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay( - learning_rate=0.1, - decay_steps=10000, - decay_rate=0.5, - staircase=True)) + optimizer = paddle.optimizer.SGD( + learning_rate=paddle.optimizer.lr.ExponentialDecay( + learning_rate=0.5, gamma=0.9)) return optimizer def test_sgd(self): @@ -264,21 +273,16 @@ def test_sgd(self): class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): - optimizer = Adam( - learning_rate=fluid.layers.inverse_time_decay( - learning_rate=0.1, - decay_steps=10000, - decay_rate=0.5, - staircase=True), - parameter_list=parameter_list) + optimizer = paddle.optimizer.Adam( + learning_rate=paddle.optimizer.lr.InverseTimeDecay( + learning_rate=0.5, gamma=0.9), + parameters=parameter_list) return optimizer def get_optimizer(self): - optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay( - learning_rate=0.1, - decay_steps=10000, - decay_rate=0.5, - staircase=True)) + optimizer = paddle.optimizer.Adam( + learning_rate=paddle.optimizer.lr.InverseTimeDecay( + learning_rate=0.5, gamma=0.9)) return optimizer def test_adam(self): @@ -287,15 +291,16 @@ def test_adam(self): class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): - optimizer = SGDOptimizer( - learning_rate=fluid.layers.polynomial_decay( - learning_rate=0.1, decay_steps=5, cycle=self.cycle), - parameter_list=parameter_list) + optimizer = paddle.optimizer.SGD( + learning_rate=paddle.optimizer.lr.PolynomialDecay( + learning_rate=0.5, decay_steps=5, cycle=self.cycle), + parameters=parameter_list) return optimizer def get_optimizer(self): - optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay( - learning_rate=0.1, decay_steps=5, cycle=self.cycle)) + optimizer = paddle.optimizer.SGD( + learning_rate=paddle.optimizer.lr.PolynomialDecay( + learning_rate=0.5, decay_steps=5, cycle=self.cycle)) return optimizer def test_sgd_cycle(self): @@ -307,17 +312,18 @@ def test_sgd(self): self._check_mlp() -class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase): +class TestImperativeOptimizerCosineAnnealingDecay(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): - optimizer = SGDOptimizer( - learning_rate=fluid.layers.cosine_decay( - learning_rate=0.1, step_each_epoch=10000, epochs=120), - parameter_list=parameter_list) + optimizer = paddle.optimizer.SGD( + learning_rate=paddle.optimizer.lr.CosineAnnealingDecay( + learning_rate=0.5, T_max=5), + parameters=parameter_list) return optimizer def get_optimizer(self): - optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay( - learning_rate=0.1, step_each_epoch=10000, epochs=120)) + optimizer = paddle.optimizer.SGD( + learning_rate=paddle.optimizer.lr.CosineAnnealingDecay( + learning_rate=0.5, T_max=5)) return optimizer def test_sgd(self): @@ -326,15 +332,110 @@ def test_sgd(self): class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): - optimizer = SGDOptimizer( - learning_rate=fluid.layers.noam_decay( - d_model=512, warmup_steps=8000), - parameter_list=parameter_list) + optimizer = paddle.optimizer.SGD( + learning_rate=paddle.optimizer.lr.NoamDecay( + d_model=0.01, warmup_steps=100, verbose=True), + parameters=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = paddle.optimizer.SGD( + learning_rate=paddle.optimizer.lr.NoamDecay( + d_model=0.01, warmup_steps=100)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerLambdaDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = paddle.optimizer.SGD( + learning_rate=paddle.optimizer.lr.LambdaDecay( + learning_rate=0.5, lr_lambda=lambda epoch: 0.9**epoch), + parameters=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = paddle.optimizer.SGD( + learning_rate=paddle.optimizer.lr.LambdaDecay( + learning_rate=0.5, lr_lambda=lambda epoch: 0.9**epoch)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerLinearWarmup(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = paddle.optimizer.SGD( + learning_rate=paddle.optimizer.lr.LinearWarmup( + learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5), + parameters=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = paddle.optimizer.SGD( + learning_rate=paddle.optimizer.lr.LinearWarmup( + learning_rate=0.5, + warmup_steps=20, + start_lr=0, + end_lr=0.5, + verbose=True)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerMultiStepDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = paddle.optimizer.SGD( + learning_rate=paddle.optimizer.lr.MultiStepDecay( + learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8), + parameters=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = paddle.optimizer.SGD( + learning_rate=paddle.optimizer.lr.MultiStepDecay( + learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerStepLR(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = paddle.optimizer.SGD( + learning_rate=paddle.optimizer.lr.StepDecay( + learning_rate=0.5, step_size=5, gamma=0.8), + parameters=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = paddle.optimizer.SGD( + learning_rate=paddle.optimizer.lr.StepDecay( + learning_rate=0.5, step_size=5, gamma=0.8)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerReduceOnPlateau(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = paddle.optimizer.SGD( + learning_rate=paddle.optimizer.lr.ReduceOnPlateau( + learning_rate=0.5), + parameters=parameter_list) return optimizer def get_optimizer(self): - optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay( - d_model=512, warmup_steps=8000)) + optimizer = paddle.optimizer.SGD( + learning_rate=paddle.optimizer.lr.ReduceOnPlateau( + learning_rate=0.5)) return optimizer def test_sgd(self): @@ -381,7 +482,7 @@ def test_lr_decay(self): bd = [2, 4, 6, 8] value = [0.2, 0.4, 0.6, 0.8, 1.0] - scheduler = paddle.optimizer.PiecewiseLR(bd, value) + scheduler = paddle.optimizer.lr.PiecewiseDecay(bd, value) adam = paddle.optimizer.Adam( scheduler, parameters=linear.parameters()) @@ -396,7 +497,7 @@ def test_lr_decay(self): self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0)) scheduler.step() - def test_lr_decay_natural_exp(self): + def test_lr_scheduler_natural_exp(self): with fluid.dygraph.guard(): a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") @@ -407,8 +508,7 @@ def test_lr_decay_natural_exp(self): loss = fluid.layers.reduce_mean(b) base_lr = 1.0 - scheduler = paddle.optimizer.NaturalExpLR(1.0, gamma=0.5) - print("scheduler.last_lr", scheduler.last_lr) + scheduler = paddle.optimizer.lr.NaturalExpDecay(1.0, gamma=0.5) adam = paddle.optimizer.Adam( scheduler, parameters=linear.parameters()) @@ -453,7 +553,7 @@ def test_set_lr(self): with self.assertRaises(RuntimeError): adam = paddle.optimizer.Adam( - paddle.optimizer.NaturalExpLR( + paddle.optimizer.lr.NaturalExpDecay( learning_rate=0.1, gamma=0.5), parameters=linear.parameters()) adam.set_lr(0.01) @@ -695,10 +795,10 @@ def test_parameter_list(self): linear_1 = Linear(10, 10) linear_2 = Linear(10, 10) - sgd = SGDOptimizer( - 1.0, - parameter_list=itertools.chain(linear_1.parameters(), - linear_2.parameters())) + sgd = paddle.optimizer.SGD(1.0, + parameters=itertools.chain( + linear_1.parameters(), + linear_2.parameters())) in_np = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") in_data = fluid.dygraph.to_variable(in_np) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py index 5b7998198efa8..0335fa547616e 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py @@ -239,7 +239,7 @@ def setUp(self): place = fluid.CPUPlace() if not core.is_compiled_with_cuda( ) else fluid.CUDAPlace(0) - scheduler = paddle.optimizer.PiecewiseLR( + scheduler = paddle.optimizer.lr.PiecewiseDecay( boundaries=bd, values=lr_arr) adam = Adam( learning_rate=scheduler, parameters=ptb_model.parameters()) @@ -328,7 +328,7 @@ def testLoadAndSetVarBase(self): place = fluid.CPUPlace() if not core.is_compiled_with_cuda( ) else fluid.CUDAPlace(0) - scheduler = paddle.optimizer.PiecewiseLR( + scheduler = paddle.optimizer.lr.PiecewiseDecay( boundaries=bd, values=lr_arr) adam = Adam( learning_rate=scheduler, parameters=ptb_model.parameters()) @@ -436,7 +436,7 @@ def testSetVariable(self): place = fluid.CPUPlace() if not core.is_compiled_with_cuda( ) else fluid.CUDAPlace(0) - scheduler = paddle.optimizer.PiecewiseLR( + scheduler = paddle.optimizer.lr.PiecewiseDecay( boundaries=bd, values=lr_arr) adam = Adam( learning_rate=scheduler, parameters=ptb_model.parameters()) @@ -544,7 +544,7 @@ def testSetNumpy(self): place = fluid.CPUPlace() if not core.is_compiled_with_cuda( ) else fluid.CUDAPlace(0) - scheduler = paddle.optimizer.PiecewiseLR( + scheduler = paddle.optimizer.lr.PiecewiseDecay( boundaries=bd, values=lr_arr) adam = Adam( learning_rate=scheduler, parameters=ptb_model.parameters()) @@ -829,7 +829,7 @@ def testSetNumpyBeforeTrain(self): place = fluid.CPUPlace() if not core.is_compiled_with_cuda( ) else fluid.CUDAPlace(0) - scheduler = paddle.optimizer.PiecewiseLR( + scheduler = paddle.optimizer.lr.PiecewiseDecay( boundaries=bd, values=lr_arr) adam = Adam( learning_rate=scheduler, diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py index f655e363e9648..21d1ba7e397db 100644 --- a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py +++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py @@ -56,22 +56,22 @@ def is_better(current, best, m, n): return var_list[1] -class TestReduceLROnPlateauDecay(object): +class TestReduceOnPlateauDecay(object): def test_ReduceLR(self): # the decay rate must be less than 1.0 with self.assertRaises(ValueError): - paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, factor=2.0) + paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=2.0) # the mode must be "min" or "max" with self.assertRaises(ValueError): - paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, mode="test") + paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, mode="test") # the threshold_mode must be "rel" or "abs" with self.assertRaises(ValueError): - paddle.optimizer.ReduceLROnPlateau( + paddle.optimizer.lr.ReduceOnPlateau( learning_rate=1.0, threshold_mode="test") with self.assertRaises(TypeError): - paddle.optimizer.ReduceLROnPlateau(learning_rate="test") + paddle.optimizer.lr.ReduceOnPlateau(learning_rate="test") with self.assertRaises(TypeError): - paddle.optimizer.ReduceLROnPlateau(learning_rate=0.5).step("test") + paddle.optimizer.lr.ReduceOnPlateau(learning_rate=0.5).step("test") places = [paddle.CPUPlace()] if core.is_compiled_with_cuda(): @@ -114,7 +114,7 @@ def _test_static(self, place, kwargs): [1], 1, 'float32', persistable=True) paddle.increment(x) loss = paddle.sin(x) - scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs) + scheduler = paddle.optimizer.lr.ReduceOnPlateau(**kwargs) adam = paddle.optimizer.Adam(learning_rate=scheduler) adam.minimize(loss) lr_var = adam._global_learning_rate() @@ -158,7 +158,7 @@ def _test_dygraph(self, place, kwargs): var_list = [best, current_lr, cooldown_counter, num_bad_epochs] linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs) + scheduler = paddle.optimizer.lr.ReduceOnPlateau(**kwargs) adam = paddle.optimizer.Adam( learning_rate=scheduler, parameters=linear.parameters()) @@ -180,7 +180,7 @@ def _test_dygraph(self, place, kwargs): loss, var_list) self.assertEqual(current_lr, expected_lr) state_dict = adam.state_dict() - scheduler1 = paddle.optimizer.ReduceLROnPlateau(**kwargs) + scheduler1 = paddle.optimizer.lr.ReduceOnPlateau(**kwargs) adam1 = paddle.optimizer.Adam( learning_rate=scheduler1, parameters=linear.parameters()) adam1.set_state_dict(state_dict) @@ -420,7 +420,7 @@ def _test_dygraph(self, python_func, paddle_api, kwarg, place): adam.clear_grad() current_lr = adam.get_lr() expected_lr = python_func(epoch, **kwarg) - if paddle_api.__name__ != "CosineAnnealingLR": + if paddle_api.__name__ != "CosineAnnealingDecay": self.assertEqual(current_lr, expected_lr) scheduler.step() else: @@ -429,74 +429,75 @@ def _test_dygraph(self, python_func, paddle_api, kwarg, place): def test_scheduler(self): with self.assertRaises(NotImplementedError): - paddle.optimizer.lr_scheduler._LRScheduler().step() + paddle.optimizer.lr.LRScheduler().step() with self.assertRaises(TypeError): - paddle.optimizer.MultiStepLR( + paddle.optimizer.lr.MultiStepDecay( learning_rate="test", milestones=[1, 2, 3]) with self.assertRaises(TypeError): - paddle.optimizer.MultiStepLR(learning_rate=0.5, milestones='test') + paddle.optimizer.lr.MultiStepDecay( + learning_rate=0.5, milestones='test') with self.assertRaises(ValueError): - paddle.optimizer.MultiStepLR( + paddle.optimizer.lr.MultiStepDecay( learning_rate=0.5, milestones=[3, 2, 1]) with self.assertRaises(ValueError): - paddle.optimizer.MultiStepLR( + paddle.optimizer.lr.MultiStepDecay( learning_rate=0.5, milestones=[1, 2, 3], gamma=2) - func_api_kwargs = [(noam_lr, paddle.optimizer.NoamLR, { + func_api_kwargs = [(noam_lr, paddle.optimizer.lr.NoamDecay, { "d_model": 0.01, "warmup_steps": 100, "verbose": False - }), (piecewise_lr, paddle.optimizer.PiecewiseLR, { + }), (piecewise_lr, paddle.optimizer.lr.PiecewiseDecay, { "boundaries": [3, 6, 9, 15, 20], "values": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6], "verbose": False - }), (natural_exp_lr, paddle.optimizer.NaturalExpLR, { + }), (natural_exp_lr, paddle.optimizer.lr.NaturalExpDecay, { "learning_rate": 0.5, "gamma": 0.1, "verbose": True - }), (inverse_time_lr, paddle.optimizer.InverseTimeLR, { + }), (inverse_time_lr, paddle.optimizer.lr.InverseTimeDecay, { "learning_rate": 0.5, "gamma": 0.1, "verbose": False - }), (polynomial_lr, paddle.optimizer.PolynomialLR, { + }), (polynomial_lr, paddle.optimizer.lr.PolynomialDecay, { "learning_rate": 0.5, "decay_steps": 20, "end_lr": 0, "power": 1.0, "cycle": False, "verbose": True - }), (polynomial_lr, paddle.optimizer.PolynomialLR, { + }), (polynomial_lr, paddle.optimizer.lr.PolynomialDecay, { "learning_rate": 0.5, "decay_steps": 20, "end_lr": 0, "power": 1.0, "cycle": True, "verbose": False - }), (linear_warmup_lr, paddle.optimizer.LinearLrWarmup, { + }), (linear_warmup_lr, paddle.optimizer.lr.LinearWarmup, { 'learning_rate': 0.5, 'warmup_steps': 20, 'start_lr': 0, 'end_lr': 0.5, "verbose": True - }), (exponential_lr, paddle.optimizer.ExponentialLR, { + }), (exponential_lr, paddle.optimizer.lr.ExponentialDecay, { "learning_rate": 0.5, "gamma": 0.9, "verbose": False - }), (multi_step_lr, paddle.optimizer.MultiStepLR, { + }), (multi_step_lr, paddle.optimizer.lr.MultiStepDecay, { "learning_rate": 0.5, "milestones": [3, 6, 9, 15, 20], "gamma": 0.8, "verbose": True - }), (step_lr, paddle.optimizer.StepLR, { + }), (step_lr, paddle.optimizer.lr.StepDecay, { "learning_rate": 0.5, "step_size": 2, "gamma": 0.8, "verbose": False - }), (lambda_lr, paddle.optimizer.LambdaLR, { + }), (lambda_lr, paddle.optimizer.lr.LambdaDecay, { "learning_rate": 0.5, "lr_lambda": lambda x: 0.95**x, "verbose": True - }), (cosine_annealing_lr, paddle.optimizer.CosineAnnealingLR, { + }), (cosine_annealing_lr, paddle.optimizer.lr.CosineAnnealingDecay, { "learning_rate": 0.5, "T_max": 10, "verbose": False diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py index 7e2f0eb2fb8bb..c3a5e151f32f8 100644 --- a/python/paddle/framework/__init__.py +++ b/python/paddle/framework/__init__.py @@ -24,11 +24,6 @@ 'DataParallel' ] -__all__ += [ - 'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay', 'ExponentialDecay', - 'InverseTimeDecay', 'PolynomialDecay', 'CosineDecay' -] - from . import random from .random import manual_seed from .framework import get_default_dtype @@ -51,11 +46,3 @@ from .io import save from .io import load from ..fluid.dygraph.parallel import DataParallel #DEFINE_ALIAS - -from ..fluid.dygraph.learning_rate_scheduler import NoamDecay #DEFINE_ALIAS -from ..fluid.dygraph.learning_rate_scheduler import PiecewiseDecay #DEFINE_ALIAS -from ..fluid.dygraph.learning_rate_scheduler import NaturalExpDecay #DEFINE_ALIAS -from ..fluid.dygraph.learning_rate_scheduler import ExponentialDecay #DEFINE_ALIAS -from ..fluid.dygraph.learning_rate_scheduler import InverseTimeDecay #DEFINE_ALIAS -from ..fluid.dygraph.learning_rate_scheduler import PolynomialDecay #DEFINE_ALIAS -from ..fluid.dygraph.learning_rate_scheduler import CosineDecay #DEFINE_ALIAS diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py index c196c1d689bfe..a5555a8494dfc 100644 --- a/python/paddle/framework/io.py +++ b/python/paddle/framework/io.py @@ -228,7 +228,7 @@ def save(obj, path): layer_state_dict = emb.state_dict() paddle.save(layer_state_dict, "emb.pdparams") - scheduler = paddle.optimizer.lr_scheduler.NoamLR( + scheduler = paddle.optimizer.lr.NoamDecay( d_model=0.01, warmup_steps=100, verbose=True) adam = paddle.optimizer.Adam( learning_rate=scheduler, @@ -320,7 +320,7 @@ def load(path, **configs): layer_state_dict = emb.state_dict() paddle.save(layer_state_dict, "emb.pdparams") - scheduler = paddle.optimizer.lr_scheduler.NoamLR( + scheduler = paddle.optimizer.lr.NoamDecay( d_model=0.01, warmup_steps=100, verbose=True) adam = paddle.optimizer.Adam( learning_rate=scheduler, diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py index 1dddef0cace1d..184dd32776441 100644 --- a/python/paddle/nn/__init__.py +++ b/python/paddle/nn/__init__.py @@ -121,13 +121,6 @@ # from .layer.conv import TreeConv #DEFINE_ALIAS # from .layer.conv import Conv1D #DEFINE_ALIAS from .layer.extension import RowConv #DEFINE_ALIAS -# from .layer.learning_rate import CosineDecay #DEFINE_ALIAS -# from .layer.learning_rate import ExponentialDecay #DEFINE_ALIAS -# from .layer.learning_rate import InverseTimeDecay #DEFINE_ALIAS -# from .layer.learning_rate import NaturalExpDecay #DEFINE_ALIAS -# from .layer.learning_rate import NoamDecay #DEFINE_ALIAS -# from .layer.learning_rate import PiecewiseDecay #DEFINE_ALIAS -# from .layer.learning_rate import PolynomialDecay #DEFINE_ALIAS from .layer.common import Linear # from .layer.loss import NCELoss #DEFINE_ALIAS from .layer.loss import BCEWithLogitsLoss #DEFINE_ALIAS diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py index 30eefb2c3912b..b12fa9a6c936f 100644 --- a/python/paddle/nn/functional/__init__.py +++ b/python/paddle/nn/functional/__init__.py @@ -95,14 +95,6 @@ from .extension import temporal_shift #DEFINE_ALIAS from .extension import warpctc #DEFINE_ALIAS from .extension import diag_embed #DEFINE_ALIAS -from .learning_rate import cosine_decay #DEFINE_ALIAS -from .learning_rate import exponential_decay #DEFINE_ALIAS -from .learning_rate import inverse_time_decay #DEFINE_ALIAS -from .learning_rate import natural_exp_decay #DEFINE_ALIAS -from .learning_rate import noam_decay #DEFINE_ALIAS -from .learning_rate import piecewise_decay #DEFINE_ALIAS -from .learning_rate import polynomial_decay #DEFINE_ALIAS -from .learning_rate import linear_lr_warmup #DEFINE_ALIAS # from .lod import sequence_concat #DEFINE_ALIAS # from .lod import sequence_conv #DEFINE_ALIAS # from .lod import sequence_enumerate #DEFINE_ALIAS diff --git a/python/paddle/nn/functional/learning_rate.py b/python/paddle/nn/functional/learning_rate.py deleted file mode 100644 index 83837fc5d46ac..0000000000000 --- a/python/paddle/nn/functional/learning_rate.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# TODO: define learning rate decay -from ...fluid.layers import cosine_decay #DEFINE_ALIAS -from ...fluid.layers import exponential_decay #DEFINE_ALIAS -from ...fluid.layers import inverse_time_decay #DEFINE_ALIAS -from ...fluid.layers import natural_exp_decay #DEFINE_ALIAS -from ...fluid.layers import noam_decay #DEFINE_ALIAS -from ...fluid.layers import piecewise_decay #DEFINE_ALIAS -from ...fluid.layers import polynomial_decay #DEFINE_ALIAS -from ...fluid.layers import linear_lr_warmup #DEFINE_ALIAS - -__all__ = [ - 'cosine_decay', 'exponential_decay', 'inverse_time_decay', - 'natural_exp_decay', 'noam_decay', 'piecewise_decay', 'polynomial_decay', - 'linear_lr_warmup' -] diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py index 3a5bcaa21fe5b..d5abaa4de5ef2 100644 --- a/python/paddle/nn/layer/__init__.py +++ b/python/paddle/nn/layer/__init__.py @@ -86,13 +86,6 @@ # from .conv import TreeConv #DEFINE_ALIAS # from .conv import Conv1D #DEFINE_ALIAS from .extension import RowConv #DEFINE_ALIAS -# from .learning_rate import CosineDecay #DEFINE_ALIAS -# from .learning_rate import ExponentialDecay #DEFINE_ALIAS -# from .learning_rate import InverseTimeDecay #DEFINE_ALIAS -# from .learning_rate import NaturalExpDecay #DEFINE_ALIAS -# from .learning_rate import NoamDecay #DEFINE_ALIAS -# from .learning_rate import PiecewiseDecay #DEFINE_ALIAS -# from .learning_rate import PolynomialDecay #DEFINE_ALIAS # from .loss import NCELoss #DEFINE_ALIAS from .loss import BCEWithLogitsLoss #DEFINE_ALIAS from .loss import CrossEntropyLoss #DEFINE_ALIAS diff --git a/python/paddle/nn/layer/learning_rate.py b/python/paddle/nn/layer/learning_rate.py deleted file mode 100644 index e91f755cb0615..0000000000000 --- a/python/paddle/nn/layer/learning_rate.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# TODO: define learning rate decay - -__all__ = [ - # 'CosineDecay', - # 'ExponentialDecay', - # 'InverseTimeDecay', - # 'NaturalExpDecay', - # 'NoamDecay', - # 'PiecewiseDecay', - # 'PolynomialDecay' -] diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py index 6f485e2e9d62f..d041cb85d5115 100644 --- a/python/paddle/optimizer/__init__.py +++ b/python/paddle/optimizer/__init__.py @@ -16,10 +16,7 @@ 'Adadelta', 'AdadeltaOptimizer', 'Adagrad', 'AdagradOptimizer', 'Adam', 'Adamax', 'AdamW', 'DecayedAdagrad', 'DecayedAdagradOptimizer', 'Dpsgd', 'DpsgdOptimizer', 'Ftrl', 'FtrlOptimizer', 'Momentum', 'MomentumOptimizer', - 'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer', '_LRScheduler', 'NoamLR', - 'PiecewiseLR', 'NaturalExpLR', 'InverseTimeLR', 'PolynomialLR', - 'LinearLrWarmup', 'ExponentialLR', 'MultiStepLR', 'StepLR', 'LambdaLR', - 'ReduceLROnPlateau', 'CosineAnnealingLR' + 'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer' ] @@ -36,6 +33,4 @@ from .sgd import SGD from .momentum import Momentum -from . import lr_scheduler -from .lr_scheduler import _LRScheduler, NoamLR, PiecewiseLR, NaturalExpLR, InverseTimeLR, PolynomialLR, \ - LinearLrWarmup, ExponentialLR, MultiStepLR, StepLR, LambdaLR, ReduceLROnPlateau, CosineAnnealingLR +from . import lr diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index 366d8b953e3d4..79caa1583121d 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -48,8 +48,8 @@ class Adam(Optimizer): Related paper: `Adam: A Method for Stochastic Optimization `_ Args: - learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``. - It can be a float value or a _LRScheduler. The default value is 0.001. + learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``. + It can be a float value or a LRScheduler. The default value is 0.001. beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates. It should be a float number or a Tensor with shape [1] and data type as float32. The default value is 0.9. diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py index cca120efd4507..e5d1962d12625 100644 --- a/python/paddle/optimizer/adamax.py +++ b/python/paddle/optimizer/adamax.py @@ -47,8 +47,8 @@ class Adamax(Optimizer): it is added here for numerical stability to prevent the division by 0 error. Args: - learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``. - It can be a float value or a _LRScheduler. The default value is 0.001. + learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``. + It can be a float value or a LRScheduler. The default value is 0.001. beta1 (float, optional): The exponential decay rate for the 1st moment estimates. The default value is 0.9. beta2 (float, optional): The exponential decay rate for the 2nd moment estimates. diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index 00c197a58b3dd..eaa0509029459 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -42,8 +42,8 @@ class AdamW(Adam): Args: - learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``. - It can be a float value or a _LRScheduler. The default value is 0.001. + learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``. + It can be a float value or a LRScheduler. The default value is 0.001. parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ This parameter is required in dygraph mode. \ The default value is None in static mode, at this time all parameters will be updated. diff --git a/python/paddle/optimizer/lr_scheduler.py b/python/paddle/optimizer/lr.py similarity index 77% rename from python/paddle/optimizer/lr_scheduler.py rename to python/paddle/optimizer/lr.py index 61391704061bd..fc7752c444843 100644 --- a/python/paddle/optimizer/lr_scheduler.py +++ b/python/paddle/optimizer/lr.py @@ -18,18 +18,62 @@ from paddle import Tensor __all__ = [ - 'NoamLR', 'PiecewiseLR', 'NaturalExpLR', 'InverseTimeLR', 'PolynomialLR', - 'LinearLrWarmup', 'ExponentialLR', 'MultiStepLR', 'StepLR', 'LambdaLR', - 'ReduceLROnPlateau', 'CosineAnnealingLR' + 'LRScheduler', 'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay', + 'InverseTimeDecay', 'PolynomialDecay', 'LinearWarmup', 'ExponentialDecay', + 'MultiStepDecay', 'StepDecay', 'LambdaDecay', 'ReduceOnPlateau', + 'CosineAnnealingDecay' ] -class _LRScheduler(object): - """LRScheduler Base class. +class LRScheduler(object): + """ + + LRScheduler Base class. Define the common interface of a learning rate scheduler. + + User can import it by ``form paddle.optimizer.lr import LRScheduler`` , + + then overload it for your subclass and have a custom implementation of ``get_lr()`` . + + Otherwise, an ``NotImplementedError`` exception will be thrown. + + Args: + learning_rate (float): The initial learning rate. It is a python float number. + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . + + Returns: + instance to schedule learning rate. + + Examples: + Here is an example of a simple ``StepDecay`` implementation. + + .. code-block:: python + + import paddle + form paddle.optimizer.lr import LRScheduler + + class StepDecay(LRScheduler): + def __init__(self, + learning_rate, + step_size, + gamma=0.1, + last_epoch=-1, + verbose=False): + if not isinstance(step_size, int): + raise TypeError( + "The type of 'step_size' must be 'int', but received %s." % + type(step_size)) + if gamma >= 1.0: + raise ValueError('gamma should be < 1.0.') + + self.step_size = step_size + self.gamma = gamma + super(StepDecay, self).__init__(learning_rate, last_epoch, verbose) + + def get_lr(self): + i = self.last_epoch // self.step_size + return self.base_lr * (self.gamma**i) - Define the common interface of an LRScheduler. - User can 'form paddle.optimizer.lr_scheduler import _LRScheduler' - And inherit from it to have a custom implementation of get_lr(). """ def __init__(self, learning_rate=0.1, last_epoch=-1, verbose=False): @@ -47,23 +91,22 @@ def __init__(self, learning_rate=0.1, last_epoch=-1, verbose=False): def __call__(self): """ - Return last computed learning rate on current epoch. + Return lastest computed learning rate on current epoch. """ return self.last_lr def step(self, epoch=None): """ - 'step' should be called after 'minimize' . It will update the learning rate in optimizer according to 'epoch'. - The new learning rate will take effect on next epoch. + + ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` . + The new learning rate will take effect on next ``optimizer.step`` . Args: epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1. Returns: None - - Examples: - Please refer to the example of current _LRScheduler. + """ if epoch is None: self.last_epoch += 1 @@ -81,11 +124,12 @@ def step(self, epoch=None): def state_dict(self): """ + Returns the state of the scheduler as a :class:`dict`. - It is a subset of self.__dict__ . + It is a subset of ``self.__dict__`` . """ - self._state_keys() + self.state_keys() state_dict = {} for key in self.keys: if key not in self.__dict__: @@ -101,19 +145,26 @@ def state_dict(self): return state_dict - # For those subclass who overload _LRScheduler, "last_epoch, last_lr" will be saved by default. + # For those subclass who overload LRScheduler, "last_epoch, last_lr" will be saved by default. # (Note): you can change it for your subclass. - def _state_keys(self): + def state_keys(self): """ - set the keys in self.__dict__ that are needed to be saved. + + For those subclass who overload ``LRScheduler`` (Base Class). Acquiescently, "last_epoch, last_lr" will be saved by ``self.keys = ['last_epoch', 'last_lr']`` . + + ``last_epoch`` is the current epoch num, and ``last_lr`` is the current learning rate. + + If you want to change the default behavior, you should have a custom implementation of ``_state_keys()`` to redefine ``self.keys`` . + """ self.keys = ['last_epoch', 'last_lr'] def set_state_dict(self, state_dict): """ + Loads the schedulers state. """ - self._state_keys() + self.state_keys() for key in self.keys: if key in state_dict: self.__dict__[key] = state_dict[key] @@ -130,14 +181,20 @@ def set_state_dict(self, state_dict): set_dict = set_state_dict def get_lr(self): + """ + + For those subclass who overload ``LRScheduler`` (Base Class), User should have a custom implementation of ``get_lr()`` . + + Otherwise, an ``NotImplementedError`` exception will be thrown. + """ # calculate by python float raise NotImplementedError -class NoamLR(_LRScheduler): +class NoamDecay(LRScheduler): """ - Applies Noam Lear to the initial learning rate. + Applies Noam Decay to the initial learning rate. The algorithm can be described as following. @@ -156,7 +213,7 @@ class NoamLR(_LRScheduler): verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . Returns: - ``NoamLR`` instance to schedule learning rate. + ``NoamDecay`` instance to schedule learning rate. Examples: .. code-block:: python @@ -164,23 +221,21 @@ class NoamLR(_LRScheduler): import paddle import numpy as np - # train on default dygraph mode - paddle.disable_static() - x = np.random.uniform(-1, 1, [10, 10]).astype("float32") + # train on default dynamic graph mode linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.lr_scheduler.NoamLR(d_model=0.01, warmup_steps=100, verbose=True) - sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) + scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100, verbose=True) + sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters()) for epoch in range(20): for batch_id in range(2): - x = paddle.to_tensor(x) + x = paddle.uniform([10, 10]) out = linear(x) loss = paddle.reduce_mean(out) loss.backward() - sgd.minimize(loss) - linear.clear_gradients() + sgd.step() + sgd.clear_gradients() scheduler.step() - # train on static mode + # train on static graph mode paddle.enable_static() main_prog = paddle.static.Program() start_prog = paddle.static.Program() @@ -189,7 +244,7 @@ class NoamLR(_LRScheduler): y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.lr_scheduler.NoamLR(d_model=0.01, warmup_steps=100, verbose=True) + scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) @@ -216,7 +271,7 @@ def __init__(self, verbose=False): self.d_model = d_model self.warmup_steps = warmup_steps - super(NoamLR, self).__init__(learning_rate, last_epoch, verbose) + super(NoamDecay, self).__init__(learning_rate, last_epoch, verbose) def get_lr(self): if self.last_epoch == 0: @@ -227,7 +282,7 @@ def get_lr(self): return self.base_lr * (self.d_model**-0.5) * min(a, b) -class PiecewiseLR(_LRScheduler): +class PiecewiseDecay(LRScheduler): """ Piecewise learning rate scheduler. @@ -253,7 +308,7 @@ class PiecewiseLR(_LRScheduler): verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . Returns: - ``PiecewiseLR`` instance to schedule learning rate. + ``PiecewiseDecay`` instance to schedule learning rate. Examples: @@ -262,23 +317,21 @@ class PiecewiseLR(_LRScheduler): import paddle import numpy as np - # train on default dygraph mode - paddle.disable_static() - x = np.random.uniform(-1, 1, [10, 10]).astype("float32") + # train on default dynamic graph mode linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.lr_scheduler.PiecewiseLR(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True) - sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) + scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True) + sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters()) for epoch in range(20): for batch_id in range(2): - x = paddle.to_tensor(x) + x = paddle.uniform([10, 10]) out = linear(x) loss = paddle.reduce_mean(out) loss.backward() - sgd.minimize(loss) - linear.clear_gradients() + sgd.step() + sgd.clear_gradients() scheduler.step() - # train on static mode + # train on static graph mode paddle.enable_static() main_prog = paddle.static.Program() start_prog = paddle.static.Program() @@ -287,7 +340,7 @@ class PiecewiseLR(_LRScheduler): y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.lr_scheduler.PiecewiseLR(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True) + scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) @@ -308,7 +361,7 @@ class PiecewiseLR(_LRScheduler): def __init__(self, boundaries, values, last_epoch=-1, verbose=False): self.boundaries = boundaries self.values = values - super(PiecewiseLR, self).__init__( + super(PiecewiseDecay, self).__init__( last_epoch=last_epoch, verbose=verbose) def get_lr(self): @@ -319,7 +372,7 @@ def get_lr(self): return self.values[len(self.values) - 1] -class NaturalExpLR(_LRScheduler): +class NaturalExpDecay(LRScheduler): """ Applies natural exponential decay to the initial learning rate. @@ -328,7 +381,7 @@ class NaturalExpLR(_LRScheduler): .. math:: - new\_learning\_rate = learning\_rate * e^{- gama * epoch} + new\_learning\_rate = learning\_rate * e^{- gamma * epoch} Args: learning_rate (float): The initial learning rate. It is a python float number. @@ -337,7 +390,7 @@ class NaturalExpLR(_LRScheduler): verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . Returns: - ``NaturalExpLR`` instance to schedule learning rate. + ``NaturalExpDecay`` instance to schedule learning rate. Examples: @@ -346,23 +399,21 @@ class NaturalExpLR(_LRScheduler): import paddle import numpy as np - # train on default dygraph mode - paddle.disable_static() - x = np.random.uniform(-1, 1, [10, 10]).astype("float32") + # train on default dynamic graph mode linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.lr_scheduler.NaturalExpLR(learning_rate=0.5, gamma=0.1, verbose=True) - sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) + scheduler = paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5, gamma=0.1, verbose=True) + sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters()) for epoch in range(20): for batch_id in range(2): - x = paddle.to_tensor(x) + x = paddle.uniform([10, 10]) out = linear(x) loss = paddle.reduce_mean(out) loss.backward() - sgd.minimize(loss) - linear.clear_gradients() + sgd.step() + sgd.clear_gradients() scheduler.step() - # train on static mode + # train on static graph mode paddle.enable_static() main_prog = paddle.static.Program() start_prog = paddle.static.Program() @@ -371,7 +422,7 @@ class NaturalExpLR(_LRScheduler): y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.lr_scheduler.NaturalExpLR(learning_rate=0.5, gamma=0.1, verbose=True) + scheduler = paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5, gamma=0.1, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) @@ -391,13 +442,14 @@ class NaturalExpLR(_LRScheduler): def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False): self.gamma = gamma - super(NaturalExpLR, self).__init__(learning_rate, last_epoch, verbose) + super(NaturalExpDecay, self).__init__(learning_rate, last_epoch, + verbose) def get_lr(self): return self.base_lr * math.exp(-1 * self.gamma * self.last_epoch) -class InverseTimeLR(_LRScheduler): +class InverseTimeDecay(LRScheduler): """ Applies inverse time decay to the initial learning rate. @@ -416,7 +468,7 @@ class InverseTimeLR(_LRScheduler): verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . Returns: - ``InverseTimeLR`` instance to schedule learning rate. + ``InverseTimeDecay`` instance to schedule learning rate. Examples: @@ -425,23 +477,21 @@ class InverseTimeLR(_LRScheduler): import paddle import numpy as np - # train on default dygraph mode - paddle.disable_static() - x = np.random.uniform(-1, 1, [10, 10]).astype("float32") + # train on default dynamic graph mode linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.lr_scheduler.InverseTimeLR(learning_rate=0.5, gamma=0.1, verbose=True) - sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) + scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1, verbose=True) + sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters()) for epoch in range(20): for batch_id in range(2): - x = paddle.to_tensor(x) + x = paddle.uniform([10, 10]) out = linear(x) loss = paddle.reduce_mean(out) loss.backward() - sgd.minimize(loss) - linear.clear_gradients() + sgd.step() + sgd.clear_gradients() scheduler.step() - # train on static mode + # train on static graph mode paddle.enable_static() main_prog = paddle.static.Program() start_prog = paddle.static.Program() @@ -450,7 +500,7 @@ class InverseTimeLR(_LRScheduler): y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.lr_scheduler.InverseTimeLR(learning_rate=0.5, gamma=0.1, verbose=True) + scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) @@ -471,13 +521,14 @@ class InverseTimeLR(_LRScheduler): def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False): self.gamma = gamma - super(InverseTimeLR, self).__init__(learning_rate, last_epoch, verbose) + super(InverseTimeDecay, self).__init__(learning_rate, last_epoch, + verbose) def get_lr(self): return self.base_lr / (1 + self.gamma * self.last_epoch) -class PolynomialLR(_LRScheduler): +class PolynomialDecay(LRScheduler): """ Applies polynomial decay to the initial learning rate. @@ -512,7 +563,7 @@ class PolynomialLR(_LRScheduler): verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . Returns: - ``PolynomialLR`` instance to schedule learning rate. + ``PolynomialDecay`` instance to schedule learning rate. Examples: @@ -521,23 +572,21 @@ class PolynomialLR(_LRScheduler): import paddle import numpy as np - # train on default dygraph mode - paddle.disable_static() - x = np.random.uniform(-1, 1, [10, 10]).astype("float32") + # train on default dynamic graph mode linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.lr_scheduler.PolynomialLR(learning_rate=0.5, decay_steps=20, verbose=True) - sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) + scheduler = paddle.optimizer.lr.PolynomialDecay(learning_rate=0.5, decay_steps=20, verbose=True) + sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters()) for epoch in range(20): for batch_id in range(2): - x = paddle.to_tensor(x) + x = paddle.uniform([10, 10]) out = linear(x) loss = paddle.reduce_mean(out) loss.backward() - sgd.minimize(loss) - linear.clear_gradients() + sgd.step() + sgd.clear_gradients() scheduler.step() - # train on static mode + # train on static graph mode paddle.enable_static() main_prog = paddle.static.Program() start_prog = paddle.static.Program() @@ -546,7 +595,7 @@ class PolynomialLR(_LRScheduler): y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.lr_scheduler.PolynomialLR(learning_rate=0.5, decay_steps=20, verbose=True) + scheduler = paddle.optimizer.lr.PolynomialDecay(learning_rate=0.5, decay_steps=20, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) @@ -576,7 +625,8 @@ def __init__(self, self.end_lr = end_lr self.power = power self.cycle = cycle - super(PolynomialLR, self).__init__(learning_rate, last_epoch, verbose) + super(PolynomialDecay, self).__init__(learning_rate, last_epoch, + verbose) def get_lr(self): tmp_epoch_num = self.last_epoch @@ -596,7 +646,7 @@ def get_lr(self): )**self.power) + self.end_lr -class LinearLrWarmup(_LRScheduler): +class LinearWarmup(LRScheduler): """ Linear learning rate warm up strategy. Update the learning rate preliminarily before the normal learning rate scheduler. @@ -604,22 +654,22 @@ class LinearLrWarmup(_LRScheduler): When epoch < warmup_steps, learning rate is updated as: - .. code-block:: text + .. math:: - lr = start_lr + (end_lr - start_lr) * (epoch / warmup_steps) + lr = start\_lr + (end\_lr - start\_lr) * \\frac{epoch}{warmup\_steps} where start_lr is the initial learning rate, and end_lr is the final learning rate; When epoch >= warmup_steps, learning rate is updated as: - .. code-block:: text + .. math:: lr = learning_rate - where lr is float or any subclass of ``_LRScheduler`` . + where ``learning_rate`` is float or any subclass of ``LRScheduler`` . Args: - learning_rate (float|_LRScheduler): The learning rate after warm-up. It is a python float number or any subclass of ``_LRScheduler`` . + learning_rate (float|LRScheduler): The learning rate after warm-up. It is a python float number or any subclass of ``LRScheduler`` . warmup_steps (int): total steps of warm up. start_lr (float): Initial learning rate of warm up. end_lr (float): Final learning rate of warm up. @@ -627,7 +677,7 @@ class LinearLrWarmup(_LRScheduler): verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . Returns: - ``LinearLrWarmup`` instance to schedule learning rate. + ``LinearWarmup`` instance to schedule learning rate. Examples: @@ -636,24 +686,22 @@ class LinearLrWarmup(_LRScheduler): import paddle import numpy as np - # train on default dygraph mode - paddle.disable_static() - x = np.random.uniform(-1, 1, [10, 10]).astype("float32") + # train on default dynamic graph mode linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.LinearLrWarmup( + scheduler = paddle.optimizer.lr.LinearWarmup( learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True) - sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) + sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters()) for epoch in range(20): for batch_id in range(2): - x = paddle.to_tensor(x) + x = paddle.uniform([10, 10]) out = linear(x) loss = paddle.reduce_mean(out) loss.backward() - sgd.minimize(loss) - linear.clear_gradients() + sgd.step() + sgd.clear_gradients() scheduler.step() - # train on static mode + # train on static graph mode paddle.enable_static() main_prog = paddle.static.Program() start_prog = paddle.static.Program() @@ -662,7 +710,7 @@ class LinearLrWarmup(_LRScheduler): y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.lr_scheduler.LinearLrWarmup( + scheduler = paddle.optimizer.lr.LinearWarmup( learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) @@ -678,7 +726,7 @@ class LinearLrWarmup(_LRScheduler): 'y': np.random.randn(3, 4, 5).astype('float32') }, fetch_list=loss.name) - scheduler.step() + scheduler.step() """ def __init__(self, @@ -689,10 +737,10 @@ def __init__(self, last_epoch=-1, verbose=False): type_check = isinstance(learning_rate, float) or isinstance( - learning_rate, int) or isinstance(learning_rate, _LRScheduler) + learning_rate, int) or isinstance(learning_rate, LRScheduler) if not type_check: raise TypeError( - "the type of learning_rate should be [int, float or _LRScheduler], the current type is {}". + "the type of learning_rate should be [int, float or LRScheduler], the current type is {}". format(learning_rate)) self.learning_rate = learning_rate self.warmup_steps = warmup_steps @@ -700,24 +748,24 @@ def __init__(self, self.end_lr = end_lr assert end_lr > start_lr, "end_lr {} must be greater than start_lr {}".format( end_lr, start_lr) - super(LinearLrWarmup, self).__init__(start_lr, last_epoch, verbose) + super(LinearWarmup, self).__init__(start_lr, last_epoch, verbose) def get_lr(self): if self.last_epoch < self.warmup_steps: return (self.end_lr - self.start_lr) * float( self.last_epoch) / float(self.warmup_steps) + self.start_lr else: - if isinstance(self.learning_rate, _LRScheduler): + if isinstance(self.learning_rate, LRScheduler): self.learning_rate.step() return self.learning_rate() return self.learning_rate -class ExponentialLR(_LRScheduler): +class ExponentialDecay(LRScheduler): """ - Update learning rate by 'gamma' each epoch. + Update learning rate by `gamma` each epoch. The algorithm can be described as following. @@ -733,7 +781,7 @@ class ExponentialLR(_LRScheduler): verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . Returns: - ``ExponentialLR`` instance to schedule learning rate. + ``ExponentialDecay`` instance to schedule learning rate. Examples: @@ -742,23 +790,21 @@ class ExponentialLR(_LRScheduler): import paddle import numpy as np - # train on default dygraph mode - paddle.disable_static() - x = np.random.uniform(-1, 1, [10, 10]).astype("float32") + # train on default dynamic graph mode linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.lr_scheduler.ExponentialLR(learning_rate=0.5, gamma=0.9, verbose=True) - sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) + scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9, verbose=True) + sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters()) for epoch in range(20): for batch_id in range(2): - x = paddle.to_tensor(x) + x = paddle.uniform([10, 10]) out = linear(x) loss = paddle.reduce_mean(out) loss.backward() - sgd.minimize(loss) - linear.clear_gradients() + sgd.step() + sgd.clear_gradients() scheduler.step() - # train on static mode + # train on static graph mode paddle.enable_static() main_prog = paddle.static.Program() start_prog = paddle.static.Program() @@ -767,7 +813,7 @@ class ExponentialLR(_LRScheduler): y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.lr_scheduler.ExponentialLR(learning_rate=0.5, gamma=0.9, verbose=True) + scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) @@ -787,15 +833,16 @@ class ExponentialLR(_LRScheduler): def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False): self.gamma = gamma - super(ExponentialLR, self).__init__(learning_rate, last_epoch, verbose) + super(ExponentialDecay, self).__init__(learning_rate, last_epoch, + verbose) def get_lr(self): return self.base_lr * (self.gamma**self.last_epoch) -class MultiStepLR(_LRScheduler): +class MultiStepDecay(LRScheduler): """ - Update the learning rate by ``gama`` once ``epoch`` reaches one of the milestones. + Update the learning rate by ``gamma`` once ``epoch`` reaches one of the milestones. The algorithm can be described as the code below. @@ -821,7 +868,7 @@ class MultiStepLR(_LRScheduler): Returns: - ``MultiStepLR`` instance to schedule learning rate. + ``MultiStepDecay`` instance to schedule learning rate. Examples: @@ -830,23 +877,21 @@ class MultiStepLR(_LRScheduler): import paddle import numpy as np - # train on default dygraph mode - paddle.disable_static() - x = np.random.uniform(-1, 1, [10, 10]).astype("float32") + # train on default dynamic graph mode linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.lr_scheduler.MultiStepLR(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True) - sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) + scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True) + sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters()) for epoch in range(20): for batch_id in range(2): - x = paddle.to_tensor(x) + x = paddle.uniform([10, 10]) out = linear(x) loss = paddle.reduce_mean(out) loss.backward() - sgd.minimize(loss) - linear.clear_gradients() + sgd.step() + sgd.clear_gradients() scheduler.step() - # train on static mode + # train on static graph mode paddle.enable_static() main_prog = paddle.static.Program() start_prog = paddle.static.Program() @@ -855,7 +900,7 @@ class MultiStepLR(_LRScheduler): y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.lr_scheduler.MultiStepLR(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True) + scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) @@ -894,7 +939,7 @@ def __init__(self, self.milestones = milestones self.gamma = gamma - super(MultiStepLR, self).__init__(learning_rate, last_epoch, verbose) + super(MultiStepDecay, self).__init__(learning_rate, last_epoch, verbose) def get_lr(self): for i in range(len(self.milestones)): @@ -903,7 +948,7 @@ def get_lr(self): return self.base_lr * (self.gamma**len(self.milestones)) -class StepLR(_LRScheduler): +class StepDecay(LRScheduler): """ Update the learning rate of ``optimizer`` by ``gamma`` every ``step_size`` number of epoch. @@ -929,7 +974,7 @@ class StepLR(_LRScheduler): verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . Returns: - ``StepLR`` instance to schedule learning rate. + ``StepDecay`` instance to schedule learning rate. Examples: @@ -939,23 +984,21 @@ class StepLR(_LRScheduler): import paddle import numpy as np - # train on default dygraph mode - paddle.disable_static() - x = np.random.uniform(-1, 1, [10, 10]).astype("float32") + # train on default dynamic graph mode linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.lr_scheduler.StepLR(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True) - sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) + scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True) + sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters()) for epoch in range(20): for batch_id in range(2): - x = paddle.to_tensor(x) + x = paddle.uniform([10, 10]) out = linear(x) loss = paddle.reduce_mean(out) loss.backward() - sgd.minimize(loss) - linear.clear_gradients() + sgd.step() + sgd.clear_gradients() scheduler.step() - # train on static mode + # train on static graph mode paddle.enable_static() main_prog = paddle.static.Program() start_prog = paddle.static.Program() @@ -964,7 +1007,7 @@ class StepLR(_LRScheduler): y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.lr_scheduler.StepLR(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True) + scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) @@ -997,14 +1040,14 @@ def __init__(self, self.step_size = step_size self.gamma = gamma - super(StepLR, self).__init__(learning_rate, last_epoch, verbose) + super(StepDecay, self).__init__(learning_rate, last_epoch, verbose) def get_lr(self): i = self.last_epoch // self.step_size return self.base_lr * (self.gamma**i) -class LambdaLR(_LRScheduler): +class LambdaDecay(LRScheduler): """ Sets the learning rate of ``optimizer`` by function ``lr_lambda`` . ``lr_lambda`` is funciton which receives ``epoch`` . @@ -1015,9 +1058,9 @@ class LambdaLR(_LRScheduler): learning_rate = 0.5 # init learning_rate lr_lambda = lambda epoch: 0.95 ** epoch - learning_rate = 0.5 # epoch 0 - learning_rate = 0.475 # epoch 1 - learning_rate = 0.45125 # epoch 2 + learning_rate = 0.5 # epoch 0, 0.5*0.95**0 + learning_rate = 0.475 # epoch 1, 0.5*0.95**1 + learning_rate = 0.45125 # epoch 2, 0.5*0.95**2 Args: learning_rate (float): The initial learning rate. It is a python float number. @@ -1026,7 +1069,7 @@ class LambdaLR(_LRScheduler): verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . Returns: - ``LambdaLR`` instance to schedule learning rate. + ``LambdaDecay`` instance to schedule learning rate. Examples: @@ -1035,23 +1078,21 @@ class LambdaLR(_LRScheduler): import paddle import numpy as np - # train on default dygraph mode - paddle.disable_static() - x = np.random.uniform(-1, 1, [10, 10]).astype("float32") + # train on default dynamic graph mode linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.lr_scheduler.LambdaLR(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True) - sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) + scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True) + sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters()) for epoch in range(20): for batch_id in range(2): - x = paddle.to_tensor(x) + x = paddle.uniform([10, 10]) out = linear(x) loss = paddle.reduce_mean(out) loss.backward() - sgd.minimize(loss) - linear.clear_gradients() + sgd.step() + sgd.clear_gradients() scheduler.step() - # train on static mode + # train on static graph mode paddle.enable_static() main_prog = paddle.static.Program() start_prog = paddle.static.Program() @@ -1060,7 +1101,7 @@ class LambdaLR(_LRScheduler): y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.lr_scheduler.LambdaLR(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True) + scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) @@ -1082,17 +1123,17 @@ class LambdaLR(_LRScheduler): def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False): if not callable(lr_lambda): raise TypeError( - "The type of 'lr_lambda' in 'LambdaLR' must be 'function', but received %s." + "The type of 'lr_lambda' in 'LambdaDecay' must be 'function', but received %s." % type(lr_lambda)) self.lr_lambda = lr_lambda - super(LambdaLR, self).__init__(learning_rate, last_epoch, verbose) + super(LambdaDecay, self).__init__(learning_rate, last_epoch, verbose) def get_lr(self): return self.base_lr * self.lr_lambda(self.last_epoch) -class ReduceLROnPlateau(_LRScheduler): +class ReduceOnPlateau(LRScheduler): """ Reduce learning rate when ``metrics`` has stopped descending. Models often benefit from reducing the learning rate by 2 to 10 times once model performance has no longer improvement. @@ -1126,7 +1167,7 @@ class ReduceLROnPlateau(_LRScheduler): Returns: - ``ReduceLROnPlateau`` instance to schedule learning rate. + ``ReduceOnPlateau`` instance to schedule learning rate. Examples: @@ -1135,23 +1176,21 @@ class ReduceLROnPlateau(_LRScheduler): import paddle import numpy as np - # train on default dygraph mode - paddle.disable_static() - x = np.random.uniform(-1, 1, [10, 10]).astype("float32") + # train on default dynamic graph mode linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.lr_scheduler.ReduceLROnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True) - sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) + scheduler = paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True) + sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters()) for epoch in range(20): for batch_id in range(2): - x = paddle.to_tensor(x) + x = paddle.uniform([10, 10]) out = linear(x) loss = paddle.reduce_mean(out) loss.backward() - sgd.minimize(loss) - linear.clear_gradients() + sgd.step() + sgd.clear_gradients() scheduler.step(loss) - # train on static mode + # train on static graph mode paddle.enable_static() main_prog = paddle.static.Program() start_prog = paddle.static.Program() @@ -1160,7 +1199,7 @@ class ReduceLROnPlateau(_LRScheduler): y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.lr_scheduler.ReduceLROnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True) + scheduler = paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) @@ -1207,7 +1246,7 @@ def __init__(self, self.threshold_mode = threshold_mode if not isinstance(learning_rate, (float, int)): raise TypeError( - "The type of 'learning_rate' in 'ReduceLROnPlateau' must be 'float', but received %s." + "The type of 'learning_rate' in 'ReduceOnPlateau' must be 'float', but received %s." % type(learning_rate)) self.verbose = verbose @@ -1230,7 +1269,7 @@ def __init__(self, self._var_name = None # "cooldown_counter / best / num_bad_epochs / last_epoch / last_lr" will be stored. - def _state_keys(self): + def state_keys(self): self.keys = [ 'cooldown_counter', 'best', 'num_bad_epochs', 'last_epoch', 'last_lr' @@ -1238,7 +1277,7 @@ def _state_keys(self): def step(self, metrics, epoch=None): """ - step should be called after 'minimize' . It will update the learning rate in optimizer according to ``metrics`` . + step should be called after `optimizer.step()` . It will update the learning rate in optimizer according to ``metrics`` . The new learning rate will take effect on next epoch. Args: @@ -1251,14 +1290,14 @@ def step(self, metrics, epoch=None): None Examples: - Please refer to the example of current _LRScheduler. + Please refer to the example of current LRScheduler. """ if epoch is None: self.last_epoch = self.last_epoch + 1 else: self.last_epoch = epoch - # loss must be 1-D Tensor with shape [1] + # loss must be float, numpy.ndarray or 1-D Tensor with shape [1] if isinstance(metrics, (Tensor, numpy.ndarray)): assert len(metrics.shape) == 1 and metrics.shape[0] == 1, "the metrics.shape " \ "should be (1L,), but the current metrics.shape is {}. Maybe that " \ @@ -1290,7 +1329,6 @@ def step(self, metrics, epoch=None): self.last_lr)) def _is_better(self, current, best): - print("mode", self.mode, 'threshold_mode', self.threshold_mode) if self.mode == 'min' and self.threshold_mode == 'rel': return current < best - best * self.threshold @@ -1304,31 +1342,23 @@ def _is_better(self, current, best): return current > best + self.threshold -class CosineAnnealingLR(_LRScheduler): +class CosineAnnealingDecay(LRScheduler): """ Set the learning rate using a cosine annealing schedule, where :math:`\eta_{max}` is set to the initial learning_rate. :math:`T_{cur}` is the number of epochs since the last restart in - SGDR: - - \begin{aligned} - \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1 - + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right), - & T_{cur} \neq (2k+1)T_{max}; \\ - \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min}) - \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right), - & T_{cur} = (2k+1)T_{max}. - \end{aligned} + SGDR. The algorithm can be described as following. .. math:: - \begin{aligned} - \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1 - + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right), - & T_{cur} \neq (2k+1)T_{max}; \\ - \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min}) - \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right), + + \\begin{aligned} + \eta_t & = \eta_{min} + \\frac{1}{2}(\eta_{max} - \eta_{min})\left(1 + + \cos\left(\\frac{T_{cur}}{T_{max}}\pi\\right)\\right), + & T_{cur} \\neq (2k+1)T_{max}; \\ + \eta_{t+1} & = \eta_{t} + \\frac{1}{2}(\eta_{max} - \eta_{min}) + \left(1 - \cos\left(\\frac{1}{T_{max}}\pi\\right)\\right), & T_{cur} = (2k+1)T_{max}. \end{aligned} @@ -1343,7 +1373,7 @@ class CosineAnnealingLR(_LRScheduler): verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . Returns: - ``CosineAnnealingLR`` instance to schedule learning rate. + ``CosineAnnealingDecay`` instance to schedule learning rate. Examples: @@ -1352,23 +1382,21 @@ class CosineAnnealingLR(_LRScheduler): import paddle import numpy as np - # train on default dygraph mode - paddle.disable_static() - x = np.random.uniform(-1, 1, [10, 10]).astype("float32") + # train on default dynamic graph mode linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.lr_scheduler.CosineAnnealingLR(learning_rate=0.5, T_max=10, verbose=True) - sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) + scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=0.5, T_max=10, verbose=True) + sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters()) for epoch in range(20): for batch_id in range(2): - x = paddle.to_tensor(x) + x = paddle.uniform([10, 10]) out = linear(x) loss = paddle.reduce_mean(out) loss.backward() - sgd.minimize(loss) - linear.clear_gradients() + sgd.step() + sgd.clear_gradients() scheduler.step() - # train on static mode + # train on static graph mode paddle.enable_static() main_prog = paddle.static.Program() start_prog = paddle.static.Program() @@ -1377,7 +1405,7 @@ class CosineAnnealingLR(_LRScheduler): y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.lr_scheduler.CosineAnnealingLR(learning_rate=0.5, T_max=10, verbose=True) + scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=0.5, T_max=10, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) @@ -1403,16 +1431,16 @@ def __init__(self, verbose=False): if not isinstance(T_max, int): raise TypeError( - "The type of 'T_max' in 'CosineAnnealingLR' must be 'int', but received %s." + "The type of 'T_max' in 'CosineAnnealingDecay' must be 'int', but received %s." % type(T_max)) if not isinstance(eta_min, (float, int)): raise TypeError( - "The type of 'eta_min' in 'CosineAnnealingLR' must be 'float, int', but received %s." + "The type of 'eta_min' in 'CosineAnnealingDecay' must be 'float, int', but received %s." % type(eta_min)) self.T_max = T_max self.eta_min = float(eta_min) - super(CosineAnnealingLR, self).__init__(learning_rate, last_epoch, - verbose) + super(CosineAnnealingDecay, self).__init__(learning_rate, last_epoch, + verbose) def get_lr(self): if self.last_epoch == 0: diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 15519cdd300e9..0f9795570619c 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -41,7 +41,7 @@ from functools import reduce from ..fluid.wrapped_decorator import signature_safe_contextmanager from .. import compat as cpt -from .lr_scheduler import _LRScheduler +from .lr import LRScheduler __all__ = ['Optimizer'] @@ -54,8 +54,8 @@ class Optimizer(object): but need to use one of it's implementation. Args: - learning_rate (float|_LRScheduler): The learning rate used to update ``Parameter``. - It can be a float value or any subclass of ``_LRScheduler`` . + learning_rate (float|LRScheduler): The learning rate used to update ``Parameter``. + It can be a float value or any subclass of ``LRScheduler`` . parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ This parameter is required in dygraph mode. \ The default value is None in static mode, at this time all parameters will be updated. @@ -82,12 +82,8 @@ class Optimizer(object): #Take the subclass adam as an example import paddle - import numpy as np - - paddle.disable_static() - inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") linear = paddle.nn.Linear(10, 10) - inp = paddle.to_tensor(inp) + inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1) out = linear(inp) loss = paddle.mean(out) adam = paddle.optimizer.Adam(learning_rate=0.1, @@ -121,9 +117,9 @@ def __init__(self, "The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!" % weight_decay.__str__()) break - if not isinstance(learning_rate, (float, _LRScheduler)): + if not isinstance(learning_rate, (float, LRScheduler)): raise TypeError( - "learning rate should be float or _LRScheduler, got %s here" % + "learning rate should be float or LRScheduler, got %s here" % type(learning_rate)) if grad_clip is not None: if not isinstance(grad_clip, GradientClipBase): @@ -156,7 +152,7 @@ def __init__(self, @framework.dygraph_only def state_dict(self): ''' - Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If _LRScheduler have been used, global_step will be include in state dict. + Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be include in state dict. If the optimizer never be called(minimize function), the state_dict is empty. Args: @@ -169,7 +165,6 @@ def state_dict(self): .. code-block:: python import paddle - paddle.disable_static() emb = paddle.nn.Embedding(10, 10) adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters()) @@ -181,14 +176,14 @@ def state_dict(self): for para_name, var_tmp in v.items(): state_dict[var_tmp.name] = var_tmp # global step if use lr decay - if isinstance(self._learning_rate, _LRScheduler): + if isinstance(self._learning_rate, LRScheduler): state_dict["LR_Scheduler"] = self._learning_rate.state_dict() return state_dict @framework.dygraph_only def set_state_dict(self, state_dict): ''' - Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If _LRScheduler have been used, global_step will be changed. + Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be changed. Args: state_dict(dict) : Dict contains all the Tensor needed by optimizer @@ -199,26 +194,28 @@ def set_state_dict(self, state_dict): .. code-block:: python import paddle - paddle.disable_static() - emb = paddle.nn.Embedding(10, 10) - state_dict = emb.state_dict() - paddle.framework.save(state_dict, "paddle_dy") + emb = paddle.nn.Embedding(10, 10) - adam = paddle.optimizer.Adam(learning_rate=paddle.optimizer.NoamLR( 100, 10000), - parameters=emb.parameters()) - state_dict = adam.state_dict() - paddle.framework.save(state_dict, "paddle_dy") + layer_state_dict = emb.state_dict() + paddle.save(layer_state_dict, "emb.pdparams") - para_state_dict, opti_state_dict = paddle.framework.load( "paddle_dy") + scheduler = paddle.optimizer.lr.NoamDecay( + d_model=0.01, warmup_steps=100, verbose=True) + adam = paddle.optimizer.Adam( + learning_rate=scheduler, + parameters=emb.parameters()) + opt_state_dict = adam.state_dict() + paddle.save(opt_state_dict, "adam.pdopt") + opti_state_dict = paddle.load("adam.pdopt") adam.set_state_dict(opti_state_dict) ''' - if isinstance(self._learning_rate, _LRScheduler): + if isinstance(self._learning_rate, LRScheduler): self._learning_rate.set_dict(state_dict["LR_Scheduler"]) - if isinstance(self._learning_rate, _LRScheduler): + if isinstance(self._learning_rate, LRScheduler): self._learning_rate.set_state_dict(state_dict["LR_Scheduler"]) self._accumulators_holder = state_dict @@ -256,7 +253,7 @@ def get_opti_var_name_list(self): return self._opti_name_list def _create_global_learning_rate(self): - if isinstance(self._learning_rate, _LRScheduler): + if isinstance(self._learning_rate, LRScheduler): lr_var = self._global_learning_rate() # only create global lr_var once if not isinstance(lr_var, framework.Variable): @@ -299,7 +296,7 @@ def set_lr(self, value): """ :api_attr: imperative - Set the value of the learning rate manually in the optimizer. If the optimizer use _LRScheduler, + Set the value of the learning rate manually in the optimizer. If the optimizer use LRScheduler, this API cannot be invoked, because it will lead to conflict. Args: @@ -312,7 +309,6 @@ def set_lr(self, value): .. code-block:: python import paddle - paddle.disable_static() linear = paddle.nn.Linear(10, 10) adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters()) @@ -335,9 +331,9 @@ def set_lr(self, value): raise TypeError( "The type of 'value' in optimizer.set_lr must be float, but received %s." % (type(value))) - if isinstance(self._learning_rate, _LRScheduler): + if isinstance(self._learning_rate, LRScheduler): raise RuntimeError( - "optimizer's learning rate can't be _LRScheduler when invoke this API, because this will lead to conflict." + "optimizer's learning rate can't be LRScheduler when invoke this API, because this will lead to conflict." ) self._learning_rate = float(value) current_lr = self._global_learning_rate() @@ -358,7 +354,7 @@ def get_lr(self): """ :api_attr: imperative - Get current step learning rate. The return value is all the same When _LRScheduler is not used, + Get current step learning rate. The return value is all the same When LRScheduler is not used, otherwise return the current step learning rate. @@ -370,15 +366,13 @@ def get_lr(self): import numpy as np import paddle - # example1: _LRScheduler is not used, return value is all the same - paddle.disable_static() + # example1: LRScheduler is not used, return value is all the same emb = paddle.nn.Embedding(10, 10) adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters()) lr = adam.get_lr() print(lr) # 0.001 - # example2: PiecewiseLR is used, return the step learning rate - paddle.disable_static() + # example2: PiecewiseDecay is used, return the scheduled learning rate inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") linear = paddle.nn.Linear(10, 10) inp = paddle.to_tensor(inp) @@ -387,7 +381,7 @@ def get_lr(self): bd = [2, 4, 6, 8] value = [0.2, 0.4, 0.6, 0.8, 1.0] - scheduler = paddle.optimizer.PiecewiseLR(bd, value, 0) + scheduler = paddle.optimizer.lr.PiecewiseDecay(bd, value, 0) adam = paddle.optimizer.Adam(scheduler, parameters=linear.parameters()) @@ -656,7 +650,6 @@ def backward(self, import paddle import numpy as np - paddle.disable_static() value = np.arange(26).reshape(2, 13).astype("float32") a = paddle.to_tensor(value) linear = paddle.nn.Linear(13, 5) @@ -727,7 +720,6 @@ def apply_gradients(self, params_grads): import paddle import numpy as np - paddle.disable_static() inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") linear = paddle.nn.Linear(10, 10) inp = paddle.to_tensor(inp) @@ -805,7 +797,7 @@ def clear_grad(self): import numpy as np import paddle - paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") a = paddle.to_tensor(value) linear = paddle.nn.Linear(13, 5) @@ -854,13 +846,9 @@ def minimize(self, .. code-block:: python import paddle - import numpy as np - - paddle.disable_static() - inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") linear = paddle.nn.Linear(10, 10) - inp = paddle.to_tensor(inp) - out = linear(inp) + input = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1) + out = linear(input) loss = paddle.mean(out) beta1 = paddle.to_tensor([0.9], dtype="float32") @@ -903,7 +891,7 @@ def step(self): import paddle import numpy as np - paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") a = paddle.to_tensor(value) linear = paddle.nn.Linear(13, 5) diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py index 5e17ca34ff218..a664b01595632 100644 --- a/python/paddle/optimizer/rmsprop.py +++ b/python/paddle/optimizer/rmsprop.py @@ -69,8 +69,8 @@ class RMSProp(Optimizer): Parameters: - learning_rate (float|_LRScheduler): The learning rate used to update ``Parameter``. - It can be a float value or a _LRScheduler. + learning_rate (float|LRScheduler): The learning rate used to update ``Parameter``. + It can be a float value or a LRScheduler. rho(float): rho is :math: `\\rho` in equation, default is 0.95. epsilon(float): :math: `\\epsilon` in equation is smoothing term to avoid division by zero, default is 1e-6.