Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

create learning rate variable for every parameter #5524

Merged
merged 3 commits into from
Nov 13, 2017
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 21 additions & 77 deletions python/paddle/v2/framework/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,21 @@ def _append_optimize_op(self, block, param_and_grad):
"""
raise NotImplementedError()

def _initialize_tensors(self, block):
"""Create all necessary tensors, that will be shared for all parameter updates.

Tensors like learning rate should be initialized here.

Args:
block: the block in which the loss variable is present
"""
pass
def _create_param_lr(self, param_and_grad):
# create learning rate variable for every parameter
param = param_and_grad[0]
param_lr = param.optimize_attr['learning_rate']
param_lr_shape = [1]
param_lr_var = self.helper.create_global_variable(
name=unique_name("learning_rate"),
dtype='float32',
shape=param_lr_shape,
lod_level=1,
persistable=True)
param_lr = param_lr * self._learning_rate
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we sure that this will be multiplied? Is there a reference for this implementation or is it taken from the old Paddle code?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lcy-seso @reyoung Could you please confirm this? Is each parameter's learning rate set relatively, and calculated by multiplying global learning rate?

Copy link
Contributor

@lcy-seso lcy-seso Nov 13, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the way old PaddlePaddle uses. The learning rate of one parameter is determined by multiplying the global learning rate to the layerwise set learning rate (if set).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lcy-seso Do you know how this works together with learning rate scheduler? Does the learning rate scheduler only work for the global learning rate?

Copy link
Contributor

@lcy-seso lcy-seso Nov 13, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From the codes, it seems that the learning rate schedule only works for global learning rate:

In old PaddlePaddle it seems that the learning rate scheduler just calculates a scale factor to decrease the learning rate.

self.helper.set_variable_initializer(
var=param_lr_var, initializer=ConstantInitializer(param_lr))
return param_lr_var

def _create_accumulators(self, block, parameters):
"""Create all accumulators needed by the parameters
Expand Down Expand Up @@ -161,8 +167,6 @@ def create_optimization_pass(self,
startup_program=startup_program)
self._create_accumulators(loss.block,
[p[0] for p in parameters_and_grads])
# Create any necessary tensors
self._initialize_tensors(loss.block)

optimize_ops = []
for param_and_grad in parameters_and_grads:
Expand Down Expand Up @@ -214,27 +218,16 @@ def __init__(self, learning_rate, global_step=None):
self.type = "sgd"
self._learning_rate = learning_rate

def _initialize_tensors(self, block):
lr_shape = [1]
# create a variable for learning_rate
self._lr = self.helper.create_global_variable(
name=unique_name("learning_rate"),
dtype='float32',
shape=lr_shape,
lod_level=1,
persistable=True)
self.helper.set_variable_initializer(
var=self._lr, initializer=ConstantInitializer(self._learning_rate))

def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)

# create the optimize op
sgd_op = block.append_op(
type=self.type,
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"LearningRate": self._lr
"LearningRate": self._create_param_lr(param_and_grad)
},
outputs={"ParamOut": param_and_grad[0]})

Expand All @@ -259,19 +252,6 @@ def __init__(self,
self._momentum = momentum
self._use_nesterov = bool(use_nesterov)

def _initialize_tensors(self, block):
assert isinstance(block, framework.Block)
lr_shape = [1]
# create a variable for learning_rate
self._lr = self.helper.create_global_variable(
name=unique_name("learning_rate"),
dtype='float32',
shape=lr_shape,
lod_level=1,
persistable=True)
self.helper.set_variable_initializer(
var=self._lr, initializer=ConstantInitializer(self._learning_rate))

def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)

Expand All @@ -290,7 +270,7 @@ def _append_optimize_op(self, block, param_and_grad):
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"Velocity": velocity_acc,
"LearningRate": self._lr
"LearningRate": self._create_param_lr(param_and_grad)
},
outputs={
"ParamOut": param_and_grad[0],
Expand All @@ -315,18 +295,6 @@ def __init__(self, learning_rate, epsilon=1.0e-6, global_step=None):
self._learning_rate = learning_rate
self._epsilon = epsilon

def _initialize_tensors(self, block):
lr_shape = [1]
# create a variable for learning_rate
self._lr = self.helper.create_global_variable(
name=unique_name("learning_rate"),
dtype='float32',
shape=lr_shape,
lod_level=1,
persistable=True)
self.helper.set_variable_initializer(
var=self._lr, initializer=ConstantInitializer(self._learning_rate))

def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)

Expand All @@ -346,7 +314,7 @@ def _append_optimize_op(self, block, param_and_grad):
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"Moment": moment_acc,
"LearningRate": self._lr
"LearningRate": self._create_param_lr(param_and_grad)
},
outputs={"ParamOut": param_and_grad[0],
"MomentOut": moment_acc},
Expand Down Expand Up @@ -378,18 +346,6 @@ def __init__(self,
self._beta2 = beta2
self._epsilon = epsilon

def _initialize_tensors(self, block):
lr_shape = [1]
# create a variable for learning_rate
self._lr = self.helper.create_global_variable(
name=unique_name("learning_rate"),
dtype='float32',
shape=lr_shape,
lod_level=1,
persistable=True)
self.helper.set_variable_initializer(
var=self._lr, initializer=ConstantInitializer(self._learning_rate))

def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)

Expand Down Expand Up @@ -433,7 +389,7 @@ def _append_optimize_op(self, block, param_and_grad):
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"LearningRate": self._lr,
"LearningRate": self._create_param_lr(param_and_grad),
"Moment1": moment1,
"Moment2": moment2,
"Beta1Pow": self._beta1_pow_acc,
Expand Down Expand Up @@ -495,18 +451,6 @@ def __init__(self,
self._beta2 = beta2
self._epsilon = epsilon

def _initialize_tensors(self, block):
lr_shape = [1]
# create a variable for learning_rate
self._lr = self.helper.create_global_variable(
name=unique_name("learning_rate"),
dtype='float32',
shape=lr_shape,
lod_level=1,
persistable=True)
self.helper.set_variable_initializer(
var=self._lr, initializer=ConstantInitializer(self._learning_rate))

def _create_accumulators(self, block, parameters):
# Create beta1 power accumulator tensor
beta_shape = [1]
Expand Down Expand Up @@ -536,7 +480,7 @@ def _append_optimize_op(self, block, param_and_grad):
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"LearningRate": self._lr,
"LearningRate": self._create_param_lr(param_and_grad),
"Moment": moment,
"InfNorm": inf_norm,
"Beta1Pow": self._beta1_pow_acc
Expand Down