From 1077256381adba5acba91f6863ebfc74e555322f Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Mon, 9 Oct 2017 17:46:14 -0700 Subject: [PATCH 1/5] init optimizer design --- doc/design/optimizer.md | 89 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 doc/design/optimizer.md diff --git a/doc/design/optimizer.md b/doc/design/optimizer.md new file mode 100644 index 0000000000000..9c1eafdcff528 --- /dev/null +++ b/doc/design/optimizer.md @@ -0,0 +1,89 @@ +## Optimizer Design +In deeplearning system, `Optimizer` is used to optimize(minimize) loss thow updating a list of parameters. + +### A typical training process: + +1. run forward to calculate activation using data and parameter. +1. run backward to calculate the gradient of activation and parameter using cost, activation, and parameter. +1. run optimize operators to apply/update the gradient to the corresponding parameter. + +### Python Interface to describe the training process + +1. +User write code to describe the network: + +```python +images = layer.data("images") +labels = layer.data("labels") +w1 = pd.var("w1") +hidden = layer.fc(images, W=w1) +cost = layer.mse(hidden, labels) +``` + +the code above will generate forward operators in [block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md). + + +2. +User create a Optimizer and set parameter list that it need to update. + +```python +optimizer = AdagradOptimizer(learing_rate=0.001) +``` + +3. +User use the optimizer to `minimize` a certain `cost` thow updating parameters in parameter_list. + +```python +opt = optimizer.minimize(cost, parameter_list=[w1, ...]) +``` + +The return value of `minimize()` is an Operator that rely on all the optimize operator. + +4. +Use Session/Executor to run this opt as target. + +```python +sess.run(target=[opt], ...) +``` + +### What does optimizer do: + +In PaddlePaddle, we use block of operators to describe computation. From the Python Interface we described above, we can see that `Optimizer` should add some operators to the computation block: + +1. Gradient Ops. Used to calculate the gradients. +2. Optimize Ops. Used to apply gradient to parameters. + +#### Optimizer Python interface: + +```python +class Optimizer(object): + def _backward(loss): + """ + Add Operators to Compute gradients of `loss` + It returns the variables that will be updated for this loss. + """ + ... + return variables + + def _update(var_list): + """ + Add Operators to Apply gradients to variables + in var_list. It returns an update `Operator`. + Run this operator will trace back to all update and backward + op related. + """ + ... + return update_op + + def minimize(loss, var_list): + """Add operations to minimize `loss` by updating `var_list`. + + This method simply combines calls `_backward()` and + `_update()`. + """ + variables = _backward(loss) + update_op = _update(variables) + return update_op +``` + +because we do not want users to know the step of `_backward` and `_update`, so we decide to export only `minimize()` to users. From 94f559b35f74a46768265def056b9d55f30c7441 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Mon, 9 Oct 2017 17:47:23 -0700 Subject: [PATCH 2/5] fix index --- doc/design/optimizer.md | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/doc/design/optimizer.md b/doc/design/optimizer.md index 9c1eafdcff528..e79a058c3f2e2 100644 --- a/doc/design/optimizer.md +++ b/doc/design/optimizer.md @@ -9,8 +9,7 @@ In deeplearning system, `Optimizer` is used to optimize(minimize) loss thow upda ### Python Interface to describe the training process -1. -User write code to describe the network: +1. User write code to describe the network: ```python images = layer.data("images") @@ -23,15 +22,13 @@ cost = layer.mse(hidden, labels) the code above will generate forward operators in [block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md). -2. -User create a Optimizer and set parameter list that it need to update. +2. User create a Optimizer and set parameter list that it need to update. ```python optimizer = AdagradOptimizer(learing_rate=0.001) ``` -3. -User use the optimizer to `minimize` a certain `cost` thow updating parameters in parameter_list. +3. User use the optimizer to `minimize` a certain `cost` thow updating parameters in parameter_list. ```python opt = optimizer.minimize(cost, parameter_list=[w1, ...]) @@ -39,8 +36,7 @@ opt = optimizer.minimize(cost, parameter_list=[w1, ...]) The return value of `minimize()` is an Operator that rely on all the optimize operator. -4. -Use Session/Executor to run this opt as target. +4. Use Session/Executor to run this opt as target. ```python sess.run(target=[opt], ...) From a30e4b4e9782984b9ea25758a3741445a86790ed Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Tue, 10 Oct 2017 14:00:21 -0700 Subject: [PATCH 3/5] optimize the interface --- doc/design/optimizer.md | 117 ++++++++++++++++++++++------------------ 1 file changed, 65 insertions(+), 52 deletions(-) diff --git a/doc/design/optimizer.md b/doc/design/optimizer.md index e79a058c3f2e2..97a1067a79684 100644 --- a/doc/design/optimizer.md +++ b/doc/design/optimizer.md @@ -1,85 +1,98 @@ ## Optimizer Design -In deeplearning system, `Optimizer` is used to optimize(minimize) loss thow updating a list of parameters. -### A typical training process: +### The Problem -1. run forward to calculate activation using data and parameter. -1. run backward to calculate the gradient of activation and parameter using cost, activation, and parameter. -1. run optimize operators to apply/update the gradient to the corresponding parameter. +A PaddlePaddle program, or a block, is a sequence of operators operating variables. A training program needs to do three kinds of works: -### Python Interface to describe the training process +1. the forward pass, which computes intermediate results and the cost(s), +1. the backward pass, which derives gradients from intermediate results and costs, and +1. the optimization pass, which update model parameters to optimize the cost(s). -1. User write code to describe the network: +These works rely on three kinds of operators: -```python -images = layer.data("images") -labels = layer.data("labels") -w1 = pd.var("w1") -hidden = layer.fc(images, W=w1) -cost = layer.mse(hidden, labels) -``` +1. forward operators, +1. gradient operators, and +1. optimization operators. -the code above will generate forward operators in [block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md). +It's true that users should be able to create all these operators manually by calling some low-level API, but it would be much more convenient if they could only describe the forward pass and let PaddlePaddle create the backward and optimization operators automatically. +In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass. -2. User create a Optimizer and set parameter list that it need to update. -```python -optimizer = AdagradOptimizer(learing_rate=0.001) -``` +### High-level Python API to describe the training process -3. User use the optimizer to `minimize` a certain `cost` thow updating parameters in parameter_list. +1. User write code to describe the network: -```python -opt = optimizer.minimize(cost, parameter_list=[w1, ...]) -``` + ```python + images = layer.data("images") + labels = layer.data("labels") + w1 = pd.var("w1") + b1 = pd.var("b1") + hidden = layer.fc(images, w=w1, b=b1) + cost = layer.mse(hidden, labels) + ``` -The return value of `minimize()` is an Operator that rely on all the optimize operator. + The above code snippet will create forward operators in [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md). -4. Use Session/Executor to run this opt as target. -```python -sess.run(target=[opt], ...) -``` +2. Users create a certain kind of Optimizer with some argument. + + ```python + optimizer = AdagradOptimizer(learing_rate=0.001) + ``` + +3. Users use the optimizer to `minimize` a certain `cost` through updating parameters in parameter_list. -### What does optimizer do: + ```python + opt_op_list = optimizer.minimize(cost, parameter_list=[w1, b1]) + ``` + The above code snippet will create gradient and optimization operators in Block. The return value of `minimize()` is list of optimization operators that will be run by session. -In PaddlePaddle, we use block of operators to describe computation. From the Python Interface we described above, we can see that `Optimizer` should add some operators to the computation block: +4. Users use Session/Executor to run this opt_op_list as target to do training. -1. Gradient Ops. Used to calculate the gradients. -2. Optimize Ops. Used to apply gradient to parameters. + ```python + sess.run(target= opt_op_list, ...) + ``` #### Optimizer Python interface: ```python class Optimizer(object): - def _backward(loss): + def create_backward_pass(loss, parameter_list=None): """ - Add Operators to Compute gradients of `loss` - It returns the variables that will be updated for this loss. - """ - ... - return variables + Add gradient Operators into Block to Compute gradients of `loss` + for parameters in parameter_list - def _update(var_list): + Args: + loss: an variable generated by cost function. + parameter_list: parameters that need to compute gradient and update to minimize the lost + + Returns: + (parameters, gradients) pair list. """ - Add Operators to Apply gradients to variables - in var_list. It returns an update `Operator`. - Run this operator will trace back to all update and backward - op related. + return vars_grads + + def create_optimization_pass(vars_grads): + """Add Operators to Apply gradients to variables. + + Args: + vars_grads: a list of (variable, gradient) pair to update. + + Returns: + optmization_op_list: a list of optimization operator that will optimize parameter with gradient. """ ... - return update_op + return optmization_op_list - def minimize(loss, var_list): - """Add operations to minimize `loss` by updating `var_list`. + def minimize(loss, parameter_list): + """Add operations to minimize `loss` by updating `parameter_list `. - This method simply combines calls `_backward()` and - `_update()`. + This method simply combines calls `create_backward_pass()` and + `create_optimization_pass()`. """ - variables = _backward(loss) - update_op = _update(variables) - return update_op + vars_grads = create_backward_pass(loss) + update_ops = create_optimization_pass(var_grads) + return update_ops ``` -because we do not want users to know the step of `_backward` and `_update`, so we decide to export only `minimize()` to users. +Users can inherit the Optimizer above to create their own Optimizer with some special logic, such as AdagradOptimizer. From 9ba35a4214f60cdc281d0a437f496ca2e95ff713 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Tue, 10 Oct 2017 14:51:13 -0700 Subject: [PATCH 4/5] add a link to python_api.md --- doc/design/python_api.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/design/python_api.md b/doc/design/python_api.md index 6213da65c8c59..2963fcf9d64dd 100644 --- a/doc/design/python_api.md +++ b/doc/design/python_api.md @@ -214,3 +214,7 @@ def fc_layer(input, size, ...): out.writer = op return out ``` + +## Optimizer + +[Optimizer Design Doc](./optimizer.md) From 49318e82846dc34730dfdc9a279c14f0373d0da7 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Tue, 10 Oct 2017 15:40:32 -0700 Subject: [PATCH 5/5] optimize the code of Optimizer --- doc/design/optimizer.md | 77 ++++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 35 deletions(-) diff --git a/doc/design/optimizer.md b/doc/design/optimizer.md index 97a1067a79684..17440fae5028c 100644 --- a/doc/design/optimizer.md +++ b/doc/design/optimizer.md @@ -58,41 +58,48 @@ In this design, we propose a high-level API that automatically derives the optim ```python class Optimizer(object): - def create_backward_pass(loss, parameter_list=None): - """ - Add gradient Operators into Block to Compute gradients of `loss` - for parameters in parameter_list - - Args: - loss: an variable generated by cost function. - parameter_list: parameters that need to compute gradient and update to minimize the lost - - Returns: - (parameters, gradients) pair list. - """ - return vars_grads - - def create_optimization_pass(vars_grads): - """Add Operators to Apply gradients to variables. - - Args: - vars_grads: a list of (variable, gradient) pair to update. - - Returns: - optmization_op_list: a list of optimization operator that will optimize parameter with gradient. - """ - ... - return optmization_op_list - - def minimize(loss, parameter_list): - """Add operations to minimize `loss` by updating `parameter_list `. - - This method simply combines calls `create_backward_pass()` and - `create_optimization_pass()`. - """ - vars_grads = create_backward_pass(loss) - update_ops = create_optimization_pass(var_grads) - return update_ops + """Optimizer Base class. + + """ + + def __init__(self): + pass + + def create_backward_pass(self, loss, parameter_list=None): + """ + create and add gradient Operators in BlockDesc to Compute gradients of `loss` + for parameters in parameter_list + + Args: + loss: an variable generated by cost function. + parameter_list: parameters that need to compute gradient and update to optimize the lost. + + Returns: + list of (parameters, gradients) pair. + """ + return None + + def create_optimization_pass(self, parameters_and_grads): + """Add optimization operators to update gradients to variables. + + Args: + parameters_and_grads: a list of (variable, gradient) pair to update. + + Returns: + optmization_op_list: a list of optimization operator that will update parameter using gradient. + """ + return None + + def minimize(self, loss, parameter_list): + """Add operations to minimize `loss` by updating `parameter_list`. + + This method combines interface `create_backward_pass()` and + `create_optimization_pass()` into one. + """ + params_grads = self.create_backward_pass(loss, parameter_list) + update_ops = self.create_optimization_pass(params_grads) + return update_ops + ``` Users can inherit the Optimizer above to create their own Optimizer with some special logic, such as AdagradOptimizer.