diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc index c78e056071793..ac80879c54130 100644 --- a/paddle/framework/backward.cc +++ b/paddle/framework/backward.cc @@ -281,12 +281,16 @@ static void CreateGradVarInBlock( auto ops = block_desc->AllOps(); for (size_t op_index = grad_op_start_index; op_index < ops.size(); ++op_index) { + bool need_infer_shape = false; ForEachVarName(ops[op_index]->Outputs(), [&](const std::string& grad_var_name) { if (block_desc->HasVar(grad_var_name)) { return false; } - block_desc->Var(grad_var_name); + need_infer_shape = true; + auto var = block_desc->Var(grad_var_name); + // FIXME(qiao) infer the datatype + var->SetDataType(framework::DataType::FP32); auto it = param_name_map.find(grad_var_name); if (it == param_name_map.end()) { return false; @@ -298,6 +302,9 @@ static void CreateGradVarInBlock( grad_record.op_idx_ = static_cast(op_index); return false; /* not break */ }); + if (need_infer_shape) { + ops[op_index]->InferShape(*block_desc); + } } } @@ -428,10 +435,16 @@ ParamGradInfoMap AppendBackward( auto& all_ops = root_block->ops_; // insert fill one op for target + // TODO(qiao) add some check to the target. std::string fill_one_op_out = GradVarName(target.Name()); + std::vector target_shape_desc = target.Shape(); + std::vector target_shape; + std::transform(target_shape_desc.begin(), target_shape_desc.end(), + std::back_inserter(target_shape), + [](int64_t dim) { return static_cast(dim); }); std::unique_ptr fill_one_op( new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}}, - {{"shape", std::vector{1}}, + {{"shape", target_shape}, {"value", static_cast(1.0)}, {"data_type", framework::DataType::FP32}})); all_ops.push_back(std::move(fill_one_op)); @@ -443,13 +456,22 @@ ParamGradInfoMap AppendBackward( auto backward_op_descs = MakeBlockBackward(program_desc, root_block_idx, &no_grad_var_names, &grad_to_var); - std::unordered_map retv; - - // Create Variable for (auto& ptr : backward_op_descs) { all_ops.push_back(std::move(ptr)); } - root_block->Var(fill_one_op_out); + // Create Variable + + // Create target gradient variable + std::unordered_map retv; + + auto var = root_block->Var(fill_one_op_out); + // FIXME(qiao) infer the data type + var->SetDataType(framework::DataType::FP32); + var->SetShape(target.Shape()); + auto& target_grad = retv[target.Name()]; + target_grad.name_ = fill_one_op_out; + target_grad.block_idx_ = root_block_idx; + target_grad.op_idx_ = static_cast(forward_op_num); // create grad_var for all blocks in this program CreateGradVarInBlock(forward_op_num, grad_to_var, root_block, &retv); diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc index 5302afcafb5c0..0c35a157bcfeb 100644 --- a/paddle/framework/backward_test.cc +++ b/paddle/framework/backward_test.cc @@ -26,6 +26,20 @@ namespace framework { using DeviceContext = platform::DeviceContext; +class NoneOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override {} +}; + +template +class NoneKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override {} +}; + class RowWiseAddOpMaker : public OpProtoAndCheckerMaker { public: RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker) @@ -215,19 +229,51 @@ class MinusOpMaker : public OpProtoAndCheckerMaker { namespace f = paddle::framework; namespace ops = paddle::operators; using EnforceNotMet = paddle::platform::EnforceNotMet; -REGISTER_OPERATOR(rowwise_add, f::NOP, f::RowWiseAddOpMaker, +// rowwise_add +REGISTER_OPERATOR(rowwise_add, f::NoneOp, f::RowWiseAddOpMaker, f::RowWiseAddGradMaker); -REGISTER_OPERATOR(rowwise_add_grad, f::NOP); -REGISTER_OP(mul, f::NOP, f::MulOpMaker, mul_grad, f::NOP); -REGISTER_OP(sigmoid, f::NOP, f::SigmoidOpMaker, sigmoid_grad, f::NOP); -REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NOP, f::NoGradOpMaker); -REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NOP, f::FillZeroOpMaker); -REGISTER_OP(sum, f::NOP, f::SumOpMaker, sum_grad, f::NOP); +REGISTER_OP_CPU_KERNEL(rowwise_add, + f::NoneKernel); +REGISTER_OPERATOR(rowwise_add_grad, f::NoneOp); +REGISTER_OP_CPU_KERNEL(rowwise_add_grad, + f::NoneKernel); +// mul +REGISTER_OP(mul, f::NoneOp, f::MulOpMaker, mul_grad, f::NoneOp); +REGISTER_OP_CPU_KERNEL(mul, f::NoneKernel); +REGISTER_OP_CPU_KERNEL(mul_grad, + f::NoneKernel); +// sigmoid +REGISTER_OP(sigmoid, f::NoneOp, f::SigmoidOpMaker, sigmoid_grad, f::NoneOp); +REGISTER_OP_CPU_KERNEL(sigmoid, + f::NoneKernel); +REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NoneOp, f::NoGradOpMaker); +// fill_zeros_like +REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NoneOp, f::FillZeroOpMaker); +REGISTER_OP_CPU_KERNEL(fill_zeros_like, + f::NoneKernel); +// sum +REGISTER_OP(sum, f::NoneOp, f::SumOpMaker, sum_grad, f::NoneOp); +REGISTER_OP_CPU_KERNEL(sum, f::NoneKernel); +REGISTER_OP_CPU_KERNEL(sum_grad, + f::NoneKernel); +// fc REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker); -REGISTER_OP(many_output_op, f::NOP, f::ManyOutputOpMaker, many_output_op_grad, - f::NOP); -REGISTER_OP(mult_in_out, f::NOP, f::MultInOutOpMaker, mult_in_out_grad, f::NOP); -REGISTER_OPERATOR(minus, f::NOP, f::MinusOpMaker, f::MinusGradOpDescMaker); +// many_output_op +REGISTER_OP(many_output_op, f::NoneOp, f::ManyOutputOpMaker, + many_output_op_grad, f::NoneOp); +// mult_in_out +REGISTER_OP(mult_in_out, f::NoneOp, f::MultInOutOpMaker, mult_in_out_grad, + f::NoneOp); +REGISTER_OP_CPU_KERNEL(mult_in_out, + f::NoneKernel); +REGISTER_OP_CPU_KERNEL(mult_in_out_grad, + f::NoneKernel); +// minus +REGISTER_OPERATOR(minus, f::NoneOp, f::MinusOpMaker, f::MinusGradOpDescMaker); +REGISTER_OP_CPU_KERNEL(minus, f::NoneKernel); +// scale +REGISTER_OPERATOR(scale, f::NoneOp); +REGISTER_OP_CPU_KERNEL(scale, f::NoneKernel); TEST(Backward, simple_op_not_need_grad) { auto fwd = f::OpRegistry::CreateOp( @@ -463,6 +509,7 @@ TEST(Backward, simple_single_op) { f::ProgramDesc *program_desc = GetNewProgramDesc(); f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc); f::BlockDescBind *block = program.Block(0); + f::OpDescBind *op = block->AppendOp(); op->SetType("rowwise_add"); op->SetInput("X", {"x"}); @@ -487,7 +534,7 @@ TEST(Backward, simple_single_op) { EXPECT_EQ(grad_op->Output(f::GradVarName("b")), std::vector({f::GradVarName("b")})); - EXPECT_EQ(var_to_grad.size(), 2UL); + EXPECT_EQ(var_to_grad.size(), 3UL); EXPECT_EQ(var_to_grad.at("b"), f::GradVarInfo(f::GradVarName("b"), 0, 2)); EXPECT_EQ(var_to_grad.at("x"), f::GradVarInfo(f::GradVarName("x"), 0, 2)); @@ -588,7 +635,7 @@ TEST(Backward, simple_mult_op) { EXPECT_EQ(grad_op3->Output(f::GradVarName("b")), std::vector({f::GradVarName("b3")})); - EXPECT_EQ(var_to_grad.size(), 6UL); + EXPECT_EQ(var_to_grad.size(), 7UL); EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6)); EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6)); EXPECT_EQ(var_to_grad.at("out1"), @@ -666,7 +713,7 @@ TEST(Backward, intermedia_var_no_grad) { std::vector({f::GradVarName("out1")})); EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")), std::vector()); - EXPECT_EQ(var_to_grad.size(), 3UL); + EXPECT_EQ(var_to_grad.size(), 4UL); EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6)); EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6)); EXPECT_EQ(var_to_grad.at("out1"), @@ -744,7 +791,7 @@ TEST(Backward, var_no_grad) { EXPECT_EQ(grad_op1->Output(f::GradVarName("H")), std::vector({f::GradVarName("h1")})); - EXPECT_EQ(var_to_grad.size(), 3UL); + EXPECT_EQ(var_to_grad.size(), 4UL); EXPECT_EQ(var_to_grad.at("y1"), f::GradVarInfo(f::GradVarName("y1"), 0, 3)); EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 5)); EXPECT_EQ(var_to_grad.at("h1"), f::GradVarInfo(f::GradVarName("h1"), 0, 5)); @@ -830,7 +877,7 @@ TEST(Backward, shared_var) { EXPECT_EQ(grad_op1->Output(f::GradVarName("b")), std::vector({f::GradVarName("b1")})); - EXPECT_EQ(var_to_grad.size(), 5UL); + EXPECT_EQ(var_to_grad.size(), 6UL); EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4)); EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5)); EXPECT_EQ(var_to_grad.at("out1"), @@ -863,7 +910,7 @@ TEST(Backward, half_backward) { auto ops = block->AllOps(); ASSERT_EQ(3UL, ops.size()); - EXPECT_EQ(var_to_grad.size(), 1UL); + EXPECT_EQ(var_to_grad.size(), 2UL); EXPECT_EQ(var_to_grad.at("a"), f::GradVarInfo(f::GradVarName("a"), 0, forward_len + 1)); } diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc index ec0683d8875a9..9db05a8ca0735 100644 --- a/paddle/operators/mul_op.cc +++ b/paddle/operators/mul_op.cc @@ -106,10 +106,10 @@ class MulOpGrad : public framework::OperatorWithKernel { auto y_dims = ctx->GetInputDim("Y"); auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); - auto x_mat_dims = - framework::flatten_to_2d(x_dims, Attr("x_num_col_dims")); - auto y_mat_dims = - framework::flatten_to_2d(y_dims, Attr("y_num_col_dims")); + auto x_mat_dims = framework::flatten_to_2d( + x_dims, ctx->Attrs().Get("x_num_col_dims")); + auto y_mat_dims = framework::flatten_to_2d( + y_dims, ctx->Attrs().Get("y_num_col_dims")); PADDLE_ENFORCE_EQ( x_mat_dims[0], out_dims[0], diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc index b360b05d16c9a..82aae72ba9378 100644 --- a/paddle/pybind/protobuf.cc +++ b/paddle/pybind/protobuf.cc @@ -163,6 +163,11 @@ void BindBlockDesc(py::module &m) { return self.Var(name); }, py::return_value_policy::reference) + .def("has_var", + [](BlockDescBind &self, py::bytes byte_name) { + std::string name = byte_name; + return self.HasVar(name); + }) .def("find_var", [](BlockDescBind &self, py::bytes byte_name) { std::string name = byte_name; diff --git a/python/paddle/v2/framework/framework.py b/python/paddle/v2/framework/framework.py index a17f988bf4330..3fb6efe42a2c6 100644 --- a/python/paddle/v2/framework/framework.py +++ b/python/paddle/v2/framework/framework.py @@ -306,6 +306,14 @@ def parent_idx(self): def idx(self): return self.desc.id + def var(self, name): + if name not in self.vars: + raise ValueError("var %s not in this block" % name) + return self.vars[name] + + def all_parameters(self): + return {v for k, v in self.vars.iteritems() if isinstance(v, Parameter)} + def create_var(self, *args, **kwargs): return Variable(self, *args, **kwargs) @@ -314,7 +322,8 @@ def has_var(self, name): def create_parameter(self, *args, **kwargs): global_block = self.program.global_block() - return Parameter(global_block, *args, **kwargs) + param = Parameter(global_block, *args, **kwargs) + return param def append_op(self, *args, **kwargs): op_desc = self.desc.append_op() @@ -392,10 +401,16 @@ def __str__(self): def global_block(self): return self.blocks[0] + def block(self, index): + return self.blocks[index] + def current_block(self): return self.blocks[self.current_block_idx] def append_backward(self, target, no_grad_set): + """ + return map(param_name -> (grad_name, block_index, op_index)) + """ assert isinstance(target, Variable) param_to_grad_info = self.desc.append_backward(target.desc, no_grad_set) self.sync_with_cpp() diff --git a/python/paddle/v2/framework/optimizer.py b/python/paddle/v2/framework/optimizer.py new file mode 100644 index 0000000000000..e356a7aadb8d6 --- /dev/null +++ b/python/paddle/v2/framework/optimizer.py @@ -0,0 +1,124 @@ +import paddle.v2.framework.framework as framework + +__all__ = ['SGDOptimizer'] + + +class Optimizer(object): + """Optimizer Base class. + + Define the common interface of an optimizer. + User should not use this class directly, but need to use one of it's implementation. + """ + + def __init__(self): + pass + + def _append_optimize_op(self, block, param_and_grad): + """ append optimize operator to block and return all the added optimize_op + """ + raise NotImplementedError() + + def create_backward_pass(self, loss, parameter_list=None, no_grad_set=None): + """ + create and add gradient Operators in BlockDesc to Compute gradients of `loss` + for parameters in parameter_list + + Args: + loss: an variable generated by cost function. + no_grad_set: variable that should not create gradient + parameter_list: parameters that need to compute gradient and update to optimize the lost. + + Returns: + list of (parameters, gradients) pair. + """ + assert isinstance(loss, framework.Variable) + param_grad_map = loss.block.program.append_backward(loss, no_grad_set or + set()) + if parameter_list is not None: + parameters = parameter_list + else: + params = loss.block.program.global_block().all_parameters() + parameters = [param.name for param in params] + params_and_grads = [] + for param in parameters: + if param not in param_grad_map: + raise Exception("param %s is not in map" % param) + grad_info = param_grad_map[param] + grad_block = loss.block.program.block(grad_info[1]) + if not grad_block.has_var(grad_info[0]): + raise Exception("grad block[%d] did not have grad var %s" % + grad_info[1], grad_info[0]) + param_var = loss.block.var(param) + grad_var = grad_block.var(grad_info[0]) + if loss.block.has_var(grad_info[0]): + params_and_grads.append((param_var, grad_var)) + else: + params_and_grads.append((param_var, None)) + return params_and_grads + + def create_optimization_pass(self, parameters_and_grads, loss): + """Add optimization operators to update gradients to variables. + + Args: + loss: the target that this optimization is for. + parameters_and_grads: a list of (variable, gradient) pair to update. + + Returns: + optmization_op_list: a list of optimization operator that will update parameter using gradient. + """ + optimize_ops = [] + for param_and_grad in parameters_and_grads: + if param_and_grad[1] is not None: + optimize_op = self._append_optimize_op(loss.block, + param_and_grad) + optimize_ops.append(optimize_op) + return optimize_ops + + def minimize(self, loss, parameter_list=None, no_grad_set=None): + """Add operations to minimize `loss` by updating `parameter_list`. + + This method combines interface `create_backward_pass()` and + `create_optimization_pass()` into one. + """ + params_grads = self.create_backward_pass(loss, parameter_list, + no_grad_set or set()) + optimize_ops = self.create_optimization_pass(params_grads, loss) + return optimize_ops + + +class SGDOptimizer(Optimizer): + """ Simple SGD optimizer without any state. + """ + + def __init__(self, learning_rate): + assert learning_rate is not None + super(Optimizer, self).__init__() + self.type = "sgd" + self._learning_rate = learning_rate + + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, framework.Block) + lr_shape = [1] + # create a var for learning_rate + lr = block.create_var(dtype="float32", shape=lr_shape, lod_level=0) + + # create an op to init the learning_rate + init_op = block.append_op( + type="fill_constant", + outputs={"Out": lr}, + attrs={"shape": lr_shape, + "value": self._learning_rate}) + + # create the optimize op + sgd_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "LearningRate": lr + }, + outputs={"ParamOut": param_and_grad[0]}, + attrs={"shape": [1], + "value": self._learning_rate}) + + return sgd_op diff --git a/python/paddle/v2/framework/tests/test_optimizer.py b/python/paddle/v2/framework/tests/test_optimizer.py new file mode 100644 index 0000000000000..3d6fa70737bf3 --- /dev/null +++ b/python/paddle/v2/framework/tests/test_optimizer.py @@ -0,0 +1,31 @@ +import unittest + +import paddle.v2.framework.framework as framework +import paddle.v2.framework.optimizer as optimizer + + +class TestOptimizer(unittest.TestCase): + def test_sgd_optimizer(self): + program = framework.g_program + block = program.global_block() + mul_x = block.create_parameter( + dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") + mul_y = block.create_var( + dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") + mul_out = block.create_var( + dtype="float32", shape=[5, 8], lod_level=0, name="mul.out") + mul_op = block.append_op( + type="mul", + inputs={"X": mul_x, + "Y": mul_y}, + outputs={"Out": mul_out}, + attrs={"x_num_col_dims": 1}) + sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01) + opts = sgd_optimizer.minimize(mul_out) + self.assertEqual(len(opts), 1) + sgd_op = opts[0] + self.assertEqual(sgd_op.type, "sgd") + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/framework/tests/test_program.py b/python/paddle/v2/framework/tests/test_program.py index d06f86c09fe4e..c98dc3492b950 100644 --- a/python/paddle/v2/framework/tests/test_program.py +++ b/python/paddle/v2/framework/tests/test_program.py @@ -34,49 +34,11 @@ def test_program(self): self.assertEqual(1, b.idx) self.assertEqual(0, b.parent_idx) - def test_desc_append_backward(self): - prog = core.ProgramDesc.__create_program_desc__() - self.assertIsNotNone(prog) - block = prog.block(0) - self.assertIsNotNone(block) - - mul_op_desc = block.append_op() - mul_op_desc.set_type("mul") - mul_op_desc.set_input("X", ["x1"]) - mul_op_desc.set_input("Y", ["y1"]) - mul_op_desc.set_output("Out", ["out1"]) - - sum_op_desc = block.append_op() - sum_op_desc.set_type("elementwise_add") - sum_op_desc.set_input("X", ["out1"]) - sum_op_desc.set_input("Y", ["b1"]) - sum_op_desc.set_output("Out", ["out2"]) - - target = block.var("out2") - - expect_ops = [ - "mul", "elementwise_add", "fill_constant", "elementwise_add_grad", - "mul_grad" - ] - - def grad_name(name): - return name + "@GRAD" - - actual_ops = [] - param_to_grad = prog.append_backward(target, set()) - for var_name in ("x1", "y1", "out1", "b1"): - self.assertEqual(param_to_grad[var_name][0], grad_name(var_name)) - self.assertEqual(param_to_grad[var_name][1], 0) - - for op in block.all_ops(): - actual_ops.append(op.type()) - self.assertEqual(actual_ops, expect_ops) - def test_append_backward(self): prog = Program.instance() block = prog.global_block() - mul_x = block.create_parameter( + mul_x = block.create_var( dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") mul_y = block.create_var( dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") @@ -88,7 +50,35 @@ def test_append_backward(self): "Y": mul_y}, outputs={"Out": [mul_out]}, attrs={"x_num_col_dims": 1}) - param_to_grad = prog.append_backward(mul_out, set()) + + add_y = block.create_var( + dtype="float32", shape=[5, 8], lod_level=0, name="add.y") + add_out = block.create_var( + dtype="float32", shape=[5, 8], lod_level=0, name="add.out") + add_op = block.append_op( + type="elementwise_add", + inputs={"X": mul_out, + "Y": add_y}, + outputs={"Out": add_out}, + attrs={"x_num_col_dims": 1}) + + param_to_grad = prog.append_backward(add_out, set()) + + def grad_name(name): + return name + "@GRAD" + + for var_name in ("mul.x", "mul.y", "mul.out", "add.y", "add.out"): + self.assertEqual(param_to_grad[var_name][0], grad_name(var_name)) + self.assertEqual(param_to_grad[var_name][1], 0) + + expect_ops = [ + "mul", "elementwise_add", "fill_constant", "elementwise_add_grad", + "mul_grad" + ] + actual_ops = [] + for op in block.ops: + actual_ops.append(op.type) + self.assertEqual(actual_ops, expect_ops) if __name__ == '__main__':