diff --git a/.clang-tidy b/.clang-tidy index d5935e488ae..31aca7b30a7 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -1,4 +1,15 @@ -# maybe-* checks are only available on OneFlow custom clang-tidy and clangd -Checks: '-*, maybe-*' +# `maybe-*` checks are only available on OneFlow custom clang-tidy and clangd +# `-allow-enabling-analyzer-alpha-checkers` should be passed to clang-tidy for CSA checkers named `clang-analyzer-alpha.*` (or `-allow-enabling-alpha-checkers` for run-clang-tidy.py) +# `aggressive-binary-operation-simplification` should be enabled (via `-Xclang -analyzer-config -Xclang aggressive-binary-operation-simplification=true` in clang) +# there is some problem in `clang-analyzer-alpha.clone.*`, so do not enable it +# `clang-analyzer-alpha.deadcode.*` is just too verbose to enable +Checks: '-*, maybe-*, clang-analyzer-core.*, clang-analyzer-cplusplus.*, clang-analyzer-nullability.*, clang-analyzer-deadcode.*, clang-analyzer-security.*, clang-analyzer-optin.cplusplus.*, clang-analyzer-optin.performance.*, clang-analyzer-alpha.core.*, clang-analyzer-alpha.cplusplus.*, clang-analyzer-alpha.security.*, cppcoreguidelines-avoid-goto, cppcoreguidelines-init-variables, cppcoreguidelines-interfaces-global-init, cppcoreguidelines-no-malloc, cppcoreguidelines-prefer-member-initializer, cppcoreguidelines-pro-type-member-init, cppcoreguidelines-pro-type-static-cast-downcast, cppcoreguidelines-slicing, cppcoreguidelines-special-member-functions, performance-unnecessary-value-param, performance-unnecessary-copy-initialization, performance-noexcept-move-constructor, performance-no-automatic-move, performance-move-const-arg, performance-implicit-conversion-in-loop, performance-for-range-copy, google-default-arguments, google-global-names-in-headers, google-explicit-constructor' # TODO: treat all maybe warnings as errors when existing warnings are all fixed -WarningsAsErrors: 'maybe-unused' +WarningsAsErrors: 'maybe-unused, clang-analyzer-nullability.*, clang-analyzer-cplusplus.*, performance-implicit-conversion-in-loop, performance-move-const-arg, performance-no-automatic-move, performance-noexcept-move-constructor, google-default-arguments, google-global-names-in-headers' + +CheckOptions: + # `cppcoreguidelines-special-member-functions` is enabled, refer to https://en.cppreference.com/w/cpp/language/rule_of_three + - key: cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor + value: True + - key: performance-move-const-arg.CheckTriviallyCopyableMove + value: False diff --git a/.github/workflows/simple.yml b/.github/workflows/simple.yml index 862217aa37f..b5501d4824d 100644 --- a/.github/workflows/simple.yml +++ b/.github/workflows/simple.yml @@ -50,7 +50,7 @@ jobs: -DCMAKE_BUILD_TYPE=Release \ -DBUILD_TESTING=ON cmake --build . -j$(nproc) --target of_git_version oneflow_deps generate_functional of_cfgobj generate_py_cfg - - name: Run Maybe-related checks by clang-tidy + - name: Run clang-tidy for all translation units # use clang as compiler for correct compiler flags run: | cd build @@ -62,7 +62,7 @@ jobs: -DBUILD_TESTING=ON \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON cd .. - ./run-clang-tidy.py -clang-tidy-binary ./clang-tidy-489012f-x86_64.AppImage -p build -quiet + ./run-clang-tidy.py -clang-tidy-binary ./clang-tidy-489012f-x86_64.AppImage -p build -quiet -allow-enabling-alpha-checkers -extra-arg="-Xclang" -extra-arg="-analyzer-config" -extra-arg="-Xclang" -extra-arg="aggressive-binary-operation-simplification=true" '^((?!third_party_install).)+(? Backward(const one::TensorTuple& outputs, const one::Ten bool retain_graph, bool create_graph) { if (create_graph) { retain_graph = true; } std::shared_ptr gradients = JUST(CheckAndInitOutGrads(outputs, out_grads)); - JUST(one::GetThreadLocalAutogradEngine()->RunBackwardAndSaveGrads4LeafTensor( + JUST(one::GetThreadLocalAutogradEngine()->RunBackwardAndSaveGrads4LeafTensorIf( outputs, *gradients, retain_graph, create_graph)); return std::make_shared(0); } @@ -86,7 +86,7 @@ Maybe Grad(const one::TensorTuple& outputs, const one::TensorT [](const std::shared_ptr& tensor) { return tensor->requires_grad(); })) << "All input tensors `.requires_grad` should be true"; std::shared_ptr gradients = JUST(CheckAndInitOutGrads(outputs, out_grads)); - return one::GetThreadLocalAutogradEngine()->RunBackwardAndReturnInputsTensorGrad( + return one::GetThreadLocalAutogradEngine()->RunBackwardAndReturnInputsTensorGradIf( outputs, inputs, *gradients, retain_graph, create_graph); } diff --git a/oneflow/api/python/autograd/no_grad_guard.cpp b/oneflow/api/python/autograd/autograd_mode.cpp similarity index 73% rename from oneflow/api/python/autograd/no_grad_guard.cpp rename to oneflow/api/python/autograd/autograd_mode.cpp index 7d8aea56ab7..975e3f87c9c 100644 --- a/oneflow/api/python/autograd/no_grad_guard.cpp +++ b/oneflow/api/python/autograd/autograd_mode.cpp @@ -26,11 +26,12 @@ namespace oneflow { namespace autograd { ONEFLOW_API_PYBIND11_MODULE("autograd", m) { - py::class_>(m, "no_grad") - .def(py::init([]() { return std::make_shared(); })) - .def("__enter__", [](const NoGradGuard& no_grad_obj) {}) - .def("__exit__", [](const NoGradGuard& no_grad_obj, const py::object& type, + py::class_>(m, "AutoGradMode") + .def(py::init([](bool mode) { return std::make_shared(mode); })) + .def("__enter__", [](const AutoGradMode& no_grad_obj) {}) + .def("__exit__", [](const AutoGradMode& no_grad_obj, const py::object& type, const py::object& value, const py::object& traceback) {}); + m.def("is_grad_enabled", &GradMode::is_enabled); } } // namespace autograd diff --git a/oneflow/api/python/framework/tensor.cpp b/oneflow/api/python/framework/tensor.cpp index 74c7837355c..839be1cf1ad 100644 --- a/oneflow/api/python/framework/tensor.cpp +++ b/oneflow/api/python/framework/tensor.cpp @@ -30,6 +30,7 @@ limitations under the License. #include "oneflow/core/framework/tensor_method.h" #include "oneflow/core/framework/device.h" #include "oneflow/core/framework/stride.h" +#include "oneflow/core/framework/nd_sbp.h" #include "oneflow/core/framework/py_distribute.h" #include "oneflow/core/functional/value_types.h" #include "oneflow/core/job/placement.cfg.h" @@ -299,9 +300,8 @@ Maybe NewTensor(py::args args, py::kwargs kwargs, Symbol desired_ if (other_tensor->is_local()) { if (placement) { // LocalTensor -> ConsistentTensor - tensor = JUST(functional::ToConsistent(other_tensor, placement, sbp_tuple, - /* identity_grad */ false, - /* grad_sbp_parallels */ {})); + tensor = + JUST(functional::ToConsistent(other_tensor, placement, sbp_tuple, GetNoneSbpList())); } else { // LocalTensor -> LocalTensor if (!device) { device = JUST(Device::New("cpu")); } @@ -310,9 +310,8 @@ Maybe NewTensor(py::args args, py::kwargs kwargs, Symbol desired_ } else { if (placement) { // ConsistentTensor -> ConsistentTensor - tensor = JUST(functional::ToConsistent(other_tensor, placement, sbp_tuple, - /* identity_grad */ false, - /* grad_sbp_parallels */ {})); + tensor = + JUST(functional::ToConsistent(other_tensor, placement, sbp_tuple, GetNoneSbpList())); } else { // ConsistentTensor -> LocalTensor tensor = JUST(functional::ConsistentToLocal(other_tensor)); diff --git a/oneflow/api/python/symbol/placement_symbol.cpp b/oneflow/api/python/symbol/placement_symbol.cpp index d5e5bf5a0f2..0af7c1470c9 100644 --- a/oneflow/api/python/symbol/placement_symbol.cpp +++ b/oneflow/api/python/symbol/placement_symbol.cpp @@ -40,53 +40,6 @@ Maybe MakeShape(const py::tuple& py_shape) { return std::make_shared(shape_dims); } -std::string SerializePlacementSymbol2String(Symbol placement) { - std::string device_type = placement->device_tag() == "gpu" ? "\"cuda\"" : "\"cpu\""; - std::vector sorted_node_ids; - HashMap> node_id2sorted_dev_phy_ids; - for (int64_t machine_id : placement->sorted_machine_ids()) { - int64_t node_id = GlobalProcessCtx::NodeId(machine_id); - if (!std::count(sorted_node_ids.begin(), sorted_node_ids.end(), node_id)) { - sorted_node_ids.push_back(node_id); - } - for (int64_t device_id : placement->sorted_dev_phy_ids(machine_id)) { - node_id2sorted_dev_phy_ids[node_id].push_back(device_id); - } - } - std::string machine_device_ids = "{"; - int64_t node_idx = 0; - for (int64_t node_id : sorted_node_ids) { - std::string device_name = std::to_string(node_id) + " : ["; - int64_t device_idx = 0; - for (int64_t device_id : node_id2sorted_dev_phy_ids.at(node_id)) { - device_name += std::to_string(device_id); - if (++device_idx != node_id2sorted_dev_phy_ids.at(node_id).size()) { device_name += ", "; } - } - device_name += "]"; - if (++node_idx != sorted_node_ids.size()) { device_name += ", "; } - machine_device_ids += device_name; - } - machine_device_ids += "}"; - std::string hierarchy = "("; - int32_t hierarchy_dim_idx = 0; - for (int64_t dim : placement->hierarchy()->dim_vec()) { - hierarchy += std::to_string(dim); - if (++hierarchy_dim_idx != placement->hierarchy()->dim_vec().size()) { - hierarchy += ", "; - } else if (placement->hierarchy()->dim_vec().size() == 1) { - hierarchy += ","; - } - } - hierarchy += ")"; - std::string placement_str = "oneflow.placement(device_type=" + device_type - + ", machine_device_ids=" + machine_device_ids - + ", hierarchy=" + hierarchy + ")"; - return placement_str; -} - -auto* CachedSerializePlacementSymbol2String = - DECORATE(&SerializePlacementSymbol2String, ThreadLocal); - struct PlacementSymbolExportUtil { static std::shared_ptr ApiCreatePlacementSymbol( int64_t symbol_id, const std::shared_ptr& symbol_conf) { @@ -207,7 +160,7 @@ struct PlacementSymbolExportUtil { } static std::string PlacementSymbol2String(Symbol placement) { - return CachedSerializePlacementSymbol2String(placement); + return *PlacementToString(placement).GetPtrOrThrow(); } static Maybe> ReplacePlacementDeviceTag(Symbol parallel_desc, diff --git a/oneflow/api/python/symbol/sbp_symbol.cpp b/oneflow/api/python/symbol/sbp_symbol.cpp index 9ed80e214f4..a6787ac0a7d 100644 --- a/oneflow/api/python/symbol/sbp_symbol.cpp +++ b/oneflow/api/python/symbol/sbp_symbol.cpp @@ -20,6 +20,7 @@ limitations under the License. #include "oneflow/core/common/constant.h" #include "oneflow/core/common/maybe.h" #include "oneflow/core/common/symbol.h" +#include "oneflow/core/framework/nd_sbp.h" #include "oneflow/core/job/sbp_parallel.cfg.h" #include "oneflow/core/job/sbp_parallel.h" @@ -30,17 +31,7 @@ namespace oneflow { namespace { std::string SbpParallelSymbolToString(const Symbol& sbp_sym) { - std::string sbp_str = "oneflow.sbp."; - if (sbp_sym->has_broadcast_parallel()) { - sbp_str += "broadcast"; - } else if (sbp_sym->has_partial_sum_parallel()) { - sbp_str += "partial_sum"; - } else if (sbp_sym->has_split_parallel()) { - sbp_str += "split(axis=" + std::to_string(sbp_sym->split_parallel().axis()) + ")"; - } else { - UNIMPLEMENTED(); - } - return sbp_str; + return *SbpToString(sbp_sym).GetPtrOrThrow(); } Maybe>> MakeSplitSbpParallelList(int max_split_axis) { diff --git a/oneflow/core/autograd/autograd_engine.cpp b/oneflow/core/autograd/autograd_engine.cpp index d2ae9d31b7d..f1a43ca6046 100644 --- a/oneflow/core/autograd/autograd_engine.cpp +++ b/oneflow/core/autograd/autograd_engine.cpp @@ -21,6 +21,7 @@ limitations under the License. #include "oneflow/core/framework/tensor.h" #include "oneflow/core/framework/tensor_arg.h" #include "oneflow/core/framework/tensor_tuple.h" +#include "oneflow/core/framework/tensor_rpc_util.h" #include "oneflow/core/autograd/autograd_mode.h" #include "oneflow/core/eager/dev_vm_dep_object_consume_mode.h" #include "oneflow/core/functional/functional.h" @@ -56,20 +57,42 @@ Maybe CopyOrAccGrad(AutogradMeta* autograd_meta, bool autograd_mode) { return Maybe::Ok(); } +Maybe RawTorchConsistentTensor(const std::shared_ptr& tensor) { + // Do nothing. + return Maybe::Ok(); +} + +static constexpr auto* TorchConsistentTensor = + DECORATE(&RawTorchConsistentTensor, CheckConsistentTensorMeta); + +Maybe CheckConsistentTensorsMeta(const TensorTuple& tensor_tuple) { + for (const auto& tensor : tensor_tuple) { + if (tensor->is_consistent()) { JUST(TorchConsistentTensor(tensor)); } + } + return Maybe::Ok(); +} + } // namespace -Maybe AutogradEngine::RunBackwardAndSaveGrads4LeafTensor(const TensorTuple& outputs, - const TensorTuple& out_grads, - bool retain_graph, - bool create_graph) { - return RunBackwardAndSaveGrads4LeafTensorIf(outputs, out_grads, retain_graph, create_graph); +Maybe AutogradEngine::RunBackwardAndSaveGrads4LeafTensorIf(const TensorTuple& outputs, + const TensorTuple& out_grads, + bool retain_graph, + bool create_graph) { + JUST(CheckConsistentTensorsMeta(outputs)); + JUST(CheckConsistentTensorsMeta(out_grads)); + DisableCheckConsistentTensorMetaScope disable_meta_check; + return RunBackwardAndSaveGrads4LeafTensor(outputs, out_grads, retain_graph, create_graph); } -Maybe AutogradEngine::RunBackwardAndReturnInputsTensorGrad( +Maybe AutogradEngine::RunBackwardAndReturnInputsTensorGradIf( const TensorTuple& outputs, const TensorTuple& inputs, const TensorTuple& out_grads, bool retain_graph, bool create_graph) { - return RunBackwardAndReturnInputsTensorGradIf(outputs, inputs, out_grads, retain_graph, - create_graph); + JUST(CheckConsistentTensorsMeta(outputs)); + JUST(CheckConsistentTensorsMeta(inputs)); + JUST(CheckConsistentTensorsMeta(out_grads)); + DisableCheckConsistentTensorMetaScope disable_meta_check; + return RunBackwardAndReturnInputsTensorGrad(outputs, inputs, out_grads, retain_graph, + create_graph); } StackFunctionNode::StackFunctionNode( @@ -167,10 +190,10 @@ void StackAutogradEngine::ClearReleasedFunctionNodes() { node_list_.end()); } -Maybe StackAutogradEngine::RunBackwardAndSaveGrads4LeafTensorIf(const TensorTuple& outputs, - const TensorTuple& out_grads, - bool retain_graph, - bool create_graph) { +Maybe StackAutogradEngine::RunBackwardAndSaveGrads4LeafTensor(const TensorTuple& outputs, + const TensorTuple& out_grads, + bool retain_graph, + bool create_graph) { ClearReleasedFunctionNodes(); for (int i = 0; i < outputs.size(); ++i) { JUST(JUST(outputs.at(i)->current_grad())->PushPartialTensor(out_grads.at(i))); @@ -190,7 +213,7 @@ Maybe StackAutogradEngine::RunBackwardAndSaveGrads4LeafTensorIf(const Tens return Maybe::Ok(); } -Maybe StackAutogradEngine::RunBackwardAndReturnInputsTensorGradIf( +Maybe StackAutogradEngine::RunBackwardAndReturnInputsTensorGrad( const TensorTuple& outputs, const TensorTuple& inputs, const TensorTuple& out_grads, bool retain_graph, bool create_graph) { ClearReleasedFunctionNodes(); @@ -396,10 +419,10 @@ Maybe GraphTask::Apply(bool save_grad_for_leaf) { return Maybe::Ok(); } -Maybe GraphAutogradEngine::RunBackwardAndSaveGrads4LeafTensorIf(const TensorTuple& outputs, - const TensorTuple& out_grads, - bool retain_graph, - bool create_graph) { +Maybe GraphAutogradEngine::RunBackwardAndSaveGrads4LeafTensor(const TensorTuple& outputs, + const TensorTuple& out_grads, + bool retain_graph, + bool create_graph) { for (int i = 0; i < outputs.size(); ++i) { JUST(JUST(outputs.at(i)->current_grad())->PushPartialTensor(out_grads.at(i))); } @@ -409,7 +432,7 @@ Maybe GraphAutogradEngine::RunBackwardAndSaveGrads4LeafTensorIf(const Tens return Maybe::Ok(); } -Maybe GraphAutogradEngine::RunBackwardAndReturnInputsTensorGradIf( +Maybe GraphAutogradEngine::RunBackwardAndReturnInputsTensorGrad( const TensorTuple& outputs, const TensorTuple& inputs, const TensorTuple& out_grads, bool retain_graph, bool create_graph) { std::shared_ptr input_current_grad = std::make_shared(inputs.size()); diff --git a/oneflow/core/autograd/autograd_engine.h b/oneflow/core/autograd/autograd_engine.h index 5fe1230f0e7..f3f05cafec0 100644 --- a/oneflow/core/autograd/autograd_engine.h +++ b/oneflow/core/autograd/autograd_engine.h @@ -69,13 +69,13 @@ class AutogradEngine { public: virtual ~AutogradEngine() = default; - Maybe RunBackwardAndSaveGrads4LeafTensor(const TensorTuple& outputs, - const TensorTuple& out_grads, bool retain_graph, - bool create_graph); - Maybe RunBackwardAndReturnInputsTensorGrad(const TensorTuple& outputs, - const TensorTuple& inputs, - const TensorTuple& out_grads, - bool retain_graph, bool create_graph); + Maybe RunBackwardAndSaveGrads4LeafTensorIf(const TensorTuple& outputs, + const TensorTuple& out_grads, bool retain_graph, + bool create_graph); + Maybe RunBackwardAndReturnInputsTensorGradIf(const TensorTuple& outputs, + const TensorTuple& inputs, + const TensorTuple& out_grads, + bool retain_graph, bool create_graph); virtual void ClearEngine() = 0; // Builds FunctionNode, binding to all `outputs_` tensors and saving in AutogradEngine virtual Maybe AddBackwardFuncPtr( @@ -88,15 +88,14 @@ class AutogradEngine { AutogradEngine() = default; private: - virtual Maybe RunBackwardAndSaveGrads4LeafTensorIf(const TensorTuple& outputs, - const TensorTuple& out_grads, - bool retain_graph, - bool create_graph) = 0; - virtual Maybe RunBackwardAndReturnInputsTensorGradIf(const TensorTuple& outputs, - const TensorTuple& inputs, - const TensorTuple& out_grads, - bool retain_graph, - bool create_graph) = 0; + virtual Maybe RunBackwardAndSaveGrads4LeafTensor(const TensorTuple& outputs, + const TensorTuple& out_grads, + bool retain_graph, bool create_graph) = 0; + virtual Maybe RunBackwardAndReturnInputsTensorGrad(const TensorTuple& outputs, + const TensorTuple& inputs, + const TensorTuple& out_grads, + bool retain_graph, + bool create_graph) = 0; }; // Stack Autograd Node and Engine @@ -137,14 +136,14 @@ class StackAutogradEngine final : public AutogradEngine { // moment. std::list> node_list_; void ClearReleasedFunctionNodes(); - Maybe RunBackwardAndSaveGrads4LeafTensorIf(const TensorTuple& outputs, - const TensorTuple& out_grads, bool retain_graph, - bool create_graph) override; - Maybe RunBackwardAndReturnInputsTensorGradIf(const TensorTuple& outputs, - const TensorTuple& inputs, - const TensorTuple& out_grads, - bool retain_graph, - bool create_graph) override; + Maybe RunBackwardAndSaveGrads4LeafTensor(const TensorTuple& outputs, + const TensorTuple& out_grads, bool retain_graph, + bool create_graph) override; + Maybe RunBackwardAndReturnInputsTensorGrad(const TensorTuple& outputs, + const TensorTuple& inputs, + const TensorTuple& out_grads, + bool retain_graph, + bool create_graph) override; }; // Graph Autograd Node and Engine @@ -194,14 +193,14 @@ class GraphAutogradEngine final : public AutogradEngine { const TensorTuple& inputs, TensorTuple* outputs) override; private: - Maybe RunBackwardAndSaveGrads4LeafTensorIf(const TensorTuple& outputs, - const TensorTuple& out_grads, bool retain_graph, - bool create_graph) override; - Maybe RunBackwardAndReturnInputsTensorGradIf(const TensorTuple& outputs, - const TensorTuple& inputs, - const TensorTuple& out_grads, - bool retain_graph, - bool create_graph) override; + Maybe RunBackwardAndSaveGrads4LeafTensor(const TensorTuple& outputs, + const TensorTuple& out_grads, bool retain_graph, + bool create_graph) override; + Maybe RunBackwardAndReturnInputsTensorGrad(const TensorTuple& outputs, + const TensorTuple& inputs, + const TensorTuple& out_grads, + bool retain_graph, + bool create_graph) override; }; AutogradEngine* GetThreadLocalAutogradEngine(); diff --git a/oneflow/core/autograd/gradient_funcs/consistent_to_consistent.cpp b/oneflow/core/autograd/gradient_funcs/consistent_to_consistent.cpp new file mode 100644 index 00000000000..a6e4d504f1c --- /dev/null +++ b/oneflow/core/autograd/gradient_funcs/consistent_to_consistent.cpp @@ -0,0 +1,73 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/id_util.h" +#include "oneflow/core/framework/op_expr_grad_function.h" +#include "oneflow/core/framework/op_builder.h" +#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h" +#include "oneflow/core/framework/op_expr.h" +#include "oneflow/core/framework/nd_sbp.h" +#include "oneflow/core/functional/functional.h" +#include "oneflow/core/common/optional.h" + +namespace oneflow { +namespace one { + +struct ConsistentToConsistentState : public AutoGradCaptureState { + Symbol parallel_desc; + Symbol nd_sbp; +}; + +class ConsistentToConsistentGradFunction : public OpExprGradFunction { + public: + Maybe Init(const OpExpr& op) override { + const auto* fw_op_expr = dynamic_cast(&op); + CHECK_NOTNULL_OR_RETURN(fw_op_expr); + grad_nd_sbp_ = fw_op_expr->grad_nd_sbp(); + return Maybe::Ok(); + } + + Maybe Capture(ConsistentToConsistentState* ctx, const TensorTuple& inputs, + const TensorTuple& outputs, + const OpExprInterpContext& interp_ctx) const override { + CHECK_EQ_OR_RETURN(inputs.size(), 1); + ctx->parallel_desc = JUST(inputs.at(0)->parallel_desc()); + ctx->nd_sbp = JUST(inputs.at(0)->nd_sbp()); + return Maybe::Ok(); + } + + Maybe Apply(const ConsistentToConsistentState* ctx, const TensorTuple& out_grads, + TensorTuple* in_grads) const override { + CHECK_EQ_OR_RETURN(out_grads.size(), 1); + const auto& out_grad = out_grads.at(0); + CHECK_OR_RETURN(out_grad->is_consistent()); + in_grads->resize(1); + const auto& grad_nd_sbp = + grad_nd_sbp_.has_value() ? JUST(grad_nd_sbp_.value()) : JUST(out_grad->nd_sbp()); + const auto& grad_sbp_list = JUST(GetSbpList(grad_nd_sbp)); + const auto& grad_grad_sbp_list = JUST(GetSbpList(ctx->nd_sbp)); + in_grads->at(0) = JUST(one::functional::ToConsistent(out_grad, ctx->parallel_desc, + *grad_sbp_list, *grad_grad_sbp_list)); + return Maybe::Ok(); + } + + private: + Optional> grad_nd_sbp_; +}; + +REGISTER_OP_EXPR_GRAD_FUNCTION("consistent_to_consistent", ConsistentToConsistentGradFunction); + +} // namespace one +} // namespace oneflow diff --git a/oneflow/core/common/error.h b/oneflow/core/common/error.h index 47a55a67dcd..9785b16b3d8 100644 --- a/oneflow/core/common/error.h +++ b/oneflow/core/common/error.h @@ -112,6 +112,12 @@ Error&& operator<<(Error&& error, const T& x) { return std::move(error); } +template<> +inline Error&& operator<<(Error&& error, const std::stringstream& x) { + error << x.str(); + return std::move(error); +} + template<> inline Error&& operator<<(Error&& error, const std::ostream& x) { error << x.rdbuf(); diff --git a/oneflow/core/eager/eager_blob_object.cpp b/oneflow/core/eager/eager_blob_object.cpp index 304465fdff2..5d187a0c207 100644 --- a/oneflow/core/eager/eager_blob_object.cpp +++ b/oneflow/core/eager/eager_blob_object.cpp @@ -47,9 +47,7 @@ Maybe EagerBlobObject::InitBlob() { { header_buffer_.reset(); int64_t header_byte_size = blob_desc_.AlignedByteSizeOfBlobHeader(); - const auto& FreeHeader = [header_byte_size](char* dptr) { std::free(dptr); }; - char* ptr = reinterpret_cast(std::malloc(header_byte_size)); - header_buffer_ = std::unique_ptr>(ptr, FreeHeader); + header_buffer_ = std::make_unique(header_byte_size); } blob_.reset(new Blob(*mem_case_, &blob_desc_, header_buffer_.get(), nullptr)); return Maybe::Ok(); diff --git a/oneflow/core/eager/eager_blob_object.h b/oneflow/core/eager/eager_blob_object.h index e81ff9e792d..aa43ead930e 100644 --- a/oneflow/core/eager/eager_blob_object.h +++ b/oneflow/core/eager/eager_blob_object.h @@ -90,7 +90,7 @@ class EagerBlobObject final : public BlobObject { const Optional& dep_object); std::unique_ptr blob_; - std::unique_ptr> header_buffer_; + std::unique_ptr header_buffer_; std::shared_ptr tensor_buffer_; std::size_t blob_body_bytes_; std::unique_ptr non_pod_initer_; diff --git a/oneflow/core/framework/nd_sbp.cpp b/oneflow/core/framework/nd_sbp.cpp index d69ba0729a9..5640ad12910 100644 --- a/oneflow/core/framework/nd_sbp.cpp +++ b/oneflow/core/framework/nd_sbp.cpp @@ -22,20 +22,6 @@ namespace oneflow { namespace { -Maybe> FindOrCreateNdSbp(const std::vector>& sbp_list) { - static thread_local auto* sbp_list2nd_sbp = - new HashMap>, Symbol>(); - auto iter = sbp_list2nd_sbp->find(sbp_list); - if (iter == sbp_list2nd_sbp->end()) { - cfg::NdSbp nd_sbp; - for (Symbol sbp_symbol : sbp_list) { - *(nd_sbp.mutable_sbp_parallel()->Add()) = *sbp_symbol; - } - iter = sbp_list2nd_sbp->emplace(sbp_list, SymbolOf(nd_sbp)).first; - } - return iter->second; -} - Maybe> FindOrCreateNdSbpString(Symbol nd_sbp) { static thread_local auto* nd_sbp2nd_sbp_str = new HashMap, std::shared_ptr>>(); @@ -81,10 +67,6 @@ Maybe> GetDualNdSbp(Symbol nd_sbp) { return iter->second; } -Maybe> GetNdSbp(const std::vector>& sbp_list) { - return FindOrCreateNdSbp(sbp_list); -} - Maybe> GetNdSbpStrList( const std::vector>& sbp_list) { return FindOrCreateNdSbpString(JUST(GetNdSbp(sbp_list))); @@ -98,4 +80,54 @@ Maybe> GetDualNdSbpStrList(Symbol nd_sbp) { return GetNdSbpStrList(JUST(GetDualNdSbp(nd_sbp))); } +namespace private_details { + +Maybe> RawGetNdSbp(const std::vector>& sbp_list) { + CHECK_OR_RETURN(!sbp_list.empty()); + cfg::NdSbp nd_sbp; + for (const auto& sbp : sbp_list) { *(nd_sbp.mutable_sbp_parallel()->Add()) = *sbp; } + return SymbolOf(nd_sbp); +} + +Maybe>> RawGetSbpList(Symbol nd_sbp) { + const auto& vec = std::make_shared>>(); + CHECK_OR_RETURN(!nd_sbp->sbp_parallel().empty()); + for (const auto& sbp_parallel : nd_sbp->sbp_parallel()) { + vec->push_back(SymbolOf(sbp_parallel)); + } + return vec; +} + +} // namespace private_details + +const std::vector>& GetNoneSbpList() { + static thread_local std::vector> none; + return none; +} + +Maybe SbpToString(Symbol sbp_sym) { + std::string sbp_str = "oneflow.sbp."; + if (sbp_sym->has_broadcast_parallel()) { + sbp_str += "broadcast"; + } else if (sbp_sym->has_partial_sum_parallel()) { + sbp_str += "partial_sum"; + } else if (sbp_sym->has_split_parallel()) { + sbp_str += "split(axis=" + std::to_string(sbp_sym->split_parallel().axis()) + ")"; + } else { + UNIMPLEMENTED_THEN_RETURN(); + } + return sbp_str; +} + +Maybe NdSbpToString(Symbol nd_sbp) { + std::string str = "("; + for (int i = 0; i < nd_sbp->sbp_parallel_size(); ++i) { + if (i > 0) { str += ", "; } + str += *JUST(SbpToString(SymbolOf(nd_sbp->sbp_parallel(i)))); + } + if (nd_sbp->sbp_parallel_size() == 1) { str += ","; } + str += ")"; + return str; +} + } // namespace oneflow diff --git a/oneflow/core/framework/nd_sbp.h b/oneflow/core/framework/nd_sbp.h index 5e34be0ee21..445caeafdee 100644 --- a/oneflow/core/framework/nd_sbp.h +++ b/oneflow/core/framework/nd_sbp.h @@ -17,15 +17,17 @@ limitations under the License. #define ONEFLOW_CORE_FRAMEWORK_ND_SBP_H_ #include +#include "oneflow/core/common/util.h" #include "oneflow/core/common/symbol.h" #include "oneflow/core/common/maybe.h" +#include "oneflow/core/common/decorator.h" #include "oneflow/core/job/sbp_parallel.cfg.h" namespace oneflow { -Maybe> GetDualNdSbp(Symbol sbp_list); +Maybe> GetDualNdSbp(Symbol nd_sbp); -Maybe> GetNdSbp(const std::vector>& sbp_list); +Maybe> GetDualNdSbp(Symbol sbp_list); Maybe> GetNdSbpStrList( const std::vector>& sbp_list); @@ -33,6 +35,23 @@ Maybe> GetNdSbpStrList( Maybe> GetNdSbpStrList(Symbol nd_sbp); Maybe> GetDualNdSbpStrList(Symbol nd_sbp); + +Maybe> GetDualNdSbpStrList(Symbol nd_sbp); + +namespace private_details { + +Maybe> RawGetNdSbp(const std::vector>& sbp_list); +Maybe>> RawGetSbpList(Symbol nd_sbp); + +} // namespace private_details + +static constexpr auto* GetNdSbp = DECORATE(&private_details::RawGetNdSbp, ThreadLocalCopiable); +static constexpr auto* GetSbpList = DECORATE(&private_details::RawGetSbpList, ThreadLocal); +const std::vector>& GetNoneSbpList(); + +Maybe SbpToString(Symbol sbp_sym); +Maybe NdSbpToString(Symbol nd_sbp); + } // namespace oneflow #endif // ONEFLOW_CORE_FRAMEWORK_ND_SBP_H_ diff --git a/oneflow/core/framework/op_expr.cpp b/oneflow/core/framework/op_expr.cpp index 5506f2494ac..42290c006b4 100644 --- a/oneflow/core/framework/op_expr.cpp +++ b/oneflow/core/framework/op_expr.cpp @@ -59,6 +59,11 @@ const std::string& BuiltinOpExprImpl::op_type_name() const { return op_proto_.op_type_name(); } +const std::string& ConsistentToConsistentOpExpr::op_type_name() const { + static const std::string kOpTypeName = "consistent_to_consistent"; + return kOpTypeName; +} + const std::string& CastToConsistentOpExpr::op_type_name() const { static const std::string kOpTypeName = "cast_to_consistent"; return kOpTypeName; @@ -69,11 +74,6 @@ const std::string& CastFromConsistentOpExpr::op_type_name() const { return kOpTypeName; } -const std::string& ConsistentToConsistentOpExpr::op_type_name() const { - static const std::string kOpTypeName = "consistent_to_consistent"; - return kOpTypeName; -} - #define DEFINE_OPEXPR_IS_GRAD_DISABLED_DEFAULT_VALUE(_T, _bool) \ template<> \ Maybe BuiltinOpExprImpl<_T>::IsGradDisabled() const { \ @@ -399,6 +399,16 @@ Maybe> UserOpExpr::InferDevices(const AttrMap& attrs, return TRY(device_infer_fn_(&device_infer_ctx)); } +ConsistentToConsistentOpExpr::ConsistentToConsistentOpExpr( + const Optional>& grad_nd_sbp) + : grad_nd_sbp_(grad_nd_sbp) {} + +/* static */ Maybe ConsistentToConsistentOpExpr::New( + const Optional>& grad_nd_sbp) { + auto* ptr = new ConsistentToConsistentOpExpr(grad_nd_sbp); + return std::shared_ptr(ptr); +} + CastConsistentOpExpr::CastConsistentOpExpr(const std::string& op_name) : op_name_(op_name) {} CastToConsistentOpExpr::CastToConsistentOpExpr(const std::string& op_name) @@ -416,14 +426,6 @@ CastFromConsistentOpExpr::CastFromConsistentOpExpr(const std::string& op_name) return std::shared_ptr(new CastFromConsistentOpExpr(op_name)); } -ConsistentToConsistentOpExpr::ConsistentToConsistentOpExpr(const std::string& op_name) - : CastConsistentOpExpr(op_name) {} - -/* static */ Maybe ConsistentToConsistentOpExpr::New( - const std::string& op_name) { - return std::shared_ptr(new ConsistentToConsistentOpExpr(op_name)); -} - template<> Maybe BuiltinOpExprImpl::BuildOpConf(OperatorConf* op_conf, const AttrMap& attrs) const { @@ -509,6 +511,15 @@ Maybe BuiltinOpExprImpl::GetOrCreateO UNIMPLEMENTED_THEN_RETURN(); } +Maybe ConsistentToConsistentOpExpr::GetOrCreateOpGradClosure() const { + if (!op_grad_func_.get()) { + op_grad_func_.reset(NewObj("consistent_to_consistent")); + CHECK_NOTNULL_OR_RETURN(op_grad_func_.get()); + JUST(op_grad_func_->Init(*this)); + } + return std::make_shared(op_grad_func_); +} + Maybe CastToConsistentOpExpr::GetOrCreateOpGradClosure() const { if (!op_grad_func_.get()) { op_grad_func_.reset(NewObj("cast_to_consistent")); diff --git a/oneflow/core/framework/op_expr.h b/oneflow/core/framework/op_expr.h index 8e7d139beda..7238edd303b 100644 --- a/oneflow/core/framework/op_expr.h +++ b/oneflow/core/framework/op_expr.h @@ -18,6 +18,8 @@ limitations under the License. #include "oneflow/core/common/util.h" #include "oneflow/core/common/symbol.h" +#include "oneflow/core/common/optional.h" +#include "oneflow/core/job/sbp_parallel.cfg.h" #include "oneflow/core/operator/op_conf.pb.h" #include "oneflow/core/framework/attr_map.h" #include "oneflow/core/framework/device.h" @@ -155,6 +157,27 @@ class UserOpExpr final : public BuiltinOpExprImpl { std::shared_ptr consistent_tensor_infer_cache_; }; +class ConsistentToConsistentOpExpr : public OpExpr { + public: + virtual ~ConsistentToConsistentOpExpr() = default; + + static Maybe New(const Optional>& grad_nd_sbp); + + const Optional>& grad_nd_sbp() const { return grad_nd_sbp_; } + const std::string& op_type_name() const override; + int input_size() const override { return 1; } + int output_size() const override { return 1; } + + Maybe IsGradDisabled() const override { return false; } + Maybe GetOrCreateOpGradClosure() const override; + + protected: + ConsistentToConsistentOpExpr(const Optional>& grad_nd_sbp); + + Optional> grad_nd_sbp_; // Reserved for configuring grad sbp + mutable std::shared_ptr op_grad_func_; +}; + class CastConsistentOpExpr : public OpExpr { public: virtual ~CastConsistentOpExpr() = default; @@ -198,25 +221,6 @@ class CastFromConsistentOpExpr final : public CastConsistentOpExpr { CastFromConsistentOpExpr(const std::string& op_name); }; -class ConsistentToConsistentOpExpr final : public CastConsistentOpExpr { - public: - ~ConsistentToConsistentOpExpr() = default; - - static Maybe New(const std::string& op_name); - - const std::string& op_type_name() const override; - - // Note(zwx): ConsistentToConsistentOpExpr is currently only used by lazy, - // there's no need to gen grad through autograd engine - Maybe IsGradDisabled() const override { return true; } - Maybe GetOrCreateOpGradClosure() const override { - UNIMPLEMENTED_THEN_RETURN(); - } - - private: - ConsistentToConsistentOpExpr(const std::string& op_name); -}; - // NOTE(chengcheng): For Lazy nn.Graph Feed/Fetch EagerTensor to/from LazyTensor. using FeedInputOpExpr = BuiltinOpExprImpl; using FeedVariableOpExpr = BuiltinOpExprImpl; diff --git a/oneflow/core/framework/op_interpreter.h b/oneflow/core/framework/op_interpreter.h index 71ea6b844d4..47a60ae1b5d 100644 --- a/oneflow/core/framework/op_interpreter.h +++ b/oneflow/core/framework/op_interpreter.h @@ -83,6 +83,7 @@ class OpExprInterpreter { _macro(VariableOp); \ _macro(CastToMirroredOp); \ _macro(CastFromMirroredOp); \ + _macro(ConsistentToConsistentOp); \ _macro(CastToConsistentOp); \ _macro(CastFromConsistentOp); \ _macro(DistributeSplitOp); \ diff --git a/oneflow/core/framework/op_interpreter/boxing/cuda_based_cpu_mpi_boxing_interpreter.cpp b/oneflow/core/framework/op_interpreter/boxing/cuda_based_cpu_mpi_boxing_interpreter.cpp new file mode 100644 index 00000000000..071bc860b51 --- /dev/null +++ b/oneflow/core/framework/op_interpreter/boxing/cuda_based_cpu_mpi_boxing_interpreter.cpp @@ -0,0 +1,63 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/op_interpreter/boxing/cuda_based_cpu_mpi_boxing_interpreter.h" +#include "oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter_mgr.h" +#include "oneflow/core/job/parallel_desc.h" + +namespace oneflow { + +struct CudaBaseMpiEagerBoxingCall { + std::shared_ptr opt_h2d; + std::shared_ptr gpu_mpi; + std::shared_ptr opt_d2h; +}; + +Maybe RawGetCudaBaseMpiEagerBoxingCall( + Symbol in_nd_sbp, Symbol out_nd_sbp, + Symbol in_parallel_desc, Symbol out_parallel_desc) { + const auto& gpu_in_parallel_desc = JUST(ReplaceDeviceType(in_parallel_desc, DeviceType::kGPU)); + const auto& gpu_out_parallel_desc = JUST(ReplaceDeviceType(out_parallel_desc, DeviceType::kGPU)); + CHECK_OR_RETURN(gpu_in_parallel_desc == gpu_out_parallel_desc); + const auto& opt_h2d = + JUST(EagerBoxingCall::New(in_nd_sbp, in_nd_sbp, in_parallel_desc, gpu_in_parallel_desc)); + const auto& gpu_mpi = JUST( + EagerBoxingCall::New(in_nd_sbp, out_nd_sbp, gpu_in_parallel_desc, gpu_out_parallel_desc)); + const auto& opt_d2h = + JUST(EagerBoxingCall::New(out_nd_sbp, out_nd_sbp, gpu_out_parallel_desc, out_parallel_desc)); + return std::shared_ptr(new CudaBaseMpiEagerBoxingCall{ + .opt_h2d = opt_h2d, + .gpu_mpi = gpu_mpi, + .opt_d2h = opt_d2h, + }); +} + +static constexpr auto* GetCudaBaseMpiEagerBoxingCall = + DECORATE(&RawGetCudaBaseMpiEagerBoxingCall, ThreadLocal); + +Maybe CudaBasedCpuMpiBoxingInterpreter::InterpretImpl( + const std::shared_ptr& input, Symbol in_nd_sbp, + Symbol out_nd_sbp, Symbol in_parallel_desc, + Symbol out_parallel_desc) const { + const auto& call = JUST( + GetCudaBaseMpiEagerBoxingCall(in_nd_sbp, out_nd_sbp, in_parallel_desc, out_parallel_desc)); + auto tensor = input; + tensor = JUST(call->opt_h2d->Apply(tensor)); + tensor = JUST(call->gpu_mpi->Apply(tensor)); + tensor = JUST(call->opt_d2h->Apply(tensor)); + return tensor; +} + +} // namespace oneflow diff --git a/oneflow/core/framework/op_interpreter/boxing/cuda_based_cpu_mpi_boxing_interpreter.h b/oneflow/core/framework/op_interpreter/boxing/cuda_based_cpu_mpi_boxing_interpreter.h new file mode 100644 index 00000000000..b2bd02dc8f5 --- /dev/null +++ b/oneflow/core/framework/op_interpreter/boxing/cuda_based_cpu_mpi_boxing_interpreter.h @@ -0,0 +1,37 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_FRAMEWORK_OP_INTERPRETER_BOXING_CUDA_BASED_CPU_MPI_BOXING_INTERPRETER_H_ +#define ONEFLOW_CORE_FRAMEWORK_OP_INTERPRETER_BOXING_CUDA_BASED_CPU_MPI_BOXING_INTERPRETER_H_ + +#include "oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter.h" + +namespace oneflow { + +class CudaBasedCpuMpiBoxingInterpreter final : public EagerBoxingInterpreter { + public: + CudaBasedCpuMpiBoxingInterpreter() = default; + ~CudaBasedCpuMpiBoxingInterpreter() override = default; + + private: + Maybe InterpretImpl(const std::shared_ptr& input, + Symbol in_nd_sbp, Symbol out_nd_sbp, + Symbol in_parallel_desc, + Symbol out_parallel_desc) const override; +}; + +} // namespace oneflow + +#endif // ONEFLOW_CORE_FRAMEWORK_OP_INTERPRETER_BOXING_CUDA_BASED_CPU_MPI_BOXING_INTERPRETER_H_ diff --git a/oneflow/core/framework/op_interpreter/boxing/cuda_copy_boxing_interpreter.cpp b/oneflow/core/framework/op_interpreter/boxing/cuda_copy_boxing_interpreter.cpp new file mode 100644 index 00000000000..67c61f95bbb --- /dev/null +++ b/oneflow/core/framework/op_interpreter/boxing/cuda_copy_boxing_interpreter.cpp @@ -0,0 +1,41 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/op_interpreter/boxing/cuda_copy_boxing_interpreter.h" +#include "oneflow/core/functional/functional.h" +#include "oneflow/core/framework/nd_sbp.h" +#include "oneflow/core/job/parallel_desc.h" + +namespace oneflow { + +Maybe CudaCopyBoxingInterpreter::InterpretImpl( + const std::shared_ptr& input, Symbol in_nd_sbp, + Symbol out_nd_sbp, Symbol in_parallel_desc, + Symbol out_parallel_desc) const { + CHECK_OR_RETURN(in_nd_sbp == out_nd_sbp); + const auto& new_tag_in_parallel_desc = + JUST(ReplaceDeviceType(in_parallel_desc, out_parallel_desc->device_type())); + CHECK_OR_RETURN(new_tag_in_parallel_desc == out_parallel_desc); + const auto& local_tensor = JUST(input->cur_rank_phy_tensor()); + const auto& sbp_list = JUST(GetSbpList(out_nd_sbp)); + const auto& tensor = + JUST(one::functional::ToConsistent(local_tensor, out_parallel_desc, *sbp_list, {})); + CHECK_OR_RETURN(tensor->is_consistent()); + const auto& tensor_placement = JUST(tensor->parallel_desc()); + CHECK_OR_RETURN(tensor_placement == out_parallel_desc); + return tensor; +} + +} // namespace oneflow diff --git a/oneflow/core/framework/op_interpreter/boxing/cuda_copy_boxing_interpreter.h b/oneflow/core/framework/op_interpreter/boxing/cuda_copy_boxing_interpreter.h new file mode 100644 index 00000000000..c99c4f7762d --- /dev/null +++ b/oneflow/core/framework/op_interpreter/boxing/cuda_copy_boxing_interpreter.h @@ -0,0 +1,36 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_FRAMEWORK_OP_INTERPRETER_BOXING_CUDA_COPY_BOXING_INTERPRETER_H_ +#define ONEFLOW_CORE_FRAMEWORK_OP_INTERPRETER_BOXING_CUDA_COPY_BOXING_INTERPRETER_H_ + +#include "oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter.h" + +namespace oneflow { + +class CudaCopyBoxingInterpreter : public EagerBoxingInterpreter { + public: + CudaCopyBoxingInterpreter() = default; + ~CudaCopyBoxingInterpreter() override = default; + + Maybe InterpretImpl(const std::shared_ptr& input, + Symbol in_nd_sbp, Symbol out_nd_sbp, + Symbol in_parallel_desc, + Symbol out_parallel_desc) const override; +}; + +} // namespace oneflow + +#endif // ONEFLOW_CORE_FRAMEWORK_OP_INTERPRETER_BOXING_CUDA_COPY_BOXING_INTERPRETER_H_ diff --git a/oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter.cpp b/oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter.cpp new file mode 100644 index 00000000000..1b8083cfedc --- /dev/null +++ b/oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter.cpp @@ -0,0 +1,70 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include +#include "oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter.h" +#include "oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter_mgr.h" + +namespace oneflow { + +namespace { +Maybe CheckEagerBoxingDataType(DataType val) { + CHECK_OR_RETURN(val != DataType::kTensorBuffer && val != DataType::kOFRecord) + << "EagerBoxing only support POD data type."; + return Maybe::Ok(); +} +} // namespace + +Maybe EagerBoxingInterpreter::Interpret(const std::shared_ptr& input, + Symbol in_nd_sbp, + Symbol out_nd_sbp, + Symbol in_parallel_desc, + Symbol out_parallel_desc) const { + JUST(CheckEagerBoxingDataType(input->dtype()->data_type())); + const auto& tensor = + JUST(InterpretImpl(input, in_nd_sbp, out_nd_sbp, in_parallel_desc, out_parallel_desc)); + const auto& tensor_nd_sbp = JUST(tensor->nd_sbp()); + const auto& tensor_placement = JUST(tensor->parallel_desc()); + CHECK_OR_RETURN(tensor_nd_sbp == out_nd_sbp) << typeid(*this).name(); + CHECK_OR_RETURN(tensor_placement == out_parallel_desc) << typeid(*this).name(); + return tensor; +} + +Maybe EagerBoxingCall::New(Symbol in_nd_sbp, + Symbol out_nd_sbp, + Symbol in_parallel_desc, + Symbol out_parallel_desc) { + const auto* mgr = Global::Get(); + const auto& boxing_interpreter = JUST( + mgr->GetEagerBoxingInterpreter(in_nd_sbp, out_nd_sbp, in_parallel_desc, out_parallel_desc)); + return std::shared_ptr(new EagerBoxingCall{ + .boxing_interpreter = boxing_interpreter, + .in_nd_sbp = in_nd_sbp, + .out_nd_sbp = out_nd_sbp, + .in_parallel_desc = in_parallel_desc, + .out_parallel_desc = out_parallel_desc, + }); +} + +Maybe EagerBoxingCall::Apply(const std::shared_ptr& input) const { + const auto& input_nd_sbp = JUST(input->nd_sbp()); + const auto& input_parallel_desc = JUST(input->parallel_desc()); + CHECK_OR_RETURN(input_nd_sbp == this->in_nd_sbp); + CHECK_OR_RETURN(input_parallel_desc == this->in_parallel_desc); + return this->boxing_interpreter->Interpret(input, this->in_nd_sbp, this->out_nd_sbp, + this->in_parallel_desc, this->out_parallel_desc); +} + +} // namespace oneflow diff --git a/oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter.h b/oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter.h index 155a3fbef3d..9044d8d45ec 100644 --- a/oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter.h +++ b/oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter.h @@ -24,14 +24,6 @@ limitations under the License. namespace oneflow { -namespace { -inline Maybe CheckEagerBoxingDataType(DataType val) { - CHECK_OR_RETURN(val != DataType::kTensorBuffer && val != DataType::kOFRecord) - << "EagerBoxing only support POD data type."; - return Maybe::Ok(); -} -} // namespace - class EagerBoxingInterpreter { public: OF_DISALLOW_COPY_AND_MOVE(EagerBoxingInterpreter); @@ -41,10 +33,7 @@ class EagerBoxingInterpreter { Maybe Interpret(const std::shared_ptr& input, Symbol in_nd_sbp, Symbol out_nd_sbp, Symbol in_parallel_desc, - Symbol out_parallel_desc) { - JUST(CheckEagerBoxingDataType(input->dtype()->data_type())); - return InterpretImpl(input, in_nd_sbp, out_nd_sbp, in_parallel_desc, out_parallel_desc); - } + Symbol out_parallel_desc) const; protected: virtual Maybe InterpretImpl(const std::shared_ptr& input, @@ -54,6 +43,20 @@ class EagerBoxingInterpreter { Symbol out_parallel_desc) const = 0; }; +struct EagerBoxingCall { + static Maybe New(Symbol in_nd_sbp, Symbol out_nd_sbp, + Symbol in_parallel_desc, + Symbol out_parallel_desc); + + Maybe Apply(const std::shared_ptr& input) const; + + const std::shared_ptr boxing_interpreter; + const Symbol in_nd_sbp; + const Symbol out_nd_sbp; + const Symbol in_parallel_desc; + const Symbol out_parallel_desc; +}; + } // namespace oneflow #endif // ONEFLOW_CORE_FRAMEWORK_OP_INTERPRETER_BOXING_EAGER_BOXING_INTERPRETER_H_ diff --git a/oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter_mgr.cpp b/oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter_mgr.cpp index 8ccd7428f93..186dc40d0e4 100644 --- a/oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter_mgr.cpp +++ b/oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter_mgr.cpp @@ -17,13 +17,15 @@ limitations under the License. #include "oneflow/core/common/constant.h" #include "oneflow/core/common/decorator.h" #include "oneflow/core/common/container_util.h" -#include "oneflow/core/job/sbp_parallel.h" +#include "oneflow/core/framework/nd_sbp.h" #include "oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter_mgr.h" #include "oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter_util.h" #include "oneflow/core/framework/op_interpreter/boxing/collective_boxing_interpreter.h" #include "oneflow/core/framework/op_interpreter/boxing/identity_boxing_interpreter.h" #include "oneflow/core/framework/op_interpreter/boxing/naive_b2p_boxing_interpreter.h" #include "oneflow/core/framework/op_interpreter/boxing/naive_s2p_boxing_interpreter.h" +#include "oneflow/core/framework/op_interpreter/boxing/cuda_copy_boxing_interpreter.h" +#include "oneflow/core/framework/op_interpreter/boxing/cuda_based_cpu_mpi_boxing_interpreter.h" namespace oneflow { @@ -31,18 +33,6 @@ namespace { using SbpPair2EagerBoxingInterpreter = HashMap, std::shared_ptr>; -std::string GetSupportedBoxingTypeInfo() { - static std::string supported_boxing_type_info = - "============ Supported eager boxing type============\n" - "\'[S(0)] -> [B]\' on GPU\n" - "\'[S(0)] -> [P]\' on GPU\n" - "\'[P] -> [B]\' on GPU\n" - "\'[P] -> [S(0)]\' on GPU\n" - "\'[B] -> [S(0)]\' on GPU\n" - "\'[B] -> [P]\' on GPU or CPU"; - return supported_boxing_type_info; -} - Maybe GetOneDimNcclCollectiveEagerBoxingInterpreter( Symbol in_nd_sbp, Symbol out_nd_sbp) { static SbpPair2EagerBoxingInterpreter sbp_pair2eager_boxing_interpreter = { @@ -58,54 +48,83 @@ Maybe GetOneDimNcclCollectiveEagerBoxingInterpreter( std::make_shared()}, }; const auto& key = std::make_pair(in_nd_sbp->sbp_parallel(0), out_nd_sbp->sbp_parallel(0)); - CHECK_OR_RETURN(sbp_pair2eager_boxing_interpreter.find(key) - != sbp_pair2eager_boxing_interpreter.end()) - << "Eager boxing type \'" << NdSbpToString(in_nd_sbp) << " -> " << NdSbpToString(out_nd_sbp) - << "\'" - << " not support yet\n" - << GetSupportedBoxingTypeInfo(); - return JUST(MapAt(sbp_pair2eager_boxing_interpreter, key)); } +Maybe GetCudaBasedCpuMpiBoxingInterpreter( + Symbol in_nd_sbp, Symbol out_nd_sbp, + Symbol in_parallel_desc, Symbol out_parallel_desc) { + CHECK_OR_RETURN(in_nd_sbp != out_nd_sbp); + const auto& gpu_in_parallel_desc = JUST(ReplaceDeviceType(in_parallel_desc, DeviceType::kGPU)); + const auto& gpu_out_parallel_desc = JUST(ReplaceDeviceType(out_parallel_desc, DeviceType::kGPU)); + CHECK_OR_RETURN(gpu_in_parallel_desc == gpu_out_parallel_desc); + const auto& gpu_boxing_interpreter = + JUST(GetOneDimNcclCollectiveEagerBoxingInterpreter(in_nd_sbp, out_nd_sbp)); + return std::shared_ptr(new CudaBasedCpuMpiBoxingInterpreter()); +} + +Maybe IgnoringDeviceTypeEqual(Symbol lhs, Symbol rhs) { + if (lhs == rhs) { return true; } + return lhs == JUST(ReplaceDeviceType(rhs, lhs->device_type())); +} + Maybe GetBoxingInterpreter(Symbol in_nd_sbp, Symbol out_nd_sbp, Symbol in_parallel_desc, Symbol out_parallel_desc) { if (in_parallel_desc == out_parallel_desc && (in_parallel_desc->parallel_num() == 1 || in_nd_sbp == out_nd_sbp)) { - static std::shared_ptr identity_boxing_interpreter = - std::make_shared(); - return identity_boxing_interpreter; + return std::shared_ptr(new IdentityBoxingInterpreter()); + } + if (in_nd_sbp->sbp_parallel_size() == 1 && out_nd_sbp->sbp_parallel_size() == 1 + && in_parallel_desc == out_parallel_desc + && EagerBoxingInterpreterUtil::IsBoxingB2P(in_nd_sbp->sbp_parallel(0), + out_nd_sbp->sbp_parallel(0))) { + return std::shared_ptr(new NaiveB2PBoxingInterpreter()); + } + if (in_nd_sbp->sbp_parallel_size() == 1 && out_nd_sbp->sbp_parallel_size() == 1 + && in_parallel_desc == out_parallel_desc + && in_parallel_desc->device_type() == DeviceType::kGPU) { + const auto& gpu_boxing_interpreter = + TRY(GetOneDimNcclCollectiveEagerBoxingInterpreter(in_nd_sbp, out_nd_sbp)); + if (gpu_boxing_interpreter.IsOk()) { return JUST(gpu_boxing_interpreter); } + } + if (in_nd_sbp->sbp_parallel_size() == 1 && out_nd_sbp->sbp_parallel_size() == 1 + && in_parallel_desc == out_parallel_desc + && in_parallel_desc->device_type() == DeviceType::kCPU) { + const auto& interpreter = TRY(GetCudaBasedCpuMpiBoxingInterpreter( + in_nd_sbp, out_nd_sbp, in_parallel_desc, out_parallel_desc)); + if (interpreter.IsOk()) { return JUST(interpreter); } + } + if (in_nd_sbp->sbp_parallel_size() == 1 && out_nd_sbp->sbp_parallel_size() == 1 + && JUST(IgnoringDeviceTypeEqual(in_parallel_desc, out_parallel_desc)) + && ((in_parallel_desc->device_type() == DeviceType::kGPU + && out_parallel_desc->device_type() == DeviceType::kCPU) + || (in_parallel_desc->device_type() == DeviceType::kCPU + && out_parallel_desc->device_type() == DeviceType::kGPU)) + && in_nd_sbp == out_nd_sbp) { + return std::shared_ptr(new CudaCopyBoxingInterpreter()); } - if (in_nd_sbp->sbp_parallel_size() == 1 && out_nd_sbp->sbp_parallel_size() == 1) { - if (in_parallel_desc == out_parallel_desc) { - if (EagerBoxingInterpreterUtil::IsBoxingB2P(in_nd_sbp->sbp_parallel(0), - out_nd_sbp->sbp_parallel(0))) { - std::shared_ptr naive_bp_boxing_interpreter = - std::make_shared(); - return naive_bp_boxing_interpreter; - } else if (in_parallel_desc->device_type() == DeviceType::kGPU) { - return GetOneDimNcclCollectiveEagerBoxingInterpreter(in_nd_sbp, out_nd_sbp); - } else { - UNIMPLEMENTED_THEN_RETURN() << "Eager boxing type \'" << NdSbpToString(in_nd_sbp) << " -> " - << NdSbpToString(out_nd_sbp) << "\'" - << " not support yet\n" - << GetSupportedBoxingTypeInfo(); - } - } else { - UNIMPLEMENTED_THEN_RETURN() << "Eager boxing with different placement not support yet\n" - << GetSupportedBoxingTypeInfo(); - } - } else { - UNIMPLEMENTED_THEN_RETURN() << "N-dim eager boxing type \'" << NdSbpToString(in_nd_sbp) - << " -> " << NdSbpToString(out_nd_sbp) << "\'" - << " not support yet\n" - << GetSupportedBoxingTypeInfo(); + if (in_nd_sbp->sbp_parallel_size() == 1 && out_nd_sbp->sbp_parallel_size() == 1 + && JUST(IgnoringDeviceTypeEqual(in_parallel_desc, out_parallel_desc)) + && ((in_parallel_desc->device_type() == DeviceType::kGPU + && out_parallel_desc->device_type() == DeviceType::kCPU) + || (in_parallel_desc->device_type() == DeviceType::kCPU + && out_parallel_desc->device_type() == DeviceType::kGPU)) + && in_nd_sbp != out_nd_sbp) { + const auto& interpreter = TRY(GetCudaBasedCpuMpiBoxingInterpreter( + in_nd_sbp, out_nd_sbp, in_parallel_desc, out_parallel_desc)); + if (interpreter.IsOk()) { return JUST(interpreter); } } + UNIMPLEMENTED_THEN_RETURN() << Error::BoxingNotSupportedError() + << "consistent-to-consistent not supported" + << ". from_nd_sbp: " << *JUST(NdSbpToString(in_nd_sbp)) + << ", to_nd_sbp: " << *JUST(NdSbpToString(out_nd_sbp)) + << ", from_placement: " << *JUST(PlacementToString(in_parallel_desc)) + << ", to_placement: " << *JUST(PlacementToString(out_parallel_desc)); } -auto* CachedGetBoxingInterpreter = DECORATE(&GetBoxingInterpreter, ThreadLocal); +static constexpr auto* CachedGetBoxingInterpreter = DECORATE(&GetBoxingInterpreter, ThreadLocal); } // namespace diff --git a/oneflow/core/framework/op_interpreter/boxing/identity_boxing_interpreter.cpp b/oneflow/core/framework/op_interpreter/boxing/identity_boxing_interpreter.cpp index 06a32dbd0f8..13cca74c95b 100644 --- a/oneflow/core/framework/op_interpreter/boxing/identity_boxing_interpreter.cpp +++ b/oneflow/core/framework/op_interpreter/boxing/identity_boxing_interpreter.cpp @@ -13,6 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "oneflow/core/functional/functional.h" +#include "oneflow/core/framework/nd_sbp.h" #include "oneflow/core/framework/op_interpreter/boxing/identity_boxing_interpreter.h" namespace oneflow { @@ -23,7 +25,10 @@ Maybe IdentityBoxingInterpreter::InterpretImpl( Symbol out_parallel_desc) const { CHECK_OR_RETURN(in_parallel_desc == out_parallel_desc); CHECK_OR_RETURN(in_parallel_desc->parallel_num() == 1 || in_nd_sbp == out_nd_sbp); - return input; + // reset sbp if parallel_num == 1 and reset ConsistentId + std::shared_ptr tensor = JUST(input->cur_rank_phy_tensor()); + return one::functional::ToConsistent(tensor, out_parallel_desc, *JUST(GetSbpList(out_nd_sbp)), + GetNoneSbpList()); } } // namespace oneflow diff --git a/oneflow/core/framework/op_interpreter/boxing/naive_b2p_boxing_interpreter.cpp b/oneflow/core/framework/op_interpreter/boxing/naive_b2p_boxing_interpreter.cpp index e37c7875bab..1034c5eb830 100644 --- a/oneflow/core/framework/op_interpreter/boxing/naive_b2p_boxing_interpreter.cpp +++ b/oneflow/core/framework/op_interpreter/boxing/naive_b2p_boxing_interpreter.cpp @@ -15,6 +15,7 @@ limitations under the License. */ #include "oneflow/core/framework/op_interpreter/boxing/naive_b2p_boxing_interpreter.h" #include "oneflow/core/framework/device.h" +#include "oneflow/core/framework/nd_sbp.h" #include "oneflow/core/job/global_for.h" #include "oneflow/core/job/resource_desc.h" #include "oneflow/core/control/global_process_ctx.h" @@ -28,11 +29,14 @@ Maybe NaiveB2PBoxingInterpreter::InterpretImpl( Symbol out_parallel_desc) const { CHECK_OR_RETURN(in_parallel_desc == out_parallel_desc); int64_t root = JUST(in_parallel_desc->MachineId4ParallelId(0)); + std::shared_ptr tensor = JUST(input->cur_rank_phy_tensor()); if (root == GlobalProcessCtx::Rank()) { - return JUST(one::functional::Identity(input)); + // do nothing } else { - return JUST(one::functional::ZerosLike(input)); + tensor = JUST(one::functional::ZerosLike(tensor)); } + return one::functional::ToConsistent(tensor, out_parallel_desc, *JUST(GetSbpList(out_nd_sbp)), + GetNoneSbpList()); } } // namespace oneflow diff --git a/oneflow/core/framework/op_interpreter/eager_consistent_op_interpreter.cpp b/oneflow/core/framework/op_interpreter/eager_consistent_op_interpreter.cpp index d904c92cb96..93fb9479817 100644 --- a/oneflow/core/framework/op_interpreter/eager_consistent_op_interpreter.cpp +++ b/oneflow/core/framework/op_interpreter/eager_consistent_op_interpreter.cpp @@ -34,6 +34,7 @@ limitations under the License. #include "oneflow/user/kernels/stateful_local_opkernel.h" #include "oneflow/core/framework/tensor_rpc_util.h" #include "oneflow/core/framework/tensor_consistent_id.h" +#include "oneflow/core/framework/nd_sbp.h" #include "oneflow/core/common/decorator.h" namespace oneflow { @@ -65,13 +66,11 @@ std::string GetDynamicOpConsistentFailedDebugString(const UserOpExpr& user_op_ex } Maybe CalcBoxingOutput(const std::shared_ptr& input, Symbol out_nd_sbp, - bool current_rank_local_is_valid) { - if (!current_rank_local_is_valid) { return input; } + Symbol out_parallel_desc) { const auto* mgr = Global::Get(); // Eager boxing const auto& in_nd_sbp = JUST(input->nd_sbp()); const auto& in_parallel_desc = JUST(input->parallel_desc()); - const auto& out_parallel_desc = in_parallel_desc; const auto& boxing_interpreter = JUST( mgr->GetEagerBoxingInterpreter(in_nd_sbp, out_nd_sbp, in_parallel_desc, out_parallel_desc)); const auto& output = JUST(boxing_interpreter->Interpret(input, in_nd_sbp, out_nd_sbp, @@ -112,8 +111,11 @@ Maybe Interpret(const UserOpExpr& user_op_expr, const TensorTuple& inputs, for (int i = 0; i < inputs.size(); ++i) { std::shared_ptr input = inputs.at(i); const auto& infered_input_meta = result->input_tensor_metas().at(i); + const auto& input_parallel_desc = JUST(input->parallel_desc()); + CHECK_OR_RETURN(input_parallel_desc == infered_input_meta->parallel_desc()); if (infered_input_meta->nd_sbp() != JUST(input->nd_sbp())) { - input = JUST(GetBoxingOutput(input, infered_input_meta->nd_sbp(), parallel_id.has_value())); + input = JUST(GetBoxingOutput(input, infered_input_meta->nd_sbp(), + infered_input_meta->parallel_desc())); } const auto& local_tensor = JUST(input->cur_rank_phy_tensor()); input_eager_blob_objects->at(i) = JUST(local_tensor->eager_blob_object()); @@ -151,6 +153,57 @@ Maybe EagerConsistentInterpreter::ApplyImpl(const VariableOpExpr& op_expr, OF_UNIMPLEMENTED(); } +namespace { + +static constexpr auto* RecursiveGetBoxingOutput = + DECORATE(&CalcBoxingOutput, CheckConsistentTensorMeta); + +Maybe RawConsistentToConsistent(const ConsistentToConsistentOpExpr& op_expr, + const TensorTuple& inputs, TensorTuple* outputs, + const OpExprInterpContext& ctx) { + CHECK_EQ_OR_RETURN(inputs.size(), 1); + CHECK_EQ_OR_RETURN(outputs->size(), 1); + const auto& input = inputs.at(0); + CHECK_OR_RETURN(input->is_consistent()); + CHECK_OR_RETURN(ctx.parallel_desc.has_value()); + CHECK_OR_RETURN(ctx.nd_sbp.has_value()); + const auto& in_parallel_desc = JUST(input->parallel_desc()); + const auto& out_nd_sbp = JUST(ctx.nd_sbp.value()); + const auto& out_parallel_desc = JUST(ctx.parallel_desc.value()); + const auto& in_parallel_id = JUST(GetParallelId4CurrentProcessCtx(in_parallel_desc)); + const auto& out_parallel_id = JUST(GetParallelId4CurrentProcessCtx(out_parallel_desc)); + const auto& tensor = JUST(RecursiveGetBoxingOutput(input, out_nd_sbp, out_parallel_desc)); + CHECK_OR_RETURN(tensor); + if (out_parallel_id->has_value()) { + const auto& nd_sbp = JUST(tensor->nd_sbp()); + const auto& parallel_desc = JUST(tensor->parallel_desc()); + CHECK_OR_RETURN(nd_sbp == out_nd_sbp) << ". nd_sbp: " << *JUST(NdSbpToString(nd_sbp)) + << ", out_nd_sbp" << *JUST(NdSbpToString(out_nd_sbp)); + CHECK_OR_RETURN(parallel_desc == out_parallel_desc); + outputs->at(0) = tensor; + } else { + ConsistentTensorMeta tensor_meta(tensor->shape(), tensor->dtype()->data_type(), out_nd_sbp, + out_parallel_desc); + const auto& tensor_impl = + JUST(EagerConsistentTensorImpl::New(SymbolOf(tensor_meta), tensor->requires_grad(), false)); + outputs->at(0).reset(new ConsistentTensor(tensor_impl)); + } + CHECK_OR_RETURN(outputs->at(0)); + return Maybe::Ok(); +} + +static constexpr auto* ConsistentToConsistent = + DECORATE(&RawConsistentToConsistent, NonRecursiveInitConsistentId); + +} // namespace + +Maybe EagerConsistentInterpreter::ApplyImpl(const ConsistentToConsistentOpExpr& op_expr, + const TensorTuple& inputs, TensorTuple* outputs, + const OpExprInterpContext& ctx) const { + JUST(ConsistentToConsistent(op_expr, inputs, outputs, ctx)); + return Maybe::Ok(); +} + Maybe EagerConsistentInterpreter::ApplyImpl(const CastToConsistentOpExpr& op_expr, const TensorTuple& inputs, TensorTuple* outputs, const OpExprInterpContext& ctx) const { diff --git a/oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.cpp b/oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.cpp index 36ca96960d1..2427d4c5f35 100644 --- a/oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.cpp +++ b/oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.cpp @@ -14,6 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "oneflow/core/common/symbol.h" +#include "oneflow/core/common/decorator.h" #include "oneflow/core/framework/device.h" #include "oneflow/core/framework/op_interpreter.h" #include "oneflow/core/framework/op_interpreter/op_interpreter_util.h" @@ -35,6 +36,7 @@ limitations under the License. #include "oneflow/core/autograd/autograd_mode.h" #include "oneflow/core/framework/placement_sbp_util.h" #include "oneflow/core/framework/tensor_rpc_util.h" +#include "oneflow/core/framework/tensor_consistent_id.h" #include "oneflow/core/framework/op_builder.h" #include "oneflow/core/framework/id_util.h" #include "oneflow/core/functional/functional.h" @@ -249,29 +251,36 @@ Maybe FindOrCreatEagerNcclBroadcastOpExpr(Symbol } return iter->second; } +} // namespace -Maybe GetSyncedTensorIfBroadcast(const std::shared_ptr& tensor, - Symbol parallel_desc, - Symbol nd_sbp) { - Optional parallel_id; - JUST(GetDevice4CurrentProcessCtx(parallel_desc, ¶llel_id)); - if (!parallel_id.has_value()) { return tensor; } - const auto& broadcast_parallel_desc = JUST(GetBroadcastSubParallelDesc(parallel_desc, nd_sbp)); - if (broadcast_parallel_desc->parallel_num() == 1 /* no broadcast */) { return tensor; } - std::shared_ptr op_expr = - JUST(FindOrCreatEagerNcclBroadcastOpExpr(broadcast_parallel_desc)); - if (JUST(broadcast_parallel_desc->MachineId4ParallelId(0)) == GlobalProcessCtx::Rank()) { +Maybe Broadcast(const std::shared_ptr& tensor, Symbol parallel_desc) { + CHECK_OR_RETURN(parallel_desc->containing_current_rank()); + if (parallel_desc->parallel_num() == 1 /* no broadcast */) { return tensor; } + std::shared_ptr op_expr = JUST(FindOrCreatEagerNcclBroadcastOpExpr(parallel_desc)); + if (JUST(parallel_desc->MachineId4ParallelId(0)) == GlobalProcessCtx::Rank()) { // inplace. TensorTuple outputs{tensor}; JUST(OpInterpUtil::Dispatch(*op_expr, {tensor}, &outputs, - one::OpExprInterpContext(AttrMap{}, broadcast_parallel_desc))); + one::OpExprInterpContext(AttrMap{}, parallel_desc))); return tensor; } else { return JUST(OpInterpUtil::Dispatch( - *op_expr, {tensor}, one::OpExprInterpContext(AttrMap{}, broadcast_parallel_desc))); + *op_expr, {tensor}, one::OpExprInterpContext(AttrMap{}, parallel_desc))); } } +namespace { + +Maybe GetSyncedTensorIfBroadcast(const std::shared_ptr& tensor, + Symbol parallel_desc, + Symbol nd_sbp) { + Optional parallel_id; + JUST(GetDevice4CurrentProcessCtx(parallel_desc, ¶llel_id)); + if (!parallel_id.has_value()) { return tensor; } + const auto& broadcast_parallel_desc = JUST(GetBroadcastSubParallelDesc(parallel_desc, nd_sbp)); + return Broadcast(tensor, broadcast_parallel_desc); +} + Maybe CalcPhysicalShape(Symbol consistent_tensor_meta) { const auto& opt_parallel_id = JUST(GetParallelId4CurrentProcessCtx(consistent_tensor_meta->parallel_desc())); @@ -294,9 +303,16 @@ Maybe TryReshapeTensor(const std::shared_ptr& tensor, } // namespace -Maybe EagerMirroredInterpreter::ApplyImpl(const CastToConsistentOpExpr& op_expr, +Maybe EagerMirroredInterpreter::ApplyImpl(const ConsistentToConsistentOpExpr& op_expr, const TensorTuple& inputs, TensorTuple* outputs, const OpExprInterpContext& ctx) const { + OF_UNIMPLEMENTED(); +} + +namespace { + +Maybe RawLocalToConsistent(const CastToConsistentOpExpr& op_expr, const TensorTuple& inputs, + TensorTuple* outputs, const OpExprInterpContext& ctx) { std::shared_ptr input_mirrored_tensor; { CHECK_EQ_OR_RETURN(inputs.size(), 1); @@ -323,23 +339,46 @@ Maybe EagerMirroredInterpreter::ApplyImpl(const CastToConsistentOpExpr& op const auto& consistent_tensor_impl = JUST(EagerConsistentTensorImpl::New( SymbolOf(tensor_meta), device, parallel_id, input_mirrored_tensor->requires_grad(), !input_mirrored_tensor->requires_grad())); - const auto& transport_token = JUST(TransportToken::NewMetaTransportToken()); - JUST(consistent_tensor_impl->set_transport_token(transport_token)); consistent_tensor = std::make_shared(consistent_tensor_impl); - JUST(WithConsistencyChecked(consistent_tensor, [&]() -> Maybe { - if (!parallel_id.has_value()) { return Maybe::Ok(); } - const auto& reshaped_tensor = JUST(TryReshapeTensor(input_mirrored_tensor, tensor_meta)); - const auto& synced_tensor = - JUST(GetSyncedTensorIfBroadcast(reshaped_tensor, parallel_desc, nd_sbp)); + if (parallel_id.has_value()) { CHECK_EQ_OR_RETURN(dtype, input_mirrored_tensor->dtype()->data_type()); - consistent_tensor_impl->reset_cur_rank_phy_tensor(JUST(synced_tensor->AsMirroredTensor())); - return Maybe::Ok(); - })); + consistent_tensor_impl->reset_cur_rank_phy_tensor(input_mirrored_tensor); + } } outputs->at(0) = consistent_tensor; return Maybe::Ok(); } +static constexpr auto* LocalToConsistent = + DECORATE(&RawLocalToConsistent, NonRecursiveInitConsistentId); + +} // namespace + +Maybe EagerMirroredInterpreter::ApplyImpl(const CastToConsistentOpExpr& op_expr, + const TensorTuple& inputs, TensorTuple* outputs, + const OpExprInterpContext& ctx) const { + JUST(LocalToConsistent(op_expr, inputs, outputs, ctx)); + const auto& consistent_tensor = JUST(outputs->at(0)->AsConsistentTensor()); + JUST(WithConsistencyChecked(consistent_tensor, [&]() -> Maybe { + if (IsConsistentTensorMetaCheckDisabled()) { return Maybe::Ok(); } + const auto& parallel_desc = JUST(ctx.parallel_desc.value()); + const auto& parallel_id = JUST(GetParallelId4CurrentProcessCtx(parallel_desc)); + if (!parallel_id->has_value()) { return Maybe::Ok(); } + const auto& nd_sbp = JUST(ctx.nd_sbp.value()); + const auto& tensor_meta = JUST(consistent_tensor->consistent_tensor_meta()); + const auto& local_tensor = JUST(consistent_tensor->cur_rank_phy_tensor()); + const auto& reshaped_tensor = JUST(TryReshapeTensor(local_tensor, tensor_meta)); + const auto& synced_tensor = + JUST(GetSyncedTensorIfBroadcast(reshaped_tensor, parallel_desc, nd_sbp)); + auto* consistent_tensor_impl = + reinterpret_cast(consistent_tensor->mut_impl()); + CHECK_NOTNULL_OR_RETURN(consistent_tensor_impl); + consistent_tensor_impl->reset_cur_rank_phy_tensor(JUST(synced_tensor->AsMirroredTensor())); + return Maybe::Ok(); + })); + return Maybe::Ok(); +} + Maybe EagerMirroredInterpreter::ApplyImpl(const CastFromConsistentOpExpr& op_expr, const TensorTuple& inputs, TensorTuple* outputs, const OpExprInterpContext& ctx) const { diff --git a/oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.h b/oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.h index 894746df549..340e54fed9a 100644 --- a/oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.h +++ b/oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.h @@ -19,8 +19,14 @@ namespace oneflow { class Device; class TensorTuple; +class ParallelDesc; namespace one { + +class Tensor; + Maybe RunEmptyOp(TensorTuple* outputs); -} +Maybe Broadcast(const std::shared_ptr& tensor, Symbol parallel_desc); + +} // namespace one } // namespace oneflow diff --git a/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp b/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp index b6ee5267736..6f9597c483b 100644 --- a/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp +++ b/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp @@ -582,9 +582,6 @@ Maybe LazyInterpreter::ApplyImpl(const ConsistentToConsistentOpExpr& op_ex CHECK_OR_RETURN(input_tensor->is_lazy()); CHECK_OR_RETURN(input_tensor->is_consistent()); - bool identity_grad = JUST(ctx.attrs.GetAttr("identity_grad")); - const auto& grad_sbp_list = JUST(ctx.attrs.GetAttr>("grad_sbp")); - CHECK_OR_RETURN(ctx.parallel_desc.has_value()); const auto& parallel_desc_sym = JUST(ctx.parallel_desc.value()); CHECK_OR_RETURN(ctx.nd_sbp.has_value()); @@ -611,24 +608,22 @@ Maybe LazyInterpreter::ApplyImpl(const ConsistentToConsistentOpExpr& op_ex } // build parallel cast op expr + std::shared_ptr> sbp_list_ptr = JUST(GetNdSbpStrList(sbp_sym)); std::string grad_mode; - std::vector grad_parallel_distribution; - if (identity_grad) { - grad_mode = "identity"; - } else if (grad_sbp_list.size() > 0) { + std::vector grad_sbp_str_list; + if (op_expr.grad_nd_sbp().has_value()) { grad_mode = "manual"; - grad_parallel_distribution = grad_sbp_list; + grad_sbp_str_list = *JUST(GetNdSbpStrList(JUST(op_expr.grad_nd_sbp().value()))); } else { - grad_mode = "restore"; + grad_mode = "identity"; } - auto sbp_list_ptr = JUST(GetNdSbpStrList(sbp_sym)); std::shared_ptr parallel_cast_op_expr = JUST(OpBuilder("hierarchical_parallel_cast", "trivial_op_name") .Input("in") .Output("out") .Attr>("nd_sbp", *sbp_list_ptr) .Attr("grad_mode", grad_mode) - .Attr>("grad_nd_sbp", grad_parallel_distribution) + .Attr>("grad_nd_sbp", grad_sbp_str_list) .Build()); CHECK_EQ_OR_RETURN(op_expr.output_size(), 1); diff --git a/oneflow/core/framework/op_interpreter/op_interpreter.cpp b/oneflow/core/framework/op_interpreter/op_interpreter.cpp index 7481821d58f..b1eb8dbf4cb 100644 --- a/oneflow/core/framework/op_interpreter/op_interpreter.cpp +++ b/oneflow/core/framework/op_interpreter/op_interpreter.cpp @@ -57,6 +57,7 @@ Maybe EagerInterpreter::Apply(const OpExpr& op_expr, const TensorTuple& in APPLY_IF(VariableOp); APPLY_IF(CastToMirroredOp); APPLY_IF(CastFromMirroredOp); + APPLY_IF(ConsistentToConsistentOp); APPLY_IF(CastToConsistentOp); APPLY_IF(CastFromConsistentOp); APPLY_IF(DistributeSplitOp); diff --git a/oneflow/core/framework/tensor_consistent_id.cpp b/oneflow/core/framework/tensor_consistent_id.cpp index 74192f153c7..f76433f1859 100644 --- a/oneflow/core/framework/tensor_consistent_id.cpp +++ b/oneflow/core/framework/tensor_consistent_id.cpp @@ -28,10 +28,9 @@ int64_t* MutThreadLocalRecursiveDepth() { } Maybe InitConsistentId(TensorTuple* outputs) { - for (int i = 0; i < outputs->size(); ++i) { - const auto& consistent_tensor = std::dynamic_pointer_cast(outputs->at(i)); - CHECK_OR_RETURN(consistent_tensor) - << Error::Unimplemented() << "consistent tensors suppported only."; + for (const auto& output : *outputs) { + CHECK_OR_RETURN(output); + const auto& consistent_tensor = JUST(output->AsConsistentTensor()); const auto& transport_token = JUST(TransportToken::NewMetaTransportToken()); JUST(consistent_tensor->mut_impl()->set_transport_token(transport_token)); } diff --git a/oneflow/core/framework/tensor_consistent_id.h b/oneflow/core/framework/tensor_consistent_id.h index fc35a548898..19c8f0fd53b 100644 --- a/oneflow/core/framework/tensor_consistent_id.h +++ b/oneflow/core/framework/tensor_consistent_id.h @@ -30,16 +30,15 @@ Maybe InitConsistentId(TensorTuple* outputs); template struct NonRecursiveInitConsistentId; -template -struct NonRecursiveInitConsistentId { - static_assert(is_maybe::value, ""); - template - static RetT Call(Arg0 arg0, Arg1 arg1, TensorTuple* outputs, Args... args) { +template +struct NonRecursiveInitConsistentId, Arg0, Arg1, TensorTuple*, Args...> { + template (*func)(Arg0, Arg1, TensorTuple*, Args...)> + static Maybe Call(Arg0 arg0, Arg1 arg1, TensorTuple* outputs, Args... args) { auto* recursive_depth = MutThreadLocalRecursiveDepth(); ++*recursive_depth; - RetT ret = func(arg0, arg1, outputs, args...); + Maybe ret = func(arg0, arg1, outputs, args...); --*recursive_depth; - if (*recursive_depth == 0) { JUST(InitConsistentId(outputs)); } + if (*recursive_depth == 0 && ret.IsOk()) { JUST(InitConsistentId(outputs)); } return ret; } }; diff --git a/oneflow/core/framework/tensor_rpc_util.h b/oneflow/core/framework/tensor_rpc_util.h index f1e37246b3e..04cb9d1c5f1 100644 --- a/oneflow/core/framework/tensor_rpc_util.h +++ b/oneflow/core/framework/tensor_rpc_util.h @@ -40,6 +40,10 @@ Maybe RunCallback(const std::shared_ptr& tensor, } // namespace private_details +inline bool IsConsistentTensorMetaCheckDisabled() { + return *private_details::MutThreadLocalDepth() > 1; +} + template struct CheckConsistentTensorMeta; @@ -60,6 +64,11 @@ struct CheckConsistentTensorMeta&, Args } }; +struct DisableCheckConsistentTensorMetaScope final { + DisableCheckConsistentTensorMetaScope() { ++*private_details::MutThreadLocalDepth(); } + ~DisableCheckConsistentTensorMetaScope() { --*private_details::MutThreadLocalDepth(); } +}; + static constexpr auto* WithConsistencyChecked = DECORATE(&private_details::RunCallback, CheckConsistentTensorMeta); diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml index 0eebf46d75e..35b00c6d708 100644 --- a/oneflow/core/functional/functional_api.yaml +++ b/oneflow/core/functional/functional_api.yaml @@ -959,7 +959,7 @@ bind_python: True - name: "to_consistent" - signature: "Tensor ToConsistent(Tensor x, *, Placement placement, SbpList sbp, Bool identity_grad=False, SbpList grad_sbp)" + signature: "Tensor ToConsistent(Tensor x, *, Placement placement, SbpList sbp, SbpList grad_sbp)" bind_python: True - name: "to_local" @@ -970,6 +970,10 @@ signature: "Tensor AllReduce(Tensor x)" bind_python: True +- name: "broadcast" + signature: "Tensor Broadcast(Tensor x)" + bind_python: True + - name: "select_first" signature: "Tensor SelectFirst(TensorTuple inputs)" bind_python: True @@ -1037,6 +1041,18 @@ SbpList sbp_tuple, DataType dtype=None, Generator generator=None)" bind_python: True +- name: "randint" + signature: "Tensor RandInt(Int64 low, Int64 high, *, Shape shape, DataType dtype=None, Device device=None, Generator generator=None)" + bind_python: True + +- name: "consistent_randint" + signature: "Tensor ConsistentRandInt(Int64 low, Int64 high, *, Shape shape, Placement placement, SbpList sbp_tuple, DataType dtype=None, Generator generator=None)" + bind_python: TensorTuple + +- name: "randperm" + signature: "Tensor RandPerm(Int32 n, *, DataType dtype=kInt64, Device device=None, Generator generator=None)" + bind_python: True + - name: "scalar_fmod" signature: "Tensor ScalarFMod(Tensor in, Scalar scalar)" bind_python: False @@ -1057,17 +1073,6 @@ signature: "Tensor L2NormalizeGrad(Tensor dy, Tensor y, Tensor square_x_sum, Int32 axis, Float epsilon, *)" bind_python: False -- name: "randint" - signature: "Tensor Randint(Int64 low,Int64 high,Shape shape, *,Device device=None,Generator generator=None)" - bind_python: True - -- name: "consistent_randint" - signature: "Tensor ConsistentRandint(Int64 low,Int64 high,Shape shape, *,Placement placement, SbpList sbp_tuple, Generator generator=None)" - -- name: "randperm" - signature: "Tensor Randperm(Int32 n,*,Device device=None, Generator generator=None)" - bind_python: True - - name: "consistent_randperm" signature: "Tensor ConsistentRandperm(Int32 n,*, Placement placement, SbpList sbp_tuple, Generator generator=None)" bind_python: True diff --git a/oneflow/core/functional/impl/comm_functor.cpp b/oneflow/core/functional/impl/comm_functor.cpp index 46572ccd7ab..7abd3041b58 100644 --- a/oneflow/core/functional/impl/comm_functor.cpp +++ b/oneflow/core/functional/impl/comm_functor.cpp @@ -18,6 +18,7 @@ limitations under the License. #include "oneflow/core/framework/op_builder.h" #include "oneflow/core/framework/op_expr.h" #include "oneflow/core/framework/op_interpreter/op_interpreter_util.h" +#include "oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.h" #include "oneflow/core/framework/tensor.h" #include "oneflow/core/framework/tensor_tuple.h" #include "oneflow/core/functional/functional.h" @@ -33,6 +34,19 @@ namespace one { namespace functional { namespace impl { +class BroadcastFunctor { + public: + BroadcastFunctor() = default; + Maybe operator()(const std::shared_ptr& x) const { + const auto& rank_group = JUST(RankGroupScope::CurrentRankGroup()); + std::string device_type_str = JUST(x->device())->type(); + CHECK_OR_RETURN(device_type_str == "cuda" || device_type_str == "cpu"); + DeviceType device_type = device_type_str == "cuda" ? DeviceType::kGPU : DeviceType::kCPU; + const auto& parallel_desc = JUST(RankGroup::GetDefaultParallelDesc(device_type, rank_group)); + return one::Broadcast(x, parallel_desc); + } +}; + class AllReduceFunctor { public: AllReduceFunctor() = default; @@ -75,7 +89,10 @@ class AllReduceFunctor { }; } // namespace impl -ONEFLOW_FUNCTION_LIBRARY(m) { m.add_functor("AllReduce"); }; +ONEFLOW_FUNCTION_LIBRARY(m) { + m.add_functor("AllReduce"); + m.add_functor("Broadcast"); +}; } // namespace functional } // namespace one diff --git a/oneflow/core/functional/impl/consistent_cast.cpp b/oneflow/core/functional/impl/consistent_cast.cpp index 30a0e312f98..6bd3392b4cb 100644 --- a/oneflow/core/functional/impl/consistent_cast.cpp +++ b/oneflow/core/functional/impl/consistent_cast.cpp @@ -30,6 +30,7 @@ limitations under the License. #include "oneflow/core/job/global_for.h" #include "oneflow/core/job/resource_desc.h" #include "oneflow/core/job/rank_group_scope.h" +#include "oneflow/core/job/lazy_mode.h" #include "oneflow/core/framework/transport_token.h" #include "oneflow/core/framework/transport_util.h" #include "oneflow/core/framework/placement_sbp_util.h" @@ -38,6 +39,7 @@ limitations under the License. #include "oneflow/core/common/container_util.h" #include "oneflow/core/common/balanced_splitter.h" #include "oneflow/core/common/decorator.h" +#include "oneflow/core/common/optional.h" #include "oneflow/core/ccl/ccl.h" namespace oneflow { @@ -201,49 +203,37 @@ Maybe GetLogicalShapeAndDataType(Shape* logical_shape, DataType* /* in and return Maybe::Ok(); } -Maybe MakeNdSbpOpExpr(const std::vector>& sbp_parallels) { - return OpBuilder("hierarchical_parallel_cast", *JUST(UniqueStr("hierarchical_parallel_cast"))) - .Input("in") - .Output("out") - .Attr>("nd_sbp", *JUST(GetNdSbpStrList(sbp_parallels))) - .Attr("grad_mode", "restore") - .Attr>("grad_nd_sbp", std::vector()) - .Build(); -} - -auto* CachedNdSbpOpExpr = DECORATE(&MakeNdSbpOpExpr, ThreadLocalCopiable); - -Maybe ConsistentToConsistent(const std::shared_ptr& x, - Symbol parallel_desc, - const std::vector>& sbp_parallels) { - const auto& consistent_tensor = JUST(x->AsConsistentTensor()); - CHECK_NOTNULL_OR_RETURN(consistent_tensor) << "consistent tensors supported only"; - CHECK_OR_RETURN(consistent_tensor->is_eager()) << "eager tensors supported only"; - const auto& nd_sbp_cast_op_expr = JUST(CachedNdSbpOpExpr(sbp_parallels)); +namespace { - const auto& ret = - JUST(OpInterpUtil::Dispatch(*nd_sbp_cast_op_expr, {consistent_tensor})); - return ret; +Maybe RawGetConsistentToConsistentOpExpr( + const std::vector>& grad_sbp_parallels) { + Optional> grad_nd_sbp; + if (!grad_sbp_parallels.empty()) { grad_nd_sbp = JUST(GetNdSbp(grad_sbp_parallels)); } + std::shared_ptr op_expr = JUST(one::ConsistentToConsistentOpExpr::New(grad_nd_sbp)); + return op_expr; } -Maybe LazyConsistentToConsistent( - const std::shared_ptr& x, Symbol parallel_desc, - const std::vector>& sbp_parallels, bool identity_grad, - const std::vector>& grad_sbp_parallels, - const std::shared_ptr& op) { - CHECK_OR_RETURN(x->is_lazy()); - CHECK_OR_RETURN(x->is_consistent()); - - Symbol parallel_distribution = JUST(GetNdSbp(sbp_parallels)); - std::vector grad_parallel_distribution = *JUST(GetNdSbpStrList(grad_sbp_parallels)); +} // namespace - MutableAttrMap attrs; - JUST(attrs.SetAttr("identity_grad", identity_grad)); - JUST(attrs.SetAttr>("grad_sbp", grad_parallel_distribution)); +static constexpr auto* GetConsistentToConsistentOpExpr = + DECORATE(&RawGetConsistentToConsistentOpExpr, ThreadLocalCopiable); - const auto& output = JUST(OpInterpUtil::Dispatch( - *op, {x}, OpExprInterpContext(attrs, parallel_desc, parallel_distribution))); - return output; +Maybe ConsistentToConsistent( + const std::shared_ptr& x, Symbol parallel_desc, + const std::vector>& sbp_parallels, + const std::vector>& grad_sbp_parallels) { + const auto& consistent_tensor = JUST(x->AsConsistentTensor()); + CHECK_NOTNULL_OR_RETURN(consistent_tensor) << "consistent tensors supported only"; + const auto& op = JUST(GetConsistentToConsistentOpExpr(grad_sbp_parallels)); + const auto& nd_sbp = JUST(GetNdSbp(sbp_parallels)); + const auto& tensor = JUST(OpInterpUtil::Dispatch( + *op, {consistent_tensor}, OpExprInterpContext(AttrMap{}, parallel_desc, nd_sbp))); + if (!LazyMode::is_enabled() && tensor != x) { + const auto& input_consistent_id = JUST(x->transport_token()); + const auto& output_consistend_id = JUST(tensor->transport_token()); + CHECK_NE_OR_RETURN(input_consistent_id, output_consistend_id); + } + return tensor; } Maybe LocalToConsistent(const std::shared_ptr& x, @@ -293,30 +283,23 @@ class ToConsistentFunctor { ToConsistentFunctor() { local_to_consistent_op_ = CHECK_JUST(one::CastToConsistentOpExpr::New(*CHECK_JUST(UniqueStr("cast_to_consistent")))); - consistent_to_consistent_op_ = CHECK_JUST( - one::ConsistentToConsistentOpExpr::New(*CHECK_JUST(UniqueStr("consistent_to_consistent")))); } Maybe operator()(const std::shared_ptr& x, Symbol parallel_desc, const std::vector>& sbp_parallels, - bool identity_grad, const std::vector>& grad_sbp_parallels) const { + std::shared_ptr tensor; if (x->is_consistent()) { - if (x->is_lazy()) { - return JUST(LazyConsistentToConsistent(x, parallel_desc, sbp_parallels, identity_grad, - grad_sbp_parallels, consistent_to_consistent_op_)); - } else { - return JUST(ConsistentToConsistent(x, parallel_desc, sbp_parallels)); - } + tensor = JUST(ConsistentToConsistent(x, parallel_desc, sbp_parallels, grad_sbp_parallels)); } else { - return JUST(LocalToConsistent(x, parallel_desc, sbp_parallels, local_to_consistent_op_)); + tensor = JUST(LocalToConsistent(x, parallel_desc, sbp_parallels, local_to_consistent_op_)); } + return tensor; } private: std::shared_ptr local_to_consistent_op_; - std::shared_ptr consistent_to_consistent_op_; }; class ConsistentToLocalFunctor { diff --git a/oneflow/core/functional/impl/random_functor.cpp b/oneflow/core/functional/impl/random_functor.cpp index 612467a9d3b..52d06dbbf70 100644 --- a/oneflow/core/functional/impl/random_functor.cpp +++ b/oneflow/core/functional/impl/random_functor.cpp @@ -252,17 +252,28 @@ class ConsistentRandNFunctor { private: std::shared_ptr op_; }; -class RandintFunctor { +class RandIntFunctor { public: - RandintFunctor() { randint_op_ = CHECK_JUST(one::OpBuilder("randint").Output("out").Build()); } + RandIntFunctor() { op_ = CHECK_JUST(one::OpBuilder("uniform").Output("out").Build()); } Maybe operator()(const int64_t low, const int64_t high, const Shape& shape, + const Optional>& dtype, const Optional>& device, const Optional& generator) const { + DataType dtype_val = DataType::kInt64; + if (dtype.has_value()) { + dtype_val = JUST(dtype.value())->data_type(); + + if (dtype_val != DataType::kFloat && dtype_val != DataType::kDouble) { + OF_UNIMPLEMENTED() << dtype_val << "not supported in randn"; + } + } + MutableAttrMap attrs; JUST(attrs.SetAttr("shape", shape)); - JUST(attrs.SetAttr("low", low)); - JUST(attrs.SetAttr("high", high)); + JUST(attrs.SetAttr("low", low)); + JUST(attrs.SetAttr("high", high)); + JUST(attrs.SetAttr("dtype", dtype_val)); std::shared_ptr gen; if (!generator) { @@ -270,40 +281,55 @@ class RandintFunctor { } else { gen = JUST(generator.value()); } - const auto& randint_kernel_state = std::make_shared(gen); + JUST(attrs.SetAttr("seed", gen->current_seed())); + + const auto& uniform_kernel_state = std::make_shared(gen); if (device.has_value()) { Symbol device_symbol = JUST(device.value()); return OpInterpUtil::Dispatch( - *randint_op_, {}, OpExprInterpContext(attrs, device_symbol, randint_kernel_state)); + *op_, {}, OpExprInterpContext(attrs, device_symbol, uniform_kernel_state)); } else { - return OpInterpUtil::Dispatch(*randint_op_, {}, - OpExprInterpContext(attrs, randint_kernel_state)); + return OpInterpUtil::Dispatch(*op_, {}, + OpExprInterpContext(attrs, uniform_kernel_state)); } } private: - std::shared_ptr randint_op_; + std::shared_ptr op_; }; -class ConsistentRandintFunctor { + +class ConsistentRandIntFunctor { public: - ConsistentRandintFunctor() { - randint_op_ = CHECK_JUST(one::OpBuilder("randint").Output("out").Build()); - } + ConsistentRandIntFunctor() { op_ = CHECK_JUST(one::OpBuilder("uniform").Output("out").Build()); } Maybe operator()(const int64_t low, const int64_t high, const Shape& shape, const Symbol& placement, const std::vector>& sbp_tuple, + const Optional>& dtype, const Optional& generator) const { + DataType dtype_val = DataType::kInt64; + if (dtype.has_value()) { + dtype_val = JUST(dtype.value())->data_type(); + + if (dtype_val != DataType::kFloat && dtype_val != DataType::kDouble) { + OF_UNIMPLEMENTED() << dtype_val << "not supported in randn"; + } + } + MutableAttrMap attrs; JUST(attrs.SetAttr("shape", shape)); - JUST(attrs.SetAttr("low", low)); - JUST(attrs.SetAttr("high", high)); + JUST(attrs.SetAttr("low", low)); + JUST(attrs.SetAttr("high", high)); + JUST(attrs.SetAttr("dtype", dtype_val)); std::shared_ptr gen; if (!generator) { gen = JUST(one::DefaultAutoGenerator()); } else { gen = JUST(generator.value()); } - const auto& randint_kernel_state = std::make_shared(gen); + + JUST(attrs.SetAttr("seed", gen->current_seed())); + + const auto& uniform_kernel_state = std::make_shared(gen); if (LazyMode::is_enabled()) { std::vector nd_sbp(sbp_tuple.size()); @@ -317,11 +343,11 @@ class ConsistentRandintFunctor { const auto& nd_sbp = JUST(GetNdSbp(sbp_tuple)); return OpInterpUtil::Dispatch( - *randint_op_, {}, OpExprInterpContext(attrs, placement, nd_sbp, randint_kernel_state)); + *op_, {}, OpExprInterpContext(attrs, placement, nd_sbp, uniform_kernel_state)); } private: - std::shared_ptr randint_op_; + std::shared_ptr op_; }; class RandPermFunctor { @@ -398,14 +424,14 @@ class ConsistentRandPermFunctor { ONEFLOW_FUNCTION_LIBRARY(m) { m.add_functor("Bernoulli"); - m.add_functor("Randperm"); - m.add_functor("ConsistentRandperm"); + m.add_functor("RandPerm"); + m.add_functor("ConsistentRandPerm"); m.add_functor("Rand"); m.add_functor("ConsistentRand"); m.add_functor("RandN"); m.add_functor("ConsistentRandN"); - m.add_functor("Randint"); - m.add_functor("ConsistentRandint"); + m.add_functor("RandInt"); + m.add_functor("ConsistentRandInt"); }; } // namespace functional diff --git a/oneflow/core/functional/tensor_index.cpp b/oneflow/core/functional/tensor_index.cpp index a7d982b2a3b..7d10d27bd63 100644 --- a/oneflow/core/functional/tensor_index.cpp +++ b/oneflow/core/functional/tensor_index.cpp @@ -18,6 +18,7 @@ limitations under the License. #include "oneflow/core/framework/tensor.h" #include "oneflow/core/framework/device.h" #include "oneflow/core/framework/tensor_tuple.h" +#include "oneflow/core/framework/nd_sbp.h" #include "oneflow/core/functional/functional.h" #include "oneflow/core/job/sbp_parallel.h" @@ -244,8 +245,8 @@ Maybe ApplyAdvancedIndexing(const std::shared_ptr& input, if (transposed_input->is_consistent()) { const auto& placement = JUST(transposed_input->parallel_desc()); const auto& broadcast_sbp = JUST(MakeBroadcastSbpParallel()); - packed_indices = JUST(ToConsistent(packed_indices, placement, {broadcast_sbp}, - /*identity_grad=*/false, /*grad_sbp_parallels=*/{})); + packed_indices = + JUST(ToConsistent(packed_indices, placement, {broadcast_sbp}, GetNoneSbpList())); } Symbol device = JUST(transposed_input->device()); if (JUST(packed_indices->device()) != device) { diff --git a/oneflow/core/job/parallel_desc.cpp b/oneflow/core/job/parallel_desc.cpp index 81e329c40be..3a230eabd0c 100644 --- a/oneflow/core/job/parallel_desc.cpp +++ b/oneflow/core/job/parallel_desc.cpp @@ -155,7 +155,8 @@ Maybe> ParallelDesc::GetDevice4CurrentProcessCtx( int64_t machine_id = 0; int64_t device_id = 0; GlobalProcessCtx::GetCurrentMachineIdAndDeviceId(&machine_id, &device_id); - const auto& device = JUST(Device::ThreadLocalGetOrNew(device_tag(), device_id)); + const auto& device = + JUST(Device::ThreadLocalGetOrNew(Device::Type4DeviceTag(device_tag()), device_id)); int64_t parallel_id_val = -1; if (TryGetParallelId(machine_id, device_id, ¶llel_id_val)) { *parallel_id = parallel_id_val; @@ -397,4 +398,58 @@ bool IsMirroredParallelContext(const ParallelContext& parallel_ctx) { return false; } +namespace private_details { + +Maybe> RawReplaceDeviceType(Symbol parallel_desc, + DeviceType device_type) { + ParallelConf parallel_conf(parallel_desc->parallel_conf()); + parallel_conf.set_device_tag(*JUST(DeviceTag4DeviceType(device_type))); + return SymbolOf(ParallelDesc(parallel_conf)); +} + +Maybe RawPlacementToString(Symbol placement) { + std::string device_type = placement->device_tag() == "gpu" ? "\"cuda\"" : "\"cpu\""; + std::vector sorted_node_ids; + HashMap> node_id2sorted_dev_phy_ids; + for (int64_t machine_id : placement->sorted_machine_ids()) { + int64_t node_id = GlobalProcessCtx::NodeId(machine_id); + if (!std::count(sorted_node_ids.begin(), sorted_node_ids.end(), node_id)) { + sorted_node_ids.push_back(node_id); + } + for (int64_t device_id : placement->sorted_dev_phy_ids(machine_id)) { + node_id2sorted_dev_phy_ids[node_id].push_back(device_id); + } + } + std::string machine_device_ids = "{"; + int64_t node_idx = 0; + for (int64_t node_id : sorted_node_ids) { + std::string device_name = std::to_string(node_id) + " : ["; + int64_t device_idx = 0; + for (int64_t device_id : node_id2sorted_dev_phy_ids.at(node_id)) { + device_name += std::to_string(device_id); + if (++device_idx != node_id2sorted_dev_phy_ids.at(node_id).size()) { device_name += ", "; } + } + device_name += "]"; + if (++node_idx != sorted_node_ids.size()) { device_name += ", "; } + machine_device_ids += device_name; + } + machine_device_ids += "}"; + std::string hierarchy = "("; + int32_t hierarchy_dim_idx = 0; + for (int64_t dim : placement->hierarchy()->dim_vec()) { + hierarchy += std::to_string(dim); + if (++hierarchy_dim_idx != placement->hierarchy()->dim_vec().size()) { + hierarchy += ", "; + } else if (placement->hierarchy()->dim_vec().size() == 1) { + hierarchy += ","; + } + } + hierarchy += ")"; + std::string placement_str = "oneflow.placement(device_type=" + device_type + + ", machine_device_ids=" + machine_device_ids + + ", hierarchy=" + hierarchy + ")"; + return placement_str; +} + +} // namespace private_details } // namespace oneflow diff --git a/oneflow/core/job/parallel_desc.h b/oneflow/core/job/parallel_desc.h index 8575bd3daa7..5756e17f55c 100644 --- a/oneflow/core/job/parallel_desc.h +++ b/oneflow/core/job/parallel_desc.h @@ -169,6 +169,19 @@ ParallelConf GenParallelConfOfCpuZeroOnAllMachines(); bool IsMirroredParallelContext(const ParallelContext& parallel_ctx); +namespace private_details { + +Maybe> RawReplaceDeviceType(Symbol, DeviceType); +Maybe RawPlacementToString(Symbol placement); + +} // namespace private_details + +static constexpr auto* ReplaceDeviceType = + DECORATE(&private_details::RawReplaceDeviceType, ThreadLocal); + +static constexpr auto* PlacementToString = + DECORATE(&private_details::RawPlacementToString, ThreadLocal); + } // namespace oneflow namespace std { diff --git a/oneflow/core/job/sbp_parallel.cpp b/oneflow/core/job/sbp_parallel.cpp index 0ab616e9cb9..97f36037320 100644 --- a/oneflow/core/job/sbp_parallel.cpp +++ b/oneflow/core/job/sbp_parallel.cpp @@ -192,24 +192,6 @@ std::string SbpParallelToString(const cfg::SbpParallel& sbp_parallel) { return sbp_str; } -std::string NdSbpToString(const Symbol nd_sbp) { - static HashMap, std::string>* nd_sbp2str = - new HashMap, std::string>(); - auto iter = nd_sbp2str->find(nd_sbp); - if (iter == nd_sbp2str->end()) { - std::stringstream nd_sbp_str; - nd_sbp_str << "["; - int32_t idx = 0; - for (const auto& sbp_parallel : nd_sbp->sbp_parallel()) { - nd_sbp_str << SbpParallelToString(sbp_parallel); - if (++idx != nd_sbp->sbp_parallel_size()) { nd_sbp_str << ", "; } - } - nd_sbp_str << "]"; - iter = nd_sbp2str->emplace(nd_sbp, nd_sbp_str.str()).first; - } - return iter->second; -} - void SbpSignatureToNdSbpSignature(const cfg::SbpSignature& sbp_signature, cfg::NdSbpSignature* nd_sbp_signature) { for (const auto& pair : sbp_signature.bn_in_op2sbp_parallel()) { diff --git a/oneflow/core/job/sbp_parallel.h b/oneflow/core/job/sbp_parallel.h index 7080db25fe2..ac8a909cc3c 100644 --- a/oneflow/core/job/sbp_parallel.h +++ b/oneflow/core/job/sbp_parallel.h @@ -53,7 +53,6 @@ void SortSbpSignatureListByCopyCost( bool IsValidSbpParallelString(const std::string& sbp_str); bool ParseSbpParallelFromString(const std::string& sbp_str, cfg::SbpParallel* sbp_parallel); std::string SbpParallelToString(const cfg::SbpParallel& sbp_parallel); -std::string NdSbpToString(const Symbol nd_sbp); void SbpSignatureToNdSbpSignature(const cfg::SbpSignature& sbp_signature, cfg::NdSbpSignature* nd_sbp_signature); diff --git a/oneflow/core/job_rewriter/quantization_aware_training.cpp b/oneflow/core/job_rewriter/quantization_aware_training.cpp index 97ffaa44729..be5746146f3 100644 --- a/oneflow/core/job_rewriter/quantization_aware_training.cpp +++ b/oneflow/core/job_rewriter/quantization_aware_training.cpp @@ -216,13 +216,13 @@ Maybe InsertQuantOpAfterInt8Ops4QatConfig(const QatConfig& qat_config) { user_op::UserOpConfWrapper MultiplyOp(const std::string& name, const std::string& x, const std::string& y, const int64_t scope_symbol_id, OpConfMap* inserted_ops) { - const auto op_wrapper = user_op::UserOpConfWrapperBuilder(name) - .Op("broadcast_mul") - .Input("x", x) - .Input("y", y) - .Output("z") - .ScopeSymbolId(scope_symbol_id) - .Build(); + auto op_wrapper = user_op::UserOpConfWrapperBuilder(name) + .Op("broadcast_mul") + .Input("x", x) + .Input("y", y) + .Output("z") + .ScopeSymbolId(scope_symbol_id) + .Build(); (*inserted_ops)[name] = op_wrapper.op_conf(); return op_wrapper; } diff --git a/oneflow/core/kernel/user_kernel.cpp b/oneflow/core/kernel/user_kernel.cpp index ed0803ec170..71a4d4833e7 100644 --- a/oneflow/core/kernel/user_kernel.cpp +++ b/oneflow/core/kernel/user_kernel.cpp @@ -524,7 +524,7 @@ class UserKernelComputeContext final : public user_op::KernelComputeContext { const JobDesc& job_desc) : user_op_conf_(kernel_conf.op_attribute().op_conf()), device_ctx_(device_ctx), - base_ctx_(std::move(UserKernelBaseContext(kernel_conf, job_desc))) { + base_ctx_(kernel_conf, job_desc) { auto InitInOrOut = [&](const PbMap& arg_map) { for (const auto& it : arg_map) { const std::string& arg_name = it.first; diff --git a/oneflow/core/platform/include/pthread_fork.h b/oneflow/core/platform/include/pthread_fork.h new file mode 100644 index 00000000000..bdc91ce8de9 --- /dev/null +++ b/oneflow/core/platform/include/pthread_fork.h @@ -0,0 +1,29 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_PLATFORM_INCLUDE_PTHREAD_FORK_H_ +#define ONEFLOW_CORE_PLATFORM_INCLUDE_PTHREAD_FORK_H_ + +namespace oneflow { + +namespace pthread_fork { + +bool IsForkedSubProcess(); + +} // namespace pthread_fork + +} // namespace oneflow + +#endif // ONEFLOW_CORE_PLATFORM_INCLUDE_PTHREAD_FORK_H_ diff --git a/oneflow/core/platform/lib/pthread_fork.cpp b/oneflow/core/platform/lib/pthread_fork.cpp new file mode 100644 index 00000000000..732383cbb8b --- /dev/null +++ b/oneflow/core/platform/lib/pthread_fork.cpp @@ -0,0 +1,33 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/platform/include/pthread_fork.h" +#include "oneflow/core/common/util.h" + +namespace oneflow { + +namespace pthread_fork { + +static bool is_fork = false; + +bool IsForkedSubProcess() { return is_fork; } +static void SetIsForkedSubProcess() { is_fork = true; } + +void RegisterForkCallback() { pthread_atfork(nullptr, nullptr, SetIsForkedSubProcess); } +COMMAND(RegisterForkCallback()); + +} // namespace pthread_fork + +} // namespace oneflow diff --git a/oneflow/core/vm/virtual_machine.cpp b/oneflow/core/vm/virtual_machine.cpp index 4a9e7e5fbc6..ef9e5b9bc1c 100644 --- a/oneflow/core/vm/virtual_machine.cpp +++ b/oneflow/core/vm/virtual_machine.cpp @@ -24,6 +24,7 @@ limitations under the License. #include "oneflow/core/common/spin_counter.h" #include "oneflow/core/framework/device.h" #include "oneflow/core/job/parallel_desc.h" +#include "oneflow/core/platform/include/pthread_fork.h" namespace oneflow { namespace vm { @@ -579,6 +580,10 @@ void VirtualMachine::__Init__(const VmDesc& vm_desc, ObjectMsgAllocator* allocat int64_t InstructionMaxRunningSeconds() { return 60 * 5; } Maybe VirtualMachine::Receive(InstructionMsgList* compute_instr_msg_list) { + CHECK_OR_RETURN(!pthread_fork::IsForkedSubProcess()) + << "Cannot run OneFlow in forked subprocess. Please add " + "'multiprocessing.set_start_method(\"spawn\")' in '__main__' if you are using Python's " + "multiprocessing"; InstructionMsgList new_instr_msg_list; OBJECT_MSG_LIST_FOR_EACH_PTR(compute_instr_msg_list, compute_instr_msg) { if (!compute_instr_msg->phy_instr_operand()) { diff --git a/oneflow/user/kernels/adaptive_pool_cpu_kernel.cpp b/oneflow/user/kernels/adaptive_pool_cpu_kernel.cpp index cee413a18d1..a74f73ea0df 100644 --- a/oneflow/user/kernels/adaptive_pool_cpu_kernel.cpp +++ b/oneflow/user/kernels/adaptive_pool_cpu_kernel.cpp @@ -44,7 +44,7 @@ void AvgForwardCompute(user_op::KernelComputeContext* ctx, const int32_t& dim) { const Shape& x_shape = ctx->TensorDesc4ArgNameAndIndex("x", 0)->shape(); const Shape& y_shape = ctx->TensorDesc4ArgNameAndIndex("y", 0)->shape(); - // TODO: Support 'channels_last' + // TODO (Tianyu): Support 'channels_last' std::string data_format = "channels_first"; const Shape& in = GetShape5D(x_shape, data_format, dim); const Shape& out = GetShape5D(y_shape, data_format, dim); @@ -100,7 +100,7 @@ void AvgBackwardCompute(user_op::KernelComputeContext* ctx, const int32_t& dim) const Shape& dx_shape = ctx->TensorDesc4ArgNameAndIndex("dx", 0)->shape(); const Shape& dy_shape = ctx->TensorDesc4ArgNameAndIndex("dy", 0)->shape(); - // TODO: Support 'channels_last' + // TODO (Tianyu): Support 'channels_last' std::string data_format = "channels_first"; const Shape& in = GetShape5D(dx_shape, data_format, dim); const Shape& out = GetShape5D(dy_shape, data_format, dim); @@ -234,9 +234,7 @@ class AdaptivePool3DCpuGradKernel final : public user_op::OpKernel { #define REGISTER_ADAPTIVE_POOL_KERNEL_WITH_DEVICE(device) \ REGISTER_ADAPTIVE_POOL_KERNEL(device, float) \ REGISTER_ADAPTIVE_POOL_KERNEL(device, double) \ - REGISTER_ADAPTIVE_POOL_KERNEL(device, int8_t) \ - REGISTER_ADAPTIVE_POOL_KERNEL(device, int32_t) \ - REGISTER_ADAPTIVE_POOL_KERNEL(device, int64_t) + REGISTER_ADAPTIVE_POOL_KERNEL(device, int) REGISTER_ADAPTIVE_POOL_KERNEL_WITH_DEVICE(DeviceType::kCPU) @@ -257,9 +255,7 @@ REGISTER_ADAPTIVE_POOL_KERNEL_WITH_DEVICE(DeviceType::kCPU) #define REGISTER_ADAPTIVE_POOL_BACKWARD_KERNEL_WITH_DEVICE(device) \ REGISTER_ADAPTIVE_POOL_BACKWARD_KERNEL(device, float) \ REGISTER_ADAPTIVE_POOL_BACKWARD_KERNEL(device, double) \ - REGISTER_ADAPTIVE_POOL_BACKWARD_KERNEL(device, int8_t) \ - REGISTER_ADAPTIVE_POOL_BACKWARD_KERNEL(device, int32_t) \ - REGISTER_ADAPTIVE_POOL_BACKWARD_KERNEL(device, int64_t) + REGISTER_ADAPTIVE_POOL_BACKWARD_KERNEL(device, int) REGISTER_ADAPTIVE_POOL_BACKWARD_KERNEL_WITH_DEVICE(DeviceType::kCPU) } // namespace oneflow diff --git a/oneflow/user/kernels/adaptive_pool_gpu_kernel.cu b/oneflow/user/kernels/adaptive_pool_gpu_kernel.cu index 4beea9e79de..6d4dff01f7b 100644 --- a/oneflow/user/kernels/adaptive_pool_gpu_kernel.cu +++ b/oneflow/user/kernels/adaptive_pool_gpu_kernel.cu @@ -18,6 +18,7 @@ limitations under the License. #include "oneflow/core/kernel/kernel_util.cuh" #include "oneflow/core/common/data_type.h" #include "oneflow/core/kernel/util/cuda_half_util.h" +#include "oneflow/core/cuda/atomic.cuh" #include "oneflow/core/operator/operator_util.h" #include "oneflow/user/utils/pool_util.h" @@ -59,6 +60,7 @@ __global__ void AdaptiveAvgPoolCudaKernel(const T* input, T* output, int num_ele const int in_panel_size = in_d * in_h * in_w; CUDA_1D_KERNEL_LOOP(idx, num_elems) { + // TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper' int bc_idx = idx / out_panel_size; int out_d_idx = (idx % out_panel_size) / out_w / out_h; int out_h_idx = (idx % out_panel_size) % (out_h * out_w) / out_w; @@ -100,6 +102,7 @@ __global__ void AdaptiveAvgPoolGradCudaKernel(T* input, const T* output, int num const int in_panel_size = in_d * in_h * in_w; CUDA_1D_KERNEL_LOOP(idx, num_elems) { + // TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper' int bc_idx = idx / out_panel_size; int out_d_idx = (idx % out_panel_size) / out_w / out_h; int out_h_idx = (idx % out_panel_size) % (out_h * out_w) / out_w; @@ -122,7 +125,10 @@ __global__ void AdaptiveAvgPoolGradCudaKernel(T* input, const T* output, int num input + bc_idx * in_panel_size + in_start_d * in_h * in_w + in_start_h * in_w + in_start_w; for (int id = 0; id < k_d; ++id) { for (int ih = 0; ih < k_h; ++ih) { - for (int iw = 0; iw < k_w; ++iw) { *(input_ptr + ih * in_w + iw) += grad_delta; } + for (int iw = 0; iw < k_w; ++iw) { + // TODO (Tianyu): Use 'atmoic::Add' when necessary + cuda::atomic::Add(input_ptr + ih * in_w + iw, grad_delta); + } } input_ptr += in_h * in_w; // next input depth } @@ -139,7 +145,7 @@ void AvgForwardCompute(KernelComputeContext* ctx, const int32_t& dim) { const Shape& x_shape = ctx->TensorDesc4ArgNameAndIndex("x", 0)->shape(); const Shape& y_shape = ctx->TensorDesc4ArgNameAndIndex("y", 0)->shape(); - // TODO: Support 'channels_last' + // TODO (Tianyu): Support 'channels_last' std::string data_format = "channels_first"; const Shape& in = GetShape5D(x_shape, data_format, dim); const Shape& out = GetShape5D(y_shape, data_format, dim); @@ -160,7 +166,7 @@ void AvgBackwardCompute(KernelComputeContext* ctx, const int32_t& dim) { const Shape& dx_shape = ctx->TensorDesc4ArgNameAndIndex("dx", 0)->shape(); const Shape& dy_shape = ctx->TensorDesc4ArgNameAndIndex("dy", 0)->shape(); - // TODO: Support 'channels_last' + // TODO (Tianyu): Support 'channels_last' std::string data_format = "channels_first"; const Shape& in = GetShape5D(dx_shape, data_format, dim); const Shape& out = GetShape5D(dy_shape, data_format, dim); @@ -258,9 +264,7 @@ class GpuAdaptiveAvgPool3dGradKernel final : public OpKernel { REGISTER_GPU_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kGPU, float); REGISTER_GPU_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kGPU, double); -REGISTER_GPU_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kGPU, int8_t); -REGISTER_GPU_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kGPU, int32_t); -REGISTER_GPU_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kGPU, int64_t); +REGISTER_GPU_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kGPU, int); #define REGISTER_GPU_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(device, dtype) \ REGISTER_USER_KERNEL("adaptive_avg_pool1d_grad") \ @@ -278,9 +282,7 @@ REGISTER_GPU_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kGPU, int64_t); REGISTER_GPU_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kGPU, float); REGISTER_GPU_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kGPU, double); -REGISTER_GPU_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kGPU, int8_t); -REGISTER_GPU_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kGPU, int32_t); -REGISTER_GPU_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kGPU, int64_t); +REGISTER_GPU_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kGPU, int); } // namespace user_op diff --git a/oneflow/user/kernels/eager_nccl_kernels.cpp b/oneflow/user/kernels/eager_nccl_kernels.cpp index cf9b97e25a0..15307ffbebd 100644 --- a/oneflow/user/kernels/eager_nccl_kernels.cpp +++ b/oneflow/user/kernels/eager_nccl_kernels.cpp @@ -16,6 +16,7 @@ limitations under the License. #include "oneflow/core/framework/framework.h" #include "oneflow/core/ccl/ccl.h" #include "oneflow/core/job/parallel_desc.h" +#include "oneflow/core/control/global_process_ctx.h" namespace oneflow { @@ -57,11 +58,15 @@ class EagerCclBroadcastKernel final : public user_op::OpKernel { CHECK(kernel_state != nullptr); const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - CHECK_EQ(in->shape(), out->shape()); - CHECK_EQ(in->data_type(), out->data_type()); int64_t root = ctx->Attr("root"); - CHECK_JUST(ccl::Broadcast(in->dptr(), out->mut_dptr(), in->shape().elem_cnt(), - in->data_type(), root, + const void* in_ptr = nullptr; + if (GlobalProcessCtx::Rank() == root) { + CHECK_EQ(in->shape(), out->shape()); + CHECK_EQ(in->data_type(), out->data_type()); + in_ptr = in->dptr(); + } + CHECK_JUST(ccl::Broadcast(in_ptr, out->mut_dptr(), out->shape().elem_cnt(), + out->data_type(), root, kernel_state->parallel_desc(), ctx->device_ctx())); }; bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } diff --git a/oneflow/user/kernels/eager_nccl_kernels.cu b/oneflow/user/kernels/eager_nccl_kernels.cu index a0af139844f..5c83fe7cb8f 100644 --- a/oneflow/user/kernels/eager_nccl_kernels.cu +++ b/oneflow/user/kernels/eager_nccl_kernels.cu @@ -14,6 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "oneflow/core/common/container_util.h" +#include "oneflow/core/control/global_process_ctx.h" #include "oneflow/core/framework/framework.h" #include "oneflow/core/device/nccl_util.h" #include "oneflow/core/job/eager_nccl_comm_manager.h" @@ -97,11 +98,15 @@ class EagerNcclBroadcastKernel final : public user_op::OpKernel { CHECK(kernel_state != nullptr); const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - CHECK_EQ(in->shape(), out->shape()); - CHECK_EQ(in->data_type(), out->data_type()); int64_t root = ctx->Attr("root"); - OF_NCCL_CHECK(ncclBroadcast(in->dptr(), out->mut_dptr(), in->shape().elem_cnt(), - GetNcclDataType(in->data_type()), root, kernel_state->comm(), + const void* in_ptr = nullptr; + if (GlobalProcessCtx::Rank() == root) { + CHECK_EQ(in->shape(), out->shape()); + CHECK_EQ(in->data_type(), out->data_type()); + in_ptr = in->dptr(); + } + OF_NCCL_CHECK(ncclBroadcast(in_ptr, out->mut_dptr(), out->shape().elem_cnt(), + GetNcclDataType(out->data_type()), root, kernel_state->comm(), ctx->device_ctx()->cuda_stream())); }; bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } @@ -157,7 +162,6 @@ class EagerNcclReduceScatterKernel final : public user_op::OpKernel { CHECK(kernel_state != nullptr); const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - CHECK(!(in->shape() == out->shape())); CHECK_EQ(in->data_type(), out->data_type()); const auto& op_type = ctx->Attr("op_type"); OF_NCCL_CHECK(ncclReduceScatter(in->dptr(), out->mut_dptr(), out->shape().elem_cnt(), @@ -193,7 +197,6 @@ class EagerNcclAllGatherKernel final : public user_op::OpKernel { CHECK(kernel_state != nullptr); const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - CHECK(!(in->shape() == out->shape())); CHECK_EQ(in->data_type(), out->data_type()); OF_NCCL_CHECK(ncclAllGather(in->dptr(), out->mut_dptr(), in->shape().elem_cnt(), GetNcclDataType(in->data_type()), kernel_state->comm(), diff --git a/oneflow/user/kernels/randint_kernel.cpp b/oneflow/user/kernels/randint_kernel.cpp deleted file mode 100644 index ac604ae8426..00000000000 --- a/oneflow/user/kernels/randint_kernel.cpp +++ /dev/null @@ -1,54 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/user/kernels/distributions/uniform_kernel.h" -#include "oneflow/core/device/device_context.h" -#include "oneflow/core/ndarray/xpu_util.h" -namespace oneflow { - -class CpuRandintKernel final : public user_op::OpKernel { - public: - CpuRandintKernel() = default; - ~CpuRandintKernel() = default; - std::shared_ptr CreateOpKernelState( - user_op::KernelInitContext* ctx) const override { - const auto& generator = CHECK_JUST(one::MakeAutoGenerator()); - generator->set_current_seed(ctx->Attr("seed")); - return std::make_shared(generator); - } - - private: - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state) const override { - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - int64_t* output = out->mut_dptr(); - auto* randint_kernel_state = dynamic_cast(state); - CHECK_NOTNULL(randint_kernel_state); - const auto& generator = randint_kernel_state->generator(); - const auto& cpu_generator = CHECK_JUST(generator->Get()); - CHECK_NOTNULL(generator); - const int64_t n = out->shape().elem_cnt(); - const int64_t low = ctx->Attr("low"); - const int64_t high = ctx->Attr("high"); - std::uniform_int_distribution dis(low, high - 1); - XPU_1D_KERNEL_LOOP(i, n) - output[i] = dis(cpu_generator->engine()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -REGISTER_USER_KERNEL("randint").SetCreateFn().SetIsMatchedHob( - (user_op::HobDeviceTag() == "cpu")); - -} // namespace oneflow diff --git a/oneflow/user/kernels/randint_kernel.cu b/oneflow/user/kernels/randint_kernel.cu deleted file mode 100644 index da6bc2644e7..00000000000 --- a/oneflow/user/kernels/randint_kernel.cu +++ /dev/null @@ -1,67 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/device/device_context.h" -#include "oneflow/core/ndarray/xpu_util.h" -#include "oneflow/user/kernels/distributions/uniform_kernel.h" -#include -#include -namespace oneflow { -__global__ void GenValues(int64_t* a, const int64_t low, const int64_t high, int32_t n, - curandState* state) { - XPU_1D_KERNEL_LOOP(i, n) { - a[i] = curand(state + i) % (high - low) - + low; //@TODO:curandState only generates 32-bit random number - } -} - -class GpuRandintKernel final : public user_op::OpKernel { - public: - GpuRandintKernel() = default; - ~GpuRandintKernel() = default; - std::shared_ptr CreateOpKernelState( - user_op::KernelInitContext* ctx) const override { - const auto& generator = CHECK_JUST(one::MakeAutoGenerator()); - generator->set_current_seed(ctx->Attr("seed")); - return std::make_shared(generator); - } - - private: - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state) const override { - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - int64_t* output = out->mut_dptr(); - auto* randint_kernel_state = dynamic_cast(state); - CHECK_NOTNULL(randint_kernel_state); - const auto& generator = randint_kernel_state->generator(); - const auto& gpu_generator = CHECK_JUST(generator->Get()); - CHECK_NOTNULL(generator); - - int32_t block_num = gpu_generator->max_block_num(); - int32_t thread_num = gpu_generator->max_thread_num(); - curandState* curand_states = gpu_generator->curand_states(); - - const int32_t n = out->shape().elem_cnt(); - const int64_t low = ctx->Attr("low"); - const int64_t high = ctx->Attr("high"); - GenValues<<device_ctx()->cuda_stream()>>>( - output, low, high, n, curand_states); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -REGISTER_USER_KERNEL("randint").SetCreateFn().SetIsMatchedHob( - (user_op::HobDeviceTag() == "gpu")); - -} // namespace oneflow diff --git a/oneflow/user/ops/randint_op.cpp b/oneflow/user/ops/randint_op.cpp deleted file mode 100644 index 2c4e44ca14e..00000000000 --- a/oneflow/user/ops/randint_op.cpp +++ /dev/null @@ -1,62 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/common/protobuf.h" -#include "oneflow/core/common/global.h" -#include "oneflow/core/job/global_for.h" - -namespace oneflow { - -Maybe InferRandintNdSbp(user_op::InferNdSbpFnContext* ctx); - -REGISTER_NO_GRAD_USER_OP("randint") - .Output("out") - .Attr("low") - .Attr("high") - .Attr("shape") - .Attr("nd_sbp") - .SetTensorDescInferFn([](user_op::InferContext* ctx) -> Maybe { - Shape* out_shape = ctx->OutputShape("out", 0); - const Shape& shape = ctx->Attr("shape"); - DimVector dim_vec; - if (shape.NumAxes() > 0) { - dim_vec.insert(dim_vec.end(), shape.dim_vec().cbegin(), shape.dim_vec().cend()); - } - *out_shape = Shape(dim_vec); - return Maybe::Ok(); - }) - .SetGetSbpFn([](user_op::SbpContext* ctx) -> Maybe { return Maybe::Ok(); }) - .SetDataTypeInferFn([](user_op::InferContext* ctx) -> Maybe { - *ctx->OutputDType("out", 0) = DataType::kInt64; - return Maybe::Ok(); - }) - .SetNdSbpInferFn(&InferRandintNdSbp); - -Maybe InferRandintNdSbp(user_op::InferNdSbpFnContext* ctx) { - cfg::NdSbp* out = ctx->NdSbp4ArgNameAndIndex("out", 0); - if (JUST(*Global, MultiClient>::Get())) { - const auto& pb_str = ctx->user_op_conf().attr("nd_sbp"); - NdSbp pb; - CHECK_OR_RETURN(TxtString2PbMessage(pb_str, &pb)); - out->InitFromProto(pb); - } else { - out->mutable_sbp_parallel()->Add()->mutable_broadcast_parallel(); - } - return Maybe::Ok(); -} - -} // namespace oneflow diff --git a/python/oneflow/__init__.py b/python/oneflow/__init__.py index 2b2dc766731..f3c2c529cff 100644 --- a/python/oneflow/__init__.py +++ b/python/oneflow/__init__.py @@ -22,7 +22,6 @@ Size = oneflow._oneflow_internal.Size device = oneflow._oneflow_internal.device placement = oneflow._oneflow_internal.placement -no_grad = oneflow._oneflow_internal.autograd.no_grad locals()["dtype"] = oneflow._oneflow_internal.dtype locals()["char"] = oneflow._oneflow_internal.char locals()["float16"] = oneflow._oneflow_internal.float16 @@ -114,6 +113,7 @@ def _SyncOnMasterFn(): register_docstr() del register_docstr del docstr +from oneflow.autograd import grad_enable, no_grad, inference_mode, is_grad_enabled import oneflow.nn.image import oneflow.nn.modules.acosh import oneflow.nn.modules.activation @@ -257,8 +257,8 @@ def _SyncOnMasterFn(): from oneflow.nn.modules.random_ops import bernoulli from oneflow.nn.modules.random_ops import rand_op as rand from oneflow.nn.modules.random_ops import randn_op as randn -from oneflow.nn.modules.random_ops import randint -from oneflow.nn.modules.random_ops import randperm +from oneflow.nn.modules.random_ops import randint_op as randint +from oneflow.nn.modules.random_ops import randperm_op as randperm from oneflow.nn.modules.reduce_ops import _max as max from oneflow.nn.modules.reduce_ops import _mean as mean from oneflow.nn.modules.reduce_ops import _min as min diff --git a/python/oneflow/autograd/__init__.py b/python/oneflow/autograd/__init__.py index 8452d5241fa..aaee33d6682 100644 --- a/python/oneflow/autograd/__init__.py +++ b/python/oneflow/autograd/__init__.py @@ -15,3 +15,9 @@ """ from oneflow.autograd.autograd import backward, grad +from oneflow.autograd.autograd_mode import ( + inference_mode, + grad_enable, + no_grad, + is_grad_enabled, +) diff --git a/python/oneflow/autograd/autograd_mode.py b/python/oneflow/autograd/autograd_mode.py new file mode 100644 index 00000000000..49fcbb0212a --- /dev/null +++ b/python/oneflow/autograd/autograd_mode.py @@ -0,0 +1,169 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import oneflow._oneflow_internal +from oneflow._oneflow_internal.autograd import AutoGradMode + + +def is_grad_enabled(): + r""" + Returns True if grad mode is currently enabled. + """ + return oneflow._oneflow_internal.autograd.is_grad_enabled() + + +class inference_mode: + r""" + Context-manager that enables or disables inference mode + + InferenceMode is a new context manager analogous to no_grad to be used when you arecertain + your operations will have no interactions with autograd (e.g., model training). Code run + under this mode gets better performance by disabling view tracking and version counter bumps. + + This context manager is thread local; it will not affect computation in other threads. + + Also functions as a decorator. (Make sure to instantiate with parenthesis.) + + Args: + mode (bool): Flag whether to enable or disable inference mode. (default: True) + + .. code-block:: python + + >>> import oneflow as flow + >>> x = flow.ones(2, 3, requires_grad=True) + >>> with flow.inference_mode(): + ... y = x * x + >>> y.requires_grad + False + >>> @flow.inference_mode() + ... def no_grad_func(x): + ... return x * x + >>> y = no_grad_func(x) + >>> y.requires_grad + False + """ + + def __init__(self, mode=True): + self.infer_mode = mode + + def __call__(self, func): + def warpper(*args, **kwargs): + with AutoGradMode(not self.infer_mode): + return func(*args, **kwargs) + + return warpper + + def __enter__(self): + self.grad_mode = AutoGradMode(not self.infer_mode) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + pass + + +class grad_enable: + r""" + Context-manager that enabled gradient calculation. + + Enables gradient calculation, if it has been disabled via no_grad. + + This context manager is thread local; it will not affect computation in other threads. + + Also functions as a decorator. (Make sure to instantiate with parenthesis.) + + .. code-block:: python + + >>> import oneflow as flow + >>> x = flow.ones(2, 3, requires_grad=True) + >>> with flow.no_grad(): + ... with flow.grad_enable(): + ... y = x * x + >>> y.requires_grad + True + >>> @flow.grad_enable() + ... def no_grad_func(x): + ... return x * x + >>> with flow.no_grad(): + ... y = no_grad_func(x) + >>> y.requires_grad + True + """ + + def __call__(self, func): + def warpper(*args, **kwargs): + with AutoGradMode(True): + return func(*args, **kwargs) + + return warpper + + def __enter__(self): + self.grad_mode = AutoGradMode(True) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + pass + + +class no_grad: + r""" + Context-manager that disabled gradient calculation. + + Disabling gradient calculation is useful for inference, when you are sure that + you will not call Tensor.backward(). It will reduce memory consumption for computations + that would otherwise have requires_grad=True. + + In this mode, the result of every computation will have requires_grad=False, even when + the inputs have requires_grad=True. + + This context manager is thread local; it will not affect computation in other threads. + + Also functions as a decorator. (Make sure to instantiate with parenthesis.) + + .. code-block:: python + + >>> import oneflow as flow + >>> x = flow.ones(2, 3, requires_grad=True) + >>> with flow.no_grad(): + ... y = x * x + >>> y.requires_grad + False + >>> @flow.no_grad() + ... def no_grad_func(x): + ... return x * x + >>> y = no_grad_func(x) + >>> y.requires_grad + False + """ + + def __call__(self, func): + def warpper(*args, **kwargs): + with AutoGradMode(False): + return func(*args, **kwargs) + + return warpper + + def __enter__(self): + self.grad_mode = AutoGradMode(False) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + pass + + +if __name__ == "__main__": + import doctest + + doctest.testmod(raise_on_error=True) diff --git a/python/oneflow/compatible/single_client/__init__.py b/python/oneflow/compatible/single_client/__init__.py index 3a8db4bd5df..2501f0b39f1 100644 --- a/python/oneflow/compatible/single_client/__init__.py +++ b/python/oneflow/compatible/single_client/__init__.py @@ -19,7 +19,6 @@ Size = oneflow._oneflow_internal.Size device = oneflow._oneflow_internal.device placement = oneflow._oneflow_internal.placement -no_grad = oneflow._oneflow_internal.autograd.no_grad locals()["dtype"] = oneflow._oneflow_internal.dtype locals()["char"] = oneflow._oneflow_internal.char locals()["float16"] = oneflow._oneflow_internal.float16 @@ -115,6 +114,7 @@ def custom_exit(returncode): sys.exit = custom_exit del custom_exit del sys +from oneflow.compatible.single_client.autograd import no_grad import oneflow.compatible.single_client.nn.modules.acosh import oneflow.compatible.single_client.nn.modules.activation import oneflow.compatible.single_client.nn.modules.argwhere diff --git a/python/oneflow/compatible/single_client/autograd/__init__.py b/python/oneflow/compatible/single_client/autograd/__init__.py index 2cd8029edd4..79b6e3b3bb6 100644 --- a/python/oneflow/compatible/single_client/autograd/__init__.py +++ b/python/oneflow/compatible/single_client/autograd/__init__.py @@ -15,3 +15,4 @@ """ from oneflow.compatible.single_client.autograd.autograd import backward, grad +from oneflow.compatible.single_client.autograd.autograd_mode import no_grad diff --git a/python/oneflow/compatible/single_client/autograd/autograd_mode.py b/python/oneflow/compatible/single_client/autograd/autograd_mode.py new file mode 100644 index 00000000000..b9d0d464a43 --- /dev/null +++ b/python/oneflow/compatible/single_client/autograd/autograd_mode.py @@ -0,0 +1,46 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import oneflow._oneflow_internal +from oneflow._oneflow_internal.autograd import AutoGradMode + + +class no_grad(AutoGradMode): + r""" + Context-manager that disabled gradient calculation. + + Disabling gradient calculation is useful for inference, when you are sure that + you will not call Tensor.backward(). It will reduce memory consumption for computations + that would otherwise have requires_grad=True. + + In this mode, the result of every computation will have requires_grad=False, even when + the inputs have requires_grad=True. + + This context manager is thread local; it will not affect computation in other threads. + + Also functions as a decorator. (Make sure to instantiate with parenthesis.) + + """ + + def __init__(self): + super().__init__(False) + + def __call__(self, func): + def warpper(*args, **kwargs): + with AutoGradMode(False): + return func(*args, **kwargs) + + return warpper diff --git a/python/oneflow/framework/tensor_str.py b/python/oneflow/framework/tensor_str.py index 6ea41fb4dca..10ecfbbc8c3 100644 --- a/python/oneflow/framework/tensor_str.py +++ b/python/oneflow/framework/tensor_str.py @@ -36,7 +36,9 @@ class __PrinterOptions(object): def _try_convert_to_local_tensor(tensor): if tensor.is_consistent: - tensor = tensor.to_consistent(sbp=flow.sbp.broadcast).to_local() + tensor = tensor.to_consistent( + placement=tensor.placement, sbp=flow.sbp.broadcast + ).to_local() return tensor @@ -214,8 +216,9 @@ def _cannot_print(sbp): and sbp != flow.sbp.split(0) ) - # TODO: delete it when s1->b is ready + # TODO: delete it when boxing on "CPU" and s1->b on "GPU" are ready if self.is_consistent: + self = self.to("cuda") if all(_cannot_print(sbp) for sbp in self.sbp): return "[...]" diff --git a/python/oneflow/nn/modules/adaptive_pool.py b/python/oneflow/nn/modules/adaptive_pool.py index ef0a37c7557..fb6ccd18960 100644 --- a/python/oneflow/nn/modules/adaptive_pool.py +++ b/python/oneflow/nn/modules/adaptive_pool.py @@ -15,27 +15,23 @@ """ import oneflow as flow from oneflow.nn.module import Module +from oneflow.nn.common_types import _size_1_t +from oneflow.nn.modules.utils import _single, _pair, _triple def _generate_output_size(input_size, output_size): new_output_size = [] - if isinstance(output_size, int): - for _ in range(len(input_size) - 2): - new_output_size.append(output_size) - elif isinstance(output_size, tuple): - assert len(input_size) - 2 == len( - output_size - ), f"The length of 'output_size' does not match the input size, {len(input_size) - 2} expected" - for i in range(len(output_size)): - if output_size[i] is None: - new_output_size.append(input_size[i + 2]) - else: - assert isinstance( - output_size[i], int - ), "numbers in 'output_size' should be integer" - new_output_size.append(output_size[i]) - else: - raise ValueError("invalid 'output_size', 'int' or 'tuple' expected") + assert len(input_size) - 2 == len( + output_size + ), f"the length of 'output_size' does not match the input size, {len(input_size) - 2} expected" + for i in range(len(output_size)): + if output_size[i] is None: + new_output_size.append(input_size[i + 2]) + else: + assert isinstance( + output_size[i], int + ), "numbers in 'output_size' should be integer" + new_output_size.append(output_size[i]) return tuple(new_output_size) @@ -55,7 +51,7 @@ class AdaptiveAvgPool1d(Module): >>> import numpy as np >>> import oneflow as flow >>> import oneflow.nn as nn - + >>> m = nn.AdaptiveAvgPool1d(5) >>> input = flow.Tensor(np.random.randn(1, 64, 8)) >>> output = m(input) @@ -64,19 +60,19 @@ class AdaptiveAvgPool1d(Module): """ - def __init__(self, output_size) -> None: + def __init__(self, output_size: _size_1_t) -> None: super().__init__() - self.output_size = output_size + assert output_size is not None, "'output_size' cannot be NoneType" + self.output_size = _single(output_size) def forward(self, x): - assert len(x.shape) == 3 - if isinstance(self.output_size, tuple): - new_output_size = self.output_size[0] - elif isinstance(self.output_size, int): - new_output_size = self.output_size - else: - raise ValueError("'output_size' should be integer or tuple") - return flow.F.adaptive_avg_pool1d(x, output_size=(new_output_size,)) + assert ( + len(x.shape) == 3 and len(self.output_size) == 1 + ), "the length of 'output_size' does not match the input size, 1 expected" + assert isinstance( + self.output_size[0], int + ), "numbers in 'output_size' should be integer" + return flow.F.adaptive_avg_pool1d(x, output_size=self.output_size) def adaptive_avg_pool1d(input, output_size): @@ -110,7 +106,7 @@ class AdaptiveAvgPool2d(Module): >>> import numpy as np >>> import oneflow as flow >>> import oneflow.nn as nn - + >>> m = nn.AdaptiveAvgPool2d((5,7)) >>> input = flow.Tensor(np.random.randn(1, 64, 8, 9)) >>> output = m(input) @@ -133,10 +129,13 @@ class AdaptiveAvgPool2d(Module): def __init__(self, output_size) -> None: super().__init__() - self.output_size = output_size + assert output_size is not None, "'output_size' cannot be NoneType" + self.output_size = _pair(output_size) def forward(self, x): - assert len(x.shape) == 4 + assert ( + len(x.shape) == 4 + ), f"expected 4-dimensional tensor, but got {len(x.shape)}-dimensional tensor" new_output_size = _generate_output_size(x.shape, self.output_size) return flow.F.adaptive_avg_pool2d(x, output_size=new_output_size) @@ -172,7 +171,7 @@ class AdaptiveAvgPool3d(Module): >>> import numpy as np >>> import oneflow as flow >>> import oneflow.nn as nn - + >>> m = nn.AdaptiveAvgPool3d((5,7,9)) >>> input = flow.Tensor(np.random.randn(1, 64, 8, 9, 10)) >>> output = m(input) @@ -195,10 +194,13 @@ class AdaptiveAvgPool3d(Module): def __init__(self, output_size) -> None: super().__init__() - self.output_size = output_size + assert output_size is not None, "'output_size' cannot be NoneType" + self.output_size = _triple(output_size) def forward(self, x): - assert len(x.shape) == 5 + assert ( + len(x.shape) == 5 + ), f"expected 5-dimensional tensor, but got {len(x.shape)}-dimensional tensor" new_output_size = _generate_output_size(x.shape, self.output_size) return flow.F.adaptive_avg_pool3d(x, output_size=new_output_size) diff --git a/python/oneflow/nn/modules/consistent_cast.py b/python/oneflow/nn/modules/consistent_cast.py index 0000cf2dd15..e37ada9f778 100644 --- a/python/oneflow/nn/modules/consistent_cast.py +++ b/python/oneflow/nn/modules/consistent_cast.py @@ -35,9 +35,7 @@ def forward(self, x, sbp, placement): @register_tensor_op("to_consistent") -def to_consistent_op( - input, placement=None, sbp=None, identity_grad=False, grad_sbp=None -): +def to_consistent_op(input, placement=None, sbp=None, grad_sbp=None): """Cast a local tensor to consistent tensor or cast a consistent tensor to another consistent tensor with different sbp or placement @@ -90,9 +88,6 @@ def _check_sbp(sbp): grad_sbp = _check_sbp(grad_sbp) - if identity_grad is True and grad_sbp is not None: - raise ValueError("If identity_grad is True, grad_sbp should not be set.") - else: # local tensor to consistent tensor if placement is None or sbp is None: @@ -106,7 +101,7 @@ def _check_sbp(sbp): if grad_sbp is None: grad_sbp = tuple() - return flow.F.to_consistent(input, placement, sbp, identity_grad, grad_sbp) + return flow.F.to_consistent(input, placement, sbp, grad_sbp) class ToLocal(Module): diff --git a/python/oneflow/nn/modules/random_ops.py b/python/oneflow/nn/modules/random_ops.py index 71225a34ec6..af0f2eb96f0 100644 --- a/python/oneflow/nn/modules/random_ops.py +++ b/python/oneflow/nn/modules/random_ops.py @@ -264,52 +264,14 @@ def randn_op( )() - -class Randint(flow.nn.Module): +class RandInt(flow.nn.Module): def __init__( self, low: flow.int64, high: flow.int64, size: tuple, generator: flow.Generator = None, - dtype: flow.dtype = flow.int64, - layout=None, - device=None, - placement=None, - sbp=None, - requires_grad=False, - ) -> None: - super().__init__() - - if generator is None: - generator = flow.Generator() - assert low < high - -class Randperm(Module): - def __init__( - self, - n, - generator: flow.Generator = None, - dtype: flow.dtype = flow.int32, - layout=None, - device: Union[flow.device, str, None] = None, - placement: flow.placement = None, - sbp: flow._oneflow_internal.sbp.sbp = None, - requires_grad: bool = False, - pin_memory: bool = False, - ) -> None: - super().__init__() - assert n >= 0 - self.n = n - - class Randint(flow.nn.Module): - def __init__( - self, - low: flow.int64, - high: flow.int64, - size: tuple, - generator: flow.Generator = None, - dtype: flow.dtype = flow.int64, + dtype: Optional[flow.dtype] = None, layout=None, device=None, placement=None, @@ -328,7 +290,6 @@ def __init__( self.generator, self.placement, self.sbp, - ) = _rand_op_common_process(size, device, generator, placement, sbp) self.dtype = dtype self.low = low @@ -337,93 +298,124 @@ def __init__( def forward(self): if self.placement is not None: res = flow.F.consistent_randint( - self.low, self.high, self.size, self.placement, self.sbp, self.generator + self.low, + self.high, + self.size, + self.placement, + self.sbp, + self.dtype, + self.generator, ) else: res = flow.F.randint( - self.low, self.high, self.size, self.device, self.generator + self.low, self.high, self.size, self.dtype, self.device, self.generator ) res.requires_grad = self.requires_grad return res -def randint( - low: flow.int64 = 0, - high: Union[int, tuple] = None, - size: tuple = None, - generator: flow.Generator = None, - dtype: flow.dtype = flow.int64, +def randint_op( + low: flow.int64, + high: flow.int64, + size: tuple, + out=None, + generator=None, + dtype: Optional[flow.dtype] = None, + layout=None, + device: Union[flow.device, str, None] = None, + placement: flow.placement = None, + sbp: flow._oneflow_internal.sbp.sbp = None, + requires_grad: bool = False, +): + """ + Returns a tensor filled with random integers generated uniformly between low (inclusive) and high (exclusive). - ) = _rand_op_common_process(1, device, generator, placement, sbp) - self.dtype = dtype + The shape of the tensor is defined by the variable argument ``size``. + + Args: + size (int... or flow.Size): Defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple or flow.Size. + out (optional): The output tensor. + dtype (flow.dtype, optional): The desired data type of returned tensor. Default: ``flow.int64``. + layout (optional): The desired layout of returned Tensor. + generator (flow.Generator, optional) – a pseudorandom number generator for sampling + device (flow.device, optional): The desired device of returned local tensor. If None, uses the + current device. + placement (flow.placement, optional): The desired device of returned consistent tensor. If None, will + construct local tensor. + sbp (flow.sbp, optional): The desired sbp of returned consistent tensor. It must be equal with the + numbers of placement. + requires_grad (bool, optional): If autograd should record operations on the returned tensor. Default: False. + + For example: + + .. code-block:: python + >>> import oneflow as flow + >>> generator = flow.Generator() + >>> generator.manual_seed(0) + >>> flow.randint(5, generator=generator) + tensor([2, 4, 3, 0, 1], dtype=oneflow.int32) + + """ + assert out is None, "out not supported yet" + assert layout is None, "layout not supported yet" + if generator is None: + generator = flow.default_generator() + return RandInt( + low, high, size, generator, dtype, layout, device, placement, sbp, requires_grad + )() + + +class RandPerm(Module): + def __init__( + self, + n, + generator: flow.Generator = None, + dtype: Optional[flow.dtype] = None, + layout=None, + device: Union[flow.device, str, None] = None, + placement: flow.placement = None, + sbp: flow._oneflow_internal.sbp.sbp = None, + requires_grad: bool = False, + pin_memory: bool = False, + ) -> None: + super().__init__() + assert n >= 0 + self.n = n def forward(self, out=None): if self.placement is not None: res = flow.F.consistent_randperm( - self.n, self.placement, self.sbp, self.generator + self.n, self.placement, self.sbp, self.dtype, self.generator ) else: - res = flow.F.randperm(self.n, self.device, self.generator) + res = flow.F.randperm(self.n, self.dtype, self.device, self.generator) res.requires_grad = self.requires_grad - return res.to(dtype=self.dtype) + return res -def randperm( +def randperm_op( n: flow.int32, generator: flow.Generator = None, out=None, - dtype: flow.dtype = flow.int32, - + dtype: Optional[flow.dtype] = None, layout=None, device: Union[flow.device, str, None] = None, placement: flow.placement = None, sbp: flow._oneflow_internal.sbp.sbp = None, requires_grad: bool = False, - ) -> flow.Tensor: - r"""Returns a tensor filled with random integers generated uniformly from :math:`[ \text{low},\text{high} )`. - - - The shape of the tensor is defined by the variable argument size. - - Args: - low (int, optional):Lowest integer to be drawn from the distribution. Default: 0. - - high (int):One above the highest integer to be drawn from the distribution. - - size (tuple):a tuple defining the shape of the output tensor. - - Keyword args: - generator(:class:`oneflow.Generator`, optional): a pseudorandom number generator for sampling - dtype (:class:`oneflow.dtype`, optional): the desired data type of returned tensor. - Default: ``oneflow.int64``. - layout: layout is not supported yet. - device: the desired device of returned tensor. Default: cpu. - requires_grad(bool, optional): If autograd should record operations on the returned tensor. Default: False. - placement (flow.placement, optional): The desired device of returned consistent tensor. If None, will - construct local tensor. - sbp (flow.sbp, optional): The desired sbp of returned consistent tensor. It must be equal with the - numbers of placement. - requires_grad (bool, optional): If autograd should record operations on the returned tensor. Default: False. - - Returns: - oneflow.Tensor: The result Tensor of given size. - - For example: - - pin_memory: bool = False, -): r""" Returns a random permutation of integers from ``0`` to ``n - 1``. Args: n (int): the upper bound (exclusive) - + Keyword args: generator(:class:`oneflow.Generator`, optional): a pseudorandom number generator for sampling out (Tensor, optional): output Tensor,not supported yet. dtype (:class:`oneflow.dtype`, optional): the desired data type of returned tensor. - Default: ``oneflow.int32``. + Default: ``oneflow.int64``. layout: layout is not supported yet. device: the desired device of returned tensor. Default: cpu. placement:(:class:`flow.placement`, optional): The desired device of returned consistent tensor. If None, @@ -435,29 +427,9 @@ def randperm( Example: - .. code-block:: python >>> import oneflow as flow - - >>> import numpy as np - >>> generator = flow.Generator() - >>> generator.manual_seed(0) - >>> flow.randint(10,(1,10),generator=generator) - tensor([[5, 5, 7, 8, 6, 8, 5, 8, 4, 6]], dtype=oneflow.int64) - """ - assert layout is None, "layout not supported yet" - if type(high) is tuple: - size = high - low, high = 0, low - if len(size) == 0: - size = (1,) - if generator is None: - generator = flow.default_generator() - return Randint( - low, high, size, generator, dtype, layout, device, placement, sbp, requires_grad - )() - >>> generator = flow.Generator() >>> generator.manual_seed(0) >>> flow.randperm(5, generator=generator) @@ -467,12 +439,11 @@ def randperm( assert layout is None, "layout not supported yet" if generator is None: generator = flow.default_generator() - return Randperm( + return RandPerm( n, generator, dtype, layout, device, placement, sbp, requires_grad, pin_memory )(out) - if __name__ == "__main__": import doctest diff --git a/python/oneflow/nn/parallel/ddp.py b/python/oneflow/nn/parallel/ddp.py index 09cd4bc1c36..bec806dabd2 100644 --- a/python/oneflow/nn/parallel/ddp.py +++ b/python/oneflow/nn/parallel/ddp.py @@ -40,9 +40,18 @@ def allreduce(grad): return allreduce -def DistributedDataParallel(module: "flow.nn.Module"): +def DistributedDataParallel( + module: "flow.nn.Module", *, broadcast_buffers: bool = True +): world_size = flow.distributed.get_world_size() - # TODO(jianhao): broadcast parameters and buffers + with flow.no_grad(): + for x in module.parameters(): + requires_grad = x.requires_grad + x.copy_(flow.F.broadcast(x)) + # TODO: fix the bug that x's requires_grad is discarded + # after flow.F.broadcast + x.requires_grad_(requires_grad) + ddp_state_for_reversed_params = OrderedDict( reversed([(x, [False, False]) for x in module.parameters()]) ) @@ -51,7 +60,7 @@ def DistributedDataParallel(module: "flow.nn.Module"): param.register_hook(lambda grad: grad / world_size) param.register_hook(allreduce_fn(ddp_state_for_reversed_params, param)) - def hook(module, input, output): + def post_forward_hook(module, input, output): ddp_state_for_reversed_params = module._ddp_state_for_reversed_params for state in ddp_state_for_reversed_params.values(): state[0], state[1] = False, False @@ -60,5 +69,15 @@ def hook(module, input, output): ) return output - module.register_forward_hook(hook) + module.register_forward_hook(post_forward_hook) + + if broadcast_buffers: + + def pre_forward_hook(module, input): + with flow.no_grad(): + for x in module.buffers(): + x.copy_(flow.F.broadcast(x)) + + module.register_forward_pre_hook(pre_forward_hook) + return module diff --git a/python/oneflow/test/modules/test_adaptive_pool.py b/python/oneflow/test/modules/test_adaptive_pool.py index decdfab86cf..a27f131d5c3 100644 --- a/python/oneflow/test/modules/test_adaptive_pool.py +++ b/python/oneflow/test/modules/test_adaptive_pool.py @@ -13,885 +13,60 @@ See the License for the specific language governing permissions and limitations under the License. """ - import unittest -from collections import OrderedDict - -import numpy as np -from test_util import GenArgList import oneflow as flow import oneflow.unittest +from oneflow.nn.common_types import _size_1_t +from packaging import version +import torch as torch_original +from typing import Union, Tuple +from automated_test_util import * -def _test_adaptive_avgpool1d_forward(test_case, device): - input = flow.Tensor( - np.array( - [ - [ - [ - 0.05580734834074974, - -0.6875145435333252, - -1.654430866241455, - -0.6225992441177368, - 0.10183599591255188, - 0.05019790679216385, - -1.2537643909454346, - 0.14907236397266388, - ] - ] - ] - ), - dtype=flow.float32, - device=flow.device(device), - ) - m = flow.nn.AdaptiveAvgPool1d(4) - m.to(device) - of_out_1 = m(input) - of_out_2 = flow.adaptive_avg_pool1d(input, 4) - np_out = np.array( - [ - [ - [ - -0.3158535957336426, - -1.1385149955749512, - 0.07601694762706757, - -0.5523459911346436, - ] - ] - ] - ) - test_case.assertTrue(np.allclose(of_out_1.numpy(), np_out, 1e-05, 1e-05)) - test_case.assertTrue(np.allclose(of_out_2.numpy(), np_out, 1e-05, 1e-05)) - - -def _test_adaptive_avgpool1d_backward(test_case, device): - input = flow.Tensor( - np.array( - [ - [ - [ - 0.05580734834074974, - -0.6875145435333252, - -1.654430866241455, - -0.6225992441177368, - 0.10183599591255188, - 0.05019790679216385, - -1.2537643909454346, - 0.14907236397266388, - ] - ] - ] - ), - dtype=flow.float32, - device=flow.device(device), - requires_grad=True, - ) - m = flow.nn.AdaptiveAvgPool1d(4) - of_out = m(input) - of_out = of_out.sum() - of_out.backward() - np_grad = np.array([[[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]]]) - test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05)) - - -@unittest.skipIf( - not flow.unittest.env.eager_execution_enabled(), - ".numpy() doesn't work in lazy mode", -) -def _test_adaptive_avgpool2d_forward(test_case, device): - input = flow.Tensor( - np.array( - [ - [ - [ - [ - 0.10039155930280685, - 0.04879157617688179, - -1.0515470504760742, - 0.9466001987457275, - ], - [ - 0.45375481247901917, - 0.23611211776733398, - 1.343685269355774, - 0.3979687988758087, - ], - [ - 0.05580734834074974, - -0.6875145435333252, - -1.654430866241455, - -0.6225992441177368, - ], - [ - 0.10183599591255188, - 0.05019790679216385, - -1.2537643909454346, - 0.14907236397266388, - ], - ] - ] - ] - ), - dtype=flow.float32, - device=flow.device(device), - ) - m = flow.nn.AdaptiveAvgPool2d((2, 2)) - m.to(device) - of_out_1 = m(input) - of_out_2 = flow.adaptive_avg_pool2d(input, (2, 2)) - np_out = np.array( - [ - [ - [ - [0.20976251363754272, 0.4091767966747284], - [-0.1199183315038681, -0.8454304933547974], - ] - ] - ] - ) - test_case.assertTrue(np.allclose(of_out_1.numpy(), np_out, 1e-05, 1e-05)) - test_case.assertTrue(np.allclose(of_out_2.numpy(), np_out, 1e-05, 1e-05)) - - -def _test_adaptive_avgpool2d_backward(test_case, device): - input = flow.Tensor( - np.array( - [ - [ - [ - [ - 0.10039155930280685, - 0.04879157617688179, - -1.0515470504760742, - 0.9466001987457275, - ], - [ - 0.45375481247901917, - 0.23611211776733398, - 1.343685269355774, - 0.3979687988758087, - ], - [ - 0.05580734834074974, - -0.6875145435333252, - -1.654430866241455, - -0.6225992441177368, - ], - [ - 0.10183599591255188, - 0.05019790679216385, - -1.2537643909454346, - 0.14907236397266388, - ], - ] - ] - ] - ), - dtype=flow.float32, - device=flow.device(device), - requires_grad=True, - ) - m = flow.nn.AdaptiveAvgPool2d((2, 2)) - of_out = m(input) - of_out = of_out.sum() - of_out.backward() - np_grad = np.array( - [ - [ - [ - [0.25, 0.25, 0.25, 0.25], - [0.25, 0.25, 0.25, 0.25], - [0.25, 0.25, 0.25, 0.25], - [0.25, 0.25, 0.25, 0.25], - ] - ] - ] - ) - test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05)) - - -def _test_adaptive_avgpool2d_hw_forward(test_case, device): - input = flow.Tensor( - np.array( - [ - [ - [ - [0.28242185711860657, -0.7742040753364563, -0.5439430475234985], - [-0.1706847995519638, 0.0430854931473732, 0.34247592091560364], - [-1.036131501197815, -1.033642292022705, 0.3455536365509033], - ] - ] - ] - ), - dtype=flow.float32, - device=flow.device(device), - ) - m = flow.nn.AdaptiveAvgPool2d((1, 2)) - m.to(device) - of_out = m(input) - np_out = np.array([[[[-0.4481925666332245, -0.27011242508888245]]]]) - test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05)) - - -def _test_adaptive_avgpool2d_hw_backward(test_case, device): - input = flow.Tensor( - np.array( - [ - [ - [ - [0.28242185711860657, -0.7742040753364563, -0.5439430475234985], - [-0.1706847995519638, 0.0430854931473732, 0.34247592091560364], - [-1.036131501197815, -1.033642292022705, 0.3455536365509033], - ] - ] - ] - ), - dtype=flow.float32, - device=flow.device(device), - requires_grad=True, - ) - m = flow.nn.AdaptiveAvgPool2d((1, 2)) - of_out = m(input) - of_out = of_out.sum() - of_out.backward() - np_grad = np.array( - [ - [ - [ - [0.1666666716337204, 0.3333333432674408, 0.1666666716337204], - [0.1666666716337204, 0.3333333432674408, 0.1666666716337204], - [0.1666666716337204, 0.3333333432674408, 0.1666666716337204], - ] - ] - ] - ) - test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05)) - - -def _test_adaptive_avgpool3d_forward(test_case, device): - input = flow.Tensor( - np.array( - [ - [ - [ - [ - [ - -1.077571799600885, - -0.7804538890365837, - -1.2627538752119443, - 0.9993507145120477, - ], - [ - 2.0222532489157516, - 1.103451377699465, - -0.4377324754879578, - 1.890491810587517, - ], - [ - -0.5593861899064654, - -0.4949520241526519, - -0.18536721363519787, - -0.6098969866775772, - ], - [ - -1.6536215260171816, - -1.0392583540436786, - 0.3686776597613967, - -0.5356882834951805, - ], - ], - [ - [ - -1.2617900664449953, - -1.4390921091631532, - 0.20654399652431357, - 0.8186472101906713, - ], - [ - -0.3033378863400014, - -0.8173269764076293, - -0.3767515097625614, - -0.11021655039337777, - ], - [ - -0.22977043608192885, - 1.2717196366649905, - -0.4790851297878291, - -1.4495369404727856, - ], - [ - -1.2802093286977783, - -0.11184514806663474, - 1.7022167087210984, - -1.7354837287725355, - ], - ], - [ - [ - 2.4706497991773606, - -0.6549702631973298, - -0.9318107079571676, - 1.4652904271682428, - ], - [ - 1.1419864234341397, - 1.389909081086008, - 0.9657841900525568, - -0.8563114264976619, - ], - [ - 0.19515087084250754, - -0.37808457398571094, - 0.2938625398496183, - 0.9279930510353327, - ], - [ - -0.9374118277994007, - 0.3341831730452431, - -0.2792542765303833, - 0.38029090707066726, - ], - ], - [ - [ - 0.5918686659736041, - -0.7870631089938902, - -0.9534344874245392, - 0.31341612954718795, - ], - [ - 0.7509029444145228, - -0.9299288398562323, - -0.7343054052782476, - -0.8806481590696694, - ], - [ - -0.4707853016353985, - 0.12253641652645629, - 0.5088022039832846, - 0.520391789327562, - ], - [ - -0.0861300651163632, - 0.30291348404866386, - -0.6268565873680123, - -0.27469204305759976, - ], - ], - ] - ] - ] - ), - dtype=flow.float32, - device=flow.device(device), - ) - m = flow.nn.AdaptiveAvgPool3d((2, 2, 2)) - m.to(device) - of_out_1 = m(input) - of_out_2 = flow.adaptive_avg_pool3d(input, (2, 2, 2)) - np_out = np.array( - [ - [ - [ - [ - [-0.3192335125472539, 0.2159474151198386], - [-0.5121654212876662, -0.3655204892948264], - ], - [ - [0.4966693377547728, -0.2015024299324123], - [-0.11470347800925032, 0.18131719803880864], - ], - ] - ] - ] - ) - test_case.assertTrue(np.allclose(of_out_1.numpy(), np_out, 1e-05, 1e-05)) - test_case.assertTrue(np.allclose(of_out_2.numpy(), np_out, 1e-05, 1e-05)) - - -def _test_adaptive_avgpool3d_backward(test_case, device): - input = flow.Tensor( - np.array( - [ - [ - [ - [ - [ - -1.077571799600885, - -0.7804538890365837, - -1.2627538752119443, - 0.9993507145120477, - ], - [ - 2.0222532489157516, - 1.103451377699465, - -0.4377324754879578, - 1.890491810587517, - ], - [ - -0.5593861899064654, - -0.4949520241526519, - -0.18536721363519787, - -0.6098969866775772, - ], - [ - -1.6536215260171816, - -1.0392583540436786, - 0.3686776597613967, - -0.5356882834951805, - ], - ], - [ - [ - -1.2617900664449953, - -1.4390921091631532, - 0.20654399652431357, - 0.8186472101906713, - ], - [ - -0.3033378863400014, - -0.8173269764076293, - -0.3767515097625614, - -0.11021655039337777, - ], - [ - -0.22977043608192885, - 1.2717196366649905, - -0.4790851297878291, - -1.4495369404727856, - ], - [ - -1.2802093286977783, - -0.11184514806663474, - 1.7022167087210984, - -1.7354837287725355, - ], - ], - [ - [ - 2.4706497991773606, - -0.6549702631973298, - -0.9318107079571676, - 1.4652904271682428, - ], - [ - 1.1419864234341397, - 1.389909081086008, - 0.9657841900525568, - -0.8563114264976619, - ], - [ - 0.19515087084250754, - -0.37808457398571094, - 0.2938625398496183, - 0.9279930510353327, - ], - [ - -0.9374118277994007, - 0.3341831730452431, - -0.2792542765303833, - 0.38029090707066726, - ], - ], - [ - [ - 0.5918686659736041, - -0.7870631089938902, - -0.9534344874245392, - 0.31341612954718795, - ], - [ - 0.7509029444145228, - -0.9299288398562323, - -0.7343054052782476, - -0.8806481590696694, - ], - [ - -0.4707853016353985, - 0.12253641652645629, - 0.5088022039832846, - 0.520391789327562, - ], - [ - -0.0861300651163632, - 0.30291348404866386, - -0.6268565873680123, - -0.27469204305759976, - ], - ], - ] - ] - ] - ), - dtype=flow.float32, - device=flow.device(device), - requires_grad=True, - ) - m = flow.nn.AdaptiveAvgPool3d((2, 2, 2)) - of_out = m(input) - of_out = of_out.sum() - of_out.backward() - np_grad = np.array( - [ - [ - [ - [ - [0.125, 0.125, 0.125, 0.125], - [0.125, 0.125, 0.125, 0.125], - [0.125, 0.125, 0.125, 0.125], - [0.125, 0.125, 0.125, 0.125], - ], - [ - [0.125, 0.125, 0.125, 0.125], - [0.125, 0.125, 0.125, 0.125], - [0.125, 0.125, 0.125, 0.125], - [0.125, 0.125, 0.125, 0.125], - ], - [ - [0.125, 0.125, 0.125, 0.125], - [0.125, 0.125, 0.125, 0.125], - [0.125, 0.125, 0.125, 0.125], - [0.125, 0.125, 0.125, 0.125], - ], - [ - [0.125, 0.125, 0.125, 0.125], - [0.125, 0.125, 0.125, 0.125], - [0.125, 0.125, 0.125, 0.125], - [0.125, 0.125, 0.125, 0.125], - ], - ] - ] - ] - ) - test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05)) - - -def _test_adaptive_avgpool3d_dhw_forward(test_case, device): - input = flow.Tensor( - np.array( - [ - [ - [ - [ - [ - -1.077571799600885, - -0.7804538890365837, - -1.2627538752119443, - 0.9993507145120477, - ], - [ - 2.0222532489157516, - 1.103451377699465, - -0.4377324754879578, - 1.890491810587517, - ], - [ - -0.5593861899064654, - -0.4949520241526519, - -0.18536721363519787, - -0.6098969866775772, - ], - [ - -1.6536215260171816, - -1.0392583540436786, - 0.3686776597613967, - -0.5356882834951805, - ], - ], - [ - [ - -1.2617900664449953, - -1.4390921091631532, - 0.20654399652431357, - 0.8186472101906713, - ], - [ - -0.3033378863400014, - -0.8173269764076293, - -0.3767515097625614, - -0.11021655039337777, - ], - [ - -0.22977043608192885, - 1.2717196366649905, - -0.4790851297878291, - -1.4495369404727856, - ], - [ - -1.2802093286977783, - -0.11184514806663474, - 1.7022167087210984, - -1.7354837287725355, - ], - ], - [ - [ - 2.4706497991773606, - -0.6549702631973298, - -0.9318107079571676, - 1.4652904271682428, - ], - [ - 1.1419864234341397, - 1.389909081086008, - 0.9657841900525568, - -0.8563114264976619, - ], - [ - 0.19515087084250754, - -0.37808457398571094, - 0.2938625398496183, - 0.9279930510353327, - ], - [ - -0.9374118277994007, - 0.3341831730452431, - -0.2792542765303833, - 0.38029090707066726, - ], - ], - [ - [ - 0.5918686659736041, - -0.7870631089938902, - -0.9534344874245392, - 0.31341612954718795, - ], - [ - 0.7509029444145228, - -0.9299288398562323, - -0.7343054052782476, - -0.8806481590696694, - ], - [ - -0.4707853016353985, - 0.12253641652645629, - 0.5088022039832846, - 0.520391789327562, - ], - [ - -0.0861300651163632, - 0.30291348404866386, - -0.6268565873680123, - -0.27469204305759976, - ], - ], - ] - ] - ] - ), - dtype=flow.float32, - device=flow.device(device), - ) - m = flow.nn.AdaptiveAvgPool3d((1, 2, 3)) - m.to(device) - of_out = m(input) - np_out = np.array( - [ - [ - [ - [0.08871791260375947, -0.4024959376509308, 0.00722249259371315], - [-0.31343444964845824, 0.08188803218941582, -0.09210164562800888], - ] - ] - ] - ) - test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05)) - - -def _test_adaptive_avgpool3d_dhw_backward(test_case, device): - input = flow.Tensor( - np.array( - [ - [ - [ - [ - [ - -1.077571799600885, - -0.7804538890365837, - -1.2627538752119443, - 0.9993507145120477, - ], - [ - 2.0222532489157516, - 1.103451377699465, - -0.4377324754879578, - 1.890491810587517, - ], - [ - -0.5593861899064654, - -0.4949520241526519, - -0.18536721363519787, - -0.6098969866775772, - ], - [ - -1.6536215260171816, - -1.0392583540436786, - 0.3686776597613967, - -0.5356882834951805, - ], - ], - [ - [ - -1.2617900664449953, - -1.4390921091631532, - 0.20654399652431357, - 0.8186472101906713, - ], - [ - -0.3033378863400014, - -0.8173269764076293, - -0.3767515097625614, - -0.11021655039337777, - ], - [ - -0.22977043608192885, - 1.2717196366649905, - -0.4790851297878291, - -1.4495369404727856, - ], - [ - -1.2802093286977783, - -0.11184514806663474, - 1.7022167087210984, - -1.7354837287725355, - ], - ], - [ - [ - 2.4706497991773606, - -0.6549702631973298, - -0.9318107079571676, - 1.4652904271682428, - ], - [ - 1.1419864234341397, - 1.389909081086008, - 0.9657841900525568, - -0.8563114264976619, - ], - [ - 0.19515087084250754, - -0.37808457398571094, - 0.2938625398496183, - 0.9279930510353327, - ], - [ - -0.9374118277994007, - 0.3341831730452431, - -0.2792542765303833, - 0.38029090707066726, - ], - ], - [ - [ - 0.5918686659736041, - -0.7870631089938902, - -0.9534344874245392, - 0.31341612954718795, - ], - [ - 0.7509029444145228, - -0.9299288398562323, - -0.7343054052782476, - -0.8806481590696694, - ], - [ - -0.4707853016353985, - 0.12253641652645629, - 0.5088022039832846, - 0.520391789327562, - ], - [ - -0.0861300651163632, - 0.30291348404866386, - -0.6268565873680123, - -0.27469204305759976, - ], - ], - ] - ] - ] - ), - dtype=flow.float32, - device=flow.device(device), - requires_grad=True, - ) - m = flow.nn.AdaptiveAvgPool3d((1, 2, 3)) - of_out = m(input) - of_out = of_out.sum() - of_out.backward() - np_grad = np.array( - [ - [ - [ - [ - [0.0625, 0.125, 0.125, 0.0625], - [0.0625, 0.125, 0.125, 0.0625], - [0.0625, 0.125, 0.125, 0.0625], - [0.0625, 0.125, 0.125, 0.0625], - ], - [ - [0.0625, 0.125, 0.125, 0.0625], - [0.0625, 0.125, 0.125, 0.0625], - [0.0625, 0.125, 0.125, 0.0625], - [0.0625, 0.125, 0.125, 0.0625], - ], - [ - [0.0625, 0.125, 0.125, 0.0625], - [0.0625, 0.125, 0.125, 0.0625], - [0.0625, 0.125, 0.125, 0.0625], - [0.0625, 0.125, 0.125, 0.0625], - ], - [ - [0.0625, 0.125, 0.125, 0.0625], - [0.0625, 0.125, 0.125, 0.0625], - [0.0625, 0.125, 0.125, 0.0625], - [0.0625, 0.125, 0.125, 0.0625], - ], - ] - ] - ] - ) - test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05)) +NoneType = type(None) +# Not the same as those in PyTorch because 'output_size' cannot be NoneType (even in 'torch.nn.AdaptiveAvgPoolXd') +_size_2_opt_t_not_none = Union[int, Tuple[Union[int, NoneType], Union[int, NoneType]]] +_size_3_opt_t_not_none = Union[ + int, Tuple[Union[int, NoneType], Union[int, NoneType], Union[int, NoneType]] +] @flow.unittest.skip_unless_1n1d() class TestAdaptiveAvgPool(flow.unittest.TestCase): + @autotest() def test_adaptive_avgpool1d(test_case): - arg_dict = OrderedDict() - arg_dict["test_fun"] = [ - _test_adaptive_avgpool1d_forward, - _test_adaptive_avgpool1d_backward, - ] - arg_dict["device"] = ["cpu", "cuda"] - for arg in GenArgList(arg_dict): - arg[0](test_case, *arg[1:]) - + m = torch.nn.AdaptiveAvgPool1d(output_size=random().to(_size_1_t)) + m.train(random()) + device = random_device() + m.to(device) + x = random_pytorch_tensor(ndim=3).to(device) + y = m(x) + return y + + @autotest() def test_adaptive_avgpool2d(test_case): - arg_dict = OrderedDict() - arg_dict["test_fun"] = [ - _test_adaptive_avgpool2d_forward, - _test_adaptive_avgpool2d_backward, - _test_adaptive_avgpool2d_hw_forward, - _test_adaptive_avgpool2d_hw_backward, - ] - arg_dict["device"] = ["cpu", "cuda"] - for arg in GenArgList(arg_dict): - arg[0](test_case, *arg[1:]) - + m = torch.nn.AdaptiveAvgPool2d(output_size=random().to(_size_2_opt_t_not_none)) + m.train(random()) + device = random_device() + m.to(device) + x = random_pytorch_tensor(ndim=4).to(device) + y = m(x) + return y + + @unittest.skipIf( + version.parse(torch_original.__version__) < version.parse("1.10.0"), + "GPU version 'nn.AdaptiveAvgPool3d' has a bug in PyTorch before '1.10.0'", + ) + @autotest() def test_adaptive_avgpool3d(test_case): - arg_dict = OrderedDict() - arg_dict["test_fun"] = [ - _test_adaptive_avgpool3d_forward, - _test_adaptive_avgpool3d_backward, - _test_adaptive_avgpool3d_dhw_forward, - _test_adaptive_avgpool3d_dhw_backward, - ] - arg_dict["device"] = ["cpu", "cuda"] - for arg in GenArgList(arg_dict): - arg[0](test_case, *arg[1:]) + m = torch.nn.AdaptiveAvgPool3d(output_size=random().to(_size_3_opt_t_not_none)) + m.train(random()) + device = random_device() + m.to(device) + x = random_pytorch_tensor(ndim=5).to(device) + y = m(x) + return y if __name__ == "__main__": diff --git a/python/oneflow/test/modules/test_autograd_mode.py b/python/oneflow/test/modules/test_autograd_mode.py new file mode 100644 index 00000000000..76ba59f8c14 --- /dev/null +++ b/python/oneflow/test/modules/test_autograd_mode.py @@ -0,0 +1,76 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import unittest +import oneflow as flow + +import oneflow.unittest + + +class TestAutogradMode(oneflow.unittest.TestCase): + def test_grad_mode(test_case): + test_case.assertTrue(flow.is_grad_enabled()) + + def test_inference_mode(test_case): + with flow.inference_mode(True): + test_case.assertFalse(flow.is_grad_enabled()) + test_case.assertTrue(flow.is_grad_enabled()) + + @flow.inference_mode(True) + def func(): + test_case.assertFalse(flow.is_grad_enabled()) + + func() + test_case.assertTrue(flow.is_grad_enabled()) + + with flow.inference_mode(False): + test_case.assertTrue(flow.is_grad_enabled()) + test_case.assertTrue(flow.is_grad_enabled()) + + @flow.inference_mode(False) + def func(): + test_case.assertTrue(flow.is_grad_enabled()) + + func() + test_case.assertTrue(flow.is_grad_enabled()) + + def test_grad_enable(test_case): + with flow.grad_enable(): + test_case.assertTrue(flow.is_grad_enabled()) + test_case.assertTrue(flow.is_grad_enabled()) + + @flow.grad_enable() + def func(): + test_case.assertTrue(flow.is_grad_enabled()) + + func() + test_case.assertTrue(flow.is_grad_enabled()) + + def test_no_grad(test_case): + with flow.no_grad(): + test_case.assertFalse(flow.is_grad_enabled()) + test_case.assertTrue(flow.is_grad_enabled()) + + @flow.no_grad() + def func(): + test_case.assertFalse(flow.is_grad_enabled()) + + func() + test_case.assertTrue(flow.is_grad_enabled()) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/oneflow/test/modules/test_avgpool.py b/python/oneflow/test/modules/test_avgpool.py index 61f7b0eb023..17eab8dcdb6 100644 --- a/python/oneflow/test/modules/test_avgpool.py +++ b/python/oneflow/test/modules/test_avgpool.py @@ -13,13 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. """ - -import math import unittest -from collections import OrderedDict - -import numpy as np -from test_util import GenArgList import oneflow as flow import oneflow.unittest @@ -28,7 +22,7 @@ @flow.unittest.skip_unless_1n1d() class TestAvgPoolingModule(flow.unittest.TestCase): - @autotest(n=20) + @autotest(n=100) def test_avgpool1d_with_random_data(test_case): m = torch.nn.AvgPool1d( kernel_size=random(4, 6), @@ -44,7 +38,7 @@ def test_avgpool1d_with_random_data(test_case): y = m(x) return y - @autotest(n=20) + @autotest(n=100) def test_avgpool2d_with_random_data(test_case): m = torch.nn.AvgPool2d( kernel_size=random(4, 6), @@ -63,17 +57,16 @@ def test_avgpool2d_with_random_data(test_case): y = m(x) return y - @autotest(n=20) + @autotest(n=100) def test_avgpool3d_with_random_data(test_case): m = torch.nn.AvgPool3d( - kernel_size=constant(3), - stride=constant(1), - padding=constant(1), + kernel_size=random(4, 6), + stride=random(1, 3) | nothing(), + padding=random(1, 3) | nothing(), ceil_mode=random(), count_include_pad=random(), divisor_override=random().to(int), ) - m.train(random()) device = random_device() m.to(device) diff --git a/python/oneflow/test/modules/test_consistent_cast.py b/python/oneflow/test/modules/test_consistent_cast.py index 080e93b783a..ff07ab3e360 100644 --- a/python/oneflow/test/modules/test_consistent_cast.py +++ b/python/oneflow/test/modules/test_consistent_cast.py @@ -73,6 +73,21 @@ def test_local_to_consistent_broadcast_data(test_case): np.array_equal(z.numpy(), np.ones((16, 16), dtype=np.int32)) ) + def test_cuda_consistent_to_consistent_cpu_s2b(test_case): + x = flow.ones((16, 16), device=flow.device("cpu"), dtype=flow.int32) + placement = flow.placement("cpu", {0: range(2)}) + y = x.to_consistent(placement=placement, sbp=flow.sbp.split(0)) + sbp = (flow.sbp.broadcast,) + y = y.to_consistent(sbp=sbp) + test_case.assertEqual(y.sbp, sbp) + test_case.assertEqual(y.placement, placement) + test_case.assertEqual(tuple(y.shape), (32, 16)) + test_case.assertEqual(y.dtype, flow.int32) + z = y.to_local() + test_case.assertTrue( + np.array_equal(z.numpy(), np.ones((32, 16), dtype=np.int32)) + ) + def test_cuda_consistent_to_consistent_s2b(test_case): x = flow.ones((16, 16), device=flow.device("cuda"), dtype=flow.int32) placement = flow.placement("cuda", {0: range(2)}) @@ -88,6 +103,26 @@ def test_cuda_consistent_to_consistent_s2b(test_case): np.array_equal(z.numpy(), np.ones((32, 16), dtype=np.int32)) ) + def test_cuda_consistent_to_consistent_cpu_s2p(test_case): + x = flow.ones((16, 16), device=flow.device("cpu"), dtype=flow.int32) + placement = flow.placement("cpu", {0: range(2)}) + y = x.to_consistent(placement=placement, sbp=flow.sbp.split(0)) + sbp = (flow.sbp.partial_sum,) + y = y.to_consistent(sbp=sbp) + test_case.assertEqual(y.sbp, sbp) + test_case.assertEqual(y.placement, placement) + test_case.assertEqual(tuple(y.shape), (32, 16)) + test_case.assertEqual(y.dtype, flow.int32) + z = y.to_local() + if int(os.getenv("RANK")) == 0: + test_case.assertTrue( + np.array_equal(z.numpy(), np.ones((32, 16), dtype=np.int32)) + ) + else: + test_case.assertTrue( + np.array_equal(z.numpy(), np.zeros((32, 16), dtype=np.int32)) + ) + def test_cuda_consistent_to_consistent_s2p(test_case): x = flow.ones((16, 16), device=flow.device("cuda"), dtype=flow.int32) placement = flow.placement("cuda", {0: range(2)}) @@ -143,6 +178,21 @@ def test_cuda_consistent_to_consistent_b2s(test_case): np.array_equal(z.numpy(), np.ones((8, 16), dtype=np.int32)) ) + def test_cuda_consistent_to_consistent_cpu_p2s(test_case): + x = flow.ones((16, 16), device=flow.device("cpu"), dtype=flow.int32) + placement = flow.placement("cpu", {0: range(2)}) + y = x.to_consistent(placement=placement, sbp=flow.sbp.partial_sum) + sbp = (flow.sbp.split(0),) + y = y.to_consistent(sbp=sbp) + test_case.assertEqual(y.sbp, sbp) + test_case.assertEqual(y.placement, placement) + test_case.assertEqual(tuple(y.shape), (16, 16)) + test_case.assertEqual(y.dtype, flow.int32) + z = y.to_local() + test_case.assertTrue( + np.array_equal(z.numpy(), np.ones((8, 16), dtype=np.int32) * 2) + ) + def test_cuda_consistent_to_consistent_p2s(test_case): x = flow.ones((16, 16), device=flow.device("cuda"), dtype=flow.int32) placement = flow.placement("cuda", {0: range(2)}) @@ -158,6 +208,41 @@ def test_cuda_consistent_to_consistent_p2s(test_case): np.array_equal(z.numpy(), np.ones((8, 16), dtype=np.int32) * 2) ) + def test_cuda_consistent_to_consistent_cuda_h2d(test_case): + x = flow.ones((16, 16), device=flow.device("cpu"), dtype=flow.int32) + placement = flow.placement("cpu", {0: range(2)}) + cuda_placement = flow.placement("cuda", {0: range(2)}) + y = x.to_consistent(placement=placement, sbp=flow.sbp.partial_sum) + y = y.to_consistent(placement=cuda_placement, sbp=flow.sbp.partial_sum) + test_case.assertEqual(y.sbp, (flow.sbp.partial_sum,)) + test_case.assertEqual(y.placement, cuda_placement) + test_case.assertEqual(tuple(y.shape), (16, 16)) + test_case.assertEqual(y.dtype, flow.int32) + z = y.to_local() + test_case.assertTrue( + np.array_equal(z.numpy(), np.ones((16, 16), dtype=np.int32)) + ) + + def test_cuda_consistent_to_consistent_cpu_p2b(test_case): + x = flow.ones((16, 16), device=flow.device("cpu"), dtype=flow.int32) + placement = flow.placement("cpu", {0: range(2)}) + cuda_placement = flow.placement("cuda", {0: range(2)}) + y = x.to_consistent(placement=placement, sbp=flow.sbp.partial_sum) + import time + + y = y.to_consistent(placement=cuda_placement, sbp=flow.sbp.partial_sum) + sbp = (flow.sbp.broadcast,) + y = y.to_consistent(placement=cuda_placement, sbp=sbp) + y = y.to_consistent(placement=placement, sbp=sbp) + test_case.assertEqual(y.sbp, sbp) + test_case.assertEqual(y.placement, placement) + test_case.assertEqual(tuple(y.shape), (16, 16)) + test_case.assertEqual(y.dtype, flow.int32) + z = y.to_local() + test_case.assertTrue( + np.array_equal(z.numpy(), np.ones((16, 16), dtype=np.int32) * 2) + ) + def test_cuda_consistent_to_consistent_p2b(test_case): x = flow.ones((16, 16), device=flow.device("cuda"), dtype=flow.int32) placement = flow.placement("cuda", {0: range(2)}) diff --git a/python/oneflow/test/modules/test_ddp.py b/python/oneflow/test/modules/test_ddp.py index 06ab7e25f6f..3bcc3c90334 100644 --- a/python/oneflow/test/modules/test_ddp.py +++ b/python/oneflow/test/modules/test_ddp.py @@ -129,6 +129,49 @@ def forward(self, x): test_case.assertTrue(np_allclose_with_shape(m.w2.grad.numpy(), np.array([4.5]))) test_case.assertTrue(np_allclose_with_shape(m.w3.grad.numpy(), np.array([3]))) + def test_broadcast_buffer(test_case): + rank = flow.framework.distribute.get_rank() + + class CustomModule(flow.nn.Module): + def __init__(self): + super().__init__() + self.register_buffer("buf", flow.tensor([1, 2]) * (rank + 1)) + + def forward(self, x): + res = self.buf + x + self.buf.copy_(x) + return res + + x = flow.tensor([2, 3]) * (rank + 1) + x = x.to("cuda") + + m = CustomModule() + m = m.to("cuda") + m = ddp(m) + + y1 = m(x) + y2 = m(x) + + m = CustomModule() + m = m.to("cuda") + m = ddp(m, broadcast_buffers=False) + + y3 = m(x) + y4 = m(x) + + if rank == 0: + test_case.assertTrue(np_allclose_with_shape(y1.numpy(), np.array([3, 5]))) + test_case.assertTrue(np_allclose_with_shape(y2.numpy(), np.array([4, 6]))) + test_case.assertTrue(np_allclose_with_shape(y3.numpy(), np.array([3, 5]))) + test_case.assertTrue(np_allclose_with_shape(y4.numpy(), np.array([4, 6]))) + elif rank == 1: + test_case.assertTrue(np_allclose_with_shape(y1.numpy(), np.array([5, 8]))) + test_case.assertTrue(np_allclose_with_shape(y2.numpy(), np.array([6, 9]))) + test_case.assertTrue(np_allclose_with_shape(y3.numpy(), np.array([6, 10]))) + test_case.assertTrue(np_allclose_with_shape(y4.numpy(), np.array([8, 12]))) + else: + raise ValueError() + if __name__ == "__main__": unittest.main() diff --git a/python/oneflow/test/modules/test_matmul.py b/python/oneflow/test/modules/test_matmul.py index 8b0fb5a471d..b10d90d1274 100644 --- a/python/oneflow/test/modules/test_matmul.py +++ b/python/oneflow/test/modules/test_matmul.py @@ -13,330 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. """ - import unittest -from collections import OrderedDict - -import numpy as np -import torch -from automated_test_util import * -from test_util import GenArgList import oneflow as flow import oneflow.unittest - - -def _test_matmul(test_case, device): - input1 = flow.Tensor( - np.random.randn(2, 6), dtype=flow.float32, device=flow.device(device) - ) - input2 = flow.Tensor( - np.random.randn(6, 5), dtype=flow.float32, device=flow.device(device) - ) - of_out = flow.matmul(input1, input2) - np_out = np.matmul(input1.numpy(), input2.numpy()) - test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05)) - - -def _test_broadcast_matmul(test_case, device): - input1 = flow.Tensor( - np.random.randn(3, 4, 5), dtype=flow.float32, device=flow.device(device) - ) - input2 = flow.Tensor( - np.random.randn(5, 6), dtype=flow.float32, device=flow.device(device) - ) - of_out = flow.matmul(input1, input2) - np_out = np.matmul(input1.numpy(), input2.numpy()) - test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05)) - - -def _test_batch_matmul(test_case, device): - input1 = flow.Tensor( - np.random.randn(10, 3, 4), dtype=flow.float32, device=flow.device(device) - ) - input2 = flow.Tensor( - np.random.randn(10, 4, 5), dtype=flow.float32, device=flow.device(device) - ) - of_out = flow.matmul(input1, input2) - np_out = np.matmul(input1.numpy(), input2.numpy()) - test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05)) - - -def _test_matmul_backward(test_case, device): - input1 = flow.Tensor( - [ - [ - -0.36023932695388794, - 0.5571867227554321, - -1.4987696409225464, - -0.9674592018127441, - 0.021076146513223648, - 2.9180469512939453, - ], - [ - -0.29169487953186035, - 0.2978641390800476, - 0.8198832273483276, - -0.3385652005672455, - -2.9260432720184326, - 0.22528153657913208, - ], - ], - dtype=flow.float32, - device=flow.device(device), - requires_grad=True, - ) - input2 = flow.Tensor( - [ - [ - -0.5270200371742249, - -0.4325239062309265, - -0.33396217226982117, - 1.2983192205429077, - -0.463693231344223, - ], - [ - 1.893467903137207, - -1.0874812602996826, - 0.7068315744400024, - -0.23532593250274658, - -0.011510828509926796, - ], - [ - -0.5477776527404785, - -0.0381619855761528, - 0.03451986983418465, - -0.8248650431632996, - -1.8885509967803955, - ], - [ - -1.0034432411193848, - 0.5428839921951294, - -0.7785694599151611, - -0.4489346146583557, - 1.780846118927002, - ], - [ - 0.9378347396850586, - -0.38816362619400024, - 0.8186876177787781, - -0.9630932807922363, - -0.11487948149442673, - ], - [ - -0.12073716521263123, - 2.181835174560547, - 0.5511962175369263, - -1.294308066368103, - -0.7765272855758667, - ], - ], - dtype=flow.float32, - device=flow.device(device), - requires_grad=True, - ) - of_out = flow.matmul(input1, input2) - of_out = of_out.sum() - of_out.backward() - np_grad = [ - [ - -0.45888009667396545, - 1.2659813165664673, - -3.264835834503174, - 0.09278273582458496, - 0.2903860807418823, - 0.5414588451385498, - ], - [ - -0.45888009667396545, - 1.2659813165664673, - -3.264835834503174, - 0.09278273582458496, - 0.2903860807418823, - 0.5414588451385498, - ], - ] - test_case.assertTrue( - np.allclose(input1.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05) - ) - - -def _test_matmul_backward_x_grad(test_case, device): - input1 = flow.Tensor( - [ - [-1.8604081869125366, -2.0019688606262207], - [1.0511547327041626, -2.263841390609741], - ], - dtype=flow.float32, - device=flow.device(device), - requires_grad=True, - ) - input2 = flow.Tensor( - [ - [-0.13973912596702576, 0.8478717803955078], - [-0.2144828885793686, -1.7145386934280396], - ], - dtype=flow.float32, - device=flow.device(device), - requires_grad=False, - ) - of_out = flow.matmul(input1, input2) - of_out = of_out.sum() - of_out.backward() - np_grad = [ - [0.7081326246261597, -1.9290215969085693], - [0.7081326246261597, -1.9290215969085693], - ] - test_case.assertTrue( - np.allclose(input1.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05) - ) - - -def _test_matmul_backward_y_grad(test_case, device): - input1 = flow.Tensor( - [ - [-1.8604081869125366, -2.0019688606262207], - [1.0511547327041626, -2.263841390609741], - ], - dtype=flow.float32, - device=flow.device(device), - requires_grad=False, - ) - input2 = flow.Tensor( - [ - [-0.13973912596702576, 0.8478717803955078], - [-0.2144828885793686, -1.7145386934280396], - ], - dtype=flow.float32, - device=flow.device(device), - requires_grad=True, - ) - of_out = flow.matmul(input1, input2) - of_out = of_out.sum() - of_out.backward() - print(input2.grad.numpy().tolist()) - np_grad = [ - [-0.809253454208374, -0.809253454208374], - [-4.265810012817383, -4.265810012817383], - ] - test_case.assertTrue( - np.allclose(input2.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05) - ) - - -def _test_broadcast_matmul_backward(test_case, device): - input1 = flow.Tensor( - [ - [ - [0.5893293023109436, -0.0376124233007431, 0.7791574001312256], - [1.1614371538162231, 0.009700910188257694, 0.7281601428985596], - ], - [ - [-0.27213698625564575, 0.7058051824569702, -0.4643424451351166], - [2.2279646396636963, 0.05870082601904869, -0.18335142731666565], - ], - ], - dtype=flow.float32, - device=flow.device(device), - requires_grad=True, - ) - input2 = flow.Tensor( - [ - [0.25825661420822144, -0.4875393807888031], - [-0.040459781885147095, -0.3713535666465759], - [-1.633512258529663, -2.0034799575805664], - ], - dtype=flow.float32, - device=flow.device(device), - requires_grad=True, - ) - of_out = flow.matmul(input1, input2) - of_out = of_out.sum() - of_out.backward() - np_grad = [ - [ - [-0.22928276658058167, -0.411813348531723, -3.6369922161102295], - [-0.22928276658058167, -0.411813348531723, -3.6369922161102295], - ], - [ - [-0.22928276658058167, -0.411813348531723, -3.6369922161102295], - [-0.22928276658058167, -0.411813348531723, -3.6369922161102295], - ], - ] - test_case.assertTrue( - np.allclose(input1.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05) - ) - - -def _test_batch_matmul_backward(test_case, device): - input1 = flow.Tensor( - [ - [ - [-0.0036776792258024216, 1.9946473836898804, -0.423959881067276], - [1.0892143249511719, 0.04005361348390579, -0.27883127331733704], - ], - [ - [-0.970306396484375, 0.017771577462553978, 0.019596196711063385], - [0.27402883768081665, -0.8192587494850159, -0.3135920464992523], - ], - ], - dtype=flow.float32, - device=flow.device(device), - requires_grad=True, - ) - input2 = flow.Tensor( - [ - [ - [1.118346929550171, -0.930071234703064], - [1.1238232851028442, 1.373764157295227], - [0.17178462445735931, -1.1010534763336182], - ], - [ - [0.6694859862327576, 0.9250285029411316], - [-1.0835869312286377, 0.4192655086517334], - [1.2616937160491943, 0.33809131383895874], - ], - ], - dtype=flow.float32, - device=flow.device(device), - requires_grad=True, - ) - of_out = flow.matmul(input1, input2) - of_out = of_out.sum() - of_out.backward() - np_grad = [ - [ - [0.18827569484710693, 2.4975874423980713, -0.9292688369750977], - [0.18827569484710693, 2.4975874423980713, -0.9292688369750977], - ], - [ - [1.5945144891738892, -0.6643214225769043, 1.5997850894927979], - [1.5945144891738892, -0.6643214225769043, 1.5997850894927979], - ], - ] - test_case.assertTrue( - np.allclose(input1.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05) - ) +from automated_test_util import * @flow.unittest.skip_unless_1n1d() class TestModule(flow.unittest.TestCase): - def test_matmul(test_case): - arg_dict = OrderedDict() - arg_dict["test_fun"] = [ - _test_matmul, - _test_broadcast_matmul, - _test_batch_matmul, - _test_matmul_backward, - _test_matmul_backward_x_grad, - _test_matmul_backward_y_grad, - _test_batch_matmul_backward, - _test_broadcast_matmul_backward, - ] - arg_dict["device"] = ["cpu", "cuda"] - for arg in GenArgList(arg_dict): - arg[0](test_case, *arg[1:]) - @autotest() def test_flow_matmul_with_random_data(test_case): k = random(1, 6) @@ -345,6 +30,13 @@ def test_flow_matmul_with_random_data(test_case): z = torch.matmul(x, y) return z + @autotest() + def test_flow_tensor_matmul_with_random_data(test_case): + k = random(1, 6) + x = random_pytorch_tensor(ndim=2, dim1=k) + y = random_pytorch_tensor(ndim=2, dim0=k) + return x.matmul(y) + if __name__ == "__main__": unittest.main() diff --git a/python/oneflow/test/modules/test_negative.py b/python/oneflow/test/modules/test_negative.py index 4534545a224..7352a29cb72 100644 --- a/python/oneflow/test/modules/test_negative.py +++ b/python/oneflow/test/modules/test_negative.py @@ -15,69 +15,14 @@ """ import unittest -from collections import OrderedDict - -import numpy as np -from test_util import GenArgList import oneflow as flow import oneflow.unittest from automated_test_util import * -def _test_negtive(test_case, shape, device): - np_input = np.random.randn(*shape) - input = flow.Tensor(np_input, dtype=flow.float32, device=flow.device(device)) - of_out = flow.negative(input) - np_out = -input.numpy() - test_case.assertTrue(np.array_equal(of_out.numpy(), np_out)) - - -def _test_negative_neg(test_case, shape, device): - np_input = np.random.randn(*shape) - input = flow.Tensor(np_input, dtype=flow.float32, device=flow.device(device)) - of_out = flow.neg(input) - np_out = -input.numpy() - test_case.assertTrue(np.array_equal(of_out.numpy(), np_out)) - - -def _test_tensor_negative(test_case, shape, device): - np_input = np.random.randn(*shape) - input = flow.Tensor(np_input, dtype=flow.float32, device=flow.device(device)) - of_out = input.negative() - np_out = -input.numpy() - test_case.assertTrue(np.array_equal(of_out.numpy(), np_out)) - - -def _test_negative_backward(test_case, shape, device): - np_input = np.random.randn(*shape) - input = flow.Tensor( - np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True - ) - of_out = flow.negative(input) - of_out = of_out.sum() - of_out.backward() - np_grad = -np.ones(shape) - test_case.assertTrue( - np.allclose(input.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05) - ) - - @flow.unittest.skip_unless_1n1d() class TestNegativeModule(flow.unittest.TestCase): - def test_negative(test_case): - arg_dict = OrderedDict() - arg_dict["test_fun"] = [ - _test_negtive, - _test_negative_neg, - _test_tensor_negative, - _test_negative_backward, - ] - arg_dict["shape"] = [(2, 3), (2, 4, 5, 6)] - arg_dict["device"] = ["cpu", "cuda"] - for arg in GenArgList(arg_dict): - arg[0](test_case, *arg[1:]) - @autotest(auto_backward=False) def test_ne_with_0shape_data(test_case): device = random_device() @@ -87,6 +32,23 @@ def test_ne_with_0shape_data(test_case): y3 = -x return (y1, y2, y3) + @autotest() + def test_tensor_negative_with_random_data(test_case): + x = random_pytorch_tensor().to(random_device()) + return x.negative() + + @autotest() + def test_negative_with_random_data(test_case): + x = random_pytorch_tensor().to(random_device()) + z = torch.negative(x) + return z + + @autotest() + def test_neg_with_random_data(test_case): + x = random_pytorch_tensor().to(random_device()) + z = torch.neg(x) + return z + if __name__ == "__main__": unittest.main() diff --git a/python/oneflow/test/modules/test_pooling.py b/python/oneflow/test/modules/test_pooling.py index e5e36c93863..4953d5477b5 100644 --- a/python/oneflow/test/modules/test_pooling.py +++ b/python/oneflow/test/modules/test_pooling.py @@ -28,14 +28,16 @@ def unpack_indices(dual_object): @flow.unittest.skip_unless_1n1d() class TestMaxPooling(flow.unittest.TestCase): - @autotest(n=20, auto_backward=True, rtol=1e-4, atol=1e-4) + @autotest(n=100, auto_backward=False) def test_maxpool1d_with_random_data(test_case): + return_indices = random().to(bool).value() m = torch.nn.MaxPool1d( kernel_size=random(4, 6).to(_size_1_t), stride=random(1, 3).to(_size_1_t) | nothing(), padding=random(1, 3).to(_size_1_t) | nothing(), dilation=random(2, 4).to(_size_1_t) | nothing(), - ceil_mode=random() | nothing(), + ceil_mode=random(), + return_indices=return_indices, ) m.train(random()) device = random_device() @@ -43,16 +45,21 @@ def test_maxpool1d_with_random_data(test_case): x = random_pytorch_tensor(ndim=3, dim2=random(20, 22)).to(device) y = m(x) - return y + if return_indices: + return unpack_indices(y) + else: + return y, y.sum().backward() - @autotest(n=20, auto_backward=True, rtol=1e-4, atol=1e-4) + @autotest(n=100, auto_backward=False) def test_maxpool2d_with_random_data(test_case): + return_indices = random().to(bool).value() m = torch.nn.MaxPool2d( kernel_size=random(4, 6).to(_size_2_t), stride=random(1, 3).to(_size_2_t) | nothing(), padding=random(1, 3).to(_size_2_t) | nothing(), dilation=random(2, 4).to(_size_2_t) | nothing(), - ceil_mode=random() | nothing(), + ceil_mode=random(), + return_indices=return_indices, ) m.train(random()) device = random_device() @@ -62,16 +69,21 @@ def test_maxpool2d_with_random_data(test_case): ) y = m(x) - return y + if return_indices: + return unpack_indices(y) + else: + return y, y.sum().backward() - @autotest(n=20, auto_backward=True, rtol=1e-4, atol=1e-4) + @autotest(n=100, auto_backward=False) def test_maxpool3d_with_random_data(test_case): + return_indices = random().to(bool).value() m = torch.nn.MaxPool3d( kernel_size=random(4, 6).to(_size_3_t), stride=random(1, 3).to(_size_3_t) | nothing(), padding=random(1, 3).to(_size_3_t) | nothing(), dilation=random(2, 4).to(_size_3_t) | nothing(), - ceil_mode=random() | nothing(), + ceil_mode=random(), + return_indices=return_indices, ) m.train(random()) device = random_device() @@ -81,7 +93,10 @@ def test_maxpool3d_with_random_data(test_case): ).to(device) y = m(x) - return y + if return_indices: + return unpack_indices(y) + else: + return y, y.sum().backward() if __name__ == "__main__": diff --git a/python/oneflow/test/modules/test_randint.py b/python/oneflow/test/modules/test_randint.py index 7a668417dab..122cd9ad097 100644 --- a/python/oneflow/test/modules/test_randint.py +++ b/python/oneflow/test/modules/test_randint.py @@ -22,7 +22,6 @@ import oneflow.unittest from test_util import GenArgList -from automated_test_util import * def _test_rand(test_case, device, shape, low, high): @@ -80,16 +79,14 @@ def _test_high(test_case, device, shape, low, high): def _test_0rank(test_case, device, shape, low, high): y1 = flow.randint(low, high, shape, device=flow.device(device)) - y2 = flow.randint(low, high, shape, device=flow.device(device)) - test_case.assertTrue(not np.allclose(y1.numpy(), y2.numpy(), atol=1e-4, rtol=1e-4)) - + test_case.assertTrue(y1.shape == shape) @flow.unittest.skip_unless_1n1d() class TestRandint(flow.unittest.TestCase): def test_consistent_naive(test_case): placement = flow.placement("cpu", {0: [0]}) sbp = (flow.sbp.broadcast,) - x = flow.randint(16, (10, 1), placement=placement, sbp=sbp) + x = flow.randint(0, 16, (10, 1), placement=placement, sbp=sbp) test_case.assertEqual(x.sbp, sbp) test_case.assertEqual(x.placement, placement) diff --git a/python/oneflow/test/modules/test_repeat.py b/python/oneflow/test/modules/test_repeat.py index 1346b4444ca..042c58c9815 100644 --- a/python/oneflow/test/modules/test_repeat.py +++ b/python/oneflow/test/modules/test_repeat.py @@ -13,177 +13,21 @@ See the License for the specific language governing permissions and limitations under the License. """ - import unittest -from collections import OrderedDict - -import numpy as np -from test_util import GenArgList import oneflow as flow import oneflow.unittest - - -def np_repeat(x, sizes): - return np.tile(x, sizes) - - -def _test_repeat_new_dim(test_case, device): - input = flow.Tensor( - np.random.randn(2, 4, 1, 3), dtype=flow.float32, device=flow.device(device) - ) - sizes = (4, 3, 2, 3, 3) - np_out = np_repeat(input.numpy(), sizes) - of_out = input.repeat(4, 3, 2, 3, 3) - test_case.assertTrue(np.array_equal(of_out.numpy(), np_out)) - - -def _test_repeat_input_list_new_dim(test_case, device): - input = flow.Tensor( - np.random.randn(2, 4, 1, 3), dtype=flow.float32, device=flow.device(device) - ) - sizes = (4, 3, 2, 3, 3) - np_out = np_repeat(input.numpy(), sizes) - of_out = input.repeat(sizes) - test_case.assertTrue(np.array_equal(of_out.numpy(), np_out)) - - -def _test_repeat_same_dim(test_case, device): - input = flow.Tensor( - np.random.randn(1, 2, 5, 3), dtype=flow.float32, device=flow.device(device) - ) - sizes = (4, 2, 3, 19) - of_out = input.repeat(4, 2, 3, 19) - np_out = np_repeat(input.numpy(), sizes) - test_case.assertTrue(np.array_equal(of_out.numpy(), np_out)) - - -def _test_repeat_same_dim_int(test_case, device): - input = flow.Tensor( - np.random.randn(1, 2, 5, 3), dtype=flow.int32, device=flow.device(device) - ) - size_tensor = flow.Tensor(np.random.randn(4, 2, 3, 19)) - sizes = size_tensor.size() - of_out = input.repeat(size_tensor.size()) - np_out = np_repeat(input.numpy(), sizes) - test_case.assertTrue(np.array_equal(of_out.numpy(), np_out.astype(np.int32))) - - -def _test_repeat_same_dim_int8(test_case, device): - input = flow.Tensor( - np.random.randn(1, 2, 5, 3), dtype=flow.int8, device=flow.device(device) - ) - size_tensor = flow.Tensor(np.random.randn(4, 2, 3, 19)) - sizes = size_tensor.size() - of_out = input.repeat(sizes) - np_out = np_repeat(input.numpy(), sizes) - test_case.assertTrue(np.array_equal(of_out.numpy(), np_out.astype(np.int32))) - - -def _test_repeat_new_dim_backward(test_case, device): - input = flow.Tensor( - np.random.randn(2, 4, 1, 3), - dtype=flow.float32, - device=flow.device(device), - requires_grad=True, - ) - sizes = (4, 3, 2, 3, 3) - of_out = input.repeat(4, 3, 2, 3, 3) - of_out = of_out.sum() - of_out.backward() - np_grad = [ - [ - [[216.0, 216.0, 216.0]], - [[216.0, 216.0, 216.0]], - [[216.0, 216.0, 216.0]], - [[216.0, 216.0, 216.0]], - ], - [ - [[216.0, 216.0, 216.0]], - [[216.0, 216.0, 216.0]], - [[216.0, 216.0, 216.0]], - [[216.0, 216.0, 216.0]], - ], - ] - test_case.assertTrue(np.array_equal(input.grad.numpy(), np_grad)) - - -def _test_repeat_same_dim_backward(test_case, device): - input = flow.Tensor( - np.random.randn(1, 2, 5, 3), - dtype=flow.float32, - device=flow.device(device), - requires_grad=True, - ) - of_out = input.repeat(1, 2, 3, 1) - of_out = of_out.sum() - of_out.backward() - np_grad = [ - [ - [ - [6.0, 6.0, 6.0], - [6.0, 6.0, 6.0], - [6.0, 6.0, 6.0], - [6.0, 6.0, 6.0], - [6.0, 6.0, 6.0], - ], - [ - [6.0, 6.0, 6.0], - [6.0, 6.0, 6.0], - [6.0, 6.0, 6.0], - [6.0, 6.0, 6.0], - [6.0, 6.0, 6.0], - ], - ] - ] - test_case.assertTrue(np.array_equal(input.grad.numpy(), np_grad)) - - -def _test_repeat_flow_size(test_case, device): - input = flow.Tensor( - np.random.randn(2, 4, 1, 3), - dtype=flow.float32, - device=flow.device(device), - requires_grad=True, - ) - sizes = flow.Size([4, 3, 2, 3, 3]) - of_out = input.repeat(sizes) - of_out = of_out.sum() - of_out.backward() - np_grad = [ - [ - [[216.0, 216.0, 216.0]], - [[216.0, 216.0, 216.0]], - [[216.0, 216.0, 216.0]], - [[216.0, 216.0, 216.0]], - ], - [ - [[216.0, 216.0, 216.0]], - [[216.0, 216.0, 216.0]], - [[216.0, 216.0, 216.0]], - [[216.0, 216.0, 216.0]], - ], - ] - test_case.assertTrue(np.array_equal(input.grad.numpy(), np_grad)) +from automated_test_util import * @flow.unittest.skip_unless_1n1d() class TestRepeat(flow.unittest.TestCase): - def test_repeat(test_case): - arg_dict = OrderedDict() - arg_dict["test_fun"] = [ - _test_repeat_new_dim, - _test_repeat_same_dim, - _test_repeat_same_dim_int, - _test_repeat_same_dim_int8, - _test_repeat_new_dim_backward, - _test_repeat_same_dim_backward, - _test_repeat_flow_size, - _test_repeat_input_list_new_dim, - ] - arg_dict["device"] = ["cpu", "cuda"] - for arg in GenArgList(arg_dict): - arg[0](test_case, *arg[1:]) + @autotest() + def test_flow_tensor_repeat_with_random_data(test_case): + x = random_pytorch_tensor(ndim=2, dim0=1, dim1=2) + sizes = (random(1, 5).to(int), random(1, 5).to(int), random(1, 5).to(int)) + y = x.repeat(sizes) + return y if __name__ == "__main__": diff --git a/python/oneflow/test/modules/test_tensor_str.py b/python/oneflow/test/modules/test_tensor_str.py index 5c88acb2f2a..ad409a1f228 100644 --- a/python/oneflow/test/modules/test_tensor_str.py +++ b/python/oneflow/test/modules/test_tensor_str.py @@ -14,6 +14,7 @@ limitations under the License. """ +import os import unittest from collections import OrderedDict @@ -26,7 +27,6 @@ import oneflow -@flow.unittest.skip_unless_1n1d() def _test_local_tensor_str(test_case, device): # int dtype x = flow.tensor([[1, 2, 3], [4, 5, -6]], device=flow.device(device)) @@ -92,7 +92,6 @@ def _test_local_tensor_str(test_case, device): test_case.assertTrue("..." in tensor_str) -@flow.unittest.skip_unless_1n1d() def _test_consistent_tensor_str(test_case, device): placement = flow.placement(device, {0: range(1)}) # split consistent tensor @@ -109,7 +108,6 @@ def _test_consistent_tensor_str(test_case, device): x = flow.ones((10, 10), placement=placement, sbp=[flow.sbp.partial_sum]) tensor_str = str(x) test_case.assertTrue("1." in tensor_str) - test_case.assertTrue("1." in str(x[0][0])) # summarized consistent tensor x = flow.ones((100, 100), placement=placement, sbp=[flow.sbp.split(0)]) @@ -123,7 +121,6 @@ def _test_consistent_tensor_str(test_case, device): test_case.assertTrue("[]" in tensor_str) -@flow.unittest.skip_unless_1n2d() def _test_consistent_tensor_str_2d(test_case, device): placement = flow.placement(device, {0: range(2)}) x = flow.ones((10, 10), placement=placement, sbp=[flow.sbp.split(0)]) @@ -133,6 +130,8 @@ def _test_consistent_tensor_str_2d(test_case, device): x = flow.ones((10, 10), placement=placement, sbp=[flow.sbp.broadcast]) tensor_str = str(x) test_case.assertTrue("1." in tensor_str) + # TODO: x[0][0].to("cuda") has bug + # test_case.assertTrue("1." in str(x[0][0])) x = flow.ones((10, 10), placement=placement, sbp=[flow.sbp.partial_sum]) tensor_str = str(x) @@ -145,14 +144,35 @@ def _test_consistent_tensor_str_2d(test_case, device): class TestTensorStrModule(flow.unittest.TestCase): - def test_tensor_str(test_case): + @flow.unittest.skip_unless_1n1d() + def test_local_tensor_str_1n1d(test_case): arg_dict = OrderedDict() arg_dict["test_fun"] = [ _test_local_tensor_str, + ] + arg_dict["device"] = ["cpu", "cuda"] + for arg in GenArgList(arg_dict): + arg[0](test_case, *arg[1:]) + + @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") + @flow.unittest.skip_unless_1n1d() + def test_consistent_tensor_str_1n1d(test_case): + arg_dict = OrderedDict() + arg_dict["test_fun"] = [ _test_consistent_tensor_str, + ] + arg_dict["device"] = ["cuda"] + for arg in GenArgList(arg_dict): + arg[0](test_case, *arg[1:]) + + @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") + @flow.unittest.skip_unless_1n2d() + def test_tensor_str_1n2d(test_case): + arg_dict = OrderedDict() + arg_dict["test_fun"] = [ _test_consistent_tensor_str_2d, ] - arg_dict["device"] = ["cpu", "cuda"] + arg_dict["device"] = ["cuda"] for arg in GenArgList(arg_dict): arg[0](test_case, *arg[1:]) diff --git a/python/oneflow/test/modules/test_tile.py b/python/oneflow/test/modules/test_tile.py index ae9a94ed84e..e2985d8d426 100644 --- a/python/oneflow/test/modules/test_tile.py +++ b/python/oneflow/test/modules/test_tile.py @@ -13,167 +13,28 @@ See the License for the specific language governing permissions and limitations under the License. """ - import unittest -from collections import OrderedDict - -import numpy as np -from test_util import GenArgList import oneflow as flow import oneflow.unittest - - -def np_tile(x, sizes): - return np.tile(x, sizes) - - -def np_tile_grad(x, sizes): - times = np.array(sizes).prod() - return np.ones(shape=x.shape) * times - - -def _test_tile_less_dim_a(test_case, device): - input = flow.Tensor( - np.random.randn(2, 4, 1, 3), dtype=flow.float32, device=flow.device(device) - ) - sizes = (2,) - np_out = np_tile(input.numpy(), sizes) - of_out = input.tile(reps=sizes) - test_case.assertTrue(np.array_equal(of_out.numpy(), np_out)) - - -def _test_tile_less_dim_b(test_case, device): - input = flow.Tensor( - np.random.randn(3, 2, 5), dtype=flow.float32, device=flow.device(device) - ) - sizes = (3, 4) - np_out = np_tile(input.numpy(), sizes) - of_out = input.tile(reps=sizes) - test_case.assertTrue(np.array_equal(of_out.numpy(), np_out)) - - -def _test_tile_less_dim_c(test_case, device): - input = flow.Tensor( - np.random.randn(4, 3, 2, 5, 3), dtype=flow.float32, device=flow.device(device) - ) - sizes = (2, 3, 4, 4) - np_out = np_tile(input.numpy(), sizes) - of_out = input.tile(reps=sizes) - test_case.assertTrue(np.array_equal(of_out.numpy(), np_out)) - - -def _test_tile_same_dim(test_case, device): - input = flow.Tensor( - np.random.randn(1, 2, 5, 3), dtype=flow.float32, device=flow.device(device) - ) - sizes = (4, 2, 3, 19) - of_out = input.tile(reps=sizes) - np_out = np_tile(input.numpy(), sizes) - test_case.assertTrue(np.array_equal(of_out.numpy(), np_out)) - - -def _test_tile_same_dim_int(test_case, device): - input = flow.Tensor( - np.random.randn(1, 2, 5, 3), dtype=flow.int32, device=flow.device(device) - ) - size_tensor = flow.Tensor(np.random.randn(4, 2, 3, 19)) - sizes = size_tensor.size() - of_out = input.tile(reps=sizes) - np_out = np_tile(input.numpy(), sizes) - test_case.assertTrue(np.array_equal(of_out.numpy(), np_out.astype(np.int32))) - - -def _test_tile_same_dim_int8(test_case, device): - input = flow.Tensor( - np.random.randn(1, 2, 5, 3), dtype=flow.int8, device=flow.device(device) - ) - size_tensor = flow.Tensor(np.random.randn(4, 2, 3, 19)) - sizes = size_tensor.size() - of_out = input.tile(reps=sizes) - np_out = np_tile(input.numpy(), sizes) - test_case.assertTrue(np.array_equal(of_out.numpy(), np_out.astype(np.int32))) - - -def _test_tile_less_dim_a_backward(test_case, device): - input = flow.Tensor( - np.random.randn(2, 4, 1, 3), - dtype=flow.float32, - device=flow.device(device), - requires_grad=True, - ) - sizes = (2,) - of_out = input.tile(reps=sizes) - of_out = of_out.sum() - of_out.backward() - np_grad = np_tile_grad(input.numpy(), sizes) - test_case.assertTrue(np.array_equal(input.grad.numpy(), np_grad)) - - -def _test_tile_less_dim_b_backward(test_case, device): - input = flow.Tensor( - np.random.randn(3, 2, 5), - dtype=flow.float32, - device=flow.device(device), - requires_grad=True, - ) - sizes = (3, 4) - of_out = input.tile(reps=sizes) - of_out = of_out.sum() - of_out.backward() - np_grad = np_tile_grad(input.numpy(), sizes) - test_case.assertTrue(np.array_equal(input.grad.numpy(), np_grad)) - - -def _test_tile_less_dim_c_backward(test_case, device): - input = flow.Tensor( - np.random.randn(4, 3, 2, 5, 3), - dtype=flow.float32, - device=flow.device(device), - requires_grad=True, - ) - sizes = (2, 3, 4, 4) - of_out = input.tile(reps=sizes) - of_out = of_out.sum() - of_out.backward() - np_grad = np_tile_grad(input.numpy(), sizes) - test_case.assertTrue(np.array_equal(input.grad.numpy(), np_grad)) - - -def _test_tile_same_dim_backward(test_case, device): - input = flow.Tensor( - np.random.randn(1, 2, 5, 3), - dtype=flow.float32, - device=flow.device(device), - requires_grad=True, - ) - sizes = (1, 2, 3, 1) - of_out = input.tile(reps=sizes) - of_out = of_out.sum() - of_out.backward() - np_grad = np_tile_grad(input.numpy(), sizes) - test_case.assertTrue(np.array_equal(input.grad.numpy(), np_grad)) +from automated_test_util import * @flow.unittest.skip_unless_1n1d() class TestTile(flow.unittest.TestCase): - def test_tile(test_case): - arg_dict = OrderedDict() - arg_dict["test_fun"] = [ - _test_tile_less_dim_a, - _test_tile_less_dim_b, - _test_tile_less_dim_c, - _test_tile_same_dim, - _test_tile_same_dim_int, - _test_tile_same_dim_int8, - _test_tile_less_dim_a_backward, - _test_tile_less_dim_b_backward, - _test_tile_less_dim_c_backward, - _test_tile_same_dim_backward, - ] - arg_dict["device"] = ["cpu", "cuda"] - for arg in GenArgList(arg_dict): - arg[0](test_case, *arg[1:]) + @autotest() + def test_flow_tile_with_random_data(test_case): + x = random_pytorch_tensor(ndim=2, dim0=1, dim1=2) + reps = (random(1, 5).to(int), random(1, 5).to(int), random(1, 5).to(int)) + z = torch.tile(x, reps) + return z + + @autotest() + def test_flow_tensor_tile_with_random_data(test_case): + x = random_pytorch_tensor(ndim=2, dim0=1, dim1=2) + reps = (random(1, 5).to(int), random(1, 5).to(int), random(1, 5).to(int)) + y = x.tile(reps) + return y if __name__ == "__main__": diff --git a/python/oneflow/test_utils/automated_test_util/generators.py b/python/oneflow/test_utils/automated_test_util/generators.py index 7970529aa6f..b6889b39088 100644 --- a/python/oneflow/test_utils/automated_test_util/generators.py +++ b/python/oneflow/test_utils/automated_test_util/generators.py @@ -26,6 +26,8 @@ import oneflow as flow py_tuple = tuple +NoneType = type(None) + TEST_MODULE = 0 TEST_FLOW = 1 TEST_TENSOR = 2 diff --git a/python/oneflow/test_utils/automated_test_util/torch_flow_dual_object.py b/python/oneflow/test_utils/automated_test_util/torch_flow_dual_object.py index b8ed340816c..e60b842ee1c 100644 --- a/python/oneflow/test_utils/automated_test_util/torch_flow_dual_object.py +++ b/python/oneflow/test_utils/automated_test_util/torch_flow_dual_object.py @@ -306,9 +306,9 @@ def check_equality(dual_object: DualObject, rtol=0.0001, atol=1e-05): break assert checker is not None, ( "checker not found for type " - + type(dual_object.pytorch) + + str(type(dual_object.pytorch)) + " and " - + type(dual_object.oneflow) + + str(type(dual_object.oneflow)) ) return checker(dual_object.pytorch, dual_object.oneflow, rtol, atol) @@ -324,7 +324,7 @@ def check_tensor_equality(torch_tensor, flow_tensor, rtol=0.0001, atol=1e-05): flow_grad = flow_tensor.grad.numpy() if not np.allclose(torch_grad, flow_grad, rtol=rtol, atol=atol): print( - "Grads are not equal. PyTorch grad: \n{torch_grad}\n, OneFlow grad: \n{flow_grad}" + f"Grads are not equal. PyTorch grad: \n{torch_grad}\n, OneFlow grad: \n{flow_grad}" ) return False equality_res = np.allclose( diff --git a/tools/cfg/template/template.cfg.cpp b/tools/cfg/template/template.cfg.cpp index 8d5486989d2..79598a3b824 100644 --- a/tools/cfg/template/template.cfg.cpp +++ b/tools/cfg/template/template.cfg.cpp @@ -872,6 +872,10 @@ void {{ util.class_name(cls) }}::CopyFrom(const {{ util.class_name(cls) }}& othe CopyFrom(other); return *this; } +{{ util.class_name(cls) }}& {{ util.class_name(cls) }}::operator=({{ util.class_name(cls) }}&& other) { + data_ = std::move(other.data_); + return *this; +} {% for field in util.message_type_fields(cls) %} {% if util.field_has_required_or_optional_label(field) %} diff --git a/tools/cfg/template/template.cfg.h b/tools/cfg/template/template.cfg.h index 8c43dadc48f..c9b7590ea1b 100644 --- a/tools/cfg/template/template.cfg.h +++ b/tools/cfg/template/template.cfg.h @@ -342,6 +342,7 @@ class {{ util.class_name(cls) }} final : public Const{{ util.class_name(cls) }} void Clear(); void CopyFrom(const {{ util.class_name(cls) }}& other); {{ util.class_name(cls) }}& operator=(const {{ util.class_name(cls) }}& other); + {{ util.class_name(cls) }}& operator=({{ util.class_name(cls) }}&& other); {% for field in util.message_type_fields(cls) %} {% if util.field_has_required_or_optional_label(field) %}