Oneflow-Inc · mergify · Sep 27, 2022 · Mar 18, 2022 · Mar 30, 2022 · Apr 8, 2022
@@ -0,0 +1,70 @@
+Auto Parallelism
+====================================================
+
+As the scale of deep-learning models grows larger and larger, distributed training,
+or parallelism, is needed. Data parallelism and model parallelism has been designed
+to speed up the training and solve memory issues.
+
+In oneflow, SBP signature enables users to configure parallelism policy easily.
+However, users still need to specify the SBP property for each operator, or most of them.
+Users might spend a couple of days digging into the detail of parallelism and get a
+low throughput just because of a slight mistake in the configuration of SBP signature.
+
+.. note::
+
+   It only works on :doc:`graph` mode.
+
+
+Our strength
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To get rid of all those configurations for SBP signatures, we developed auto parallelism.
+Still, configurations of placement are necessary and we have not supported auto placement
+yet. If you read this paragraph before you rush into any SBP stuff, then congratulation,
+you do not need to learn SBPs. You can start writing your code as you did under CPU mode.
+Our auto parallelism would generate a fast strategy customized for your specific models,
+the size of parameters, and the number of available GPUs.
+
+
+How to use auto parallelism?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You just need to simply enable the configuration settings in the model
+of :doc:`graph` .
+
+Example::
+
+    import oneflow as flow
+    class SubclassGraph(flow.nn.Graph):
+        def __init__(self):
+            super().__init__() # MUST be called
+            # auto parallelism configuration
+            self.config.enable_auto_parallel(True)
+            # other configurations about auto parallelism
+            # ......
+
+        def build(self):
+            pass
+
+.. warning::
+
+   If you enable auto parallelism, OneFlow will take care of the SBP configurations
+   of operators except for explicit ``to_global`` functions.
+
+
+Configuration API for auto parallelism
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. currentmodule:: oneflow.nn.graph.graph_config.GraphConfig
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    enable_auto_parallel
+    enable_auto_parallel_prune_parallel_cast_ops
+    set_auto_parallel_computation_cost_ratio
+    set_auto_parallel_wait_time
+    enable_auto_parallel_mainstem_algo
+    enable_auto_parallel_sbp_collector
+
@@ -30,6 +30,7 @@ OneFlow upholds the core concept and architecture of static compilation and stre
     nn.init
     optim
     graph
+    auto_parallel
     image
     utils.data
     one_embedding

diff --git a/oneflow/core/auto_parallel/algorithm_util.cpp b/oneflow/core/auto_parallel/algorithm_util.cpp
@@ -0,0 +1,33 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "oneflow/core/auto_parallel/algorithm_util.h"
+
+namespace oneflow {
+namespace auto_parallel {
+
+// Inverse function of order
+// The reason why we need the inverse_order, a.k.a id2order, instead of id2value is to eliminate
+// equality. For example, we have v[0] < v[1] = v[2] < v[3] We do not know v[1] is before or after
+// v[2] with comp(v[1], v[2]). But if we transfer it to order order[0] < order[1] < order[2] <
+// order[3] We know the strict order.
+void InverseOrder(const std::vector<int32_t>& order, std::vector<int32_t>& inverse_order) {
+  inverse_order.resize(order.size());
+  for (int32_t i = 0; i < order.size(); i++) { inverse_order[order[i]] = i; }
+}
+
+}  // namespace auto_parallel
+}  // namespace oneflow
diff --git a/oneflow/core/auto_parallel/algorithm_util.h b/oneflow/core/auto_parallel/algorithm_util.h
@@ -0,0 +1,82 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_AUTO_PARALLEL_ALGORITHM_UTIL_H_
+#define ONEFLOW_CORE_AUTO_PARALLEL_ALGORITHM_UTIL_H_
+
+#include <vector>
+#include <cstdlib>
+#include <algorithm>
+#include <unordered_map>
+
+namespace oneflow {
+namespace auto_parallel {
+
+// this function is to remove the i-th element from a vector in Constant time.
+// the vector should not care about ordering.
+// Be more careful about this function. Make sure that the traveling order of
+// the vector goes from back to front.
+template<class T>
+void RemoveFrom(std::vector<T>& v, int32_t i) {
+  v[i] = v.back();
+  v.pop_back();
+}
+
+template<class T>
+void CheckAndRemoveFrom(std::vector<T>& v, T& t) {
+  for (int32_t i = v.size() - 1; i >= 0; i--) {
+    if (v[i] == t) {
+      RemoveFrom<T>(v, i);
+      break;
+    }
+  }
+}
+
+// Inverse function, which transfer a vector to an unordered_map.
+template<class T>
+void InverseFunction(const std::vector<T>& v, std::unordered_map<T, int32_t>& inverse_map) {
+  inverse_map.clear();
+  for (int32_t i = 0; i < v.size(); i++) { inverse_map[v[i]] = i; }
+}
+
+// When you want to sort something but you can not move any elements, use order.
+// Decide the order of sorting in a list v, we have
+// v[order[i]] < v[order[j]] for all i<j.
+// We could define the comparison, then we have
+// comp(v[order[i]], v[order[j]]) == true for all i<j.
+template<class T, class Compare>
+void DecideOrder(const T& v, std::vector<int32_t>& order, const Compare& comp) {
+  // Initialize order
+  order.resize(v.size());
+  for (int32_t i = 0; i < v.size(); i++) { order[i] = i; }
+  // sort
+  std::sort(order.begin(), order.end(), [&](int32_t i, int32_t j) { return comp(v[i], v[j]); });
+}
+
+// Inverse function of order
+// The reason why we need the inverse_order, a.k.a id2order, instead of id2value is to eliminate
+// equality. For example, we have v[0] < v[1] = v[2] < v[3] We do not know v[1] is before or after
+// v[2] with comp(v[1], v[2]). But if we transfer it to order order[0] < order[1] < order[2] <
+// order[3] We know the strict order.
+void InverseOrder(const std::vector<int32_t>& order, std::vector<int32_t>& inverse_order);
+
+}  // namespace auto_parallel
+
+static const double float_deviation_minus = 0.9999999;
+static const double float_deviation_plus = 1.0000001;
+
+}  // namespace oneflow
+
+#endif  // ONEFLOW_CORE_AUTO_PARALLEL_ALGORITHM_UTIL_H_
diff --git a/oneflow/core/auto_parallel/binary_set.cpp b/oneflow/core/auto_parallel/binary_set.cpp
@@ -0,0 +1,143 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/auto_parallel/binary_set.h"
+
+namespace oneflow {
+namespace auto_parallel {
+
+// A static function for initialization of log_2 mapping
+std::unordered_map<kBinarySetEntryType, int32_t> BinarySet::InitLog2() {
+  std::unordered_map<kBinarySetEntryType, int32_t> log_2;
+  for (int32_t i = 0; i < BinarySet::bit_entry_type_; i++) {
+    log_2[(kBinarySetEntryType)(1 << i)] = i;
+  }
+  return log_2;
+}
+
+// Initialization of log_2 mapping
+const std::unordered_map<kBinarySetEntryType, int32_t> BinarySet::log_2_ = BinarySet::InitLog2();
+
+// Constructor
+BinarySet::BinarySet(int32_t size_of_set) : size_of_set_(size_of_set) {
+  int32_t k = (size_of_set - 1) / bit_entry_type_ + 1;
+  binary_set_values_.resize(k, 0);
+}
+
+// Initialization if needed
+void BinarySet::Initialize(int32_t size_of_set) {
+  size_of_set_ = size_of_set;
+  int32_t k = (size_of_set - 1) / bit_entry_type_ + 1;
+  binary_set_values_.resize(k, 0);
+}
+
+// Clear all the elements in the set
+void BinarySet::Clear() { binary_set_values_.assign(binary_set_values_.size(), 0); }
+
+// Check if i-th element in this subset
+int32_t BinarySet::CheckExistence(int32_t i) const {
+  int32_t k = i / bit_entry_type_;
+  int32_t j = i % bit_entry_type_;
+  return (binary_set_values_[k] >> j) & 1;
+}
+
+// Add i-th element into this subset
+void BinarySet::AddEntry(int32_t i) {
+  int32_t k = i / bit_entry_type_;
+  int32_t j = i % bit_entry_type_;
+  binary_set_values_[k] |= (1 << j);
+}
+// Take i-th element out from this subset
+void BinarySet::DeleteEntry(int32_t i) {
+  int32_t k = i / bit_entry_type_;
+  int32_t j = i % bit_entry_type_;
+  binary_set_values_[k] &= ~(1 << j);
+}
+// Get the union with another subset and store it into u
+void BinarySet::UnionTo(const BinarySet& bs, BinarySet& u) {
+  for (int32_t k = 0; k < binary_set_values_.size(); k++) {
+    u.binary_set_values_[k] = binary_set_values_[k] | bs.binary_set_values_[k];
+  }
+}
+// If this binary set intersects another one
+bool BinarySet::IfIntersect(const BinarySet& bs) const {
+  int32_t min_bs_size = std::min(binary_set_values_.size(), bs.binary_set_values_.size());
+  for (int32_t k = 0; k < min_bs_size; k++) {
+    if (binary_set_values_[k] & bs.binary_set_values_[k]) { return true; }
+  }
+  return false;
+}
+// Get the intersection with another subset and store it into i
+void BinarySet::IntersectionTo(const BinarySet& bs, BinarySet& i) const {
+  int32_t min_bs_size = std::min(binary_set_values_.size(), bs.binary_set_values_.size());
+  if (min_bs_size > i.binary_set_values_.size()) { i.binary_set_values_.resize(min_bs_size, 0); }
+  for (int32_t k = 0; k < binary_set_values_.size(); k++) {
+    i.binary_set_values_[k] = binary_set_values_[k] & bs.binary_set_values_[k];
+  }
+}
+// Count number of elements in this subset
+int32_t BinarySet::Total() const {
+  int32_t t = 0;
+  for (int32_t k = 0; k < binary_set_values_.size(); k++) {
+    kBinarySetEntryType bsv = binary_set_values_[k];
+    bsv = (bsv & 0x5555555555555555) + ((bsv >> 1) & 0x5555555555555555);
+    bsv = (bsv & 0x3333333333333333) + ((bsv >> 2) & 0x3333333333333333);
+    bsv = (bsv & 0x0F0F0F0F0F0F0F0F) + ((bsv >> 4) & 0x0F0F0F0F0F0F0F0F);
+    bsv = (bsv & 0x00FF00FF00FF00FF) + ((bsv >> 8) & 0x00FF00FF00FF00FF);
+    bsv = (bsv & 0x0000FFFF0000FFFF) + ((bsv >> 16) & 0x0000FFFF0000FFFF);
+    // bsv = (bsv & 0x00000000FFFFFFFF) + ((bsv >> 32) & 0x00000000FFFFFFFF);
+    t += int32_t(bsv);
+  }
+  return t;
+}
+
+// Output all the elements in the subset
+void BinarySet::OutPut(std::vector<int32_t>& out) const {
+  out.clear();
+  for (int32_t i = 0; i < size_of_set_; i++) {
+    if (CheckExistence(i)) { out.emplace_back(i); }
+  }
+}
+
+// Output all the elements in the subset
+void BinarySet::QuickOutPut(std::vector<int32_t>& out) const {
+  out.clear();
+  for (int32_t i = 0; i < binary_set_values_.size(); i++) {
+    kBinarySetEntryType x = binary_set_values_[i];
+    kBinarySetEntryType y = 0;
+    while (x) {
+      y = x;
+      x &= x - 1;
+      out.emplace_back(i * BinarySet::bit_entry_type_ + log_2_.find(y - x)->second);
+    }
+  }
+}
+
+// Add elements of input into this subset
+void BinarySet::AddEntries(std::vector<int32_t>& in) {
+  for (int32_t i : in) { AddEntry(i); }
+}
+
+// If two binary sets are equal to each other
+bool BinarySet::operator==(const BinarySet& rhs) const {
+  if (size_of_set_ != rhs.size_of_set_) { return false; }
+  for (int32_t i = 0; i < binary_set_values_.size(); i++) {
+    if (binary_set_values_[i] != rhs.binary_set_values_[i]) { return false; }
+  }
+  return true;
+}
+
+}  // namespace auto_parallel
+}  // namespace oneflow