-
Notifications
You must be signed in to change notification settings - Fork 656
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
NNGraph input/output valid by register tensors #6240
Changes from all commits
8be06d6
f949599
2f02c8a
23abb57
a82bc7b
209c8a3
c9e3002
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,6 +36,23 @@ limitations under the License. | |
|
||
namespace oneflow { | ||
|
||
namespace { | ||
|
||
Maybe<bool> GetTensorValidInCurRank(const std::shared_ptr<one::Tensor>& tensor) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 判断是否在本 rank 上有分量,表示是否 valid |
||
if (tensor->is_consistent()) { | ||
const auto& parallel_id = JUST(GetParallelId4CurrentProcessCtx(JUST(tensor->parallel_desc()))); | ||
if (parallel_id->has_value()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. parallel_id看起来是一个placement内的rank id编号,如果发现当前的全局rank不属于该tensor的placement,那么就会查到一个空的placement内的rank id编号,表示该tensor在该全局rank没有分量。 这个判断是否有本rank分量的接口后面貌似可以考虑包装下会更直接? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 是的 |
||
return true; | ||
} else { | ||
return false; | ||
} | ||
} else { | ||
return true; | ||
} | ||
} | ||
|
||
} // namespace | ||
|
||
NNGraph::~NNGraph() { | ||
VLOG(2) << "graph destructor Try to close c nn graph name " << name_ << "." << std::endl; | ||
CHECK_JUST(Close()); | ||
|
@@ -57,15 +74,41 @@ const std::vector<std::string>& NNGraph::inputs_op_names() const { return input_ | |
|
||
const std::vector<std::string>& NNGraph::outputs_op_names() const { return output_op_names_; } | ||
|
||
const std::vector<bool>& NNGraph::inputs_valid() const { return input_tensors_valid_; } | ||
|
||
const std::vector<bool>& NNGraph::outputs_valid() const { return output_tensors_valid_; } | ||
|
||
int64_t NNGraph::variable_op_size() const { return variable_op_name2eager_blob_.size(); } | ||
|
||
Maybe<void> NNGraph::RegisterInputOpNames(const std::vector<std::string>& input_op_names) { | ||
Maybe<void> NNGraph::RegisterInputOpNamesAndTensors( | ||
const std::vector<std::string>& input_op_names, | ||
const std::vector<std::shared_ptr<one::Tensor>>& input_tensors) { | ||
CHECK_EQ_OR_RETURN(input_op_names.size(), input_tensors.size()); | ||
CHECK_OR_RETURN(input_op_names_.empty()) | ||
<< " The input tensors of nn.Graph " << name_ << " are register repeatedly."; | ||
CHECK_OR_RETURN(input_tensors_valid_.empty()); | ||
input_op_names_.assign(input_op_names.begin(), input_op_names.end()); | ||
input_tensors_valid_.reserve(input_tensors.size()); | ||
for (const auto& input_tensor : input_tensors) { | ||
input_tensors_valid_.push_back(JUST(GetTensorValidInCurRank(input_tensor))); | ||
} | ||
CHECK_EQ_OR_RETURN(input_tensors_valid_.size(), input_tensors.size()); | ||
return Maybe<void>::Ok(); | ||
} | ||
|
||
Maybe<void> NNGraph::RegisterOutputOpNames(const std::vector<std::string>& output_op_names) { | ||
Maybe<void> NNGraph::RegisterOutputOpNamesAndTensors( | ||
const std::vector<std::string>& output_op_names, | ||
const std::vector<std::shared_ptr<one::Tensor>>& output_tensors) { | ||
CHECK_EQ_OR_RETURN(output_op_names.size(), output_tensors.size()); | ||
CHECK_OR_RETURN(output_op_names_.empty()) | ||
<< " The output tensors of nn.Graph " << name_ << " are register repeatedly."; | ||
CHECK_OR_RETURN(output_tensors_valid_.empty()); | ||
output_op_names_.assign(output_op_names.begin(), output_op_names.end()); | ||
output_tensors_valid_.reserve(output_tensors.size()); | ||
for (const auto& output_tensor : output_tensors) { | ||
output_tensors_valid_.push_back(JUST(GetTensorValidInCurRank(output_tensor))); | ||
} | ||
CHECK_EQ_OR_RETURN(output_tensors_valid_.size(), output_tensors.size()); | ||
return Maybe<void>::Ok(); | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
""" | ||
Copyright 2020 The OneFlow Authors. All rights reserved. | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
""" | ||
|
||
import os | ||
import time | ||
import unittest | ||
import numpy as np | ||
|
||
import oneflow as flow | ||
import oneflow.unittest | ||
|
||
|
||
def _test_graph_buffer_limit(test_case): | ||
class StageLayerModule(flow.nn.Module): | ||
def __init__(self): | ||
super().__init__() | ||
self.linear1 = flow.nn.Linear(10, 8, False) | ||
self.linear2 = flow.nn.Linear(8, 10, False) | ||
flow.nn.init.constant_(self.linear1.weight, 0.023) | ||
flow.nn.init.constant_(self.linear2.weight, 1.23) | ||
|
||
def forward(self, x): | ||
out0 = self.linear1(x) | ||
out0 = out0 + 1.0 | ||
out0 = out0 * 2.0 | ||
out1 = self.linear2(out0) | ||
return out1 | ||
|
||
P0 = flow.placement("cuda", {0: [0]}) | ||
P1 = flow.placement("cuda", {0: [1]}) | ||
PT = flow.placement("cuda", {0: [0, 1]}) | ||
B = flow.sbp.broadcast | ||
|
||
class PipelineModule(flow.nn.Module): | ||
def __init__(self): | ||
super().__init__() | ||
self.layer_0 = StageLayerModule() | ||
self.layer_1 = StageLayerModule() | ||
self.layer_0.to_consistent(P0, B) | ||
self.layer_1.to_consistent(P1, B) | ||
|
||
def forward(self, x): | ||
# stage 0 | ||
in0 = x.to_consistent(P0, B) | ||
out0 = self.layer_0(in0) | ||
# stage 1 | ||
in1 = out0.to_consistent(P1, B) | ||
out1 = self.layer_1(in1) | ||
return out1 | ||
|
||
pp_m = PipelineModule() | ||
pp_m.eval() | ||
|
||
class PipelineGraph(flow.nn.Graph): | ||
def __init__(self): | ||
super().__init__() | ||
self.pp_m = pp_m | ||
|
||
def build(self, x): | ||
return self.pp_m(x) | ||
|
||
pp_g = PipelineGraph() | ||
|
||
for i in range(500): | ||
x = flow.randn(16, 10) | ||
x = x.to_consistent(P0, B) | ||
out = pp_g(x) | ||
# print(out.to_local().mean()) | ||
|
||
|
||
@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") | ||
@flow.unittest.skip_unless_1n2d() | ||
class TestGraphPipelineBufferLimit(oneflow.unittest.TestCase): | ||
def test_graph_buffer_limit(test_case): | ||
_test_graph_buffer_limit(test_case) | ||
|
||
|
||
if __name__ == "__main__": | ||
unittest.main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
核心逻辑,根据 NNGraphIf 提供的 inputs/outputs_valid 接口,决定是否跳过 Send Push/Pull CB