-
Notifications
You must be signed in to change notification settings - Fork 5.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feature/insert reduce_op to parallel exe #10096
Changes from 23 commits
7995380
a6715d6
ac7b414
9c8b9c0
bd11523
9f45c06
cecebbe
01b0106
dee077b
c741aeb
fca0fb9
0561fae
a22d385
ed13f4a
e0f37f8
8d83914
cec94e1
3210055
20ba594
bbad887
f965e9a
7b58d47
7ee07df
ea78be2
ed052f1
c0a3746
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,11 +19,9 @@ | |
namespace paddle { | ||
namespace framework { | ||
namespace details { | ||
BroadcastOpHandle::BroadcastOpHandle(const std::vector<Scope *> &local_scopes, | ||
const std::vector<platform::Place> &places) | ||
: local_scopes_(local_scopes), places_(places) {} | ||
|
||
void BroadcastOpHandle::RunImpl() { | ||
if (places_.size() == 1) return; | ||
// the input and output may have dummy var. | ||
VarHandle *in_var_handle; | ||
|
||
|
@@ -55,27 +53,93 @@ void BroadcastOpHandle::RunImpl() { | |
|
||
Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var); | ||
|
||
for (auto *out : out_var_handles) { | ||
if (*out == *in_var_handle) { | ||
continue; | ||
if (!use_nccl_ || platform::is_cpu_place(in_tensor.place())) { | ||
for (auto *out : out_var_handles) { | ||
if (*out == *in_var_handle) { | ||
continue; | ||
} | ||
|
||
auto &out_p = out->place_; | ||
auto *out_var = var_scopes.at(out->scope_idx_)->FindVar(out->name_); | ||
PADDLE_ENFORCE_NOT_NULL(out_var); | ||
PADDLE_ENFORCE_EQ(out_p.which(), in_tensor.place().which(), | ||
"Places must be all on CPU or all on CUDA."); | ||
|
||
VariableVisitor::ShareDimsAndLoD(*in_var, out_var); | ||
VariableVisitor::GetMutableTensor(out_var).mutable_data(out_p, | ||
in_tensor.type()); | ||
|
||
auto dev_ctx = dev_ctxes_.at(out_p); | ||
RunAndRecordEvent(out_p, [in_tensor, out_var, dev_ctx, out_p] { | ||
paddle::framework::TensorCopy( | ||
in_tensor, out_p, *(dev_ctx), | ||
&VariableVisitor::GetMutableTensor(out_var)); | ||
}); | ||
} | ||
} else { | ||
#ifdef PADDLE_WITH_CUDA | ||
PADDLE_ENFORCE(platform::is_gpu_place(in_tensor.place())); | ||
VarHandle *out_handle; | ||
int root = boost::get<platform::CUDAPlace>(in_tensor.place()).device; | ||
std::vector<std::function<void()>> all_reduce_calls; | ||
|
||
for (size_t j = 0; j < out_var_handles.size(); ++j) { | ||
auto *out = out_var_handles[j]; | ||
auto *out_var = var_scopes.at(out->scope_idx_)->FindVar(out->name_); | ||
|
||
if (*out != *in_var_handle) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are you comparing VarHandle instead of VarHandle*? Is it correct? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
line 90 compares the instance but not the point.
The point of input (in_var_handle) and output(out) must be different because the output varHandle is always created. code
Thanks, I will correct later. |
||
PADDLE_ENFORCE_NOT_NULL(out_var); | ||
PADDLE_ENFORCE_EQ(out->place_.which(), in_tensor.place().which(), | ||
"Places must be all on CPU or all on CUDA."); | ||
VariableVisitor::ShareDimsAndLoD(*in_var, out_var); | ||
VariableVisitor::GetMutableTensor(out_var).mutable_data( | ||
out->place_, in_tensor.type()); | ||
} | ||
|
||
auto out_p = out->place_; | ||
int dev_id = boost::get<platform::CUDAPlace>(out_p).device; | ||
|
||
auto &nccl_ctx = nccl_ctxs_->at(dev_id); | ||
auto stream = nccl_ctx.stream(); | ||
auto comm = nccl_ctx.comm_; | ||
|
||
void *send_recv_buffer = nullptr; | ||
if (root == dev_id) { | ||
send_recv_buffer = const_cast<void *>(in_tensor.data<void>()); | ||
out_handle = out; | ||
} else { | ||
send_recv_buffer = | ||
VariableVisitor::GetMutableTensor(out_var).mutable_data( | ||
out->place_); | ||
} | ||
|
||
int type = platform::ToNCCLDataType(in_tensor.type()); | ||
all_reduce_calls.emplace_back([=] { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it is "broadcast" not "all_reduce"? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, thanks! |
||
PADDLE_ENFORCE(platform::dynload::ncclBcast( | ||
send_recv_buffer, in_tensor.numel(), | ||
static_cast<ncclDataType_t>(type), root, comm, stream)); | ||
}); | ||
} | ||
|
||
auto &out_p = out->place_; | ||
auto *out_var = var_scopes.at(out->scope_idx_)->FindVar(out->name_); | ||
PADDLE_ENFORCE_NOT_NULL(out_var); | ||
PADDLE_ENFORCE_EQ(out_p.which(), in_var_handle->place_.which(), | ||
"Places must be all on CPU or all on CUDA."); | ||
|
||
VariableVisitor::ShareDimsAndLoD(*in_var, out_var); | ||
VariableVisitor::GetMutableTensor(out_var).mutable_data(out_p, | ||
in_tensor.type()); | ||
|
||
auto dev_ctx = dev_ctxes_.at(out_p); | ||
RunAndRecordEvent(out_p, [in_tensor, out_var, dev_ctx, out_p] { | ||
paddle::framework::TensorCopy( | ||
in_tensor, out_p, *(dev_ctx), | ||
&VariableVisitor::GetMutableTensor(out_var)); | ||
this->RunAndRecordEvent([&] { | ||
{ | ||
platform::NCCLGroupGuard guard; | ||
for (auto &call : all_reduce_calls) { | ||
call(); | ||
} | ||
} | ||
if (*out_handle != *in_var_handle) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is out_handle? What is the usage of this code block? Can you add some comments to explain this block? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is root device and what is input and output of root device? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, the description above is a bit obscure. I regarded the source device, which will send the data to other devices, as the root node. |
||
auto out_var = var_scopes.at(in_var_handle->scope_idx_) | ||
->FindVar(out_var_handles[0]->name_); | ||
paddle::framework::TensorCopy( | ||
in_tensor, in_var_handle->place_, | ||
*(dev_ctxes_.at(in_var_handle->place_)), | ||
&VariableVisitor::GetMutableTensor(out_var)); | ||
} | ||
}); | ||
#else | ||
PADDLE_THROW("CUDA is not support."); | ||
#endif | ||
} | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,14 +24,35 @@ | |
#include "paddle/fluid/framework/selected_rows.h" | ||
#include "paddle/fluid/platform/device_context.h" | ||
|
||
#ifdef PADDLE_WITH_CUDA | ||
#include "paddle/fluid/platform/nccl_helper.h" | ||
#endif | ||
|
||
namespace paddle { | ||
namespace framework { | ||
namespace details { | ||
|
||
struct BroadcastOpHandle : public OpHandleBase { | ||
public: | ||
#ifdef PADDLE_WITH_CUDA | ||
BroadcastOpHandle(const std::vector<Scope *> &local_scopes, | ||
const std::vector<platform::Place> &places, bool use_nccl, | ||
const platform::NCCLContextMap *nccl_ctxs) | ||
: local_scopes_(local_scopes), | ||
places_(places), | ||
use_nccl_(use_nccl), | ||
nccl_ctxs_(nccl_ctxs) { | ||
if (nccl_ctxs_) { | ||
for (auto &p_ctx : nccl_ctxs_->contexts_) { | ||
dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get(); | ||
} | ||
} | ||
} | ||
#else | ||
BroadcastOpHandle(const std::vector<Scope *> &local_scopes, | ||
const std::vector<platform::Place> &places); | ||
const std::vector<platform::Place> &places, bool use_nccl) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use_nccl_ should be false and we don't need use_nccl argument here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. right, that's why we don't need use_nccl argument here for CPU case? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for pointing out, in order to compare the performance between memory copying and ncclBcast, I add this parameter, the result is ncclBcast is faster. So use_nccl can be removed. |
||
: local_scopes_(local_scopes), places_(places), use_nccl_(use_nccl) {} | ||
#endif | ||
|
||
std::string Name() const override; | ||
|
||
|
@@ -44,6 +65,10 @@ struct BroadcastOpHandle : public OpHandleBase { | |
private: | ||
const std::vector<Scope *> &local_scopes_; | ||
const std::vector<platform::Place> &places_; | ||
bool use_nccl_; | ||
#ifdef PADDLE_WITH_CUDA | ||
const platform::NCCLContextMap *nccl_ctxs_; | ||
#endif | ||
}; | ||
} // namespace details | ||
} // namespace framework | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,15 +35,25 @@ struct TestBroadcastOpHandle { | |
std::unique_ptr<OpHandleBase> op_handle_; | ||
std::vector<std::unique_ptr<VarHandleBase>> vars_; | ||
std::vector<p::Place> gpu_list_; | ||
bool use_gpu_; | ||
#ifdef PADDLE_WITH_CUDA | ||
std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_; | ||
#endif | ||
|
||
void WaitAll() { | ||
for (size_t j = 0; j < ctxs_.size(); ++j) { | ||
ctxs_[j]->Wait(); | ||
} | ||
#ifdef PADDLE_WITH_CUDA | ||
if (nccl_ctxs_) { | ||
nccl_ctxs_->WaitAll(); | ||
} | ||
#endif | ||
} | ||
|
||
void InitCtxOnGpu(bool use_gpu) { | ||
if (use_gpu) { | ||
use_gpu_ = use_gpu; | ||
if (use_gpu_) { | ||
#ifdef PADDLE_WITH_CUDA | ||
int count = p::GetCUDADeviceCount(); | ||
if (count <= 1) { | ||
|
@@ -57,6 +67,7 @@ struct TestBroadcastOpHandle { | |
gpu_list_.push_back(p); | ||
ctxs_.emplace_back(new p::CUDADeviceContext(p)); | ||
} | ||
nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_)); | ||
#else | ||
PADDLE_THROW("CUDA is not support."); | ||
#endif | ||
|
@@ -67,6 +78,9 @@ struct TestBroadcastOpHandle { | |
gpu_list_.push_back(p); | ||
ctxs_.emplace_back(new p::CPUDeviceContext(p)); | ||
} | ||
#ifdef PADDLE_WITH_CUDA | ||
nccl_ctxs_.reset(nullptr); | ||
#endif | ||
} | ||
} | ||
|
||
|
@@ -82,7 +96,21 @@ struct TestBroadcastOpHandle { | |
} | ||
param_scopes_[input_scope_idx]->Var("input"); | ||
|
||
op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_)); | ||
#ifdef PADDLE_WITH_CUDA | ||
op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_, use_gpu_, | ||
nccl_ctxs_.get())); | ||
#endif | ||
|
||
if (use_gpu_) { | ||
#ifndef PADDLE_WITH_CUDA | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are so many ifdef here, can we improve this block of codes? |
||
PADDLE_THROW("CUDA is not support."); | ||
#endif | ||
} else { | ||
#ifndef PADDLE_WITH_CUDA | ||
op_handle_.reset( | ||
new BroadcastOpHandle(local_scopes_, gpu_list_, use_gpu_)); | ||
#endif | ||
} | ||
|
||
auto* in_var_handle = | ||
new VarHandle(1, input_scope_idx, "input", gpu_list_[input_scope_idx]); | ||
|
@@ -97,7 +125,9 @@ struct TestBroadcastOpHandle { | |
op_handle_->AddInput(dummy_var_handle); | ||
|
||
for (size_t j = 0; j < gpu_list_.size(); ++j) { | ||
op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get()); | ||
if (!use_gpu_) { | ||
op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get()); | ||
} | ||
VarHandle* out_var_handle = new VarHandle(2, j, "out", gpu_list_[j]); | ||
vars_.emplace_back(out_var_handle); | ||
op_handle_->AddOutput(out_var_handle); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no need for () for dex_ctx?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Have removed.