-
Notifications
You must be signed in to change notification settings - Fork 5.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix sendrecv port bind #9595
Merged
typhoonzero
merged 4 commits into
PaddlePaddle:develop
from
typhoonzero:fix_test_sendrecv_portbind
Apr 4, 2018
Merged
Fix sendrecv port bind #9595
Changes from 1 commit
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,185 +12,145 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
See the License for the specific language governing permissions and | ||
limitations under the License. */ | ||
|
||
#include <stdint.h> | ||
#include <ostream> | ||
#include <thread> | ||
|
||
#include "paddle/fluid/framework/executor.h" | ||
#include "paddle/fluid/framework/lod_tensor.h" | ||
#include "paddle/fluid/framework/op_registry.h" | ||
#include "paddle/fluid/framework/threadpool.h" | ||
#include "paddle/fluid/operators/detail/grpc_server.h" | ||
#include "paddle/fluid/operators/listen_and_serv_op.h" | ||
|
||
namespace paddle { | ||
namespace operators { | ||
|
||
constexpr char kOptimizeBlock[] = "OptimizeBlock"; | ||
|
||
void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) { | ||
service->RunSyncUpdate(); | ||
VLOG(4) << "RunServer thread end"; | ||
} | ||
|
||
static void CreateTensorFromMessageType(framework::Variable *var, | ||
sendrecv::VarType var_type) { | ||
if (var_type == sendrecv::VarType::LOD_TENSOR) { | ||
var->GetMutable<framework::LoDTensor>(); | ||
} else if (var_type == sendrecv::VarType::SELECTED_ROWS) { | ||
var->GetMutable<framework::SelectedRows>(); | ||
} else { | ||
PADDLE_THROW( | ||
"VariableMessage type %d is not in " | ||
"[LoDTensor, SelectedRows]", | ||
var_type); | ||
} | ||
ListenAndServOp::ListenAndServOp(const std::string &type, | ||
const framework::VariableNameMap &inputs, | ||
const framework::VariableNameMap &outputs, | ||
const framework::AttributeMap &attrs) | ||
: OperatorBase(type, inputs, outputs, attrs) {} | ||
|
||
int ListenAndServOp::GetSelectedPort() { | ||
return rpc_service_->GetSelectedPort(); | ||
} | ||
|
||
static void ParallelExecuteBlocks(const std::vector<size_t> ¶llel_blkids, | ||
framework::Executor *executor, | ||
framework::ProgramDesc *program, | ||
framework::Scope *scope) { | ||
std::vector<std::future<void>> fs; | ||
for (size_t idx : parallel_blkids) { | ||
fs.push_back(framework::Async([&executor, &program, &scope, idx]() { | ||
int run_block = idx; // thread local | ||
try { | ||
executor->Run(*program, scope, run_block, false, false); | ||
} catch (std::exception &e) { | ||
LOG(ERROR) << "run sub program error " << e.what(); | ||
} | ||
})); | ||
} | ||
for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); | ||
void ListenAndServOp::Stop() { | ||
rpc_service_->Push(LISTEN_TERMINATE_MESSAGE); | ||
server_thread_->join(); | ||
} | ||
|
||
class ListenAndServOp : public framework::OperatorBase { | ||
public: | ||
ListenAndServOp(const std::string &type, | ||
const framework::VariableNameMap &inputs, | ||
const framework::VariableNameMap &outputs, | ||
const framework::AttributeMap &attrs) | ||
: OperatorBase(type, inputs, outputs, attrs) { | ||
if (!rpc_service_) { | ||
std::string endpoint = Attr<std::string>("endpoint"); | ||
rpc_service_.reset(new detail::AsyncGRPCServer(endpoint)); | ||
server_thread_.reset(new std::thread(RunServer, rpc_service_)); | ||
} | ||
} | ||
void ListenAndServOp::RunImpl(const framework::Scope &scope, | ||
const platform::Place &dev_place) const { | ||
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); | ||
auto &dev_ctx = *pool.Get(dev_place); | ||
framework::Scope &recv_scope = scope.NewScope(); | ||
LOG(INFO) << "created recv scope: " << &recv_scope; | ||
|
||
void Stop() override { | ||
rpc_service_->Push(LISTEN_TERMINATE_MESSAGE); | ||
server_thread_->join(); | ||
if (!rpc_service_) { | ||
std::string endpoint = Attr<std::string>("endpoint"); | ||
rpc_service_.reset(new detail::AsyncGRPCServer(endpoint)); | ||
} | ||
|
||
void RunImpl(const framework::Scope &scope, | ||
const platform::Place &dev_place) const override { | ||
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); | ||
auto &dev_ctx = *pool.Get(dev_place); | ||
framework::Scope &recv_scope = scope.NewScope(); | ||
|
||
// FIXME(Yancey1989): initialize rpc server with lazy mode. | ||
rpc_service_->SetScope(&recv_scope); | ||
rpc_service_->SetDevCtx(&dev_ctx); | ||
auto ins = Inputs("X"); | ||
auto fan_in = Attr<int>("Fanin"); | ||
|
||
auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock); | ||
auto *program = block->Program(); | ||
int num_blocks = program->Size(); | ||
PADDLE_ENFORCE_GE(num_blocks, 2, | ||
"server program should have at least 2 blocks"); | ||
|
||
framework::Executor executor(dev_place); | ||
|
||
// TODO(qiao) set proper fields for table lookup and update | ||
rpc_service_->SetExecutor(&executor); | ||
rpc_service_->SetPrefetchBlkdId(0); | ||
rpc_service_->SetProgram(program); | ||
|
||
// TODO(typhoonzero): change this to a while_op for every cluster-batch. | ||
bool exit_flag = false; | ||
// Record received sparse variables, so that | ||
// we could reset those after execute optimize program | ||
std::vector<framework::Variable *> sparse_vars; | ||
while (!exit_flag) { | ||
// Get from multiple trainers, we don't care about the order in which | ||
// the gradients arrives, just add suffix 0~n and merge the gradient. | ||
rpc_service_->SetCond(0); | ||
size_t recv_var_cnt = 0; | ||
int batch_barrier = 0; | ||
while (batch_barrier != fan_in) { | ||
const detail::ReceivedMessage v = rpc_service_->Get(); | ||
auto recv_var_name = v.first; | ||
if (recv_var_name == LISTEN_TERMINATE_MESSAGE) { | ||
LOG(INFO) << "received terminate message and exit"; | ||
exit_flag = true; | ||
break; | ||
} else if (recv_var_name == BATCH_BARRIER_MESSAGE) { | ||
VLOG(3) << "recv batch barrier message"; | ||
batch_barrier++; | ||
continue; | ||
} else { | ||
VLOG(3) << "received grad: " << recv_var_name; | ||
recv_var_cnt++; | ||
auto var = v.second->GetVar(); | ||
if (var == nullptr) { | ||
LOG(ERROR) << "Can not find server side var: " << recv_var_name; | ||
PADDLE_THROW("Can not find server side var"); | ||
} | ||
if (var->IsType<framework::SelectedRows>()) { | ||
sparse_vars.push_back(var); | ||
} | ||
} | ||
} | ||
if (exit_flag) { | ||
rpc_service_->SetCond(1); | ||
rpc_service_->ShutDown(); | ||
auto ins = Inputs("X"); | ||
auto fan_in = Attr<int>("Fanin"); | ||
auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock); | ||
auto *program = block->Program(); | ||
size_t num_blocks = program->Size(); | ||
PADDLE_ENFORCE_GE(num_blocks, 2, | ||
"server program should have at least 2 blocks"); | ||
|
||
framework::Executor executor(dev_place); | ||
|
||
// FIXME(Yancey1989): initialize rpc server with lazy mode. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This line is not useful. |
||
rpc_service_->SetScope(&recv_scope); | ||
rpc_service_->SetDevCtx(&dev_ctx); | ||
// TODO(qiao) set proper fields for table lookup and update | ||
rpc_service_->SetExecutor(&executor); | ||
rpc_service_->SetPrefetchBlkdId(0); | ||
rpc_service_->SetProgram(program); | ||
// start the server listening after all member initialized. | ||
server_thread_.reset(new std::thread(RunServer, rpc_service_)); | ||
// FIXME(typhoonzero): do we need to wait until the server port is ready? | ||
sleep(5); | ||
|
||
// TODO(typhoonzero): change this to a while_op for every cluster-batch. | ||
bool exit_flag = false; | ||
// Record received sparse variables, so that | ||
// we could reset those after execute optimize program | ||
std::vector<framework::Variable *> sparse_vars; | ||
while (!exit_flag) { | ||
// Get from multiple trainers, we don't care about the order in which | ||
// the gradients arrives, just add suffix 0~n and merge the gradient. | ||
rpc_service_->SetCond(0); | ||
size_t recv_var_cnt = 0; | ||
int batch_barrier = 0; | ||
while (batch_barrier != fan_in) { | ||
const detail::ReceivedMessage v = rpc_service_->Get(); | ||
auto recv_var_name = v.first; | ||
if (recv_var_name == LISTEN_TERMINATE_MESSAGE) { | ||
LOG(INFO) << "received terminate message and exit"; | ||
exit_flag = true; | ||
break; | ||
} | ||
|
||
// NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads | ||
// and this will still work. | ||
|
||
// The optimize blocks which have the same parent ID would run parallel | ||
// TODO(Yancey1989): need to use ParallelExecutor for future | ||
size_t last_parent_blkid = program->Block(1).Parent(); | ||
std::vector<size_t> parallel_blkids; | ||
parallel_blkids.push_back(1); | ||
double ts = detail::GetTimestamp(); | ||
for (size_t blkid = 2; blkid < num_blocks; ++blkid) { | ||
if (program->Block(blkid).Parent() != last_parent_blkid) { | ||
for (size_t idx : parallel_blkids) VLOG(3) << idx; | ||
ParallelExecuteBlocks(parallel_blkids, &executor, program, | ||
&recv_scope); | ||
parallel_blkids.clear(); | ||
last_parent_blkid = program->Block(blkid).Parent(); | ||
} else if (recv_var_name == BATCH_BARRIER_MESSAGE) { | ||
VLOG(3) << "recv batch barrier message"; | ||
batch_barrier++; | ||
continue; | ||
} else { | ||
VLOG(3) << "received grad: " << recv_var_name; | ||
recv_var_cnt++; | ||
auto var = v.second->GetVar(); | ||
if (var == nullptr) { | ||
LOG(ERROR) << "Can not find server side var: " << recv_var_name; | ||
PADDLE_THROW("Can not find server side var"); | ||
} | ||
if (var->IsType<framework::SelectedRows>()) { | ||
sparse_vars.push_back(var); | ||
} | ||
parallel_blkids.push_back(blkid); | ||
} | ||
ParallelExecuteBlocks(parallel_blkids, &executor, program, &recv_scope); | ||
|
||
VLOG(3) << "run all blocks spent " << detail::GetTimestamp() - ts | ||
<< "(ms)"; | ||
|
||
// Reset the received sparse variables, the sum operator would not | ||
// sum the input sparse variables which rows is empty at the next | ||
// mini-batch. | ||
// TODO(Yancey1989): move the reset action into an operator, we couldn't | ||
// have any hide logic in the operator. | ||
for (auto &var : sparse_vars) { | ||
var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear(); | ||
} | ||
} | ||
if (exit_flag) { | ||
rpc_service_->SetCond(1); | ||
// FIXME(typhoonzero): use another condition to sync wait clients get. | ||
rpc_service_->WaitClientGet(fan_in); | ||
sparse_vars.clear(); | ||
} // while(true) | ||
} | ||
rpc_service_->ShutDown(); | ||
break; | ||
} | ||
|
||
protected: | ||
std::shared_ptr<detail::AsyncGRPCServer> rpc_service_; | ||
std::shared_ptr<std::thread> server_thread_; | ||
}; | ||
// NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads | ||
// and this will still work. | ||
|
||
// The optimize blocks which have the same parent ID would run parallel | ||
// TODO(Yancey1989): need to use ParallelExecutor for future | ||
int32_t last_parent_blkid = program->Block(1).Parent(); | ||
std::vector<size_t> parallel_blkids; | ||
parallel_blkids.push_back(1); | ||
double ts = detail::GetTimestamp(); | ||
for (size_t blkid = 2; blkid < num_blocks; ++blkid) { | ||
if (program->Block(blkid).Parent() != last_parent_blkid) { | ||
for (size_t idx : parallel_blkids) VLOG(3) << idx; | ||
ParallelExecuteBlocks(parallel_blkids, &executor, program, &recv_scope); | ||
parallel_blkids.clear(); | ||
last_parent_blkid = program->Block(blkid).Parent(); | ||
} | ||
parallel_blkids.push_back(blkid); | ||
} | ||
ParallelExecuteBlocks(parallel_blkids, &executor, program, &recv_scope); | ||
|
||
VLOG(3) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)"; | ||
|
||
// Reset the received sparse variables, the sum operator would not | ||
// sum the input sparse variables which rows is empty at the next | ||
// mini-batch. | ||
// TODO(Yancey1989): move the reset action into an operator, we couldn't | ||
// have any hide logic in the operator. | ||
for (auto &var : sparse_vars) { | ||
var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear(); | ||
} | ||
rpc_service_->SetCond(1); | ||
// FIXME(typhoonzero): use another condition to sync wait clients get. | ||
rpc_service_->WaitClientGet(fan_in); | ||
sparse_vars.clear(); | ||
} // while(true) | ||
} | ||
|
||
class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker { | ||
public: | ||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These should be deleted.