From 66a038c76597d469ab3fa6238ae4c555a20bf3d0 Mon Sep 17 00:00:00 2001 From: chengtbf <472491134@qq.com> Date: Wed, 7 Jul 2021 23:17:52 +0800 Subject: [PATCH 1/4] MultiClientSessionContext --- .../multi_client_session_context.cpp | 107 ++++++++++++++++++ .../framework/multi_client_session_context.h | 42 +++++++ 2 files changed, 149 insertions(+) create mode 100644 oneflow/core/framework/multi_client_session_context.cpp create mode 100644 oneflow/core/framework/multi_client_session_context.h diff --git a/oneflow/core/framework/multi_client_session_context.cpp b/oneflow/core/framework/multi_client_session_context.cpp new file mode 100644 index 00000000000..0930f961f96 --- /dev/null +++ b/oneflow/core/framework/multi_client_session_context.cpp @@ -0,0 +1,107 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "oneflow/core/framework/multi_client_session_context.h" +#include "oneflow/core/framework/load_library.h" +#include "oneflow/core/job/version.h" +#include "oneflow/core/job/global_for.h" +#include "oneflow/core/job/id_manager.h" +#include "oneflow/core/job/job_instance.h" +#include "oneflow/core/job/job_build_and_infer_ctx_mgr.h" +#include "oneflow/core/common/buffer_manager.h" +#include "oneflow/core/rpc/include/global_process_ctx.h" +#ifdef WITH_CUDA +#include +#endif // WITH_CUDA + +namespace oneflow { + +namespace { + +int32_t GetGpuDeviceNum() { +#ifndef WITH_CUDA + return 0; +#else + int device_count = 0; + cudaGetDeviceCount(&device_count); + return device_count; +#endif +} + +} // namespace + +MultiClientSessionContext::~MultiClientSessionContext() { + if (is_inited_) { + { + // NOTE(chengcheng): delete runtime global objects + Global>>::Delete(); + } + + Global::Delete(); + Global::Delete(); + + // TODO(chengcheng): remove ForEnv + Global::Delete(); + Global::New(Global::Get()->resource(), + GlobalProcessCtx::NumOfProcessPerNode()); + } +} + +Maybe MultiClientSessionContext::LazyInitOnlyOnce(const ConfigProto& config_proto) { + if (!is_inited_) { + CHECK_OR_RETURN(GlobalProcessCtx::IsMultiClient()); + DumpVersionInfo(); + + Resource resource = config_proto.resource(); + + { + // TODO(chengcheng): remove this hack + // env config for multi-client + resource.set_machine_num(GlobalProcessCtx::NodeSize()); + const int32_t gpu_device_num = GetGpuDeviceNum(); + resource.set_gpu_device_num(gpu_device_num); + if (gpu_device_num == 0) { + resource.set_cpu_device_num(GlobalProcessCtx::NumOfProcessPerNode()); + } else { + resource.set_cpu_device_num(gpu_device_num); + } + } + + Global::Delete(); + Global::New(resource, GlobalProcessCtx::NumOfProcessPerNode()); + Global::New(); + // TODO(chengcheng): refactor JobBuildAndInferCtxMgr + Global::New(); + + for (const std::string& lib_path : config_proto.load_lib_path()) { + JUST(LoadLibrary(lib_path)); + } + + { + // NOTE(chengcheng): init runtime global objects + Global>>::New(); + } + + is_inited_ = true; + } + return Maybe::Ok(); +} + +Maybe MultiClientSessionContext::GetJobNameId(const std::string& job_class_name) { + return job_class_name2id_[job_class_name]++; +} + +} // namespace oneflow diff --git a/oneflow/core/framework/multi_client_session_context.h b/oneflow/core/framework/multi_client_session_context.h new file mode 100644 index 00000000000..e8100e8ecda --- /dev/null +++ b/oneflow/core/framework/multi_client_session_context.h @@ -0,0 +1,42 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_FRAMEWORK_MULTI_CLIENT_SESSION_CONTEXT_H_ +#define ONEFLOW_CORE_FRAMEWORK_MULTI_CLIENT_SESSION_CONTEXT_H_ + +#include "oneflow/core/common/util.h" +#include "oneflow/core/job/job_set.pb.h" +#include "oneflow/core/common/maybe.h" + +namespace oneflow { + +class MultiClientSessionContext { + public: + OF_DISALLOW_COPY_AND_MOVE(MultiClientSessionContext); + MultiClientSessionContext() : is_inited_(false) {} + ~MultiClientSessionContext(); + + Maybe LazyInitOnlyOnce(const ConfigProto& config_proto); + + Maybe GetJobNameId(const std::string& job_class_name); + + private: + bool is_inited_; + HashMap job_class_name2id_; +}; + +} // namespace oneflow + +#endif // ONEFLOW_CORE_FRAMEWORK_MULTI_CLIENT_SESSION_CONTEXT_H_ From d9febb7c0b50b830a106533b68d16680933537b5 Mon Sep 17 00:00:00 2001 From: chengtbf <472491134@qq.com> Date: Thu, 8 Jul 2021 16:07:43 +0800 Subject: [PATCH 2/4] export multi-client session context --- oneflow/api/python/session/session.cpp | 5 ++++ oneflow/api/python/session/session.h | 25 +++++++++++++++++++ oneflow/api/python/session/session_api.h | 12 +++++++++ .../multi_client_session_context.cpp | 6 +---- .../framework/multi_client_session_context.h | 5 +--- 5 files changed, 44 insertions(+), 9 deletions(-) diff --git a/oneflow/api/python/session/session.cpp b/oneflow/api/python/session/session.cpp index d561074bd15..eca8ba9d34d 100644 --- a/oneflow/api/python/session/session.cpp +++ b/oneflow/api/python/session/session.cpp @@ -30,6 +30,11 @@ ONEFLOW_API_PYBIND11_MODULE("", m) { m.def("StartLazyGlobalSession", &StartLazyGlobalSession); m.def("StopLazyGlobalSession", &StopLazyGlobalSession); + // multi-client lazy global session context + m.def("CreateMultiClientSessionContext", &CreateMultiClientSessionContext); + m.def("InitMultiClientSessionContext", &InitMultiClientSessionContext); + m.def("DestroyMultiClientSessionContext", &DestroyMultiClientSessionContext); + using namespace oneflow; m.def("NewSessionId", &NewSessionId); py::class_(m, "LogicalConfigProtoContext") diff --git a/oneflow/api/python/session/session.h b/oneflow/api/python/session/session.h index 9a7d676bd82..b5be7f52f2f 100644 --- a/oneflow/api/python/session/session.h +++ b/oneflow/api/python/session/session.h @@ -22,11 +22,13 @@ limitations under the License. #include "oneflow/core/control/ctrl_client.h" #include "oneflow/core/control/global_process_ctx.h" #include "oneflow/core/job/global_for.h" +#include "oneflow/core/job/env_global_objects_scope.h" #include "oneflow/core/job/session_global_objects_scope.h" #include "oneflow/core/job/cluster_instruction.h" #include "oneflow/core/job/oneflow.h" #include "oneflow/core/job/job_build_and_infer_ctx_mgr.h" #include "oneflow/core/framework/config_def.h" +#include "oneflow/core/framework/multi_client_session_context.h" #include "oneflow/core/persistence/tee_persistent_log_stream.h" namespace oneflow { @@ -106,6 +108,29 @@ inline Maybe StopLazyGlobalSession() { return Maybe::Ok(); } +inline Maybe CreateMultiClientSessionContext() { + CHECK_ISNULL_OR_RETURN(Global::Get()); + Global::New(); + return Maybe::Ok(); +} + +inline Maybe InitMultiClientSessionContext(const std::string& config_proto_str) { + CHECK_NOTNULL_OR_RETURN(Global::Get()); + CHECK_NOTNULL_OR_RETURN(Global::Get()); + CHECK_NOTNULL_OR_RETURN(Global::Get()) << "env not found"; + + ConfigProto config_proto; + CHECK_OR_RETURN(TxtString2PbMessage(config_proto_str, &config_proto)) + << "failed to parse config_proto: " << config_proto_str; + JUST(Global::Get()->TryInit(config_proto)); + return Maybe::Ok(); +} + +inline Maybe DestroyMultiClientSessionContext() { + Global::Delete(); + return Maybe::Ok(); +} + } // namespace oneflow #endif // ONEFLOW_API_PYTHON_SESSION_SESSION_H_ diff --git a/oneflow/api/python/session/session_api.h b/oneflow/api/python/session/session_api.h index 4092d6ab528..1dc0b015697 100644 --- a/oneflow/api/python/session/session_api.h +++ b/oneflow/api/python/session/session_api.h @@ -34,4 +34,16 @@ inline void StartLazyGlobalSession() { return oneflow::StartLazyGlobalSession(). inline void StopLazyGlobalSession() { return oneflow::StopLazyGlobalSession().GetOrThrow(); } +inline void CreateMultiClientSessionContext() { + return oneflow::CreateMultiClientSessionContext().GetOrThrow(); +} + +inline void InitMultiClientSessionContext(const std::string& config_proto_str) { + return oneflow::InitMultiClientSessionContext(config_proto_str).GetOrThrow(); +} + +inline void DestroyMultiClientSessionContext() { + return oneflow::DestroyMultiClientSessionContext().GetOrThrow(); +} + #endif // ONEFLOW_API_PYTHON_SESSION_SESSION_API_H_ diff --git a/oneflow/core/framework/multi_client_session_context.cpp b/oneflow/core/framework/multi_client_session_context.cpp index 0930f961f96..8478e8f6f25 100644 --- a/oneflow/core/framework/multi_client_session_context.cpp +++ b/oneflow/core/framework/multi_client_session_context.cpp @@ -60,7 +60,7 @@ MultiClientSessionContext::~MultiClientSessionContext() { } } -Maybe MultiClientSessionContext::LazyInitOnlyOnce(const ConfigProto& config_proto) { +Maybe MultiClientSessionContext::TryInit(const ConfigProto& config_proto) { if (!is_inited_) { CHECK_OR_RETURN(GlobalProcessCtx::IsMultiClient()); DumpVersionInfo(); @@ -100,8 +100,4 @@ Maybe MultiClientSessionContext::LazyInitOnlyOnce(const ConfigProto& confi return Maybe::Ok(); } -Maybe MultiClientSessionContext::GetJobNameId(const std::string& job_class_name) { - return job_class_name2id_[job_class_name]++; -} - } // namespace oneflow diff --git a/oneflow/core/framework/multi_client_session_context.h b/oneflow/core/framework/multi_client_session_context.h index e8100e8ecda..564cb709807 100644 --- a/oneflow/core/framework/multi_client_session_context.h +++ b/oneflow/core/framework/multi_client_session_context.h @@ -28,13 +28,10 @@ class MultiClientSessionContext { MultiClientSessionContext() : is_inited_(false) {} ~MultiClientSessionContext(); - Maybe LazyInitOnlyOnce(const ConfigProto& config_proto); - - Maybe GetJobNameId(const std::string& job_class_name); + Maybe TryInit(const ConfigProto& config_proto); private: bool is_inited_; - HashMap job_class_name2id_; }; } // namespace oneflow From 92fcf5e0cd5f938fa2a0f3b0c4e7c0582c53eaa3 Mon Sep 17 00:00:00 2001 From: chengtbf <472491134@qq.com> Date: Fri, 9 Jul 2021 17:29:26 +0800 Subject: [PATCH 3/4] add NOTE --- .../multi_client_session_context.cpp | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/oneflow/core/framework/multi_client_session_context.cpp b/oneflow/core/framework/multi_client_session_context.cpp index 8478e8f6f25..67313b49c92 100644 --- a/oneflow/core/framework/multi_client_session_context.cpp +++ b/oneflow/core/framework/multi_client_session_context.cpp @@ -53,8 +53,9 @@ MultiClientSessionContext::~MultiClientSessionContext() { Global::Delete(); Global::Delete(); - // TODO(chengcheng): remove ForEnv + // TODO(chengcheng): remove template ForEnv and ForSession Global::Delete(); + // NOTE(chengcheng): New after delete because in EnvGlobalObjectScope once created ResourceDesc. Global::New(Global::Get()->resource(), GlobalProcessCtx::NumOfProcessPerNode()); } @@ -68,8 +69,15 @@ Maybe MultiClientSessionContext::TryInit(const ConfigProto& config_proto) Resource resource = config_proto.resource(); { - // TODO(chengcheng): remove this hack - // env config for multi-client + // NOTE(chengcheng): + // In multi-client, user can NOT config gpu_device_num and cpu_device_num. + // + // cpu_device_num is a confusing name, it should be explained as: + // gpu_device corresponding host memory and compute stream. + // When gpu_device_num == 0 (cpu only), cpu device num should be process num. + // + // gpu_device_num is the number of visible GPUs one current machine. + // NOTE: gpu_device_num NOT necessarily equal to the num of process one this machine. resource.set_machine_num(GlobalProcessCtx::NodeSize()); const int32_t gpu_device_num = GetGpuDeviceNum(); resource.set_gpu_device_num(gpu_device_num); @@ -80,13 +88,18 @@ Maybe MultiClientSessionContext::TryInit(const ConfigProto& config_proto) } } - Global::Delete(); + // NOTE(chengcheng): detele first because in EnvGlobalObjectScope has created ResourceDesc. + if (Global::Get() != nullptr) { + // TODO(chengcheng): reorganize dependency of all Global objects. + Global::Delete(); + } Global::New(resource, GlobalProcessCtx::NumOfProcessPerNode()); Global::New(); // TODO(chengcheng): refactor JobBuildAndInferCtxMgr Global::New(); for (const std::string& lib_path : config_proto.load_lib_path()) { + // TODO(chengcheng): remove load_lib_path in config proto. using LoadLibraryNow JUST(LoadLibrary(lib_path)); } From 192afb842783b8c90ba5b6506a714db372ae0933 Mon Sep 17 00:00:00 2001 From: chengtbf <472491134@qq.com> Date: Fri, 9 Jul 2021 18:31:44 +0800 Subject: [PATCH 4/4] fix for new IsMultiClient --- oneflow/core/framework/multi_client_session_context.cpp | 3 ++- oneflow/core/rpc/include/global_process_ctx.h | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/oneflow/core/framework/multi_client_session_context.cpp b/oneflow/core/framework/multi_client_session_context.cpp index 67313b49c92..8ebca1ebfdc 100644 --- a/oneflow/core/framework/multi_client_session_context.cpp +++ b/oneflow/core/framework/multi_client_session_context.cpp @@ -23,6 +23,7 @@ limitations under the License. #include "oneflow/core/job/job_build_and_infer_ctx_mgr.h" #include "oneflow/core/common/buffer_manager.h" #include "oneflow/core/rpc/include/global_process_ctx.h" +#include "oneflow/api/python/env/env.h" #ifdef WITH_CUDA #include #endif // WITH_CUDA @@ -63,7 +64,7 @@ MultiClientSessionContext::~MultiClientSessionContext() { Maybe MultiClientSessionContext::TryInit(const ConfigProto& config_proto) { if (!is_inited_) { - CHECK_OR_RETURN(GlobalProcessCtx::IsMultiClient()); + CHECK_OR_RETURN(JUST(IsMultiClient())); DumpVersionInfo(); Resource resource = config_proto.resource(); diff --git a/oneflow/core/rpc/include/global_process_ctx.h b/oneflow/core/rpc/include/global_process_ctx.h index c9f3d6e7be8..7a6388310a7 100644 --- a/oneflow/core/rpc/include/global_process_ctx.h +++ b/oneflow/core/rpc/include/global_process_ctx.h @@ -28,7 +28,6 @@ struct GlobalProcessCtx { static int64_t ThisNodeId(); static int64_t NumOfProcessPerNode(); static bool IsThisProcessMaster(); - static bool IsMultiClient(); static size_t WorldSize(); static std::string LogDirEntry(); };