diff --git a/oneflow/api/python/session/session.cpp b/oneflow/api/python/session/session.cpp index d561074bd15..eca8ba9d34d 100644 --- a/oneflow/api/python/session/session.cpp +++ b/oneflow/api/python/session/session.cpp @@ -30,6 +30,11 @@ ONEFLOW_API_PYBIND11_MODULE("", m) { m.def("StartLazyGlobalSession", &StartLazyGlobalSession); m.def("StopLazyGlobalSession", &StopLazyGlobalSession); + // multi-client lazy global session context + m.def("CreateMultiClientSessionContext", &CreateMultiClientSessionContext); + m.def("InitMultiClientSessionContext", &InitMultiClientSessionContext); + m.def("DestroyMultiClientSessionContext", &DestroyMultiClientSessionContext); + using namespace oneflow; m.def("NewSessionId", &NewSessionId); py::class_(m, "LogicalConfigProtoContext") diff --git a/oneflow/api/python/session/session.h b/oneflow/api/python/session/session.h index 9a7d676bd82..b5be7f52f2f 100644 --- a/oneflow/api/python/session/session.h +++ b/oneflow/api/python/session/session.h @@ -22,11 +22,13 @@ limitations under the License. #include "oneflow/core/control/ctrl_client.h" #include "oneflow/core/control/global_process_ctx.h" #include "oneflow/core/job/global_for.h" +#include "oneflow/core/job/env_global_objects_scope.h" #include "oneflow/core/job/session_global_objects_scope.h" #include "oneflow/core/job/cluster_instruction.h" #include "oneflow/core/job/oneflow.h" #include "oneflow/core/job/job_build_and_infer_ctx_mgr.h" #include "oneflow/core/framework/config_def.h" +#include "oneflow/core/framework/multi_client_session_context.h" #include "oneflow/core/persistence/tee_persistent_log_stream.h" namespace oneflow { @@ -106,6 +108,29 @@ inline Maybe StopLazyGlobalSession() { return Maybe::Ok(); } +inline Maybe CreateMultiClientSessionContext() { + CHECK_ISNULL_OR_RETURN(Global::Get()); + Global::New(); + return Maybe::Ok(); +} + +inline Maybe InitMultiClientSessionContext(const std::string& config_proto_str) { + CHECK_NOTNULL_OR_RETURN(Global::Get()); + CHECK_NOTNULL_OR_RETURN(Global::Get()); + CHECK_NOTNULL_OR_RETURN(Global::Get()) << "env not found"; + + ConfigProto config_proto; + CHECK_OR_RETURN(TxtString2PbMessage(config_proto_str, &config_proto)) + << "failed to parse config_proto: " << config_proto_str; + JUST(Global::Get()->TryInit(config_proto)); + return Maybe::Ok(); +} + +inline Maybe DestroyMultiClientSessionContext() { + Global::Delete(); + return Maybe::Ok(); +} + } // namespace oneflow #endif // ONEFLOW_API_PYTHON_SESSION_SESSION_H_ diff --git a/oneflow/api/python/session/session_api.h b/oneflow/api/python/session/session_api.h index 4092d6ab528..1dc0b015697 100644 --- a/oneflow/api/python/session/session_api.h +++ b/oneflow/api/python/session/session_api.h @@ -34,4 +34,16 @@ inline void StartLazyGlobalSession() { return oneflow::StartLazyGlobalSession(). inline void StopLazyGlobalSession() { return oneflow::StopLazyGlobalSession().GetOrThrow(); } +inline void CreateMultiClientSessionContext() { + return oneflow::CreateMultiClientSessionContext().GetOrThrow(); +} + +inline void InitMultiClientSessionContext(const std::string& config_proto_str) { + return oneflow::InitMultiClientSessionContext(config_proto_str).GetOrThrow(); +} + +inline void DestroyMultiClientSessionContext() { + return oneflow::DestroyMultiClientSessionContext().GetOrThrow(); +} + #endif // ONEFLOW_API_PYTHON_SESSION_SESSION_API_H_ diff --git a/oneflow/core/framework/multi_client_session_context.cpp b/oneflow/core/framework/multi_client_session_context.cpp new file mode 100644 index 00000000000..8ebca1ebfdc --- /dev/null +++ b/oneflow/core/framework/multi_client_session_context.cpp @@ -0,0 +1,117 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "oneflow/core/framework/multi_client_session_context.h" +#include "oneflow/core/framework/load_library.h" +#include "oneflow/core/job/version.h" +#include "oneflow/core/job/global_for.h" +#include "oneflow/core/job/id_manager.h" +#include "oneflow/core/job/job_instance.h" +#include "oneflow/core/job/job_build_and_infer_ctx_mgr.h" +#include "oneflow/core/common/buffer_manager.h" +#include "oneflow/core/rpc/include/global_process_ctx.h" +#include "oneflow/api/python/env/env.h" +#ifdef WITH_CUDA +#include +#endif // WITH_CUDA + +namespace oneflow { + +namespace { + +int32_t GetGpuDeviceNum() { +#ifndef WITH_CUDA + return 0; +#else + int device_count = 0; + cudaGetDeviceCount(&device_count); + return device_count; +#endif +} + +} // namespace + +MultiClientSessionContext::~MultiClientSessionContext() { + if (is_inited_) { + { + // NOTE(chengcheng): delete runtime global objects + Global>>::Delete(); + } + + Global::Delete(); + Global::Delete(); + + // TODO(chengcheng): remove template ForEnv and ForSession + Global::Delete(); + // NOTE(chengcheng): New after delete because in EnvGlobalObjectScope once created ResourceDesc. + Global::New(Global::Get()->resource(), + GlobalProcessCtx::NumOfProcessPerNode()); + } +} + +Maybe MultiClientSessionContext::TryInit(const ConfigProto& config_proto) { + if (!is_inited_) { + CHECK_OR_RETURN(JUST(IsMultiClient())); + DumpVersionInfo(); + + Resource resource = config_proto.resource(); + + { + // NOTE(chengcheng): + // In multi-client, user can NOT config gpu_device_num and cpu_device_num. + // + // cpu_device_num is a confusing name, it should be explained as: + // gpu_device corresponding host memory and compute stream. + // When gpu_device_num == 0 (cpu only), cpu device num should be process num. + // + // gpu_device_num is the number of visible GPUs one current machine. + // NOTE: gpu_device_num NOT necessarily equal to the num of process one this machine. + resource.set_machine_num(GlobalProcessCtx::NodeSize()); + const int32_t gpu_device_num = GetGpuDeviceNum(); + resource.set_gpu_device_num(gpu_device_num); + if (gpu_device_num == 0) { + resource.set_cpu_device_num(GlobalProcessCtx::NumOfProcessPerNode()); + } else { + resource.set_cpu_device_num(gpu_device_num); + } + } + + // NOTE(chengcheng): detele first because in EnvGlobalObjectScope has created ResourceDesc. + if (Global::Get() != nullptr) { + // TODO(chengcheng): reorganize dependency of all Global objects. + Global::Delete(); + } + Global::New(resource, GlobalProcessCtx::NumOfProcessPerNode()); + Global::New(); + // TODO(chengcheng): refactor JobBuildAndInferCtxMgr + Global::New(); + + for (const std::string& lib_path : config_proto.load_lib_path()) { + // TODO(chengcheng): remove load_lib_path in config proto. using LoadLibraryNow + JUST(LoadLibrary(lib_path)); + } + + { + // NOTE(chengcheng): init runtime global objects + Global>>::New(); + } + + is_inited_ = true; + } + return Maybe::Ok(); +} + +} // namespace oneflow diff --git a/oneflow/core/framework/multi_client_session_context.h b/oneflow/core/framework/multi_client_session_context.h new file mode 100644 index 00000000000..564cb709807 --- /dev/null +++ b/oneflow/core/framework/multi_client_session_context.h @@ -0,0 +1,39 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_FRAMEWORK_MULTI_CLIENT_SESSION_CONTEXT_H_ +#define ONEFLOW_CORE_FRAMEWORK_MULTI_CLIENT_SESSION_CONTEXT_H_ + +#include "oneflow/core/common/util.h" +#include "oneflow/core/job/job_set.pb.h" +#include "oneflow/core/common/maybe.h" + +namespace oneflow { + +class MultiClientSessionContext { + public: + OF_DISALLOW_COPY_AND_MOVE(MultiClientSessionContext); + MultiClientSessionContext() : is_inited_(false) {} + ~MultiClientSessionContext(); + + Maybe TryInit(const ConfigProto& config_proto); + + private: + bool is_inited_; +}; + +} // namespace oneflow + +#endif // ONEFLOW_CORE_FRAMEWORK_MULTI_CLIENT_SESSION_CONTEXT_H_ diff --git a/oneflow/core/rpc/include/global_process_ctx.h b/oneflow/core/rpc/include/global_process_ctx.h index c9f3d6e7be8..7a6388310a7 100644 --- a/oneflow/core/rpc/include/global_process_ctx.h +++ b/oneflow/core/rpc/include/global_process_ctx.h @@ -28,7 +28,6 @@ struct GlobalProcessCtx { static int64_t ThisNodeId(); static int64_t NumOfProcessPerNode(); static bool IsThisProcessMaster(); - static bool IsMultiClient(); static size_t WorldSize(); static std::string LogDirEntry(); };