Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove GpuDeviceNum #8166

Merged
merged 12 commits into from
May 10, 2022
3 changes: 2 additions & 1 deletion oneflow/api/python/env/env.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ limitations under the License.
#include "oneflow/core/job/graph_scope_vars.h"
#include "oneflow/core/control/global_process_ctx.h"
#include "oneflow/core/rpc/include/base.h"
#include "oneflow/core/ep/include/device_manager_registry.h"

namespace oneflow {

Expand All @@ -53,7 +54,7 @@ inline Maybe<size_t> GetWorldSize() { return GlobalProcessCtx::WorldSize(); }
inline Maybe<size_t> GetNodeSize() { return GlobalProcessCtx::NodeSize(); }
inline Maybe<size_t> GetLocalRank() { return GlobalProcessCtx::LocalRank(); }
inline Maybe<size_t> CudaGetDeviceCount() {
return Global<ResourceDesc, ForSession>::Get()->GpuDeviceNum();
return Global<ep::DeviceManagerRegistry>::Get()->GetDeviceCount(DeviceType::kCUDA);
}
inline Maybe<void> SetFLAGS_alsologtostderr(bool flag) {
FLAGS_alsologtostderr = flag;
Expand Down
25 changes: 8 additions & 17 deletions oneflow/api/python/symbol/placement_symbol.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,25 +28,16 @@ limitations under the License.
#include "oneflow/core/job/parallel_desc.h"
#include "oneflow/core/job/global_for.h"
#include "oneflow/core/job/resource_desc.h"

#ifdef WITH_CUDA
#include <cuda_runtime_api.h>
#endif // WITH_CUDA
#include "oneflow/core/ep/include/device_manager_registry.h"

namespace py = pybind11;

namespace oneflow {

namespace {

int64_t GetGpuDeviceNum() {
#ifndef WITH_CUDA
return 0;
#else
int device_count = 0;
cudaGetDeviceCount(&device_count);
return device_count;
#endif
int64_t GetDeviceCount(const std::string& device_name) {
return Global<ep::DeviceManagerRegistry>::Get()->GetDeviceCount(device_name);
}

struct PlacementSymbolExportUtil {
Expand Down Expand Up @@ -158,11 +149,11 @@ struct PlacementSymbolExportUtil {
if (it == device_tag2placement.end()) {
int64_t node_size = GlobalProcessCtx::NodeSize();
int64_t device_num = GlobalProcessCtx::NumOfProcessPerNode();
if (type == "cuda") {
const int64_t gpu_device_num = GetGpuDeviceNum();
CHECK_NE_OR_RETURN(gpu_device_num, 0)
<< "Can\'t construct placement with \"cuda\" type because there is no CUDA device!";
device_num = std::min(device_num, gpu_device_num);
if (type != "cpu") {
const int64_t device_count = GetDeviceCount(type);
CHECK_NE_OR_RETURN(device_count, 0) << "Can\'t construct placement with \"" << type
<< "\" type because there is no device!";
device_num = std::min(device_num, device_count);
}
std::vector<std::string> machine_device_ids;
for (int64_t node_id = 0; node_id < node_size; ++node_id) {
Expand Down
4 changes: 2 additions & 2 deletions oneflow/core/autograd/gradient_funcs/cublas_fused_mlp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,10 @@ Maybe<void> CublasFusedMLP::Apply(const CublasFusedMLPCaptureState* ctx,
const auto& last_layer_wgrad_bgrad =
JUST(functional::CublasMatmulBiasAddGrad(last_bias_dy, last_layer_x));
if (last_layer_weight_requires_grad) {
*JUST(VectorAt(in_grads, weight_num)) = *JUST(VectorAt(last_layer_wgrad_bgrad, 0));
*JUST(VectorAt(in_grads, weight_num)) = JUST(VectorAt(*last_layer_wgrad_bgrad, 0));
}
if (last_layer_bias_requires_grad) {
*JUST(VectorAt(in_grads, 2 * weight_num)) = *JUST(VectorAt(last_layer_wgrad_bgrad, 1));
*JUST(VectorAt(in_grads, 2 * weight_num)) = JUST(VectorAt(*last_layer_wgrad_bgrad, 1));
}
}

Expand Down
47 changes: 44 additions & 3 deletions oneflow/core/ep/common/device_manager_registry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,21 +29,41 @@ class DeviceManagerRegistry::Impl {
}
~Impl() = default;

DeviceManager* GetDeviceManager(DeviceType device_type) {
DeviceManager* GetDeviceManagerOrNull(DeviceType device_type) {
std::lock_guard<std::mutex> lock(mutex_);
if (!managers_.at(device_type)) {
std::lock_guard<std::mutex> factories_lock(factories_mutex_);
auto& factory = factories_.at(device_type);
CHECK(factory);
managers_.at(device_type) = factory->NewDeviceManager(registry_);
if (factory) {
managers_.at(device_type) = factory->NewDeviceManager(registry_);
} else {
return nullptr;
}
}
return managers_.at(device_type).get();
}

DeviceManager* GetDeviceManager(DeviceType device_type) {
return CHECK_NOTNULL(GetDeviceManagerOrNull(device_type));
}

std::shared_ptr<Device> GetDevice(DeviceType device_type, size_t device_index) {
return GetDeviceManager(device_type)->GetDevice(device_index);
}

size_t GetDeviceCount(DeviceType device_type) {
DeviceManager* manager = GetDeviceManagerOrNull(device_type);
if (manager == nullptr) {
return 0;
} else {
return manager->GetDeviceCount();
}
}

size_t GetDeviceCount(const std::string& device_type_name) {
return GetDeviceCount(GetDeviceTypeByDeviceTypeName(device_type_name));
}

static void DumpVersionInfo() {
std::lock_guard<std::mutex> factories_lock(factories_mutex_);
for (auto& factory : factories_) {
Expand Down Expand Up @@ -93,6 +113,11 @@ class DeviceManagerRegistry::Impl {
return types;
}

static bool IsDeviceTypeRegistered(DeviceType device_type) {
std::lock_guard<std::mutex> lock(factories_mutex_);
return factories_.at(device_type).operator bool();
}

private:
std::mutex mutex_;
std::vector<std::unique_ptr<DeviceManager>> managers_;
Expand All @@ -114,11 +139,23 @@ DeviceManager* DeviceManagerRegistry::GetDeviceManager(DeviceType device_type) {
return impl_->GetDeviceManager(device_type);
}

DeviceManager* DeviceManagerRegistry::GetDeviceManagerOrNull(DeviceType device_type) {
return impl_->GetDeviceManagerOrNull(device_type);
}

std::shared_ptr<Device> DeviceManagerRegistry::GetDevice(DeviceType device_type,
size_t device_index) {
return impl_->GetDevice(device_type, device_index);
}

size_t DeviceManagerRegistry::GetDeviceCount(DeviceType device_type) {
return impl_->GetDeviceCount(device_type);
}

size_t DeviceManagerRegistry::GetDeviceCount(const std::string& device_type_name) {
return impl_->GetDeviceCount(device_type_name);
}

/*static*/ void DeviceManagerRegistry::RegisterDeviceManagerFactory(
std::unique_ptr<DeviceManagerFactory>&& factory) {
Impl::RegisterDeviceManagerFactory(std::move(factory));
Expand All @@ -140,6 +177,10 @@ std::shared_ptr<Device> DeviceManagerRegistry::GetDevice(DeviceType device_type,
return Impl::GetRegisteredDeviceTypes();
}

/*static*/ bool DeviceManagerRegistry::IsDeviceTypeRegistered(DeviceType device_type) {
return Impl::IsDeviceTypeRegistered(device_type);
}

} // namespace ep

} // namespace oneflow
4 changes: 4 additions & 0 deletions oneflow/core/ep/include/device_manager_registry.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,17 @@ class DeviceManagerRegistry {
~DeviceManagerRegistry();

DeviceManager* GetDeviceManager(DeviceType device_type);
DeviceManager* GetDeviceManagerOrNull(DeviceType device_type);
std::shared_ptr<Device> GetDevice(DeviceType device_type, size_t device_index);
size_t GetDeviceCount(DeviceType device_type);
size_t GetDeviceCount(const std::string& device_type_name);

static void RegisterDeviceManagerFactory(std::unique_ptr<DeviceManagerFactory>&& factory);
static void DumpVersionInfo();
static std::string GetDeviceTypeNameByDeviceType(DeviceType device_type);
static DeviceType GetDeviceTypeByDeviceTypeName(const std::string& device_type_name);
static std::set<DeviceType> GetRegisteredDeviceTypes();
static bool IsDeviceTypeRegistered(DeviceType device_type);

private:
class Impl;
Expand Down
5 changes: 0 additions & 5 deletions oneflow/core/job/id_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,6 @@ limitations under the License.
namespace oneflow {

IDMgr::IDMgr() {
CHECK_LT((Global<ResourceDesc, ForSession>::Get()->process_ranks().size()),
static_cast<int64_t>(1) << machine_id_bit_num_);
gpu_device_num_ = Global<ResourceDesc, ForSession>::Get()->GpuDeviceNum();
cpu_device_num_ = Global<ResourceDesc, ForSession>::Get()->CpuDeviceNum();
CHECK_LT(gpu_device_num_ + cpu_device_num_, (static_cast<int64_t>(1) << thread_id_bit_num_) - 3);
regst_desc_id_count_ = 0;
mem_block_id_count_ = 0;
chunk_id_count_ = 0;
Expand Down
10 changes: 0 additions & 10 deletions oneflow/core/job/id_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,20 +39,10 @@ class IDMgr final {
friend class Global<IDMgr>;
IDMgr();

int64_t gpu_device_num_;
int64_t cpu_device_num_;
int64_t regst_desc_id_count_;
int64_t mem_block_id_count_;
int64_t chunk_id_count_;
TaskIdGenerator task_id_gen_;

// 64 bit id design:
// sign | machine | thread | local_work_stream | task
// 1 | 10 | 11 | 21 | 21
static const int64_t machine_id_bit_num_ = 10;
static const int64_t thread_id_bit_num_ = 11;
static const int64_t local_work_stream_id_bit_num_ = 21;
static const int64_t task_id_bit_num_ = 21;
};

} // namespace oneflow
Expand Down
4 changes: 0 additions & 4 deletions oneflow/core/job/job_desc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,6 @@ Maybe<JobDesc> JobDesc::New(int64_t symbol_id, const JobConfigProto& job_conf) {

Maybe<void> JobDesc::Init() {
cfg_job_conf_.reset(new cfg::JobConfigProto(job_conf_));

#ifndef WITH_CUDA
CHECK_EQ_OR_RETURN((Global<ResourceDesc, ForSession>::Get()->GpuDeviceNum()), 0);
#endif
CheckFunctionConfig(job_conf_);
return Maybe<void>::Ok();
}
Expand Down
48 changes: 12 additions & 36 deletions oneflow/core/job/parallel_desc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,22 +26,14 @@ limitations under the License.
#include "oneflow/core/framework/instructions_builder.h"
#include "oneflow/core/framework/device.h"
#include "oneflow/core/vm/vm_util.h"
#ifdef WITH_CUDA
#include <cuda_runtime_api.h>
#endif // WITH_CUDA
#include "oneflow/core/ep/include/device_manager_registry.h"

namespace oneflow {

namespace {

int64_t GetGpuDeviceNum() {
#ifndef WITH_CUDA
return 0;
#else
int device_count = 0;
cudaGetDeviceCount(&device_count);
return device_count;
#endif
int64_t GetDeviceCount(DeviceType device_type) {
return Global<ep::DeviceManagerRegistry>::Get()->GetDeviceCount(device_type);
}

using MachineId2DeviceIdList =
Expand Down Expand Up @@ -88,7 +80,6 @@ Maybe<OFRecord> ParseMachineAndDeviceIdList(const ParallelConf& parallel_conf) {

ParallelDesc::ParallelDesc(const ParallelConf& user_conf) : symbol_id_(NullOpt) { // NOLINT
CHECK_JUST(MaybeInit(user_conf));
CHECK_JUST(CheckWithResourceDesc(*(Global<ResourceDesc, ForSession>::Get())));
}

Maybe<ParallelDesc> ParallelDesc::New(int64_t symbol_id, const ParallelConf& parallel_conf) {
Expand Down Expand Up @@ -305,17 +296,6 @@ Maybe<void> ParallelDesc::SanityCheck() {
return Maybe<void>::Ok();
}

Maybe<void> ParallelDesc::CheckWithResourceDesc(const ResourceDesc& resource_desc) {
if (device_type_ == DeviceType::kCUDA) {
for (auto& pair : *machine_id2sorted_dev_phy_ids_) {
for (int64_t dev_phy_id : *pair.second) {
CHECK_LT_OR_RETURN(dev_phy_id, resource_desc.GpuDeviceNum());
}
}
}
return Maybe<void>::Ok();
}

Maybe<void> ParallelDesc::CheckDeviceIdsIsValid() const {
const auto& sorted_dev_phy_ids_iter =
machine_id2sorted_dev_phy_ids_->find(GlobalProcessCtx::Rank());
Expand All @@ -326,23 +306,19 @@ Maybe<void> ParallelDesc::CheckDeviceIdsIsValid() const {
}
if (sorted_dev_phy_ids_iter != machine_id2sorted_dev_phy_ids_->end()) {
for (int64_t dev_phy_id : *sorted_dev_phy_ids_iter->second) {
if (device_type_ == DeviceType::kCUDA) {
const int64_t gpu_device_num = GetGpuDeviceNum();
CHECK_NE_OR_RETURN(gpu_device_num, 0)
<< Error::RuntimeError()
<< "Placement with \"cuda\" type is invalid because there is no CUDA device!";
int64_t device_num = std::min(GlobalProcessCtx::NumOfProcessPerNode(), gpu_device_num);
CHECK_LT_OR_RETURN(dev_phy_id, device_num)
<< Error::RuntimeError() << "Placement is invalid because device id must be less than "
<< (gpu_device_num < GlobalProcessCtx::NumOfProcessPerNode()
? "num of CUDA devices on node"
: "num of process per node");
} else if (device_type_ == DeviceType::kCPU) {
if (device_type_ == DeviceType::kCPU) {
CHECK_LT_OR_RETURN(dev_phy_id, GlobalProcessCtx::NumOfProcessPerNode())
<< Error::RuntimeError()
<< "Placement is invalid because device id must be less than num of process per node";
} else {
OF_UNIMPLEMENTED();
const int64_t device_count = GetDeviceCount(device_type_);
CHECK_NE_OR_RETURN(device_count, 0)
<< Error::RuntimeError() << "Placement is invalid because there is no device!";
int64_t device_num = std::min(GlobalProcessCtx::NumOfProcessPerNode(), device_count);
CHECK_LT_OR_RETURN(dev_phy_id, device_num)
<< Error::RuntimeError() << "Placement is invalid because device id must be less than "
<< (device_count < GlobalProcessCtx::NumOfProcessPerNode() ? "num devices on node"
: "num of process per node");
}
}
}
Expand Down
4 changes: 3 additions & 1 deletion oneflow/core/job/plan_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ limitations under the License.
#include "oneflow/core/memory/memory_case_util.h"
#include "oneflow/core/register/runtime_register_desc.h"
#include "oneflow/core/persistence/tee_persistent_log_stream.h"
#include "oneflow/core/ep/include/device_manager_registry.h"

namespace oneflow {

Expand Down Expand Up @@ -386,7 +387,8 @@ void PlanUtil::CleanUselessMemBlockAndCheckValid(Plan* plan) {

void PlanUtil::ToDotFile(const Plan& plan, const std::string& filepath) {
const auto& process_ranks = Global<ResourceDesc, ForSession>::Get()->process_ranks();
size_t gpu_device_num = Global<ResourceDesc, ForSession>::Get()->GpuDeviceNum();
size_t gpu_device_num =
Global<ep::DeviceManagerRegistry>::Get()->GetDeviceCount(DeviceType::kCUDA);
std::map<int64_t, std::map<int64_t, std::vector<std::vector<std::string>>>>
machine_id2job_id_device_id2node_list;
for (size_t i : process_ranks) {
Expand Down
2 changes: 0 additions & 2 deletions oneflow/core/job/resource_desc.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ class ResourceDesc final {
__attribute__((deprecated)) Machine machine(int32_t idx) const;
size_t CommNetWorkerNum() const { return resource_.comm_net_worker_num(); }
int32_t CpuDeviceNum() const { return resource_.cpu_device_num(); }
int32_t GpuDeviceNum() const { return resource_.gpu_device_num(); }
int32_t MemZoneNum() const { return GpuDeviceNum() + 1; }
int32_t MaxMdSaveWorkerNum() const { return resource_.max_mdsave_worker_num(); }
size_t reserved_host_mem_byte() const { return resource_.reserved_host_mem_mbyte() * kMB; }
size_t reserved_device_mem_byte() const { return resource_.reserved_device_mem_mbyte() * kMB; }
Expand Down