Skip to content

Commit

Permalink
Add ref counting for rsmi init and shutdown
Browse files Browse the repository at this point in the history
Also, clean lint from kfd_ioctl.h file.

Change-Id: I5a2ae127ab6ab6676a1b075ed10858d0ebfe13c1
  • Loading branch information
Chris Freehill authored and Chris Freehill committed May 11, 2020
1 parent e1f0d7e commit 8e03d10
Show file tree
Hide file tree
Showing 9 changed files with 837 additions and 378 deletions.
711 changes: 354 additions & 357 deletions include/rocm_smi/kfd_ioctl.h

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions include/rocm_smi/rocm_smi.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ typedef enum {
RSMI_STATUS_BUSY, //!< A resource or mutex could not be
//!< acquired because it is already
//!< being used
RSMI_STATUS_REFCOUNT_OVERFLOW, //!< An internal reference counter
//!< exceeded INT32_MAX

RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF, //!< An unknown error occurred
} rsmi_status_t;
Expand Down
20 changes: 17 additions & 3 deletions include/rocm_smi/rocm_smi_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,18 @@ class RocmSMI {
int kfd_notif_evt_fh(void) const {return kfd_notif_evt_fh_;}
void set_kfd_notif_evt_fh(int fd) {kfd_notif_evt_fh_ = fd;}
std::mutex *kfd_notif_evt_fh_mutex(void) {return &kfd_notif_evt_fh_mutex_;}
int kfd_notif_evt_fh_refcnt_inc() {return ++kfd_notif_evt_fh_refcnt_;}
int kfd_notif_evt_fh_refcnt_dec() {return --kfd_notif_evt_fh_refcnt_;}
std::mutex *bootstrap_mutex(void) {return &bootstrap_mutex_;}

uint32_t ref_count(void) const {return ref_count_;}
uint32_t ref_count_inc(void) {return ++ref_count_;}
uint32_t ref_count_dec(void) {return --ref_count_;}

uint32_t kfd_notif_evt_fh_refcnt(void) const {
return kfd_notif_evt_fh_refcnt_;}
uint32_t kfd_notif_evt_fh_refcnt_inc(void) {
return ++kfd_notif_evt_fh_refcnt_;}
uint32_t kfd_notif_evt_fh_refcnt_dec(void) {
return --kfd_notif_evt_fh_refcnt_;}

private:
std::vector<std::shared_ptr<Device>> devices_;
Expand All @@ -114,8 +124,12 @@ class RocmSMI {
uint32_t euid_;

int kfd_notif_evt_fh_;
int kfd_notif_evt_fh_refcnt_;
std::mutex kfd_notif_evt_fh_mutex_;
uint32_t kfd_notif_evt_fh_refcnt_; // Access to this should be protected
// by kfd_notif_evt_fh_mutex_
std::mutex bootstrap_mutex_;
uint32_t ref_count_; // Access to this should be protected
// by bootstrap_mutex_
};

} // namespace smi
Expand Down
79 changes: 79 additions & 0 deletions include/rocm_smi/rocm_smi_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,85 @@ struct ScopedPthread {
pthread_wrap& pthrd_ref_;
bool mutex_not_acquired_; // Use for AcquireNB (not for Aquire())
};


#define PASTE2(x, y) x##y
#define PASTE(x, y) PASTE2(x, y)

#define __forceinline __inline__ __attribute__((always_inline))

template <typename lambda>
class ScopeGuard {
public:
explicit __forceinline ScopeGuard(const lambda& release)
: release_(release), dismiss_(false) {}

ScopeGuard(const ScopeGuard& rhs) {*this = rhs; }

__forceinline ~ScopeGuard() {
if (!dismiss_) release_();
}
__forceinline ScopeGuard& operator=(const ScopeGuard& rhs) {
dismiss_ = rhs.dismiss_;
release_ = rhs.release_;
rhs.dismiss_ = true;
}
__forceinline void Dismiss() { dismiss_ = true; }

private:
lambda release_;
bool dismiss_;
};

template <typename lambda>
static __forceinline ScopeGuard<lambda> MakeScopeGuard(lambda rel) {
return ScopeGuard<lambda>(rel);
}

#define MAKE_SCOPE_GUARD_HELPER(lname, sname, ...) \
auto lname = __VA_ARGS__; \
amd::smi::ScopeGuard<decltype(lname)> sname(lname);
#define MAKE_SCOPE_GUARD(...) \
MAKE_SCOPE_GUARD_HELPER(PASTE(scopeGuardLambda, __COUNTER__), \
PASTE(scopeGuard, __COUNTER__), __VA_ARGS__)
#define MAKE_NAMED_SCOPE_GUARD(name, ...) \
MAKE_SCOPE_GUARD_HELPER(PASTE(scopeGuardLambda, __COUNTER__), name, \
__VA_ARGS__)


// A macro to disallow the copy and move constructor and operator= functions
#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
TypeName(const TypeName&) = delete; \
TypeName(TypeName&&) = delete; \
void operator=(const TypeName&) = delete; \
void operator=(TypeName&&) = delete;

template <class LockType>
class ScopedAcquire {
public:
/// @brief: When constructing, acquire the lock.
/// @param: lock(Input), pointer to an existing lock.
explicit ScopedAcquire(LockType* lock) : lock_(lock), doRelease(true) {
lock_->Acquire();}

/// @brief: when destructing, release the lock.
~ScopedAcquire() {
if (doRelease) lock_->Release();
}

/// @brief: Release the lock early. Avoid using when possible.
void Release() {
lock_->Release();
doRelease = false;
}

private:
LockType* lock_;
bool doRelease;
/// @brief: Disable copiable and assignable ability.
DISALLOW_COPY_AND_ASSIGN(ScopedAcquire);
};

} // namespace smi
} // namespace amd

Expand Down
71 changes: 61 additions & 10 deletions src/rocm_smi.cc
Original file line number Diff line number Diff line change
Expand Up @@ -165,20 +165,20 @@ static rsmi_status_t handleException() {
return RSMI_STATUS_NOT_SUPPORTED; \
} \
return RSMI_STATUS_INVALID_ARGS; \
} \
}

#define CHK_SUPPORT(RT_PTR, VR, SUB_VR) \
GET_DEV_FROM_INDX \
CHK_API_SUPPORT_ONLY((RT_PTR), (VR), (SUB_VR))

#define CHK_SUPPORT_NAME_ONLY(RT_PTR) \
CHK_SUPPORT((RT_PTR), RSMI_DEFAULT_VARIANT, RSMI_DEFAULT_VARIANT) \
CHK_SUPPORT((RT_PTR), RSMI_DEFAULT_VARIANT, RSMI_DEFAULT_VARIANT)

#define CHK_SUPPORT_VAR(RT_PTR, VR) \
CHK_SUPPORT((RT_PTR), (VR), RSMI_DEFAULT_VARIANT) \
CHK_SUPPORT((RT_PTR), (VR), RSMI_DEFAULT_VARIANT)

#define CHK_SUPPORT_SUBVAR_ONLY(RT_PTR, SUB_VR) \
CHK_SUPPORT((RT_PTR), RSMI_DEFAULT_VARIANT, (SUB_VR)) \
CHK_SUPPORT((RT_PTR), RSMI_DEFAULT_VARIANT, (SUB_VR))

static pthread_mutex_t *get_mutex(uint32_t dv_ind) {
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
Expand Down Expand Up @@ -540,9 +540,29 @@ static bool is_power_of_2(uint64_t n) {
rsmi_status_t
rsmi_init(uint64_t flags) {
TRY

amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
smi.Initialize(flags);
std::lock_guard<std::mutex> guard(*smi.bootstrap_mutex());

if (smi.ref_count() == INT32_MAX) {
return RSMI_STATUS_REFCOUNT_OVERFLOW;
}

(void)smi.ref_count_inc();

// If smi.Initialize() throws, we should clean up and dec. ref_count_.
// Otherwise, if no issues, the Dismiss() will prevent the ref_count_
// decrement.
MAKE_NAMED_SCOPE_GUARD(refGuard, [&]() { (void)smi.ref_count_dec(); });

if (smi.ref_count() == 1) {
try {
smi.Initialize(flags);
} catch(...) {
smi.Cleanup();
throw;
}
}
refGuard.Dismiss();

return RSMI_STATUS_SUCCESS;
CATCH
Expand All @@ -555,9 +575,17 @@ rsmi_shut_down(void) {
TRY

amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
std::lock_guard<std::mutex> guard(*smi.bootstrap_mutex());

if (smi.ref_count() == 0) {
return RSMI_STATUS_INIT_ERROR;
}

smi.Cleanup();
(void)smi.ref_count_dec();

if (smi.ref_count() == 0) {
smi.Cleanup();
}
return RSMI_STATUS_SUCCESS;
CATCH
}
Expand Down Expand Up @@ -2371,6 +2399,15 @@ rsmi_status_string(rsmi_status_t status, const char **status_string) {
"type that was expected";
break;

case RSMI_STATUS_BUSY:
*status_string = "A resource or mutex could not be acquired "
"because it is already being used";
break;

case RSMI_STATUS_REFCOUNT_OVERFLOW:
*status_string = "An internal reference counter exceeded INT32_MAX";
break;

case RSMI_STATUS_UNKNOWN_ERROR:
*status_string = "An unknown error prevented the call from completing"
" successfully";
Expand Down Expand Up @@ -3186,6 +3223,7 @@ rsmi_event_notification_init(uint32_t dv_ind) {

std::lock_guard<std::mutex> guard(*smi.kfd_notif_evt_fh_mutex());
if (smi.kfd_notif_evt_fh() == -1) {
assert(smi.kfd_notif_evt_fh_refcnt() == 0);
int kfd_fd = open(kPathKFDIoctl, O_RDWR | O_CLOEXEC);

if (kfd_fd <= 0) {
Expand All @@ -3199,8 +3237,7 @@ rsmi_event_notification_init(uint32_t dv_ind) {

smi.set_kfd_notif_evt_fh(kfd_fd);
}
smi.kfd_notif_evt_fh_refcnt_inc();

(void)smi.kfd_notif_evt_fh_refcnt_inc();
struct kfd_ioctl_smi_events_args args;

assert(dev->kfd_gpu_id() <= UINT32_MAX);
Expand Down Expand Up @@ -3354,7 +3391,7 @@ rsmi_status_t rsmi_event_notification_stop(uint32_t dv_ind) {
dev->set_evt_notif_anon_file_ptr(nullptr);
dev->set_evt_notif_anon_fd(-1);

if (!smi.kfd_notif_evt_fh_refcnt_dec()) {
if (smi.kfd_notif_evt_fh_refcnt_dec() == 0) {
int ret = close(smi.kfd_notif_evt_fh());
smi.set_kfd_notif_evt_fh(-1);
if (ret < 0) {
Expand Down Expand Up @@ -3385,3 +3422,17 @@ rsmi_test_sleep(uint32_t dv_ind, uint32_t seconds) {
sleep(seconds);
return RSMI_STATUS_SUCCESS;
}

int32_t
rsmi_test_refcount(uint64_t refcnt_type) {
(void)refcnt_type;

amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
std::lock_guard<std::mutex> guard(*smi.bootstrap_mutex());

if (smi.ref_count() == 0 && smi.monitor_devices().size() != 0) {
return -1;
}

return smi.ref_count();
}
13 changes: 10 additions & 3 deletions src/rocm_smi_main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,12 @@ RocmSMI::Initialize(uint64_t flags) {
auto i = 0;
uint32_t ret;

assert(ref_count_ == 1);
if (ref_count_ != 1) {
throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR,
"Unexpected: RocmSMI ref_count_ != 1");
}

init_options_ = flags;

euid_ = geteuid();
Expand Down Expand Up @@ -299,16 +305,17 @@ RocmSMI::Initialize(uint64_t flags) {

void
RocmSMI::Cleanup() {
s_monitor_devices.clear();
devices_.clear();
monitors_.clear();

if (kfd_notif_evt_fh() >= 0) {
int ret = close(kfd_notif_evt_fh());
if (ret < 0) {
throw amd::smi::rsmi_exception(RSMI_STATUS_FILE_ERROR,
"Failed to close kfd file handle on shutdown.");
}
}
s_monitor_devices.clear();
devices_.clear();
monitors_.clear();
}

RocmSMI::RocmSMI(uint64_t flags) : init_options_(flags),
Expand Down
Loading

0 comments on commit 8e03d10

Please sign in to comment.