From 8e03d100354a90a9121cb16c6ad6a40bdd81533d Mon Sep 17 00:00:00 2001
From: Chris Freehill <Chris.Freehill@amd.com>
Date: Mon, 11 May 2020 15:24:47 -0500
Subject: [PATCH] Add ref counting for rsmi init and shutdown

Also, clean lint from kfd_ioctl.h file.

Change-Id: I5a2ae127ab6ab6676a1b075ed10858d0ebfe13c1
---
 include/rocm_smi/kfd_ioctl.h                  | 711 +++++++++---------
 include/rocm_smi/rocm_smi.h                   |   2 +
 include/rocm_smi/rocm_smi_main.h              |  20 +-
 include/rocm_smi/rocm_smi_utils.h             |  79 ++
 src/rocm_smi.cc                               |  71 +-
 src/rocm_smi_main.cc                          |  13 +-
 .../functional/init_shutdown_refcount.cc      | 226 ++++++
 .../functional/init_shutdown_refcount.h       |  74 ++
 tests/rocm_smi_test/main.cc                   |  19 +-
 9 files changed, 837 insertions(+), 378 deletions(-)
 create mode 100755 tests/rocm_smi_test/functional/init_shutdown_refcount.cc
 create mode 100755 tests/rocm_smi_test/functional/init_shutdown_refcount.h

diff --git a/include/rocm_smi/kfd_ioctl.h b/include/rocm_smi/kfd_ioctl.h
index 7eb58269..9356cd16 100755
--- a/include/rocm_smi/kfd_ioctl.h
+++ b/include/rocm_smi/kfd_ioctl.h
@@ -28,82 +28,82 @@
 
 #define KFD_IOCTL_MAJOR_VERSION 1
 #define KFD_IOCTL_MINOR_VERSION 2
-#define KFD_IOCTL_DBG_MAJOR_VERSION	1
-#define KFD_IOCTL_DBG_MINOR_VERSION	0
+#define KFD_IOCTL_DBG_MAJOR_VERSION  1
+#define KFD_IOCTL_DBG_MINOR_VERSION  0
 
 struct kfd_ioctl_get_version_args {
-	__u32 major_version;	/* from KFD */
-	__u32 minor_version;	/* from KFD */
+  __u32 major_version;  /* from KFD */
+  __u32 minor_version;  /* from KFD */
 };
 
 /* For kfd_ioctl_create_queue_args.queue_type. */
-#define KFD_IOC_QUEUE_TYPE_COMPUTE		0x0
-#define KFD_IOC_QUEUE_TYPE_SDMA			0x1
-#define KFD_IOC_QUEUE_TYPE_COMPUTE_AQL		0x2
-#define KFD_IOC_QUEUE_TYPE_SDMA_XGMI		0x3
+#define KFD_IOC_QUEUE_TYPE_COMPUTE    0x0
+#define KFD_IOC_QUEUE_TYPE_SDMA      0x1
+#define KFD_IOC_QUEUE_TYPE_COMPUTE_AQL    0x2
+#define KFD_IOC_QUEUE_TYPE_SDMA_XGMI    0x3
 
-#define KFD_MAX_QUEUE_PERCENTAGE	100
-#define KFD_MAX_QUEUE_PRIORITY		15
+#define KFD_MAX_QUEUE_PERCENTAGE  100
+#define KFD_MAX_QUEUE_PRIORITY    15
 
 struct kfd_ioctl_create_queue_args {
-	__u64 ring_base_address;	/* to KFD */
-	__u64 write_pointer_address;	/* from KFD */
-	__u64 read_pointer_address;	/* from KFD */
-	__u64 doorbell_offset;	/* from KFD */
+  __u64 ring_base_address;  /* to KFD */
+  __u64 write_pointer_address;  /* from KFD */
+  __u64 read_pointer_address;  /* from KFD */
+  __u64 doorbell_offset;  /* from KFD */
 
-	__u32 ring_size;		/* to KFD */
-	__u32 gpu_id;		/* to KFD */
-	__u32 queue_type;		/* to KFD */
-	__u32 queue_percentage;	/* to KFD */
-	__u32 queue_priority;	/* to KFD */
-	__u32 queue_id;		/* from KFD */
+  __u32 ring_size;    /* to KFD */
+  __u32 gpu_id;    /* to KFD */
+  __u32 queue_type;    /* to KFD */
+  __u32 queue_percentage;  /* to KFD */
+  __u32 queue_priority;  /* to KFD */
+  __u32 queue_id;    /* from KFD */
 
-	__u64 eop_buffer_address;	/* to KFD */
-	__u64 eop_buffer_size;	/* to KFD */
-	__u64 ctx_save_restore_address; /* to KFD */
-	__u32 ctx_save_restore_size;	/* to KFD */
-	__u32 ctl_stack_size;		/* to KFD */
+  __u64 eop_buffer_address;  /* to KFD */
+  __u64 eop_buffer_size;  /* to KFD */
+  __u64 ctx_save_restore_address; /* to KFD */
+  __u32 ctx_save_restore_size;  /* to KFD */
+  __u32 ctl_stack_size;    /* to KFD */
 };
 
 struct kfd_ioctl_destroy_queue_args {
-	__u32 queue_id;		/* to KFD */
-	__u32 pad;
+  __u32 queue_id;    /* to KFD */
+  __u32 pad;
 };
 
 struct kfd_ioctl_update_queue_args {
-	__u64 ring_base_address;	/* to KFD */
+  __u64 ring_base_address;  /* to KFD */
 
-	__u32 queue_id;		/* to KFD */
-	__u32 ring_size;		/* to KFD */
-	__u32 queue_percentage;	/* to KFD */
-	__u32 queue_priority;	/* to KFD */
+  __u32 queue_id;    /* to KFD */
+  __u32 ring_size;    /* to KFD */
+  __u32 queue_percentage;  /* to KFD */
+  __u32 queue_priority;  /* to KFD */
 };
 
 struct kfd_ioctl_set_cu_mask_args {
-	__u32 queue_id;		/* to KFD */
-	__u32 num_cu_mask;		/* to KFD */
-	__u64 cu_mask_ptr;		/* to KFD */
+  __u32 queue_id;    /* to KFD */
+  __u32 num_cu_mask;    /* to KFD */
+  __u64 cu_mask_ptr;    /* to KFD */
 };
 
 struct kfd_ioctl_get_queue_wave_state_args {
-	__u64 ctl_stack_address;	/* to KFD */
-	__u32 ctl_stack_used_size;	/* from KFD */
-	__u32 save_area_used_size;	/* from KFD */
-	__u32 queue_id;			/* to KFD */
-	__u32 pad;
+  __u64 ctl_stack_address;  /* to KFD */
+  __u32 ctl_stack_used_size;  /* from KFD */
+  __u32 save_area_used_size;  /* from KFD */
+  __u32 queue_id;      /* to KFD */
+  __u32 pad;
 };
 
 struct kfd_queue_snapshot_entry {
-	__u64 ring_base_address;
-	__u64 write_pointer_address;
-	__u64 read_pointer_address;
-	__u64 ctx_save_restore_address;
-	__u32 queue_id;
-	__u32 gpu_id;
-	__u32 ring_size;
-	__u32 queue_type;
-	__u32 queue_status;
-	__u32 reserved[19];
+  __u64 ring_base_address;
+  __u64 write_pointer_address;
+  __u64 read_pointer_address;
+  __u64 ctx_save_restore_address;
+  __u32 queue_id;
+  __u32 gpu_id;
+  __u32 ring_size;
+  __u32 queue_type;
+  __u32 queue_status;
+  __u32 reserved[19];
 };
 
 /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */
@@ -111,13 +111,13 @@ struct kfd_queue_snapshot_entry {
 #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1
 
 struct kfd_ioctl_set_memory_policy_args {
-	__u64 alternate_aperture_base;	/* to KFD */
-	__u64 alternate_aperture_size;	/* to KFD */
+  __u64 alternate_aperture_base;  /* to KFD */
+  __u64 alternate_aperture_size;  /* to KFD */
 
-	__u32 gpu_id;			/* to KFD */
-	__u32 default_policy;		/* to KFD */
-	__u32 alternate_policy;		/* to KFD */
-	__u32 pad;
+  __u32 gpu_id;      /* to KFD */
+  __u32 default_policy;    /* to KFD */
+  __u32 alternate_policy;    /* to KFD */
+  __u32 pad;
 };
 
 /*
@@ -128,24 +128,24 @@ struct kfd_ioctl_set_memory_policy_args {
  */
 
 struct kfd_ioctl_get_clock_counters_args {
-	__u64 gpu_clock_counter;	/* from KFD */
-	__u64 cpu_clock_counter;	/* from KFD */
-	__u64 system_clock_counter;	/* from KFD */
-	__u64 system_clock_freq;	/* from KFD */
+  __u64 gpu_clock_counter;  /* from KFD */
+  __u64 cpu_clock_counter;  /* from KFD */
+  __u64 system_clock_counter;  /* from KFD */
+  __u64 system_clock_freq;  /* from KFD */
 
-	__u32 gpu_id;		/* to KFD */
-	__u32 pad;
+  __u32 gpu_id;    /* to KFD */
+  __u32 pad;
 };
 
 struct kfd_process_device_apertures {
-	__u64 lds_base;		/* from KFD */
-	__u64 lds_limit;		/* from KFD */
-	__u64 scratch_base;		/* from KFD */
-	__u64 scratch_limit;		/* from KFD */
-	__u64 gpuvm_base;		/* from KFD */
-	__u64 gpuvm_limit;		/* from KFD */
-	__u32 gpu_id;		/* from KFD */
-	__u32 pad;
+  __u64 lds_base;    /* from KFD */
+  __u64 lds_limit;    /* from KFD */
+  __u64 scratch_base;    /* from KFD */
+  __u64 scratch_limit;    /* from KFD */
+  __u64 gpuvm_base;    /* from KFD */
+  __u64 gpuvm_limit;    /* from KFD */
+  __u32 gpu_id;    /* from KFD */
+  __u32 pad;
 };
 
 /*
@@ -155,25 +155,25 @@ struct kfd_process_device_apertures {
  */
 #define NUM_OF_SUPPORTED_GPUS 7
 struct kfd_ioctl_get_process_apertures_args {
-	struct kfd_process_device_apertures
-			process_apertures[NUM_OF_SUPPORTED_GPUS];/* from KFD */
+  struct kfd_process_device_apertures
+      process_apertures[NUM_OF_SUPPORTED_GPUS];/* from KFD */
 
-	/* from KFD, should be in the range [1 - NUM_OF_SUPPORTED_GPUS] */
-	__u32 num_of_nodes;
-	__u32 pad;
+  /* from KFD, should be in the range [1 - NUM_OF_SUPPORTED_GPUS] */
+  __u32 num_of_nodes;
+  __u32 pad;
 };
 
 struct kfd_ioctl_get_process_apertures_new_args {
-	/* User allocated. Pointer to struct kfd_process_device_apertures
-	 * filled in by Kernel
-	 */
-	__u64 kfd_process_device_apertures_ptr;
-	/* to KFD - indicates amount of memory present in
-	 *  kfd_process_device_apertures_ptr
-	 * from KFD - Number of entries filled by KFD.
-	 */
-	__u32 num_of_nodes;
-	__u32 pad;
+  /* User allocated. Pointer to struct kfd_process_device_apertures
+   * filled in by Kernel
+   */
+  __u64 kfd_process_device_apertures_ptr;
+  /* to KFD - indicates amount of memory present in
+   *  kfd_process_device_apertures_ptr
+   * from KFD - Number of entries filled by KFD.
+   */
+  __u32 num_of_nodes;
+  __u32 pad;
 };
 
 #define MAX_ALLOWED_NUM_POINTS    100
@@ -181,35 +181,35 @@ struct kfd_ioctl_get_process_apertures_new_args {
 #define MAX_ALLOWED_WAC_BUFF_SIZE  128
 
 struct kfd_ioctl_dbg_register_args {
-	__u32 gpu_id;		/* to KFD */
-	__u32 pad;
+  __u32 gpu_id;    /* to KFD */
+  __u32 pad;
 };
 
 struct kfd_ioctl_dbg_unregister_args {
-	__u32 gpu_id;		/* to KFD */
-	__u32 pad;
+  __u32 gpu_id;    /* to KFD */
+  __u32 pad;
 };
 
 struct kfd_ioctl_dbg_address_watch_args {
-	__u64 content_ptr;		/* a pointer to the actual content */
-	__u32 gpu_id;		/* to KFD */
-	__u32 buf_size_in_bytes;	/*including gpu_id and buf_size */
+  __u64 content_ptr;    /* a pointer to the actual content */
+  __u32 gpu_id;    /* to KFD */
+  __u32 buf_size_in_bytes;  /*including gpu_id and buf_size */
 };
 
 struct kfd_ioctl_dbg_wave_control_args {
-	__u64 content_ptr;		/* a pointer to the actual content */
-	__u32 gpu_id;		/* to KFD */
-	__u32 buf_size_in_bytes;	/*including gpu_id and buf_size */
+  __u64 content_ptr;    /* a pointer to the actual content */
+  __u32 gpu_id;    /* to KFD */
+  __u32 buf_size_in_bytes;  /*including gpu_id and buf_size */
 };
 
 /* mapping event types to API spec */
-#define	KFD_DBG_EV_STATUS_TRAP		1
-#define	KFD_DBG_EV_STATUS_VMFAULT	2
-#define	KFD_DBG_EV_STATUS_SUSPENDED	4
-#define KFD_DBG_EV_STATUS_NEW_QUEUE	8
-#define	KFD_DBG_EV_FLAG_CLEAR_STATUS	1
+#define  KFD_DBG_EV_STATUS_TRAP    1
+#define  KFD_DBG_EV_STATUS_VMFAULT  2
+#define  KFD_DBG_EV_STATUS_SUSPENDED  4
+#define KFD_DBG_EV_STATUS_NEW_QUEUE  8
+#define  KFD_DBG_EV_FLAG_CLEAR_STATUS  1
 
-#define KFD_INVALID_QUEUEID	0xffffffff
+#define KFD_INVALID_QUEUEID  0xffffffff
 
 /* KFD_IOC_DBG_TRAP_ENABLE:
  * ptr:   unused
@@ -273,179 +273,177 @@ struct kfd_ioctl_dbg_wave_control_args {
  * data2: minor version (OUT)
  * data3: unused
  */
-#define KFD_IOC_DBG_TRAP_GET_VERSION	7
+#define KFD_IOC_DBG_TRAP_GET_VERSION  7
 
 
 struct kfd_ioctl_dbg_trap_args {
-	__u64 ptr;     /* to KFD -- used for pointer arguments: queue arrays */
-	__u32 pid;     /* to KFD */
-	__u32 gpu_id;  /* to KFD */
-	__u32 op;      /* to KFD */
-	__u32 data1;   /* to KFD */
-	__u32 data2;   /* to KFD */
-	__u32 data3;   /* to KFD */
+  __u64 ptr;     /* to KFD -- used for pointer arguments: queue arrays */
+  __u32 pid;     /* to KFD */
+  __u32 gpu_id;  /* to KFD */
+  __u32 op;      /* to KFD */
+  __u32 data1;   /* to KFD */
+  __u32 data2;   /* to KFD */
+  __u32 data3;   /* to KFD */
 };
 
 /* Matching HSA_EVENTTYPE */
-#define KFD_IOC_EVENT_SIGNAL			0
-#define KFD_IOC_EVENT_NODECHANGE		1
-#define KFD_IOC_EVENT_DEVICESTATECHANGE		2
-#define KFD_IOC_EVENT_HW_EXCEPTION		3
-#define KFD_IOC_EVENT_SYSTEM_EVENT		4
-#define KFD_IOC_EVENT_DEBUG_EVENT		5
-#define KFD_IOC_EVENT_PROFILE_EVENT		6
-#define KFD_IOC_EVENT_QUEUE_EVENT		7
-#define KFD_IOC_EVENT_MEMORY			8
-
-#define KFD_IOC_WAIT_RESULT_COMPLETE		0
-#define KFD_IOC_WAIT_RESULT_TIMEOUT		1
-#define KFD_IOC_WAIT_RESULT_FAIL		2
-
-#define KFD_SIGNAL_EVENT_LIMIT			4096
+#define KFD_IOC_EVENT_SIGNAL      0
+#define KFD_IOC_EVENT_NODECHANGE    1
+#define KFD_IOC_EVENT_DEVICESTATECHANGE    2
+#define KFD_IOC_EVENT_HW_EXCEPTION    3
+#define KFD_IOC_EVENT_SYSTEM_EVENT    4
+#define KFD_IOC_EVENT_DEBUG_EVENT    5
+#define KFD_IOC_EVENT_PROFILE_EVENT    6
+#define KFD_IOC_EVENT_QUEUE_EVENT    7
+#define KFD_IOC_EVENT_MEMORY      8
+
+#define KFD_IOC_WAIT_RESULT_COMPLETE    0
+#define KFD_IOC_WAIT_RESULT_TIMEOUT    1
+#define KFD_IOC_WAIT_RESULT_FAIL    2
+
+#define KFD_SIGNAL_EVENT_LIMIT      4096
 
 /* For kfd_event_data.hw_exception_data.reset_type. */
-#define KFD_HW_EXCEPTION_WHOLE_GPU_RESET	0
-#define KFD_HW_EXCEPTION_PER_ENGINE_RESET	1
+#define KFD_HW_EXCEPTION_WHOLE_GPU_RESET  0
+#define KFD_HW_EXCEPTION_PER_ENGINE_RESET  1
 
 /* For kfd_event_data.hw_exception_data.reset_cause. */
-#define KFD_HW_EXCEPTION_GPU_HANG	0
-#define KFD_HW_EXCEPTION_ECC		1
+#define KFD_HW_EXCEPTION_GPU_HANG  0
+#define KFD_HW_EXCEPTION_ECC    1
 
 /* For kfd_hsa_memory_exception_data.ErrorType */
-#define KFD_MEM_ERR_NO_RAS		0
-#define KFD_MEM_ERR_SRAM_ECC		1
-#define KFD_MEM_ERR_POISON_CONSUMED	2
-#define KFD_MEM_ERR_GPU_HANG		3
+#define KFD_MEM_ERR_NO_RAS    0
+#define KFD_MEM_ERR_SRAM_ECC    1
+#define KFD_MEM_ERR_POISON_CONSUMED  2
+#define KFD_MEM_ERR_GPU_HANG    3
 
 struct kfd_ioctl_create_event_args {
-	__u64 event_page_offset;	/* from KFD */
-	__u32 event_trigger_data;	/* from KFD - signal events only */
-	__u32 event_type;		/* to KFD */
-	__u32 auto_reset;		/* to KFD */
-	__u32 node_id;		/* to KFD - only valid for certain
-							event types */
-	__u32 event_id;		/* from KFD */
-	__u32 event_slot_index;	/* from KFD */
+  __u64 event_page_offset;  /* from KFD */
+  __u32 event_trigger_data;  /* from KFD - signal events only */
+  __u32 event_type;    /* to KFD */
+  __u32 auto_reset;    /* to KFD */
+  __u32 node_id;    /* to KFD - only valid for certain event types */
+  __u32 event_id;    /* from KFD */
+  __u32 event_slot_index;  /* from KFD */
 };
 
 struct kfd_ioctl_destroy_event_args {
-	__u32 event_id;		/* to KFD */
-	__u32 pad;
+  __u32 event_id;    /* to KFD */
+  __u32 pad;
 };
 
 struct kfd_ioctl_set_event_args {
-	__u32 event_id;		/* to KFD */
-	__u32 pad;
+  __u32 event_id;    /* to KFD */
+  __u32 pad;
 };
 
 struct kfd_ioctl_reset_event_args {
-	__u32 event_id;		/* to KFD */
-	__u32 pad;
+  __u32 event_id;    /* to KFD */
+  __u32 pad;
 };
 
 struct kfd_memory_exception_failure {
-	__u32 NotPresent;	/* Page not present or supervisor privilege */
-	__u32 ReadOnly;	/* Write access to a read-only page */
-	__u32 NoExecute;	/* Execute access to a page marked NX */
-	__u32 imprecise;	/* Can't determine the	exact fault address */
+  __u32 NotPresent;  /* Page not present or supervisor privilege */
+  __u32 ReadOnly;  /* Write access to a read-only page */
+  __u32 NoExecute;  /* Execute access to a page marked NX */
+  __u32 imprecise;  /* Can't determine the  exact fault address */
 };
 
 /* memory exception data */
 struct kfd_hsa_memory_exception_data {
-	struct kfd_memory_exception_failure failure;
-	__u64 va;
-	__u32 gpu_id;
-	__u32 ErrorType; /* 0 = no RAS error,
-			  * 1 = ECC_SRAM,
-			  * 2 = Link_SYNFLOOD (poison),
-			  * 3 = GPU hang (not attributable to a specific cause),
-			  * other values reserved
-			  */
+  struct kfd_memory_exception_failure failure;
+  __u64 va;
+  __u32 gpu_id;
+  __u32 ErrorType;  // 0 = no RAS error,
+        // 1 = ECC_SRAM,
+        // 2 = Link_SYNFLOOD (poison),
+        // 3 = GPU hang (not attributable to a specific cause),
+        // other values reserved
 };
 
 /* hw exception data */
 struct kfd_hsa_hw_exception_data {
-	__u32 reset_type;
-	__u32 reset_cause;
-	__u32 memory_lost;
-	__u32 gpu_id;
+  __u32 reset_type;
+  __u32 reset_cause;
+  __u32 memory_lost;
+  __u32 gpu_id;
 };
 
 /* Event data */
 struct kfd_event_data {
-	union {
-		struct kfd_hsa_memory_exception_data memory_exception_data;
-		struct kfd_hsa_hw_exception_data hw_exception_data;
-	};				/* From KFD */
-	__u64 kfd_event_data_ext;	/* pointer to an extension structure
-					   for future exception types */
-	__u32 event_id;		/* to KFD */
-	__u32 pad;
+  union {
+    struct kfd_hsa_memory_exception_data memory_exception_data;
+    struct kfd_hsa_hw_exception_data hw_exception_data;
+  };        /* From KFD */
+  __u64 kfd_event_data_ext;  // pointer to an extension structure
+                             // for future exception types
+  __u32 event_id;    /* to KFD */
+  __u32 pad;
 };
 
 struct kfd_ioctl_wait_events_args {
-	__u64 events_ptr;		/* pointed to struct
-					   kfd_event_data array, to KFD */
-	__u32 num_events;		/* to KFD */
-	__u32 wait_for_all;		/* to KFD */
-	__u32 timeout;		/* to KFD */
-	__u32 wait_result;		/* from KFD */
+  __u64 events_ptr;    // pointed to struct
+                       // kfd_event_data array, to KFD
+  __u32 num_events;    /* to KFD */
+  __u32 wait_for_all;    /* to KFD */
+  __u32 timeout;    /* to KFD */
+  __u32 wait_result;    /* from KFD */
 };
 
 struct kfd_ioctl_set_scratch_backing_va_args {
-	__u64 va_addr;	/* to KFD */
-	__u32 gpu_id;	/* to KFD */
-	__u32 pad;
+  __u64 va_addr;  /* to KFD */
+  __u32 gpu_id;  /* to KFD */
+  __u32 pad;
 };
 
 struct kfd_ioctl_get_tile_config_args {
-	/* to KFD: pointer to tile array */
-	__u64 tile_config_ptr;
-	/* to KFD: pointer to macro tile array */
-	__u64 macro_tile_config_ptr;
-	/* to KFD: array size allocated by user mode
-	 * from KFD: array size filled by kernel
-	 */
-	__u32 num_tile_configs;
-	/* to KFD: array size allocated by user mode
-	 * from KFD: array size filled by kernel
-	 */
-	__u32 num_macro_tile_configs;
-
-	__u32 gpu_id;		/* to KFD */
-	__u32 gb_addr_config;	/* from KFD */
-	__u32 num_banks;		/* from KFD */
-	__u32 num_ranks;		/* from KFD */
-	/* struct size can be extended later if needed
-	 * without breaking ABI compatibility
-	 */
+  /* to KFD: pointer to tile array */
+  __u64 tile_config_ptr;
+  /* to KFD: pointer to macro tile array */
+  __u64 macro_tile_config_ptr;
+  /* to KFD: array size allocated by user mode
+   * from KFD: array size filled by kernel
+   */
+  __u32 num_tile_configs;
+  /* to KFD: array size allocated by user mode
+   * from KFD: array size filled by kernel
+   */
+  __u32 num_macro_tile_configs;
+
+  __u32 gpu_id;    /* to KFD */
+  __u32 gb_addr_config;  /* from KFD */
+  __u32 num_banks;    /* from KFD */
+  __u32 num_ranks;    /* from KFD */
+  /* struct size can be extended later if needed
+   * without breaking ABI compatibility
+   */
 };
 
 struct kfd_ioctl_set_trap_handler_args {
-	__u64 tba_addr;		/* to KFD */
-	__u64 tma_addr;		/* to KFD */
-	__u32 gpu_id;		/* to KFD */
-	__u32 pad;
+  __u64 tba_addr;    /* to KFD */
+  __u64 tma_addr;    /* to KFD */
+  __u32 gpu_id;    /* to KFD */
+  __u32 pad;
 };
 
 struct kfd_ioctl_acquire_vm_args {
-	__u32 drm_fd;	/* to KFD */
-	__u32 gpu_id;	/* to KFD */
+  __u32 drm_fd;  /* to KFD */
+  __u32 gpu_id;  /* to KFD */
 };
 
 /* Allocation flags: memory types */
-#define KFD_IOC_ALLOC_MEM_FLAGS_VRAM		(1 << 0)
-#define KFD_IOC_ALLOC_MEM_FLAGS_GTT		(1 << 1)
-#define KFD_IOC_ALLOC_MEM_FLAGS_USERPTR		(1 << 2)
-#define KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL	(1 << 3)
-#define KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP	(1 << 4)
+#define KFD_IOC_ALLOC_MEM_FLAGS_VRAM    (1 << 0)
+#define KFD_IOC_ALLOC_MEM_FLAGS_GTT    (1 << 1)
+#define KFD_IOC_ALLOC_MEM_FLAGS_USERPTR    (1 << 2)
+#define KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL  (1 << 3)
+#define KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP  (1 << 4)
 /* Allocation flags: attributes/access options */
-#define KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE	(1 << 31)
-#define KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE	(1 << 30)
-#define KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC		(1 << 29)
-#define KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE	(1 << 28)
-#define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM	(1 << 27)
-#define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT	(1 << 26)
+#define KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE  (1 << 31)
+#define KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE  (1 << 30)
+#define KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC    (1 << 29)
+#define KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE  (1 << 28)
+#define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM  (1 << 27)
+#define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT  (1 << 26)
 
 /* Allocate memory for later SVM (shared virtual memory) mapping.
  *
@@ -460,12 +458,12 @@ struct kfd_ioctl_acquire_vm_args {
  * @flags:       memory type and attributes. See KFD_IOC_ALLOC_MEM_FLAGS above
  */
 struct kfd_ioctl_alloc_memory_of_gpu_args {
-	__u64 va_addr;		/* to KFD */
-	__u64 size;		/* to KFD */
-	__u64 handle;		/* from KFD */
-	__u64 mmap_offset;	/* to KFD (userptr), from KFD (mmap offset) */
-	__u32 gpu_id;		/* to KFD */
-	__u32 flags;
+  __u64 va_addr;    /* to KFD */
+  __u64 size;    /* to KFD */
+  __u64 handle;    /* from KFD */
+  __u64 mmap_offset;  /* to KFD (userptr), from KFD (mmap offset) */
+  __u32 gpu_id;    /* to KFD */
+  __u32 flags;
 };
 
 /* Free memory allocated with kfd_ioctl_alloc_memory_of_gpu
@@ -473,7 +471,7 @@ struct kfd_ioctl_alloc_memory_of_gpu_args {
  * @handle: memory handle returned by alloc
  */
 struct kfd_ioctl_free_memory_of_gpu_args {
-	__u64 handle;		/* to KFD */
+  __u64 handle;    /* to KFD */
 };
 
 /* Map memory to one or more GPUs
@@ -492,10 +490,10 @@ struct kfd_ioctl_free_memory_of_gpu_args {
  * n_devices.
  */
 struct kfd_ioctl_map_memory_to_gpu_args {
-	__u64 handle;			/* to KFD */
-	__u64 device_ids_array_ptr;	/* to KFD */
-	__u32 n_devices;		/* to KFD */
-	__u32 n_success;		/* to/from KFD */
+  __u64 handle;      /* to KFD */
+  __u64 device_ids_array_ptr;  /* to KFD */
+  __u32 n_devices;    /* to KFD */
+  __u32 n_success;    /* to/from KFD */
 };
 
 /* Unmap memory from one or more GPUs
@@ -503,10 +501,10 @@ struct kfd_ioctl_map_memory_to_gpu_args {
  * same arguments as for mapping
  */
 struct kfd_ioctl_unmap_memory_from_gpu_args {
-	__u64 handle;			/* to KFD */
-	__u64 device_ids_array_ptr;	/* to KFD */
-	__u32 n_devices;		/* to KFD */
-	__u32 n_success;		/* to/from KFD */
+  __u64 handle;      /* to KFD */
+  __u64 device_ids_array_ptr;  /* to KFD */
+  __u32 n_devices;    /* to KFD */
+  __u32 n_success;    /* to/from KFD */
 };
 
 /* Allocate GWS for specific queue
@@ -517,28 +515,27 @@ struct kfd_ioctl_unmap_memory_from_gpu_args {
  *               only support contiguous GWS allocation
  */
 struct kfd_ioctl_alloc_queue_gws_args {
-	__u32 queue_id;		/* to KFD */
-	__u32 num_gws;		/* to KFD */
-	__u32 first_gws;	/* from KFD */
-	__u32 pad;		/* to KFD */
+  __u32 queue_id;    /* to KFD */
+  __u32 num_gws;    /* to KFD */
+  __u32 first_gws;  /* from KFD */
+  __u32 pad;    /* to KFD */
 };
 
 struct kfd_ioctl_get_dmabuf_info_args {
-	__u64 size;		/* from KFD */
-	__u64 metadata_ptr;	/* to KFD */
-	__u32 metadata_size;	/* to KFD (space allocated by user)
-				 * from KFD (actual metadata size)
-				 */
-	__u32 gpu_id;	/* from KFD */
-	__u32 flags;		/* from KFD (KFD_IOC_ALLOC_MEM_FLAGS) */
-	__u32 dmabuf_fd;	/* to KFD */
+  __u64 size;    /* from KFD */
+  __u64 metadata_ptr;  /* to KFD */
+  __u32 metadata_size;  // to KFD (space allocated by user)
+                        // from KFD (actual metadata size)
+  __u32 gpu_id;  /* from KFD */
+  __u32 flags;    /* from KFD (KFD_IOC_ALLOC_MEM_FLAGS) */
+  __u32 dmabuf_fd;  /* to KFD */
 };
 
 struct kfd_ioctl_import_dmabuf_args {
-	__u64 va_addr;	/* to KFD */
-	__u64 handle;	/* from KFD */
-	__u32 gpu_id;	/* to KFD */
-	__u32 dmabuf_fd;	/* to KFD */
+  __u64 va_addr;  /* to KFD */
+  __u64 handle;  /* from KFD */
+  __u32 gpu_id;  /* to KFD */
+  __u32 dmabuf_fd;  /* to KFD */
 };
 
 /*
@@ -548,36 +545,36 @@ struct kfd_ioctl_import_dmabuf_args {
 #define KFD_SMI_EVENT_VMFAULT     0x0000000000000001
 
 struct kfd_ioctl_smi_events_args {
-	__u32 gpuid;    /* to KFD */
-	__u32 anon_fd;  /* from KFD */
+  __u32 gpuid;    /* to KFD */
+  __u32 anon_fd;  /* from KFD */
 };
 
 /* Register offset inside the remapped mmio page
  */
 enum kfd_mmio_remap {
-	KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL = 0,
-	KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL = 4,
+  KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL = 0,
+  KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL = 4,
 };
 
 struct kfd_ioctl_ipc_export_handle_args {
-	__u64 handle;		/* to KFD */
-	__u32 share_handle[4];	/* from KFD */
-	__u32 gpu_id;		/* to KFD */
-	__u32 pad;
+  __u64 handle;    /* to KFD */
+  __u32 share_handle[4];  /* from KFD */
+  __u32 gpu_id;    /* to KFD */
+  __u32 pad;
 };
 
 struct kfd_ioctl_ipc_import_handle_args {
-	__u64 handle;		/* from KFD */
-	__u64 va_addr;		/* to KFD */
-	__u64 mmap_offset;		/* from KFD */
-	__u32 share_handle[4];	/* to KFD */
-	__u32 gpu_id;		/* to KFD */
-	__u32 pad;
+  __u64 handle;    /* from KFD */
+  __u64 va_addr;    /* to KFD */
+  __u64 mmap_offset;    /* from KFD */
+  __u32 share_handle[4];  /* to KFD */
+  __u32 gpu_id;    /* to KFD */
+  __u32 pad;
 };
 
 struct kfd_memory_range {
-	__u64 va_addr;
-	__u64 size;
+  __u64 va_addr;
+  __u64 size;
 };
 
 /* flags definitions
@@ -587,143 +584,143 @@ struct kfd_memory_range {
 #define KFD_CROSS_MEMORY_RW_BIT (1 << 0)
 #define KFD_SET_CROSS_MEMORY_READ(flags) (flags &= ~KFD_CROSS_MEMORY_RW_BIT)
 #define KFD_SET_CROSS_MEMORY_WRITE(flags) (flags |= KFD_CROSS_MEMORY_RW_BIT)
-#define KFD_IS_CROSS_MEMORY_WRITE(flags) (flags & KFD_CROSS_MEMORY_RW_BIT)
+#define KFD_IS_CROSS_MEMORY_WRITE(flags) (flags & KFD_CROSS_MEMORY_RW_BIT)  // NOLINT
 
 struct kfd_ioctl_cross_memory_copy_args {
-	/* to KFD: Process ID of the remote process */
-	__u32 pid;
-	/* to KFD: See above definition */
-	__u32 flags;
-	/* to KFD: Source GPU VM range */
-	__u64 src_mem_range_array;
-	/* to KFD: Size of above array */
-	__u64 src_mem_array_size;
-	/* to KFD: Destination GPU VM range */
-	__u64 dst_mem_range_array;
-	/* to KFD: Size of above array */
-	__u64 dst_mem_array_size;
-	/* from KFD: Total amount of bytes copied */
-	__u64 bytes_copied;
+  /* to KFD: Process ID of the remote process */
+  __u32 pid;
+  /* to KFD: See above definition */
+  __u32 flags;
+  /* to KFD: Source GPU VM range */
+  __u64 src_mem_range_array;
+  /* to KFD: Size of above array */
+  __u64 src_mem_array_size;
+  /* to KFD: Destination GPU VM range */
+  __u64 dst_mem_range_array;
+  /* to KFD: Size of above array */
+  __u64 dst_mem_array_size;
+  /* from KFD: Total amount of bytes copied */
+  __u64 bytes_copied;
 };
 
 #define AMDKFD_IOCTL_BASE 'K'
-#define AMDKFD_IO(nr)			_IO(AMDKFD_IOCTL_BASE, nr)
-#define AMDKFD_IOR(nr, type)		_IOR(AMDKFD_IOCTL_BASE, nr, type)
-#define AMDKFD_IOW(nr, type)		_IOW(AMDKFD_IOCTL_BASE, nr, type)
-#define AMDKFD_IOWR(nr, type)		_IOWR(AMDKFD_IOCTL_BASE, nr, type)
+#define AMDKFD_IO(nr)      _IO(AMDKFD_IOCTL_BASE, nr)
+#define AMDKFD_IOR(nr, type)    _IOR(AMDKFD_IOCTL_BASE, nr, type)
+#define AMDKFD_IOW(nr, type)    _IOW(AMDKFD_IOCTL_BASE, nr, type)
+#define AMDKFD_IOWR(nr, type)    _IOWR(AMDKFD_IOCTL_BASE, nr, type)
 
-#define AMDKFD_IOC_GET_VERSION			\
-		AMDKFD_IOR(0x01, struct kfd_ioctl_get_version_args)
+#define AMDKFD_IOC_GET_VERSION      \
+    AMDKFD_IOR(0x01, struct kfd_ioctl_get_version_args)
 
-#define AMDKFD_IOC_CREATE_QUEUE			\
-		AMDKFD_IOWR(0x02, struct kfd_ioctl_create_queue_args)
+#define AMDKFD_IOC_CREATE_QUEUE      \
+    AMDKFD_IOWR(0x02, struct kfd_ioctl_create_queue_args)
 
-#define AMDKFD_IOC_DESTROY_QUEUE		\
-		AMDKFD_IOWR(0x03, struct kfd_ioctl_destroy_queue_args)
+#define AMDKFD_IOC_DESTROY_QUEUE    \
+    AMDKFD_IOWR(0x03, struct kfd_ioctl_destroy_queue_args)
 
-#define AMDKFD_IOC_SET_MEMORY_POLICY		\
-		AMDKFD_IOW(0x04, struct kfd_ioctl_set_memory_policy_args)
+#define AMDKFD_IOC_SET_MEMORY_POLICY    \
+    AMDKFD_IOW(0x04, struct kfd_ioctl_set_memory_policy_args)
 
-#define AMDKFD_IOC_GET_CLOCK_COUNTERS		\
-		AMDKFD_IOWR(0x05, struct kfd_ioctl_get_clock_counters_args)
+#define AMDKFD_IOC_GET_CLOCK_COUNTERS    \
+    AMDKFD_IOWR(0x05, struct kfd_ioctl_get_clock_counters_args)
 
-#define AMDKFD_IOC_GET_PROCESS_APERTURES	\
-		AMDKFD_IOR(0x06, struct kfd_ioctl_get_process_apertures_args)
+#define AMDKFD_IOC_GET_PROCESS_APERTURES  \
+    AMDKFD_IOR(0x06, struct kfd_ioctl_get_process_apertures_args)
 
-#define AMDKFD_IOC_UPDATE_QUEUE			\
-		AMDKFD_IOW(0x07, struct kfd_ioctl_update_queue_args)
+#define AMDKFD_IOC_UPDATE_QUEUE      \
+    AMDKFD_IOW(0x07, struct kfd_ioctl_update_queue_args)
 
-#define AMDKFD_IOC_CREATE_EVENT			\
-		AMDKFD_IOWR(0x08, struct kfd_ioctl_create_event_args)
+#define AMDKFD_IOC_CREATE_EVENT      \
+    AMDKFD_IOWR(0x08, struct kfd_ioctl_create_event_args)
 
-#define AMDKFD_IOC_DESTROY_EVENT		\
-		AMDKFD_IOW(0x09, struct kfd_ioctl_destroy_event_args)
+#define AMDKFD_IOC_DESTROY_EVENT    \
+    AMDKFD_IOW(0x09, struct kfd_ioctl_destroy_event_args)
 
-#define AMDKFD_IOC_SET_EVENT			\
-		AMDKFD_IOW(0x0A, struct kfd_ioctl_set_event_args)
+#define AMDKFD_IOC_SET_EVENT      \
+    AMDKFD_IOW(0x0A, struct kfd_ioctl_set_event_args)
 
-#define AMDKFD_IOC_RESET_EVENT			\
-		AMDKFD_IOW(0x0B, struct kfd_ioctl_reset_event_args)
+#define AMDKFD_IOC_RESET_EVENT      \
+    AMDKFD_IOW(0x0B, struct kfd_ioctl_reset_event_args)
 
-#define AMDKFD_IOC_WAIT_EVENTS			\
-		AMDKFD_IOWR(0x0C, struct kfd_ioctl_wait_events_args)
+#define AMDKFD_IOC_WAIT_EVENTS      \
+    AMDKFD_IOWR(0x0C, struct kfd_ioctl_wait_events_args)
 
-#define AMDKFD_IOC_DBG_REGISTER			\
-		AMDKFD_IOW(0x0D, struct kfd_ioctl_dbg_register_args)
+#define AMDKFD_IOC_DBG_REGISTER      \
+    AMDKFD_IOW(0x0D, struct kfd_ioctl_dbg_register_args)
 
-#define AMDKFD_IOC_DBG_UNREGISTER		\
-		AMDKFD_IOW(0x0E, struct kfd_ioctl_dbg_unregister_args)
+#define AMDKFD_IOC_DBG_UNREGISTER    \
+    AMDKFD_IOW(0x0E, struct kfd_ioctl_dbg_unregister_args)
 
-#define AMDKFD_IOC_DBG_ADDRESS_WATCH		\
-		AMDKFD_IOW(0x0F, struct kfd_ioctl_dbg_address_watch_args)
+#define AMDKFD_IOC_DBG_ADDRESS_WATCH    \
+    AMDKFD_IOW(0x0F, struct kfd_ioctl_dbg_address_watch_args)
 
-#define AMDKFD_IOC_DBG_WAVE_CONTROL		\
-		AMDKFD_IOW(0x10, struct kfd_ioctl_dbg_wave_control_args)
+#define AMDKFD_IOC_DBG_WAVE_CONTROL    \
+    AMDKFD_IOW(0x10, struct kfd_ioctl_dbg_wave_control_args)
 
-#define AMDKFD_IOC_SET_SCRATCH_BACKING_VA	\
-		AMDKFD_IOWR(0x11, struct kfd_ioctl_set_scratch_backing_va_args)
+#define AMDKFD_IOC_SET_SCRATCH_BACKING_VA  \
+    AMDKFD_IOWR(0x11, struct kfd_ioctl_set_scratch_backing_va_args)
 
-#define AMDKFD_IOC_GET_TILE_CONFIG		\
-		AMDKFD_IOWR(0x12, struct kfd_ioctl_get_tile_config_args)
+#define AMDKFD_IOC_GET_TILE_CONFIG    \
+    AMDKFD_IOWR(0x12, struct kfd_ioctl_get_tile_config_args)
 
-#define AMDKFD_IOC_SET_TRAP_HANDLER		\
-		AMDKFD_IOW(0x13, struct kfd_ioctl_set_trap_handler_args)
+#define AMDKFD_IOC_SET_TRAP_HANDLER    \
+    AMDKFD_IOW(0x13, struct kfd_ioctl_set_trap_handler_args)
 
-#define AMDKFD_IOC_GET_PROCESS_APERTURES_NEW	\
-		AMDKFD_IOWR(0x14,		\
-			struct kfd_ioctl_get_process_apertures_new_args)
+#define AMDKFD_IOC_GET_PROCESS_APERTURES_NEW  \
+    AMDKFD_IOWR(0x14,    \
+      struct kfd_ioctl_get_process_apertures_new_args)
 
-#define AMDKFD_IOC_ACQUIRE_VM			\
-		AMDKFD_IOW(0x15, struct kfd_ioctl_acquire_vm_args)
+#define AMDKFD_IOC_ACQUIRE_VM      \
+    AMDKFD_IOW(0x15, struct kfd_ioctl_acquire_vm_args)
 
-#define AMDKFD_IOC_ALLOC_MEMORY_OF_GPU		\
-		AMDKFD_IOWR(0x16, struct kfd_ioctl_alloc_memory_of_gpu_args)
+#define AMDKFD_IOC_ALLOC_MEMORY_OF_GPU    \
+    AMDKFD_IOWR(0x16, struct kfd_ioctl_alloc_memory_of_gpu_args)
 
-#define AMDKFD_IOC_FREE_MEMORY_OF_GPU		\
-		AMDKFD_IOW(0x17, struct kfd_ioctl_free_memory_of_gpu_args)
+#define AMDKFD_IOC_FREE_MEMORY_OF_GPU    \
+    AMDKFD_IOW(0x17, struct kfd_ioctl_free_memory_of_gpu_args)
 
-#define AMDKFD_IOC_MAP_MEMORY_TO_GPU		\
-		AMDKFD_IOWR(0x18, struct kfd_ioctl_map_memory_to_gpu_args)
+#define AMDKFD_IOC_MAP_MEMORY_TO_GPU    \
+    AMDKFD_IOWR(0x18, struct kfd_ioctl_map_memory_to_gpu_args)
 
-#define AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU	\
-		AMDKFD_IOWR(0x19, struct kfd_ioctl_unmap_memory_from_gpu_args)
+#define AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU  \
+    AMDKFD_IOWR(0x19, struct kfd_ioctl_unmap_memory_from_gpu_args)
 
-#define AMDKFD_IOC_SET_CU_MASK		\
-		AMDKFD_IOW(0x1A, struct kfd_ioctl_set_cu_mask_args)
+#define AMDKFD_IOC_SET_CU_MASK    \
+    AMDKFD_IOW(0x1A, struct kfd_ioctl_set_cu_mask_args)
 
-#define AMDKFD_IOC_GET_QUEUE_WAVE_STATE		\
-		AMDKFD_IOWR(0x1B, struct kfd_ioctl_get_queue_wave_state_args)
+#define AMDKFD_IOC_GET_QUEUE_WAVE_STATE    \
+    AMDKFD_IOWR(0x1B, struct kfd_ioctl_get_queue_wave_state_args)
 
-#define AMDKFD_IOC_GET_DMABUF_INFO		\
-		AMDKFD_IOWR(0x1C, struct kfd_ioctl_get_dmabuf_info_args)
+#define AMDKFD_IOC_GET_DMABUF_INFO    \
+    AMDKFD_IOWR(0x1C, struct kfd_ioctl_get_dmabuf_info_args)
 
-#define AMDKFD_IOC_IMPORT_DMABUF		\
-		AMDKFD_IOWR(0x1D, struct kfd_ioctl_import_dmabuf_args)
+#define AMDKFD_IOC_IMPORT_DMABUF    \
+    AMDKFD_IOWR(0x1D, struct kfd_ioctl_import_dmabuf_args)
 
-#define AMDKFD_IOC_ALLOC_QUEUE_GWS		\
-		AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
+#define AMDKFD_IOC_ALLOC_QUEUE_GWS    \
+    AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
 
-#define AMDKFD_IOC_SMI_EVENTS			\
-		AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
+#define AMDKFD_IOC_SMI_EVENTS      \
+    AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
 
-#define AMDKFD_COMMAND_START		0x01
-#define AMDKFD_COMMAND_END		0x20
+#define AMDKFD_COMMAND_START    0x01
+#define AMDKFD_COMMAND_END    0x20
 
 /* non-upstream ioctls */
 #define AMDKFD_IOC_IPC_IMPORT_HANDLE                                    \
-		AMDKFD_IOWR(0x1F, struct kfd_ioctl_ipc_import_handle_args)
+    AMDKFD_IOWR(0x1F, struct kfd_ioctl_ipc_import_handle_args)
 
-#define AMDKFD_IOC_IPC_EXPORT_HANDLE		\
-		AMDKFD_IOWR(0x20, struct kfd_ioctl_ipc_export_handle_args)
+#define AMDKFD_IOC_IPC_EXPORT_HANDLE    \
+    AMDKFD_IOWR(0x20, struct kfd_ioctl_ipc_export_handle_args)
 
-#define AMDKFD_IOC_DBG_TRAP			\
-		AMDKFD_IOWR(0x21, struct kfd_ioctl_dbg_trap_args)
+#define AMDKFD_IOC_DBG_TRAP      \
+    AMDKFD_IOWR(0x21, struct kfd_ioctl_dbg_trap_args)
 
-#define AMDKFD_IOC_CROSS_MEMORY_COPY		\
-		AMDKFD_IOWR(0x22, struct kfd_ioctl_cross_memory_copy_args)
+#define AMDKFD_IOC_CROSS_MEMORY_COPY    \
+    AMDKFD_IOWR(0x22, struct kfd_ioctl_cross_memory_copy_args)
 
-#define AMDKFD_COMMAND_START		0x01
+#define AMDKFD_COMMAND_START    0x01
 #undef AMDKFD_COMMAND_END
-#define AMDKFD_COMMAND_END		0x22
+#define AMDKFD_COMMAND_END    0x22
 
 #endif  //  INCLUDE_ROCM_SMI_KFD_IOCTL_H_
diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h
index 5cf748a6..a6b66b30 100755
--- a/include/rocm_smi/rocm_smi.h
+++ b/include/rocm_smi/rocm_smi.h
@@ -123,6 +123,8 @@ typedef enum {
   RSMI_STATUS_BUSY,                      //!< A resource or mutex could not be
                                          //!< acquired because it is already
                                          //!< being used
+  RSMI_STATUS_REFCOUNT_OVERFLOW,          //!< An internal reference counter
+                                         //!< exceeded INT32_MAX
 
   RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF,  //!< An unknown error occurred
 } rsmi_status_t;
diff --git a/include/rocm_smi/rocm_smi_main.h b/include/rocm_smi/rocm_smi_main.h
index bb8b326d..3503c483 100755
--- a/include/rocm_smi/rocm_smi_main.h
+++ b/include/rocm_smi/rocm_smi_main.h
@@ -94,8 +94,18 @@ class RocmSMI {
     int kfd_notif_evt_fh(void) const {return kfd_notif_evt_fh_;}
     void set_kfd_notif_evt_fh(int fd) {kfd_notif_evt_fh_ = fd;}
     std::mutex *kfd_notif_evt_fh_mutex(void) {return &kfd_notif_evt_fh_mutex_;}
-    int kfd_notif_evt_fh_refcnt_inc() {return ++kfd_notif_evt_fh_refcnt_;}
-    int kfd_notif_evt_fh_refcnt_dec() {return --kfd_notif_evt_fh_refcnt_;}
+    std::mutex *bootstrap_mutex(void) {return &bootstrap_mutex_;}
+
+    uint32_t ref_count(void) const {return ref_count_;}
+    uint32_t ref_count_inc(void) {return ++ref_count_;}
+    uint32_t ref_count_dec(void) {return --ref_count_;}
+
+    uint32_t kfd_notif_evt_fh_refcnt(void) const {
+                                             return kfd_notif_evt_fh_refcnt_;}
+    uint32_t kfd_notif_evt_fh_refcnt_inc(void) {
+                                           return ++kfd_notif_evt_fh_refcnt_;}
+    uint32_t kfd_notif_evt_fh_refcnt_dec(void) {
+                                           return --kfd_notif_evt_fh_refcnt_;}
 
  private:
     std::vector<std::shared_ptr<Device>> devices_;
@@ -114,8 +124,12 @@ class RocmSMI {
     uint32_t euid_;
 
     int kfd_notif_evt_fh_;
-    int kfd_notif_evt_fh_refcnt_;
     std::mutex kfd_notif_evt_fh_mutex_;
+    uint32_t kfd_notif_evt_fh_refcnt_;  // Access to this should be protected
+                                        // by kfd_notif_evt_fh_mutex_
+    std::mutex bootstrap_mutex_;
+    uint32_t ref_count_;  // Access to this should be protected
+                          // by bootstrap_mutex_
 };
 
 }  // namespace smi
diff --git a/include/rocm_smi/rocm_smi_utils.h b/include/rocm_smi/rocm_smi_utils.h
index be5b75c2..4c042fd1 100755
--- a/include/rocm_smi/rocm_smi_utils.h
+++ b/include/rocm_smi/rocm_smi_utils.h
@@ -105,6 +105,85 @@ struct ScopedPthread {
      pthread_wrap& pthrd_ref_;
      bool mutex_not_acquired_;  // Use for AcquireNB (not for Aquire())
 };
+
+
+#define PASTE2(x, y) x##y
+#define PASTE(x, y) PASTE2(x, y)
+
+#define __forceinline __inline__ __attribute__((always_inline))
+
+template <typename lambda>
+class ScopeGuard {
+ public:
+  explicit __forceinline ScopeGuard(const lambda& release)
+      : release_(release), dismiss_(false) {}
+
+  ScopeGuard(const ScopeGuard& rhs) {*this = rhs; }
+
+  __forceinline ~ScopeGuard() {
+    if (!dismiss_) release_();
+  }
+  __forceinline ScopeGuard& operator=(const ScopeGuard& rhs) {
+    dismiss_ = rhs.dismiss_;
+    release_ = rhs.release_;
+    rhs.dismiss_ = true;
+  }
+  __forceinline void Dismiss() { dismiss_ = true; }
+
+ private:
+  lambda release_;
+  bool dismiss_;
+};
+
+template <typename lambda>
+static __forceinline ScopeGuard<lambda> MakeScopeGuard(lambda rel) {
+  return ScopeGuard<lambda>(rel);
+}
+
+#define MAKE_SCOPE_GUARD_HELPER(lname, sname, ...) \
+  auto lname = __VA_ARGS__;                        \
+  amd::smi::ScopeGuard<decltype(lname)> sname(lname);
+#define MAKE_SCOPE_GUARD(...)                                   \
+  MAKE_SCOPE_GUARD_HELPER(PASTE(scopeGuardLambda, __COUNTER__), \
+                          PASTE(scopeGuard, __COUNTER__), __VA_ARGS__)
+#define MAKE_NAMED_SCOPE_GUARD(name, ...)                             \
+  MAKE_SCOPE_GUARD_HELPER(PASTE(scopeGuardLambda, __COUNTER__), name, \
+                          __VA_ARGS__)
+
+
+// A macro to disallow the copy and move constructor and operator= functions
+#define DISALLOW_COPY_AND_ASSIGN(TypeName)   \
+  TypeName(const TypeName&) = delete;        \
+  TypeName(TypeName&&) = delete;             \
+  void operator=(const TypeName&) = delete;  \
+  void operator=(TypeName&&) = delete;
+
+template <class LockType>
+class ScopedAcquire {
+ public:
+  /// @brief: When constructing, acquire the lock.
+  /// @param: lock(Input), pointer to an existing lock.
+  explicit ScopedAcquire(LockType* lock) : lock_(lock), doRelease(true) {
+                                                            lock_->Acquire();}
+
+  /// @brief: when destructing, release the lock.
+  ~ScopedAcquire() {
+    if (doRelease) lock_->Release();
+  }
+
+  /// @brief: Release the lock early.  Avoid using when possible.
+  void Release() {
+    lock_->Release();
+    doRelease = false;
+  }
+
+ private:
+  LockType* lock_;
+  bool doRelease;
+  /// @brief: Disable copiable and assignable ability.
+  DISALLOW_COPY_AND_ASSIGN(ScopedAcquire);
+};
+
 }  // namespace smi
 }  // namespace amd
 
diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc
index 1cb73a6f..cb0df0e0 100755
--- a/src/rocm_smi.cc
+++ b/src/rocm_smi.cc
@@ -165,20 +165,20 @@ static rsmi_status_t handleException() {
         return RSMI_STATUS_NOT_SUPPORTED; \
       }  \
       return RSMI_STATUS_INVALID_ARGS; \
-    } \
+    }
 
 #define CHK_SUPPORT(RT_PTR, VR, SUB_VR)  \
     GET_DEV_FROM_INDX \
     CHK_API_SUPPORT_ONLY((RT_PTR), (VR), (SUB_VR))
 
 #define CHK_SUPPORT_NAME_ONLY(RT_PTR) \
-    CHK_SUPPORT((RT_PTR), RSMI_DEFAULT_VARIANT, RSMI_DEFAULT_VARIANT) \
+    CHK_SUPPORT((RT_PTR), RSMI_DEFAULT_VARIANT, RSMI_DEFAULT_VARIANT)
 
 #define CHK_SUPPORT_VAR(RT_PTR, VR) \
-    CHK_SUPPORT((RT_PTR), (VR), RSMI_DEFAULT_VARIANT) \
+    CHK_SUPPORT((RT_PTR), (VR), RSMI_DEFAULT_VARIANT)
 
 #define CHK_SUPPORT_SUBVAR_ONLY(RT_PTR, SUB_VR) \
-    CHK_SUPPORT((RT_PTR), RSMI_DEFAULT_VARIANT, (SUB_VR)) \
+    CHK_SUPPORT((RT_PTR), RSMI_DEFAULT_VARIANT, (SUB_VR))
 
 static pthread_mutex_t *get_mutex(uint32_t dv_ind) {
   amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
@@ -540,9 +540,29 @@ static bool is_power_of_2(uint64_t n) {
 rsmi_status_t
 rsmi_init(uint64_t flags) {
   TRY
-
   amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
-  smi.Initialize(flags);
+  std::lock_guard<std::mutex> guard(*smi.bootstrap_mutex());
+
+  if (smi.ref_count() == INT32_MAX) {
+    return RSMI_STATUS_REFCOUNT_OVERFLOW;
+  }
+
+  (void)smi.ref_count_inc();
+
+  // If smi.Initialize() throws, we should clean up and dec. ref_count_.
+  // Otherwise, if no issues, the Dismiss() will prevent the ref_count_
+  // decrement.
+  MAKE_NAMED_SCOPE_GUARD(refGuard, [&]() { (void)smi.ref_count_dec(); });
+
+  if (smi.ref_count() == 1) {
+    try {
+      smi.Initialize(flags);
+    } catch(...) {
+      smi.Cleanup();
+      throw;
+    }
+  }
+  refGuard.Dismiss();
 
   return RSMI_STATUS_SUCCESS;
   CATCH
@@ -555,9 +575,17 @@ rsmi_shut_down(void) {
   TRY
 
   amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
+  std::lock_guard<std::mutex> guard(*smi.bootstrap_mutex());
+
+  if (smi.ref_count() == 0) {
+    return RSMI_STATUS_INIT_ERROR;
+  }
 
-  smi.Cleanup();
+  (void)smi.ref_count_dec();
 
+  if (smi.ref_count() == 0) {
+    smi.Cleanup();
+  }
   return RSMI_STATUS_SUCCESS;
   CATCH
 }
@@ -2371,6 +2399,15 @@ rsmi_status_string(rsmi_status_t status, const char **status_string) {
                                                      "type that was expected";
       break;
 
+    case RSMI_STATUS_BUSY:
+      *status_string = "A resource or mutex could not be acquired "
+                                           "because it is already being used";
+    break;
+
+    case RSMI_STATUS_REFCOUNT_OVERFLOW:
+      *status_string = "An internal reference counter exceeded INT32_MAX";
+      break;
+
     case RSMI_STATUS_UNKNOWN_ERROR:
       *status_string = "An unknown error prevented the call from completing"
                           " successfully";
@@ -3186,6 +3223,7 @@ rsmi_event_notification_init(uint32_t dv_ind) {
 
   std::lock_guard<std::mutex> guard(*smi.kfd_notif_evt_fh_mutex());
   if (smi.kfd_notif_evt_fh() == -1) {
+    assert(smi.kfd_notif_evt_fh_refcnt() == 0);
     int kfd_fd = open(kPathKFDIoctl, O_RDWR | O_CLOEXEC);
 
     if (kfd_fd <= 0) {
@@ -3199,8 +3237,7 @@ rsmi_event_notification_init(uint32_t dv_ind) {
 
     smi.set_kfd_notif_evt_fh(kfd_fd);
   }
-  smi.kfd_notif_evt_fh_refcnt_inc();
-
+  (void)smi.kfd_notif_evt_fh_refcnt_inc();
   struct kfd_ioctl_smi_events_args args;
 
   assert(dev->kfd_gpu_id() <= UINT32_MAX);
@@ -3354,7 +3391,7 @@ rsmi_status_t rsmi_event_notification_stop(uint32_t dv_ind) {
   dev->set_evt_notif_anon_file_ptr(nullptr);
   dev->set_evt_notif_anon_fd(-1);
 
-  if (!smi.kfd_notif_evt_fh_refcnt_dec()) {
+  if (smi.kfd_notif_evt_fh_refcnt_dec() == 0) {
     int ret = close(smi.kfd_notif_evt_fh());
     smi.set_kfd_notif_evt_fh(-1);
     if (ret < 0) {
@@ -3385,3 +3422,17 @@ rsmi_test_sleep(uint32_t dv_ind, uint32_t seconds) {
   sleep(seconds);
   return RSMI_STATUS_SUCCESS;
 }
+
+int32_t
+rsmi_test_refcount(uint64_t refcnt_type) {
+  (void)refcnt_type;
+
+  amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
+  std::lock_guard<std::mutex> guard(*smi.bootstrap_mutex());
+
+  if (smi.ref_count() == 0 && smi.monitor_devices().size() != 0) {
+    return -1;
+  }
+
+  return smi.ref_count();
+}
diff --git a/src/rocm_smi_main.cc b/src/rocm_smi_main.cc
index 9cd02244..75d98ce9 100755
--- a/src/rocm_smi_main.cc
+++ b/src/rocm_smi_main.cc
@@ -244,6 +244,12 @@ RocmSMI::Initialize(uint64_t flags) {
   auto i = 0;
   uint32_t ret;
 
+  assert(ref_count_ == 1);
+  if (ref_count_ != 1) {
+    throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR,
+            "Unexpected: RocmSMI ref_count_ != 1");
+  }
+
   init_options_ = flags;
 
   euid_ = geteuid();
@@ -299,6 +305,10 @@ RocmSMI::Initialize(uint64_t flags) {
 
 void
 RocmSMI::Cleanup() {
+  s_monitor_devices.clear();
+  devices_.clear();
+  monitors_.clear();
+
   if (kfd_notif_evt_fh() >= 0) {
     int ret = close(kfd_notif_evt_fh());
     if (ret < 0) {
@@ -306,9 +316,6 @@ RocmSMI::Cleanup() {
                  "Failed to close kfd file handle on shutdown.");
     }
   }
-  s_monitor_devices.clear();
-  devices_.clear();
-  monitors_.clear();
 }
 
 RocmSMI::RocmSMI(uint64_t flags) : init_options_(flags),
diff --git a/tests/rocm_smi_test/functional/init_shutdown_refcount.cc b/tests/rocm_smi_test/functional/init_shutdown_refcount.cc
new file mode 100755
index 00000000..ac0ff97d
--- /dev/null
+++ b/tests/rocm_smi_test/functional/init_shutdown_refcount.cc
@@ -0,0 +1,226 @@
+/*
+ * =============================================================================
+ *   ROC Runtime Conformance Release License
+ * =============================================================================
+ * The University of Illinois/NCSA
+ * Open Source License (NCSA)
+ *
+ * Copyright (c) 2020, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Developed by:
+ *
+ *                 AMD Research and AMD ROC Software Development
+ *
+ *                 Advanced Micro Devices, Inc.
+ *
+ *                 www.amd.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal with the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ *  - Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimers.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimers in
+ *    the documentation and/or other materials provided with the distribution.
+ *  - Neither the names of <Name of Development Group, Name of Institution>,
+ *    nor the names of its contributors may be used to endorse or promote
+ *    products derived from this Software without specific prior written
+ *    permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS WITH THE SOFTWARE.
+ *
+ */
+
+#include <pthread.h>
+
+#include <algorithm>
+#include <iostream>
+#include <thread>  // NOLINT
+#include <random>
+#include <chrono>  // NOLINT
+
+#include "rocm_smi_test/functional/init_shutdown_refcount.h"
+#include "gtest/gtest.h"
+#include "rocm_smi/rocm_smi.h"
+#include "rocm_smi_test/test_common.h"
+
+extern int32_t
+rsmi_test_refcount(uint64_t refcnt_type);
+
+static void rand_sleep_mod(int msec) {
+  unsigned int seed = time(NULL);
+  std::mt19937_64 eng{seed};
+  std::uniform_int_distribution<> dist{10, msec};
+  std::this_thread::sleep_for(std::chrono::milliseconds{dist(eng)});
+}
+
+static void* RSMIInitFunction(void* args) {
+  rsmi_status_t status;
+
+  (void)args;
+  rand_sleep_mod(100);
+  status = rsmi_init(0);
+  EXPECT_EQ(RSMI_STATUS_SUCCESS, status);
+  pthread_exit(nullptr);
+  return nullptr;
+}
+
+static void* RSMIShutDownFunction(void* args) {
+  rsmi_status_t status;
+
+  (void)args;
+  rand_sleep_mod(100);
+  status = rsmi_shut_down();
+  EXPECT_EQ(RSMI_STATUS_SUCCESS, status);
+  pthread_exit(nullptr);
+  return nullptr;
+}
+
+static void *RSMIInitShutDownFunction(void* args) {
+  rsmi_status_t status;
+
+  (void)args;
+  rand_sleep_mod(100);
+  status = rsmi_init(0);
+  EXPECT_EQ(RSMI_STATUS_SUCCESS, status);
+
+  rand_sleep_mod(100);
+
+  status = rsmi_shut_down();
+  EXPECT_EQ(RSMI_STATUS_SUCCESS, status);
+  pthread_exit(nullptr);
+  return nullptr;
+}
+
+static const int NumOfThreads = 100;
+
+TestConcurrentInit::TestConcurrentInit(void) : TestBase() {
+  set_title("RSMI Concurrent Init Test");
+  set_description("This test initializes RSMI concurrently to verify "
+                                         "reference counting functionality.");
+}
+
+TestConcurrentInit::~TestConcurrentInit(void) {
+}
+
+void TestConcurrentInit::SetUp(void) {
+  // TestBase::SetUp();  // Skip usual SetUp to avoid doing the usual rsmi_init
+  return;
+}
+
+// Compare required profile for this test case with what we're actually
+// running on
+void TestConcurrentInit::DisplayTestInfo(void) {
+  TestBase::DisplayTestInfo();
+}
+
+void TestConcurrentInit::DisplayResults(void) const {
+  TestBase::DisplayResults();
+  return;
+}
+
+void TestConcurrentInit::Close() {
+  // This will close handles opened within rsmitst utility calls and call
+  // rsmi_shut_down(), so it should be done after other hsa cleanup
+  TestBase::Close();
+}
+
+// Compare required profile for this test case with what we're actually
+// running on
+void TestConcurrentInit::Run(void) {
+  if (setup_failed_) {
+    std::cout << "** SetUp Failed for this test. Skipping.**" << std::endl;
+    return;
+  }
+
+  pthread_t ThreadId[NumOfThreads];
+  pthread_attr_t attr;
+  pthread_attr_init(&attr);
+  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+
+  std::cout << "Testing concurrent rsmi_init()..." << std::endl;
+  for (int Id = 0; Id < NumOfThreads; ++Id) {
+    int ThreadStatus = pthread_create(&ThreadId[Id], &attr,
+                                                   RSMIInitFunction, nullptr);
+    ASSERT_EQ(0, ThreadStatus) << "pthead_create failed.";
+  }
+
+  for (int Id = 0; Id < NumOfThreads; ++Id) {
+    int err = pthread_join(ThreadId[Id], nullptr);
+    ASSERT_EQ(0, err) << "pthread_join failed.";
+  }
+
+  // Invoke hsa_shut_down and verify that all the hsa_init's were counted.
+  // HSA should be exactly closed after NumOfThreads calls.
+  for (int Id = 0; Id < NumOfThreads; ++Id) {
+    rsmi_status_t err = rsmi_shut_down();
+    ASSERT_EQ(RSMI_STATUS_SUCCESS, err) << "An rsmi_init was missed.";
+  }
+
+  rsmi_status_t err = rsmi_shut_down();
+  ASSERT_EQ(RSMI_INITIALIZATION_ERROR, err) <<
+                "rsmi_init reference count was too high.";
+
+  int32_t refcnt = rsmi_test_refcount(0);
+  ASSERT_EQ(0, refcnt);
+
+  std::cout << "Concurrent rsmi_init() test passed." << std::endl << std::endl;
+  std::cout << "Testing concurrent rsmi_shut_down()..." << std::endl;
+
+  // Invoke hsa_shut_down and verify that all the hsa_init's were counted.
+  // HSA should be exactly closed after NumOfThreads calls.
+  for (int Id = 0; Id < NumOfThreads; ++Id) {
+    rsmi_status_t err = rsmi_init(0);
+    ASSERT_EQ(RSMI_STATUS_SUCCESS, err);
+  }
+
+  for (int Id = 0; Id < NumOfThreads; ++Id) {
+    int ThreadStatus =
+         pthread_create(&ThreadId[Id], &attr, RSMIShutDownFunction, nullptr);
+    ASSERT_EQ(0, ThreadStatus) << "pthead_create failed.";
+  }
+
+  for (int Id = 0; Id < NumOfThreads; ++Id) {
+    int err = pthread_join(ThreadId[Id], nullptr);
+    ASSERT_EQ(0, err) << "pthread_join failed.";
+  }
+
+  refcnt = rsmi_test_refcount(0);
+  ASSERT_EQ(0, refcnt);
+
+  std::cout << "Concurrent rsmi_shut_down() passed." << std::endl;
+
+  std::cout <<
+      "Testing concurrent rsmi_init() followed by rsmi_shut_down()..." <<
+                                                                    std::endl;
+
+  for (int Id = 0; Id < NumOfThreads; ++Id) {
+    int ThreadStatus =
+      pthread_create(&ThreadId[Id], &attr, RSMIInitShutDownFunction, nullptr);
+    ASSERT_EQ(0, ThreadStatus) << "pthead_create failed.";
+  }
+
+  for (int Id = 0; Id < NumOfThreads; ++Id) {
+    int err = pthread_join(ThreadId[Id], nullptr);
+    ASSERT_EQ(0, err) << "pthread_join failed.";
+  }
+
+  refcnt = rsmi_test_refcount(0);
+  ASSERT_EQ(0, refcnt);
+
+  std::cout <<
+      "Concurrent rsmi_init() followed by rsmi_shut_down() passed." <<
+                                                                    std::endl;
+}
diff --git a/tests/rocm_smi_test/functional/init_shutdown_refcount.h b/tests/rocm_smi_test/functional/init_shutdown_refcount.h
new file mode 100755
index 00000000..9d7c3212
--- /dev/null
+++ b/tests/rocm_smi_test/functional/init_shutdown_refcount.h
@@ -0,0 +1,74 @@
+/*
+ * =============================================================================
+ *   ROC Runtime Conformance Release License
+ * =============================================================================
+ * The University of Illinois/NCSA
+ * Open Source License (NCSA)
+ *
+ * Copyright (c) 2020, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Developed by:
+ *
+ *                 AMD Research and AMD ROC Software Development
+ *
+ *                 Advanced Micro Devices, Inc.
+ *
+ *                 www.amd.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal with the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ *  - Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimers.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimers in
+ *    the documentation and/or other materials provided with the distribution.
+ *  - Neither the names of <Name of Development Group, Name of Institution>,
+ *    nor the names of its contributors may be used to endorse or promote
+ *    products derived from this Software without specific prior written
+ *    permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS WITH THE SOFTWARE.
+ *
+ */
+
+#ifndef TESTS_ROCM_SMI_TEST_FUNCTIONAL_INIT_SHUTDOWN_REFCOUNT_H_
+#define TESTS_ROCM_SMI_TEST_FUNCTIONAL_INIT_SHUTDOWN_REFCOUNT_H_
+
+#include "rocm_smi_test/test_base.h"
+
+class TestConcurrentInit : public TestBase {
+ public:
+    TestConcurrentInit();
+
+    // @Brief: Destructor for the TestConcurrentInit class
+    virtual ~TestConcurrentInit();
+
+    // @Brief: Setup the environment for measurement
+    virtual void SetUp();
+
+    // @Brief: Core measurement execution
+    virtual void Run();
+
+    // @Brief: Clean up and retrive the resource
+    virtual void Close();
+
+    // @Brief: Display  results
+    virtual void DisplayResults() const;
+
+    // @Brief: Display information about what this test does
+    virtual void DisplayTestInfo(void);
+};
+
+#endif  // TESTS_ROCM_SMI_TEST_FUNCTIONAL_INIT_SHUTDOWN_REFCOUNT_H_
diff --git a/tests/rocm_smi_test/main.cc b/tests/rocm_smi_test/main.cc
index 4ad0fe8f..f8c50d54 100755
--- a/tests/rocm_smi_test/main.cc
+++ b/tests/rocm_smi_test/main.cc
@@ -79,6 +79,7 @@
 #include "functional/api_support_read.h"
 #include "functional/mutual_exclusion.h"
 #include "functional/evt_notif_read_write.h"
+#include "functional/init_shutdown_refcount.h"
 
 static RSMITstGlobals *sRSMIGlvalues = nullptr;
 
@@ -226,17 +227,25 @@ TEST(rsmitstReadOnly, TestAPISupportRead) {
   RunGenericTest(&tst);
 }
 TEST(rsmitstReadOnly, TestMutualExclusion) {
-  TestMutualExclusion test;
+  TestMutualExclusion tst;
 
-  test.DisplayTestInfo();
-  test.SetUp();
-  test.Run();
-  RunCustomTestEpilog(&test);
+  tst.DisplayTestInfo();
+  tst.SetUp();
+  tst.Run();
+  RunCustomTestEpilog(&tst);
 }
 TEST(rsmitstReadWrite, TestEvtNotifReadWrite) {
   TestEvtNotifReadWrite tst;
   RunGenericTest(&tst);
 }
+TEST(rsmitstReadOnly, TestConcurrentInit) {
+  TestConcurrentInit tst;
+  tst.DisplayTestInfo();
+  //  tst.SetUp();   // Avoid extra rsmi_init
+  tst.Run();
+  // RunCustomTestEpilog(&tst);  // Avoid extra rsmi_shut_down
+  tst.DisplayResults();
+}
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);