fix(api/device): use recent timestamp for GPU process utilization que…

…ry (#85)
XuehaiPan · Aug 4, 2023 · ef77b8b · ef77b8b
1 parent ec53de7
commit ef77b8b
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 7 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,7 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
--
+- Use recent timestamp for GPU process utilization query for more accurate per-process GPU usage by [@XuehaiPan](https://github.com/XuehaiPan) in [#85](https://github.com/XuehaiPan/nvitop/pull/85). We extend our heartfelt gratitude to [@2581543189](https://github.com/2581543189) for their invaluable assistance. Their timely comments and comprehensive feedback have greatly contributed to the improvement of this project.
 
 ### Fixed
 

diff --git a/nvitop/api/device.py b/nvitop/api/device.py
@@ -113,6 +113,7 @@
 import sys
 import textwrap
 import threading
+import time
 from collections import OrderedDict
 from typing import (
     TYPE_CHECKING,
@@ -683,7 +684,6 @@ def __init__(
                 self._nvml_index = libnvml.nvmlQuery('nvmlDeviceGetIndex', self._handle)
 
         self._max_clock_infos: ClockInfos = ClockInfos(graphics=NA, sm=NA, memory=NA, video=NA)
-        self._timestamp: int = 0
         self._lock: threading.RLock = threading.RLock()
 
         self._ident: tuple[Hashable, str] = (self.index, self.uuid())
@@ -1700,11 +1700,13 @@ def processes(self) -> dict[int, GpuProcess]:
             samples = libnvml.nvmlQuery(
                 'nvmlDeviceGetProcessUtilization',
                 self.handle,
-                self._timestamp,
+                # Only utilization samples that were recorded after this timestamp will be returned.
+                # The CPU timestamp, i.e. absolute Unix epoch timestamp (in microseconds), is used.
+                # Here we use the timestamp 1/4 second ago to ensure the record buffer is not empty.
+                time.time_ns() // 1000 - 250_000,
                 default=(),
             )
-            self._timestamp = max(min((s.timeStamp for s in samples), default=0) - 2_000_000, 0)
-            for s in samples:
+            for s in sorted(samples, key=lambda s: s.timeStamp):
                 try:
                     processes[s.pid].set_gpu_utilization(s.smUtil, s.memUtil, s.encUtil, s.decUtil)
                 except KeyError:
@@ -2019,7 +2021,6 @@ def __init__(
                 raise libnvml.NVMLError_NotFound
 
         self._max_clock_infos = ClockInfos(graphics=NA, sm=NA, memory=NA, video=NA)
-        self._timestamp = 0
         self._lock = threading.RLock()
 
         self._ident = (self.index, self.uuid())

diff --git a/nvitop/api/process.py b/nvitop/api/process.py
@@ -694,7 +694,19 @@ def update_gpu_status(self) -> int | NaType:
         self.set_gpu_memory(NA)
         self.set_gpu_cc_protected_memory(NA)
         self.set_gpu_utilization(NA, NA, NA, NA)
-        self.device.processes()
+        processes = self.device.processes()
+        process = processes.get(self.pid, self)
+        if process is not self:
+            # The current process is gone and the instance has been removed from the cache.
+            # Update GPU status from the new instance.
+            self.set_gpu_memory(process.gpu_memory())
+            self.set_gpu_cc_protected_memory(process.gpu_cc_protected_memory())
+            self.set_gpu_utilization(
+                process.gpu_sm_utilization(),
+                process.gpu_memory_utilization(),
+                process.gpu_encoder_utilization(),
+                process.gpu_decoder_utilization(),
+            )
         return self.gpu_memory()
 
     @property