From 1a194afcb5e55d7d58db8d23495b71d3a14a5363 Mon Sep 17 00:00:00 2001
From: Xuehai Pan
Date: Mon, 17 Oct 2022 17:46:52 +0800
Subject: [PATCH] feat(core/libnvml): add compatibility layers for NVML Python
bindings (#30)
---
.github/ISSUE_TEMPLATE.md | 3 +-
README.md | 15 +--
nvitop/cli.py | 25 +++-
nvitop/core/libnvml.py | 259 +++++++++++++++++++++++++++++++++++++-
nvitop/version.py | 3 +
pyproject.toml | 2 +-
requirements.txt | 2 +-
7 files changed, 286 insertions(+), 23 deletions(-)
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
index b66bd5f8..8cfaf0c1 100644
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@@ -4,7 +4,8 @@
1. Was this issue already reported? Please do a quick search.
2. Maybe the problem is solved in the current master branch already?
- Simply clone nvitop's git repository and run ./nvitop.py to find out.
+ Simply clone nvitop's git repository and run `LOGLEVEL=DEBUG ./nvitop.py`
+ to find out.
3. Provide all the relevant information, as outlined in this template.
Feel free to remove any sections you don't need.
-->
diff --git a/README.md b/README.md
index 5184f6b2..dee765b3 100644
--- a/README.md
+++ b/README.md
@@ -166,20 +166,6 @@ pip3 install .
-**IMPORTANT:** `pip` will install `nvidia-ml-py>=11.450.51,<=11.495.46` as a dependency for `nvitop`. Please verify whether the `nvidia-ml-py` package is compatible with your NVIDIA driver version. You can check the release history of `nvidia-ml-py` at [nvidia-ml-py's Release History](https://pypi.org/project/nvidia-ml-py/11.495.46/#history), and install the compatible version manually by:
-
-```bash
-pip3 install --no-dependencies 'nvidia-ml-py==xx.yyy.zzz'
-```
-
-Since `nvidia-ml-py>=11.450.129`, the definition of `nvmlProcessInfo_t` has introduced two new fields `gpuInstanceId` and `computeInstanceId` (`GI ID` and `CI ID` in newer `nvidia-smi`) which are incompatible with some old NVIDIA drivers. `nvitop` may not display the processes correctly due to this incompatibility.
-
-You can specified the version of `nvidia-ml-py` while installing `nvitop` as:
-
-```bash
-pip3 install 'nvitop[pynvml-11.450.51]' # or 'nvitop[cuda10]'
-```
-
------
## Usage
@@ -359,6 +345,7 @@ process filtering:
| `NVITOP_MONITOR_MODE` | The default display mode (a comma-separated string) | `auto` / `full` / `compact`
`plain` / `colorful`
`dark` / `light` | `auto,plain,dark` |
| `NVITOP_GPU_UTILIZATION_THRESHOLDS` | Thresholds of GPU utilization | `10,75` , `1,99`, ... | `10,75` |
| `NVITOP_MEMORY_UTILIZATION_THRESHOLDS` | Thresholds of GPU memory percent | `10,80` , `1,99`, ... | `10,80` |
+| `LOGLEVEL` | Log level for log messages | `DEBUG` , `INFO`, `WARNING`, ... | `WARNING` |
For example:
diff --git a/nvitop/cli.py b/nvitop/cli.py
index e62306d5..68646000 100644
--- a/nvitop/cli.py
+++ b/nvitop/cli.py
@@ -353,9 +353,8 @@ def main(): # pylint: disable=too-many-branches,too-many-statements,too-many-lo
parent = HostProcess().parent()
grandparent = parent.parent() if parent is not None else None
if grandparent is not None and parent.name() == 'sh' and grandparent.name() == 'watch':
- print(
- 'HINT: You are running `nvitop` under `watch` command. Please try `nvitop -m` directly.',
- file=sys.stderr,
+ messages.append(
+ 'HINT: You are running `nvitop` under `watch` command. Please try `nvitop -m` directly.'
)
ui.print()
@@ -397,12 +396,32 @@ def main(): # pylint: disable=too-many-branches,too-many-statements,too-many-lo
).replace('@VERSION@', Device.driver_version())
messages.append(message)
+ # pylint: disable-next=protected-access
+ if libnvml._driver_get_memory_info_v2_available and not libnvml._pynvml_memory_v2_available:
+ messages.append(
+ (
+ 'WARNING: The `{0}` package does not support the NVML memory info version 2 APIs, which would\n'
+ 'get inaccurate results. Please upgrade `{0}` via `{1}`.'
+ ).format(
+ colored('nvidia-ml-py', attrs=('bold',)),
+ colored('pip3 install --upgrade nvitop nvidia-ml-py', attrs=('bold',)),
+ )
+ )
+
if len(messages) > 0:
for message in messages:
if message.startswith('ERROR:'):
message = message.replace(
'ERROR:', colored('ERROR:', color='red', attrs=('bold',)), 1
)
+ elif message.startswith('WARNING:'):
+ message = message.replace(
+ 'WARNING:', colored('WARNING:', color='yellow', attrs=('bold',)), 1
+ )
+ elif message.startswith('HINT:'):
+ message = message.replace(
+ 'HINT:', colored('HINT:', color='green', attrs=('bold',)), 1
+ )
print(message, file=sys.stderr)
return 1
return 0
diff --git a/nvitop/core/libnvml.py b/nvitop/core/libnvml.py
index ec006842..a2ca1f12 100644
--- a/nvitop/core/libnvml.py
+++ b/nvitop/core/libnvml.py
@@ -5,8 +5,11 @@
# pylint: disable=invalid-name
+import ctypes as _ctypes
+import functools as _functools
import inspect as _inspect
import logging as _logging
+import os as _os
import re as _re
import sys as _sys
import threading as _threading
@@ -49,7 +52,6 @@
# Load members from module `pynvml` and register them in `__all__` and globals.
_vars_pynvml = vars(_pynvml)
-_vars = _OrderedDict()
_name = _attr = None
_errcode_to_name = {}
_const_names = []
@@ -158,6 +160,22 @@
__lock = _threading.Lock()
LOGGER = _logging.getLogger(__name__)
+try:
+ LOGGER.setLevel(_os.getenv('LOGLEVEL', default='WARNING').upper())
+except (ValueError, TypeError):
+ pass
+if not LOGGER.hasHandlers() and LOGGER.isEnabledFor(_logging.DEBUG):
+ _formatter = _logging.Formatter(
+ '[%(levelname)s] %(asctime)s %(name)s::%(funcName)s: %(message)s'
+ )
+ _stream_handler = _logging.StreamHandler()
+ _stream_handler.setFormatter(_formatter)
+ _file_handler = _logging.FileHandler('nvitop.log')
+ _file_handler.setFormatter(_formatter)
+ LOGGER.addHandler(_stream_handler)
+ LOGGER.addHandler(_file_handler)
+ del _formatter, _stream_handler, _file_handler
+
UNKNOWN_FUNCTIONS = {}
UNKNOWN_FUNCTIONS_CACHE_SIZE = 1024
VERSIONED_PATTERN = _re.compile(r'^(?P\w+)(?P_v(\d)+)$')
@@ -397,7 +415,241 @@ def nvmlCheckReturn(
return retval != NA and isinstance(retval, types)
-# Add support for lookup fallback and context manager.
+# Patch layers for backward compatibility ##########################################################
+def __patch_backward_compatibility_layers() -> None:
+ function_name_mapping_lock = _threading.Lock()
+ function_name_mapping = {}
+
+ def function_mapping_update(mapping):
+ with function_name_mapping_lock:
+ mapping = dict(mapping)
+ for name, mapped_name in function_name_mapping.items():
+ if mapped_name in mapping:
+ mapping[name] = mapping[mapped_name]
+ function_name_mapping.update(mapping)
+ return mapping
+
+ def with_mapped_function_name():
+ def wrapper(nvmlGetFunctionPointer):
+ @_functools.wraps(nvmlGetFunctionPointer)
+ def wrapped(name):
+ mapped_name = function_name_mapping.get(name, name)
+ return nvmlGetFunctionPointer(mapped_name)
+
+ return wrapped
+
+ _pynvml.__dict__.update( # need to use module.__dict__.__setitem__ because module.__setattr__ will not work
+ _nvmlGetFunctionPointer=wrapper(
+ _pynvml._nvmlGetFunctionPointer # pylint: disable=protected-access
+ )
+ )
+
+ def patch_function_pointers_when_fail(names, callback):
+ """Patches the function pointers of the NVML library."""
+
+ def wrapper(nvmlGetFunctionPointer):
+ @_functools.wraps(nvmlGetFunctionPointer)
+ def wrapped(name):
+ try:
+ return nvmlGetFunctionPointer(name)
+ except NVMLError_FunctionNotFound as ex:
+ if name in names:
+ new_name = callback(name, names, ex, _pynvml, __modself)
+ return nvmlGetFunctionPointer(new_name)
+ raise
+
+ return wrapped
+
+ return wrapper
+
+ def patch_process_info():
+ PrintableStructure = _pynvml._PrintableStructure # pylint: disable=protected-access
+
+ # pylint: disable-next=missing-class-docstring,too-few-public-methods
+ class c_nvmlProcessInfo_v1_t(PrintableStructure):
+ _fields_ = [
+ ('pid', _ctypes.c_uint),
+ ('usedGpuMemory', _ctypes.c_ulonglong),
+ ]
+ _fmt_ = {
+ 'usedGpuMemory': '%d B',
+ }
+
+ # pylint: disable-next=missing-class-docstring,too-few-public-methods
+ class c_nvmlProcessInfo_v2_t(PrintableStructure):
+ _fields_ = [
+ ('pid', _ctypes.c_uint),
+ ('usedGpuMemory', _ctypes.c_ulonglong),
+ ('gpuInstanceId', _ctypes.c_uint),
+ ('computeInstanceId', _ctypes.c_uint),
+ ]
+ _fmt_ = {
+ 'usedGpuMemory': '%d B',
+ }
+
+ nvmlDeviceGetRunningProcesses_v3_v2 = {
+ 'nvmlDeviceGetComputeRunningProcesses_v3': 'nvmlDeviceGetComputeRunningProcesses_v2',
+ 'nvmlDeviceGetGraphicsRunningProcesses_v3': 'nvmlDeviceGetGraphicsRunningProcesses_v2',
+ 'nvmlDeviceGetMPSComputeRunningProcesses_v3': 'nvmlDeviceGetMPSComputeRunningProcesses_v2',
+ }
+ nvmlDeviceGetRunningProcesses_v2_v1 = {
+ 'nvmlDeviceGetComputeRunningProcesses_v2': 'nvmlDeviceGetComputeRunningProcesses',
+ 'nvmlDeviceGetGraphicsRunningProcesses_v2': 'nvmlDeviceGetGraphicsRunningProcesses',
+ 'nvmlDeviceGetMPSComputeRunningProcesses_v2': 'nvmlDeviceGetMPSComputeRunningProcesses',
+ }
+
+ def patch_process_info_callback(
+ name, names, exception, pynvml, modself
+ ): # pylint: disable=unused-argument
+ if name in nvmlDeviceGetRunningProcesses_v3_v2:
+ mapping = nvmlDeviceGetRunningProcesses_v3_v2
+ struct_type = c_nvmlProcessInfo_v2_t
+ elif name in nvmlDeviceGetRunningProcesses_v2_v1:
+ mapping = nvmlDeviceGetRunningProcesses_v2_v1
+ struct_type = c_nvmlProcessInfo_v1_t
+ else:
+ raise exception # no fallbacks for v1 APIs
+
+ LOGGER.debug('Patching NVML function pointer `%s`', name)
+ mapping = function_mapping_update(mapping)
+ pynvml.__dict__.update(c_nvmlProcessInfo_t=struct_type)
+ modself.__dict__.update(c_nvmlProcessInfo_t=struct_type)
+
+ for old_name, mapped_name in mapping.items():
+ LOGGER.debug(' Map NVML function `%s` to `%s`', old_name, mapped_name)
+ LOGGER.debug(
+ ' Patch NVML struct `c_nvmlProcessInfo_t` to `%s`', struct_type.__name__
+ )
+ return mapping[name]
+
+ _pynvml.__dict__.update( # need to use module.__dict__.__setitem__ because module.__setattr__ will not work
+ # The patching ordering is important
+ _nvmlGetFunctionPointer=patch_function_pointers_when_fail(
+ names=set(nvmlDeviceGetRunningProcesses_v3_v2), callback=patch_process_info_callback
+ )(
+ patch_function_pointers_when_fail(
+ names=set(nvmlDeviceGetRunningProcesses_v2_v1),
+ callback=patch_process_info_callback,
+ )(
+ _pynvml._nvmlGetFunctionPointer # pylint: disable=protected-access
+ )
+ )
+ )
+
+ with_mapped_function_name() # patch first and only for once
+ patch_process_info()
+
+
+__patch_backward_compatibility_layers()
+del __patch_backward_compatibility_layers
+
+
+_pynvml_memory_v2_available = hasattr(_pynvml, 'nvmlMemory_v2')
+_pynvml_get_memory_info_v2_available = _pynvml_memory_v2_available
+_driver_get_memory_info_v2_available = None
+
+
+def nvmlDeviceGetMemoryInfo(handle): # pylint: disable=function-redefined,too-many-branches
+ """Retrieves the amount of used, free, reserved and total memory available on the device, in bytes.
+
+ Note:
+ - The version 2 API adds additional memory information. The reserved amount is supported on
+ version 2 only.
+ - In MIG mode, if device handle is provided, the API returns aggregate information, only if
+ the caller has appropriate privileges. Per-instance information can be queried by using
+ specific MIG device handles.
+
+ Raises:
+ NVMLError_InvalidArgument:
+ If the library has not been successfully initialized.
+ NVMLError_NoPermission:
+ If the user doesn't have permission to perform this operation.
+ NVMLError_InvalidArgument:
+ If device is invalid or memory is NULL.
+ NVMLError_GpuIsLost:
+ If the target GPU has fallen off the bus or is otherwise inaccessible.
+ NVMLError_Unknown:
+ On any unexpected error.
+ """
+
+ global _pynvml_get_memory_info_v2_available, _driver_get_memory_info_v2_available # pylint: disable=global-statement
+
+ _lazy_init()
+
+ if _driver_get_memory_info_v2_available is None:
+ try:
+ # pylint: disable-next=protected-access
+ _pynvml._nvmlGetFunctionPointer('nvmlDeviceGetMemoryInfo_v2')
+ except NVMLError_FunctionNotFound:
+ with __lock:
+ _driver_get_memory_info_v2_available = False
+ _pynvml_get_memory_info_v2_available = False
+ else:
+ with __lock:
+ _driver_get_memory_info_v2_available = True
+
+ if _driver_get_memory_info_v2_available:
+ if _pynvml_memory_v2_available:
+ # driver ✔ pynvml ?
+ try:
+ # pylint: disable-next=unexpected-keyword-arg,no-member
+ retval = _pynvml.nvmlDeviceGetMemoryInfo(handle, version=_pynvml.nvmlMemory_v2)
+ except TypeError as ex:
+ if 'unexpected keyword argument' in str(ex).lower():
+ # driver ✔ pynvml ✘
+ with __lock:
+ _pynvml_get_memory_info_v2_available = False
+ LOGGER.debug(
+ 'NVML memory info version 2 is not available due to incompatible `nvidia-ml-py` package.'
+ )
+ else:
+ # driver ✔ pynvml ? user ✘
+ with __lock:
+ _driver_get_memory_info_v2_available = (
+ None # unset the flag for user exceptions
+ )
+ raise
+ except (NVMLError_FunctionNotFound, NVMLError_Unknown):
+ # driver ✔ pynvml ✘
+ with __lock:
+ _pynvml_get_memory_info_v2_available = False
+ LOGGER.debug(
+ 'NVML memory info version 2 is not available due to incompatible NVIDIA driver.'
+ )
+ else:
+ # driver ✔ pynvml ✔
+ LOGGER.debug('NVML memory info version 2 is available.')
+ return retval
+ else:
+ # driver ✔ pynvml ✘
+ LOGGER.debug(
+ 'NVML constant `nvmlMemory_v2` not found in package `nvidia-ml-py`, but '
+ 'your NVIDIA driver does support the NVML memory info version 2 APIs. NVML '
+ 'memory info version 2 is not available due to the legacy dependencies. '
+ 'Please consider upgrading your `nvidia-ml-py` package by running '
+ '`pip3 install --upgrade nvitop nvidia-ml-py`.'
+ )
+ elif _pynvml_memory_v2_available:
+ # driver ✘ pynvml ?
+ LOGGER.debug(
+ 'NVML memory info version 2 is not available due to incompatible NVIDIA driver.'
+ )
+ else:
+ # driver ✘ pynvml ✘
+ LOGGER.debug(
+ 'NVML constant `nvmlMemory_v2` not found in package `nvidia-ml-py`, and '
+ 'your NVIDIA driver does not support the NVML memory info version 2 APIs. '
+ 'NVML memory info version 2 is not available.'
+ )
+
+ elif _pynvml_get_memory_info_v2_available:
+ # pylint: disable-next=unexpected-keyword-arg
+ return _pynvml.nvmlDeviceGetMemoryInfo(handle, version=_pynvml.nvmlMemory_v2)
+
+ return _pynvml.nvmlDeviceGetMemoryInfo(handle)
+
+
+# Add support for lookup fallback and context manager ##############################################
class _CustomModule(_ModuleType):
"""Modified module type to support lookup fallback and context manager.
@@ -445,6 +697,7 @@ def __del__(self) -> None:
__modself.__class__ = _CustomModule
del _CustomModule
-del _inspect, _logging, _re, _sys, _threading
+# Delete imported references
+del _inspect, _logging, _os, _re, _sys, _threading
del _OrderedDict, _FunctionType, _ModuleType
del _Tuple, _Callable, _Type, _Union, _Optional, _Any
diff --git a/nvitop/version.py b/nvitop/version.py
index 8c2cdf15..f0caface 100644
--- a/nvitop/version.py
+++ b/nvitop/version.py
@@ -49,6 +49,9 @@
'11.460.79',
'11.470.66',
'11.495.46',
+ '11.510.69', # the first version supports the `nvmlMemory_v2` API
+ '11.515.48',
+ '11.515.75',
]
"""The list of supported ``nvidia-ml-py`` versions.
See also: `nvidia-ml-py's Release History `_.
diff --git a/pyproject.toml b/pyproject.toml
index c38bf36a..4f4b6422 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,7 +44,7 @@ classifiers = [
"Topic :: Utilities",
]
dependencies = [
- "nvidia-ml-py >= 11.450.51, < 11.500.0a0",
+ "nvidia-ml-py >= 11.450.51, < 11.516.0a0",
"psutil >= 5.6.6",
"cachetools >= 1.0.1",
"termcolor >= 1.0.0",
diff --git a/requirements.txt b/requirements.txt
index 0878e2c9..8060624c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-nvidia-ml-py >= 11.450.51, < 11.500.0a0
+nvidia-ml-py >= 11.450.51, < 11.516.0a0
psutil >= 5.6.6
cachetools >= 1.0.1
termcolor >= 1.0.0