From 1a194afcb5e55d7d58db8d23495b71d3a14a5363 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 17 Oct 2022 17:46:52 +0800 Subject: [PATCH] feat(core/libnvml): add compatibility layers for NVML Python bindings (#30) --- .github/ISSUE_TEMPLATE.md | 3 +- README.md | 15 +-- nvitop/cli.py | 25 +++- nvitop/core/libnvml.py | 259 +++++++++++++++++++++++++++++++++++++- nvitop/version.py | 3 + pyproject.toml | 2 +- requirements.txt | 2 +- 7 files changed, 286 insertions(+), 23 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index b66bd5f8..8cfaf0c1 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -4,7 +4,8 @@ 1. Was this issue already reported? Please do a quick search. 2. Maybe the problem is solved in the current master branch already? - Simply clone nvitop's git repository and run ./nvitop.py to find out. + Simply clone nvitop's git repository and run `LOGLEVEL=DEBUG ./nvitop.py` + to find out. 3. Provide all the relevant information, as outlined in this template. Feel free to remove any sections you don't need. --> diff --git a/README.md b/README.md index 5184f6b2..dee765b3 100644 --- a/README.md +++ b/README.md @@ -166,20 +166,6 @@ pip3 install .

-**IMPORTANT:** `pip` will install `nvidia-ml-py>=11.450.51,<=11.495.46` as a dependency for `nvitop`. Please verify whether the `nvidia-ml-py` package is compatible with your NVIDIA driver version. You can check the release history of `nvidia-ml-py` at [nvidia-ml-py's Release History](https://pypi.org/project/nvidia-ml-py/11.495.46/#history), and install the compatible version manually by: - -```bash -pip3 install --no-dependencies 'nvidia-ml-py==xx.yyy.zzz' -``` - -Since `nvidia-ml-py>=11.450.129`, the definition of `nvmlProcessInfo_t` has introduced two new fields `gpuInstanceId` and `computeInstanceId` (`GI ID` and `CI ID` in newer `nvidia-smi`) which are incompatible with some old NVIDIA drivers. `nvitop` may not display the processes correctly due to this incompatibility. - -You can specified the version of `nvidia-ml-py` while installing `nvitop` as: - -```bash -pip3 install 'nvitop[pynvml-11.450.51]' # or 'nvitop[cuda10]' -``` - ------ ## Usage @@ -359,6 +345,7 @@ process filtering: | `NVITOP_MONITOR_MODE` | The default display mode (a comma-separated string) | `auto` / `full` / `compact`
`plain` / `colorful`
`dark` / `light` | `auto,plain,dark` | | `NVITOP_GPU_UTILIZATION_THRESHOLDS` | Thresholds of GPU utilization | `10,75` , `1,99`, ... | `10,75` | | `NVITOP_MEMORY_UTILIZATION_THRESHOLDS` | Thresholds of GPU memory percent | `10,80` , `1,99`, ... | `10,80` | +| `LOGLEVEL` | Log level for log messages | `DEBUG` , `INFO`, `WARNING`, ... | `WARNING` | For example: diff --git a/nvitop/cli.py b/nvitop/cli.py index e62306d5..68646000 100644 --- a/nvitop/cli.py +++ b/nvitop/cli.py @@ -353,9 +353,8 @@ def main(): # pylint: disable=too-many-branches,too-many-statements,too-many-lo parent = HostProcess().parent() grandparent = parent.parent() if parent is not None else None if grandparent is not None and parent.name() == 'sh' and grandparent.name() == 'watch': - print( - 'HINT: You are running `nvitop` under `watch` command. Please try `nvitop -m` directly.', - file=sys.stderr, + messages.append( + 'HINT: You are running `nvitop` under `watch` command. Please try `nvitop -m` directly.' ) ui.print() @@ -397,12 +396,32 @@ def main(): # pylint: disable=too-many-branches,too-many-statements,too-many-lo ).replace('@VERSION@', Device.driver_version()) messages.append(message) + # pylint: disable-next=protected-access + if libnvml._driver_get_memory_info_v2_available and not libnvml._pynvml_memory_v2_available: + messages.append( + ( + 'WARNING: The `{0}` package does not support the NVML memory info version 2 APIs, which would\n' + 'get inaccurate results. Please upgrade `{0}` via `{1}`.' + ).format( + colored('nvidia-ml-py', attrs=('bold',)), + colored('pip3 install --upgrade nvitop nvidia-ml-py', attrs=('bold',)), + ) + ) + if len(messages) > 0: for message in messages: if message.startswith('ERROR:'): message = message.replace( 'ERROR:', colored('ERROR:', color='red', attrs=('bold',)), 1 ) + elif message.startswith('WARNING:'): + message = message.replace( + 'WARNING:', colored('WARNING:', color='yellow', attrs=('bold',)), 1 + ) + elif message.startswith('HINT:'): + message = message.replace( + 'HINT:', colored('HINT:', color='green', attrs=('bold',)), 1 + ) print(message, file=sys.stderr) return 1 return 0 diff --git a/nvitop/core/libnvml.py b/nvitop/core/libnvml.py index ec006842..a2ca1f12 100644 --- a/nvitop/core/libnvml.py +++ b/nvitop/core/libnvml.py @@ -5,8 +5,11 @@ # pylint: disable=invalid-name +import ctypes as _ctypes +import functools as _functools import inspect as _inspect import logging as _logging +import os as _os import re as _re import sys as _sys import threading as _threading @@ -49,7 +52,6 @@ # Load members from module `pynvml` and register them in `__all__` and globals. _vars_pynvml = vars(_pynvml) -_vars = _OrderedDict() _name = _attr = None _errcode_to_name = {} _const_names = [] @@ -158,6 +160,22 @@ __lock = _threading.Lock() LOGGER = _logging.getLogger(__name__) +try: + LOGGER.setLevel(_os.getenv('LOGLEVEL', default='WARNING').upper()) +except (ValueError, TypeError): + pass +if not LOGGER.hasHandlers() and LOGGER.isEnabledFor(_logging.DEBUG): + _formatter = _logging.Formatter( + '[%(levelname)s] %(asctime)s %(name)s::%(funcName)s: %(message)s' + ) + _stream_handler = _logging.StreamHandler() + _stream_handler.setFormatter(_formatter) + _file_handler = _logging.FileHandler('nvitop.log') + _file_handler.setFormatter(_formatter) + LOGGER.addHandler(_stream_handler) + LOGGER.addHandler(_file_handler) + del _formatter, _stream_handler, _file_handler + UNKNOWN_FUNCTIONS = {} UNKNOWN_FUNCTIONS_CACHE_SIZE = 1024 VERSIONED_PATTERN = _re.compile(r'^(?P\w+)(?P_v(\d)+)$') @@ -397,7 +415,241 @@ def nvmlCheckReturn( return retval != NA and isinstance(retval, types) -# Add support for lookup fallback and context manager. +# Patch layers for backward compatibility ########################################################## +def __patch_backward_compatibility_layers() -> None: + function_name_mapping_lock = _threading.Lock() + function_name_mapping = {} + + def function_mapping_update(mapping): + with function_name_mapping_lock: + mapping = dict(mapping) + for name, mapped_name in function_name_mapping.items(): + if mapped_name in mapping: + mapping[name] = mapping[mapped_name] + function_name_mapping.update(mapping) + return mapping + + def with_mapped_function_name(): + def wrapper(nvmlGetFunctionPointer): + @_functools.wraps(nvmlGetFunctionPointer) + def wrapped(name): + mapped_name = function_name_mapping.get(name, name) + return nvmlGetFunctionPointer(mapped_name) + + return wrapped + + _pynvml.__dict__.update( # need to use module.__dict__.__setitem__ because module.__setattr__ will not work + _nvmlGetFunctionPointer=wrapper( + _pynvml._nvmlGetFunctionPointer # pylint: disable=protected-access + ) + ) + + def patch_function_pointers_when_fail(names, callback): + """Patches the function pointers of the NVML library.""" + + def wrapper(nvmlGetFunctionPointer): + @_functools.wraps(nvmlGetFunctionPointer) + def wrapped(name): + try: + return nvmlGetFunctionPointer(name) + except NVMLError_FunctionNotFound as ex: + if name in names: + new_name = callback(name, names, ex, _pynvml, __modself) + return nvmlGetFunctionPointer(new_name) + raise + + return wrapped + + return wrapper + + def patch_process_info(): + PrintableStructure = _pynvml._PrintableStructure # pylint: disable=protected-access + + # pylint: disable-next=missing-class-docstring,too-few-public-methods + class c_nvmlProcessInfo_v1_t(PrintableStructure): + _fields_ = [ + ('pid', _ctypes.c_uint), + ('usedGpuMemory', _ctypes.c_ulonglong), + ] + _fmt_ = { + 'usedGpuMemory': '%d B', + } + + # pylint: disable-next=missing-class-docstring,too-few-public-methods + class c_nvmlProcessInfo_v2_t(PrintableStructure): + _fields_ = [ + ('pid', _ctypes.c_uint), + ('usedGpuMemory', _ctypes.c_ulonglong), + ('gpuInstanceId', _ctypes.c_uint), + ('computeInstanceId', _ctypes.c_uint), + ] + _fmt_ = { + 'usedGpuMemory': '%d B', + } + + nvmlDeviceGetRunningProcesses_v3_v2 = { + 'nvmlDeviceGetComputeRunningProcesses_v3': 'nvmlDeviceGetComputeRunningProcesses_v2', + 'nvmlDeviceGetGraphicsRunningProcesses_v3': 'nvmlDeviceGetGraphicsRunningProcesses_v2', + 'nvmlDeviceGetMPSComputeRunningProcesses_v3': 'nvmlDeviceGetMPSComputeRunningProcesses_v2', + } + nvmlDeviceGetRunningProcesses_v2_v1 = { + 'nvmlDeviceGetComputeRunningProcesses_v2': 'nvmlDeviceGetComputeRunningProcesses', + 'nvmlDeviceGetGraphicsRunningProcesses_v2': 'nvmlDeviceGetGraphicsRunningProcesses', + 'nvmlDeviceGetMPSComputeRunningProcesses_v2': 'nvmlDeviceGetMPSComputeRunningProcesses', + } + + def patch_process_info_callback( + name, names, exception, pynvml, modself + ): # pylint: disable=unused-argument + if name in nvmlDeviceGetRunningProcesses_v3_v2: + mapping = nvmlDeviceGetRunningProcesses_v3_v2 + struct_type = c_nvmlProcessInfo_v2_t + elif name in nvmlDeviceGetRunningProcesses_v2_v1: + mapping = nvmlDeviceGetRunningProcesses_v2_v1 + struct_type = c_nvmlProcessInfo_v1_t + else: + raise exception # no fallbacks for v1 APIs + + LOGGER.debug('Patching NVML function pointer `%s`', name) + mapping = function_mapping_update(mapping) + pynvml.__dict__.update(c_nvmlProcessInfo_t=struct_type) + modself.__dict__.update(c_nvmlProcessInfo_t=struct_type) + + for old_name, mapped_name in mapping.items(): + LOGGER.debug(' Map NVML function `%s` to `%s`', old_name, mapped_name) + LOGGER.debug( + ' Patch NVML struct `c_nvmlProcessInfo_t` to `%s`', struct_type.__name__ + ) + return mapping[name] + + _pynvml.__dict__.update( # need to use module.__dict__.__setitem__ because module.__setattr__ will not work + # The patching ordering is important + _nvmlGetFunctionPointer=patch_function_pointers_when_fail( + names=set(nvmlDeviceGetRunningProcesses_v3_v2), callback=patch_process_info_callback + )( + patch_function_pointers_when_fail( + names=set(nvmlDeviceGetRunningProcesses_v2_v1), + callback=patch_process_info_callback, + )( + _pynvml._nvmlGetFunctionPointer # pylint: disable=protected-access + ) + ) + ) + + with_mapped_function_name() # patch first and only for once + patch_process_info() + + +__patch_backward_compatibility_layers() +del __patch_backward_compatibility_layers + + +_pynvml_memory_v2_available = hasattr(_pynvml, 'nvmlMemory_v2') +_pynvml_get_memory_info_v2_available = _pynvml_memory_v2_available +_driver_get_memory_info_v2_available = None + + +def nvmlDeviceGetMemoryInfo(handle): # pylint: disable=function-redefined,too-many-branches + """Retrieves the amount of used, free, reserved and total memory available on the device, in bytes. + + Note: + - The version 2 API adds additional memory information. The reserved amount is supported on + version 2 only. + - In MIG mode, if device handle is provided, the API returns aggregate information, only if + the caller has appropriate privileges. Per-instance information can be queried by using + specific MIG device handles. + + Raises: + NVMLError_InvalidArgument: + If the library has not been successfully initialized. + NVMLError_NoPermission: + If the user doesn't have permission to perform this operation. + NVMLError_InvalidArgument: + If device is invalid or memory is NULL. + NVMLError_GpuIsLost: + If the target GPU has fallen off the bus or is otherwise inaccessible. + NVMLError_Unknown: + On any unexpected error. + """ + + global _pynvml_get_memory_info_v2_available, _driver_get_memory_info_v2_available # pylint: disable=global-statement + + _lazy_init() + + if _driver_get_memory_info_v2_available is None: + try: + # pylint: disable-next=protected-access + _pynvml._nvmlGetFunctionPointer('nvmlDeviceGetMemoryInfo_v2') + except NVMLError_FunctionNotFound: + with __lock: + _driver_get_memory_info_v2_available = False + _pynvml_get_memory_info_v2_available = False + else: + with __lock: + _driver_get_memory_info_v2_available = True + + if _driver_get_memory_info_v2_available: + if _pynvml_memory_v2_available: + # driver ✔ pynvml ? + try: + # pylint: disable-next=unexpected-keyword-arg,no-member + retval = _pynvml.nvmlDeviceGetMemoryInfo(handle, version=_pynvml.nvmlMemory_v2) + except TypeError as ex: + if 'unexpected keyword argument' in str(ex).lower(): + # driver ✔ pynvml ✘ + with __lock: + _pynvml_get_memory_info_v2_available = False + LOGGER.debug( + 'NVML memory info version 2 is not available due to incompatible `nvidia-ml-py` package.' + ) + else: + # driver ✔ pynvml ? user ✘ + with __lock: + _driver_get_memory_info_v2_available = ( + None # unset the flag for user exceptions + ) + raise + except (NVMLError_FunctionNotFound, NVMLError_Unknown): + # driver ✔ pynvml ✘ + with __lock: + _pynvml_get_memory_info_v2_available = False + LOGGER.debug( + 'NVML memory info version 2 is not available due to incompatible NVIDIA driver.' + ) + else: + # driver ✔ pynvml ✔ + LOGGER.debug('NVML memory info version 2 is available.') + return retval + else: + # driver ✔ pynvml ✘ + LOGGER.debug( + 'NVML constant `nvmlMemory_v2` not found in package `nvidia-ml-py`, but ' + 'your NVIDIA driver does support the NVML memory info version 2 APIs. NVML ' + 'memory info version 2 is not available due to the legacy dependencies. ' + 'Please consider upgrading your `nvidia-ml-py` package by running ' + '`pip3 install --upgrade nvitop nvidia-ml-py`.' + ) + elif _pynvml_memory_v2_available: + # driver ✘ pynvml ? + LOGGER.debug( + 'NVML memory info version 2 is not available due to incompatible NVIDIA driver.' + ) + else: + # driver ✘ pynvml ✘ + LOGGER.debug( + 'NVML constant `nvmlMemory_v2` not found in package `nvidia-ml-py`, and ' + 'your NVIDIA driver does not support the NVML memory info version 2 APIs. ' + 'NVML memory info version 2 is not available.' + ) + + elif _pynvml_get_memory_info_v2_available: + # pylint: disable-next=unexpected-keyword-arg + return _pynvml.nvmlDeviceGetMemoryInfo(handle, version=_pynvml.nvmlMemory_v2) + + return _pynvml.nvmlDeviceGetMemoryInfo(handle) + + +# Add support for lookup fallback and context manager ############################################## class _CustomModule(_ModuleType): """Modified module type to support lookup fallback and context manager. @@ -445,6 +697,7 @@ def __del__(self) -> None: __modself.__class__ = _CustomModule del _CustomModule -del _inspect, _logging, _re, _sys, _threading +# Delete imported references +del _inspect, _logging, _os, _re, _sys, _threading del _OrderedDict, _FunctionType, _ModuleType del _Tuple, _Callable, _Type, _Union, _Optional, _Any diff --git a/nvitop/version.py b/nvitop/version.py index 8c2cdf15..f0caface 100644 --- a/nvitop/version.py +++ b/nvitop/version.py @@ -49,6 +49,9 @@ '11.460.79', '11.470.66', '11.495.46', + '11.510.69', # the first version supports the `nvmlMemory_v2` API + '11.515.48', + '11.515.75', ] """The list of supported ``nvidia-ml-py`` versions. See also: `nvidia-ml-py's Release History `_. diff --git a/pyproject.toml b/pyproject.toml index c38bf36a..4f4b6422 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,7 +44,7 @@ classifiers = [ "Topic :: Utilities", ] dependencies = [ - "nvidia-ml-py >= 11.450.51, < 11.500.0a0", + "nvidia-ml-py >= 11.450.51, < 11.516.0a0", "psutil >= 5.6.6", "cachetools >= 1.0.1", "termcolor >= 1.0.0", diff --git a/requirements.txt b/requirements.txt index 0878e2c9..8060624c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -nvidia-ml-py >= 11.450.51, < 11.500.0a0 +nvidia-ml-py >= 11.450.51, < 11.516.0a0 psutil >= 5.6.6 cachetools >= 1.0.1 termcolor >= 1.0.0