feat(core/libnvml): add compatibility layers for NVML Python bindings (…

…#30)
XuehaiPan · Oct 17, 2022 · 1a194af · 1a194af
1 parent 72f31e5
commit 1a194af
Show file tree

Hide file tree

Showing 7 changed files with 286 additions and 23 deletions.
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
@@ -4,7 +4,8 @@
 
   1. Was this issue already reported? Please do a quick search.
   2. Maybe the problem is solved in the current master branch already?
-     Simply clone nvitop's git repository and run ./nvitop.py to find out.
+     Simply clone nvitop's git repository and run `LOGLEVEL=DEBUG ./nvitop.py`
+     to find out.
   3. Provide all the relevant information, as outlined in this template.
      Feel free to remove any sections you don't need.
 -->

diff --git a/README.md b/README.md
@@ -166,20 +166,6 @@ pip3 install .
   </br>
 </p>
 
-**IMPORTANT:** `pip` will install `nvidia-ml-py>=11.450.51,<=11.495.46` as a dependency for `nvitop`. Please verify whether the `nvidia-ml-py` package is compatible with your NVIDIA driver version. You can check the release history of `nvidia-ml-py` at [nvidia-ml-py's Release History](https://pypi.org/project/nvidia-ml-py/11.495.46/#history), and install the compatible version manually by:
-
-```bash
-pip3 install --no-dependencies 'nvidia-ml-py==xx.yyy.zzz'
-```
-
-Since `nvidia-ml-py>=11.450.129`, the definition of `nvmlProcessInfo_t` has introduced two new fields `gpuInstanceId` and `computeInstanceId` (`GI ID` and `CI ID` in newer `nvidia-smi`) which are incompatible with some old NVIDIA drivers. `nvitop` may not display the processes correctly due to this incompatibility.
-
-You can specified the version of `nvidia-ml-py` while installing `nvitop` as:
-
-```bash
-pip3 install 'nvitop[pynvml-11.450.51]'  # or 'nvitop[cuda10]'
-```
-
 ------
 
 ## Usage
@@ -359,6 +345,7 @@ process filtering:
 | `NVITOP_MONITOR_MODE`                  | The default display mode (a comma-separated string) | `auto` / `full` / `compact`<br>`plain` / `colorful`<br>`dark` / `light` | `auto,plain,dark` |
 | `NVITOP_GPU_UTILIZATION_THRESHOLDS`    | Thresholds of GPU utilization                       | `10,75` , `1,99`, ...                                                   | `10,75`           |
 | `NVITOP_MEMORY_UTILIZATION_THRESHOLDS` | Thresholds of GPU memory percent                    | `10,80` , `1,99`, ...                                                   | `10,80`           |
+| `LOGLEVEL`                             | Log level for log messages                          | `DEBUG` , `INFO`, `WARNING`, ...                                        | `WARNING`         |
 
 For example:
 

diff --git a/nvitop/cli.py b/nvitop/cli.py
@@ -353,9 +353,8 @@ def main():  # pylint: disable=too-many-branches,too-many-statements,too-many-lo
             parent = HostProcess().parent()
             grandparent = parent.parent() if parent is not None else None
             if grandparent is not None and parent.name() == 'sh' and grandparent.name() == 'watch':
-                print(
-                    'HINT: You are running `nvitop` under `watch` command. Please try `nvitop -m` directly.',
-                    file=sys.stderr,
+                messages.append(
+                    'HINT: You are running `nvitop` under `watch` command. Please try `nvitop -m` directly.'
                 )
 
     ui.print()
@@ -397,12 +396,32 @@ def main():  # pylint: disable=too-many-branches,too-many-statements,too-many-lo
             ).replace('@VERSION@', Device.driver_version())
         messages.append(message)
 
+    # pylint: disable-next=protected-access
+    if libnvml._driver_get_memory_info_v2_available and not libnvml._pynvml_memory_v2_available:
+        messages.append(
+            (
+                'WARNING: The `{0}` package does not support the NVML memory info version 2 APIs, which would\n'
+                'get inaccurate results. Please upgrade `{0}` via `{1}`.'
+            ).format(
+                colored('nvidia-ml-py', attrs=('bold',)),
+                colored('pip3 install --upgrade nvitop nvidia-ml-py', attrs=('bold',)),
+            )
+        )
+
     if len(messages) > 0:
         for message in messages:
             if message.startswith('ERROR:'):
                 message = message.replace(
                     'ERROR:', colored('ERROR:', color='red', attrs=('bold',)), 1
                 )
+            elif message.startswith('WARNING:'):
+                message = message.replace(
+                    'WARNING:', colored('WARNING:', color='yellow', attrs=('bold',)), 1
+                )
+            elif message.startswith('HINT:'):
+                message = message.replace(
+                    'HINT:', colored('HINT:', color='green', attrs=('bold',)), 1
+                )
             print(message, file=sys.stderr)
         return 1
     return 0

diff --git a/nvitop/core/libnvml.py b/nvitop/core/libnvml.py
@@ -5,8 +5,11 @@
 
 # pylint: disable=invalid-name
 
+import ctypes as _ctypes
+import functools as _functools
 import inspect as _inspect
 import logging as _logging
+import os as _os
 import re as _re
 import sys as _sys
 import threading as _threading
@@ -49,7 +52,6 @@
 
 # Load members from module `pynvml` and register them in `__all__` and globals.
 _vars_pynvml = vars(_pynvml)
-_vars = _OrderedDict()
 _name = _attr = None
 _errcode_to_name = {}
 _const_names = []
@@ -158,6 +160,22 @@
 __lock = _threading.Lock()
 
 LOGGER = _logging.getLogger(__name__)
+try:
+    LOGGER.setLevel(_os.getenv('LOGLEVEL', default='WARNING').upper())
+except (ValueError, TypeError):
+    pass
+if not LOGGER.hasHandlers() and LOGGER.isEnabledFor(_logging.DEBUG):
+    _formatter = _logging.Formatter(
+        '[%(levelname)s] %(asctime)s %(name)s::%(funcName)s: %(message)s'
+    )
+    _stream_handler = _logging.StreamHandler()
+    _stream_handler.setFormatter(_formatter)
+    _file_handler = _logging.FileHandler('nvitop.log')
+    _file_handler.setFormatter(_formatter)
+    LOGGER.addHandler(_stream_handler)
+    LOGGER.addHandler(_file_handler)
+    del _formatter, _stream_handler, _file_handler
+
 UNKNOWN_FUNCTIONS = {}
 UNKNOWN_FUNCTIONS_CACHE_SIZE = 1024
 VERSIONED_PATTERN = _re.compile(r'^(?P<name>\w+)(?P<suffix>_v(\d)+)$')
@@ -397,7 +415,241 @@ def nvmlCheckReturn(
     return retval != NA and isinstance(retval, types)
 
 
-# Add support for lookup fallback and context manager.
+# Patch layers for backward compatibility ##########################################################
+def __patch_backward_compatibility_layers() -> None:
+    function_name_mapping_lock = _threading.Lock()
+    function_name_mapping = {}
+
+    def function_mapping_update(mapping):
+        with function_name_mapping_lock:
+            mapping = dict(mapping)
+            for name, mapped_name in function_name_mapping.items():
+                if mapped_name in mapping:
+                    mapping[name] = mapping[mapped_name]
+            function_name_mapping.update(mapping)
+        return mapping
+
+    def with_mapped_function_name():
+        def wrapper(nvmlGetFunctionPointer):
+            @_functools.wraps(nvmlGetFunctionPointer)
+            def wrapped(name):
+                mapped_name = function_name_mapping.get(name, name)
+                return nvmlGetFunctionPointer(mapped_name)
+
+            return wrapped
+
+        _pynvml.__dict__.update(  # need to use module.__dict__.__setitem__ because module.__setattr__ will not work
+            _nvmlGetFunctionPointer=wrapper(
+                _pynvml._nvmlGetFunctionPointer  # pylint: disable=protected-access
+            )
+        )
+
+    def patch_function_pointers_when_fail(names, callback):
+        """Patches the function pointers of the NVML library."""
+
+        def wrapper(nvmlGetFunctionPointer):
+            @_functools.wraps(nvmlGetFunctionPointer)
+            def wrapped(name):
+                try:
+                    return nvmlGetFunctionPointer(name)
+                except NVMLError_FunctionNotFound as ex:
+                    if name in names:
+                        new_name = callback(name, names, ex, _pynvml, __modself)
+                        return nvmlGetFunctionPointer(new_name)
+                    raise
+
+            return wrapped
+
+        return wrapper
+
+    def patch_process_info():
+        PrintableStructure = _pynvml._PrintableStructure  # pylint: disable=protected-access
+
+        # pylint: disable-next=missing-class-docstring,too-few-public-methods
+        class c_nvmlProcessInfo_v1_t(PrintableStructure):
+            _fields_ = [
+                ('pid', _ctypes.c_uint),
+                ('usedGpuMemory', _ctypes.c_ulonglong),
+            ]
+            _fmt_ = {
+                'usedGpuMemory': '%d B',
+            }
+
+        # pylint: disable-next=missing-class-docstring,too-few-public-methods
+        class c_nvmlProcessInfo_v2_t(PrintableStructure):
+            _fields_ = [
+                ('pid', _ctypes.c_uint),
+                ('usedGpuMemory', _ctypes.c_ulonglong),
+                ('gpuInstanceId', _ctypes.c_uint),
+                ('computeInstanceId', _ctypes.c_uint),
+            ]
+            _fmt_ = {
+                'usedGpuMemory': '%d B',
+            }
+
+        nvmlDeviceGetRunningProcesses_v3_v2 = {
+            'nvmlDeviceGetComputeRunningProcesses_v3': 'nvmlDeviceGetComputeRunningProcesses_v2',
+            'nvmlDeviceGetGraphicsRunningProcesses_v3': 'nvmlDeviceGetGraphicsRunningProcesses_v2',
+            'nvmlDeviceGetMPSComputeRunningProcesses_v3': 'nvmlDeviceGetMPSComputeRunningProcesses_v2',
+        }
+        nvmlDeviceGetRunningProcesses_v2_v1 = {
+            'nvmlDeviceGetComputeRunningProcesses_v2': 'nvmlDeviceGetComputeRunningProcesses',
+            'nvmlDeviceGetGraphicsRunningProcesses_v2': 'nvmlDeviceGetGraphicsRunningProcesses',
+            'nvmlDeviceGetMPSComputeRunningProcesses_v2': 'nvmlDeviceGetMPSComputeRunningProcesses',
+        }
+
+        def patch_process_info_callback(
+            name, names, exception, pynvml, modself
+        ):  # pylint: disable=unused-argument
+            if name in nvmlDeviceGetRunningProcesses_v3_v2:
+                mapping = nvmlDeviceGetRunningProcesses_v3_v2
+                struct_type = c_nvmlProcessInfo_v2_t
+            elif name in nvmlDeviceGetRunningProcesses_v2_v1:
+                mapping = nvmlDeviceGetRunningProcesses_v2_v1
+                struct_type = c_nvmlProcessInfo_v1_t
+            else:
+                raise exception  # no fallbacks for v1 APIs
+
+            LOGGER.debug('Patching NVML function pointer `%s`', name)
+            mapping = function_mapping_update(mapping)
+            pynvml.__dict__.update(c_nvmlProcessInfo_t=struct_type)
+            modself.__dict__.update(c_nvmlProcessInfo_t=struct_type)
+
+            for old_name, mapped_name in mapping.items():
+                LOGGER.debug('    Map NVML function `%s` to `%s`', old_name, mapped_name)
+            LOGGER.debug(
+                '    Patch NVML struct `c_nvmlProcessInfo_t` to `%s`', struct_type.__name__
+            )
+            return mapping[name]
+
+        _pynvml.__dict__.update(  # need to use module.__dict__.__setitem__ because module.__setattr__ will not work
+            # The patching ordering is important
+            _nvmlGetFunctionPointer=patch_function_pointers_when_fail(
+                names=set(nvmlDeviceGetRunningProcesses_v3_v2), callback=patch_process_info_callback
+            )(
+                patch_function_pointers_when_fail(
+                    names=set(nvmlDeviceGetRunningProcesses_v2_v1),
+                    callback=patch_process_info_callback,
+                )(
+                    _pynvml._nvmlGetFunctionPointer  # pylint: disable=protected-access
+                )
+            )
+        )
+
+    with_mapped_function_name()  # patch first and only for once
+    patch_process_info()
+
+
+__patch_backward_compatibility_layers()
+del __patch_backward_compatibility_layers
+
+
+_pynvml_memory_v2_available = hasattr(_pynvml, 'nvmlMemory_v2')
+_pynvml_get_memory_info_v2_available = _pynvml_memory_v2_available
+_driver_get_memory_info_v2_available = None
+
+
+def nvmlDeviceGetMemoryInfo(handle):  # pylint: disable=function-redefined,too-many-branches
+    """Retrieves the amount of used, free, reserved and total memory available on the device, in bytes.
+
+    Note:
+        - The version 2 API adds additional memory information. The reserved amount is supported on
+          version 2 only.
+        - In MIG mode, if device handle is provided, the API returns aggregate information, only if
+          the caller has appropriate privileges. Per-instance information can be queried by using
+          specific MIG device handles.
+
+    Raises:
+        NVMLError_InvalidArgument:
+            If the library has not been successfully initialized.
+        NVMLError_NoPermission:
+            If the user doesn't have permission to perform this operation.
+        NVMLError_InvalidArgument:
+            If device is invalid or memory is NULL.
+        NVMLError_GpuIsLost:
+            If the target GPU has fallen off the bus or is otherwise inaccessible.
+        NVMLError_Unknown:
+            On any unexpected error.
+    """
+
+    global _pynvml_get_memory_info_v2_available, _driver_get_memory_info_v2_available  # pylint: disable=global-statement
+
+    _lazy_init()
+
+    if _driver_get_memory_info_v2_available is None:
+        try:
+            # pylint: disable-next=protected-access
+            _pynvml._nvmlGetFunctionPointer('nvmlDeviceGetMemoryInfo_v2')
+        except NVMLError_FunctionNotFound:
+            with __lock:
+                _driver_get_memory_info_v2_available = False
+                _pynvml_get_memory_info_v2_available = False
+        else:
+            with __lock:
+                _driver_get_memory_info_v2_available = True
+
+        if _driver_get_memory_info_v2_available:
+            if _pynvml_memory_v2_available:
+                # driver ✔ pynvml ?
+                try:
+                    # pylint: disable-next=unexpected-keyword-arg,no-member
+                    retval = _pynvml.nvmlDeviceGetMemoryInfo(handle, version=_pynvml.nvmlMemory_v2)
+                except TypeError as ex:
+                    if 'unexpected keyword argument' in str(ex).lower():
+                        # driver ✔ pynvml ✘
+                        with __lock:
+                            _pynvml_get_memory_info_v2_available = False
+                        LOGGER.debug(
+                            'NVML memory info version 2 is not available due to incompatible `nvidia-ml-py` package.'
+                        )
+                    else:
+                        # driver ✔ pynvml ? user ✘
+                        with __lock:
+                            _driver_get_memory_info_v2_available = (
+                                None  # unset the flag for user exceptions
+                            )
+                        raise
+                except (NVMLError_FunctionNotFound, NVMLError_Unknown):
+                    # driver ✔ pynvml ✘
+                    with __lock:
+                        _pynvml_get_memory_info_v2_available = False
+                    LOGGER.debug(
+                        'NVML memory info version 2 is not available due to incompatible NVIDIA driver.'
+                    )
+                else:
+                    # driver ✔ pynvml ✔
+                    LOGGER.debug('NVML memory info version 2 is available.')
+                    return retval
+            else:
+                # driver ✔ pynvml ✘
+                LOGGER.debug(
+                    'NVML constant `nvmlMemory_v2` not found in package `nvidia-ml-py`, but '
+                    'your NVIDIA driver does support the NVML memory info version 2 APIs. NVML '
+                    'memory info version 2 is not available due to the legacy dependencies. '
+                    'Please consider upgrading your `nvidia-ml-py` package by running '
+                    '`pip3 install --upgrade nvitop nvidia-ml-py`.'
+                )
+        elif _pynvml_memory_v2_available:
+            # driver ✘ pynvml ?
+            LOGGER.debug(
+                'NVML memory info version 2 is not available due to incompatible NVIDIA driver.'
+            )
+        else:
+            # driver ✘ pynvml ✘
+            LOGGER.debug(
+                'NVML constant `nvmlMemory_v2` not found in package `nvidia-ml-py`, and '
+                'your NVIDIA driver does not support the NVML memory info version 2 APIs. '
+                'NVML memory info version 2 is not available.'
+            )
+
+    elif _pynvml_get_memory_info_v2_available:
+        # pylint: disable-next=unexpected-keyword-arg
+        return _pynvml.nvmlDeviceGetMemoryInfo(handle, version=_pynvml.nvmlMemory_v2)
+
+    return _pynvml.nvmlDeviceGetMemoryInfo(handle)
+
+
+# Add support for lookup fallback and context manager ##############################################
 class _CustomModule(_ModuleType):
     """Modified module type to support lookup fallback and context manager.
 
@@ -445,6 +697,7 @@ def __del__(self) -> None:
 __modself.__class__ = _CustomModule
 del _CustomModule
 
-del _inspect, _logging, _re, _sys, _threading
+# Delete imported references
+del _inspect, _logging, _os, _re, _sys, _threading
 del _OrderedDict, _FunctionType, _ModuleType
 del _Tuple, _Callable, _Type, _Union, _Optional, _Any
diff --git a/nvitop/version.py b/nvitop/version.py
@@ -49,6 +49,9 @@
     '11.460.79',
     '11.470.66',
     '11.495.46',
+    '11.510.69',  # the first version supports the `nvmlMemory_v2` API
+    '11.515.48',
+    '11.515.75',
 ]
 """The list of supported ``nvidia-ml-py`` versions.
 See also: `nvidia-ml-py's Release History <https://pypi.org/project/nvidia-ml-py/#history>`_.

diff --git a/pyproject.toml b/pyproject.toml
@@ -44,7 +44,7 @@ classifiers = [
     "Topic :: Utilities",
 ]
 dependencies = [
-    "nvidia-ml-py >= 11.450.51, < 11.500.0a0",
+    "nvidia-ml-py >= 11.450.51, < 11.516.0a0",
     "psutil >= 5.6.6",
     "cachetools >= 1.0.1",
     "termcolor >= 1.0.0",