Docker: support new NVidia runtime (#653)

* Update GPU run script * Dockerfile bugfix: have USE_GPU be in scope, create same code dir as mounted * Make setup.py respect USE_GPU environment variable This is necessary since during Docker build NVidia runtime is always disabled, and so (a) TensorFlow import will fail if we preinstall tensorflow-gpu (no CUDA libraries) and (b) nvidia-smi will not be present so autodetect will fail. * Refactor setup.py to appease Codacy * Update changelog * Remove debugging code
Stable-Baselines-Team · Jan 15, 2020 · cc20b83 · cc20b83
1 parent 483960a
commit cc20b83
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 36 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,6 +1,6 @@
 ARG PARENT_IMAGE
-ARG USE_GPU
 FROM $PARENT_IMAGE
+ARG USE_GPU
 
 RUN apt-get -y update \
     && apt-get -y install \
@@ -25,22 +25,14 @@ RUN apt-get -y update \
 ENV CODE_DIR /root/code
 ENV VENV /root/venv
 
-COPY ./setup.py /root/code/setup.py
+COPY ./setup.py ${CODE_DIR}/stable-baselines/setup.py
 RUN \
-    mkdir -p ${CODE_DIR}/stable_baselines && \
     pip install virtualenv && \
     virtualenv $VENV --python=python3 && \
     . $VENV/bin/activate && \
-    cd $CODE_DIR && \
     pip install --upgrade pip && \
-    if [ "$USE_GPU" = "True" ]; then \
-        TENSORFLOW_PACKAGE="tensorflow-gpu==1.8.0"; \
-    else \
-        TENSORFLOW_PACKAGE="tensorflow==1.8.0"; \
-    fi; \
-    pip install ${TENSORFLOW_PACKAGE} && \
+    cd ${CODE_DIR}/stable-baselines && \
     pip install -e .[mpi,tests] && \
-    pip install codacy-coverage && \
     rm -rf $HOME/.cache/pip
 
 ENV PATH=$VENV/bin:$PATH

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
@@ -21,7 +21,9 @@ New Features:
 Bug Fixes:
 ^^^^^^^^^^
 
-- Fixed Docker build script, `scripts/build_docker.sh`, to pass `USE_GPU` build argument.
+- Fixed Docker images via `scripts/build_docker.sh` and `Dockerfile`: GPU image now contains `tensorflow-gpu`,
+  and both images have `stable_baselines` installed in developer mode at correct directory for mounting.
+- Fixed Docker GPU run script, `scripts/run_docker_gpu.sh`, to work with new NVidia Container Toolkit.
 - Repeated calls to `RLModel.learn()` now preserve internal counters for some episode
   logging statistics that used to be zeroed at the start of every call.
 

diff --git a/scripts/run_docker_gpu.sh b/scripts/run_docker_gpu.sh
@@ -6,7 +6,14 @@ cmd_line="$@"
 echo "Executing in the docker (gpu image):"
 echo $cmd_line
 
+# TODO: always use new-style once sufficiently widely used (probably 2021 onwards)
+if [ -x "$(which nvidia-docker)" ]; then
+  # old-style nvidia-docker2
+  NVIDIA_ARG="--runtime=nvidia"
+else
+  NVIDIA_ARG="--gpus all"
+fi
 
-docker run -it --runtime=nvidia --rm --network host --ipc=host \
+docker run -it ${NVIDIA_ARG} --rm --network host --ipc=host \
   --mount src=$(pwd),target=/root/code/stable-baselines,type=bind stablebaselines/stable-baselines:v2.9.0 \
   bash -c "cd /root/code/stable-baselines/ && $cmd_line"
diff --git a/setup.py b/setup.py
@@ -1,3 +1,4 @@
+import os
 import sys
 import subprocess
 from setuptools import setup, find_packages
@@ -9,29 +10,34 @@
 
 # Check tensorflow installation to avoid
 # breaking pre-installed tf gpu
-install_tf, tf_gpu = False, False
-try:
-    import tensorflow as tf
-    if tf.__version__ < LooseVersion('1.8.0'):
+def find_tf_dependency():
+    install_tf, tf_gpu = False, False
+    try:
+        import tensorflow as tf
+        if tf.__version__ < LooseVersion('1.8.0'):
+            install_tf = True
+            # check if a gpu version is needed
+            tf_gpu = tf.test.is_gpu_available()
+    except ImportError:
         install_tf = True
-        # check if a gpu version is needed
-        tf_gpu = tf.test.is_gpu_available()
-except ImportError:
-    install_tf = True
-    # Check if a nvidia gpu is present
-    for command in ['nvidia-smi', '/usr/bin/nvidia-smi', 'nvidia-smi.exe']:
-        try:
-            if subprocess.call([command]) == 0:
-                tf_gpu = True
-                break
-        except IOError:  # command does not exist / is not executable
-            pass
-
-tf_dependency = []
-if install_tf:
-    tf_dependency = ['tensorflow-gpu>=1.8.0,<2.0.0'] if tf_gpu else ['tensorflow>=1.8.0,<2.0.0']
-    if tf_gpu:
-        print("A GPU was detected, tensorflow-gpu will be installed")
+        # Check if a nvidia gpu is present
+        for command in ['nvidia-smi', '/usr/bin/nvidia-smi', 'nvidia-smi.exe']:
+            try:
+                if subprocess.call([command]) == 0:
+                    tf_gpu = True
+                    break
+            except IOError:  # command does not exist / is not executable
+                pass
+        if os.environ['USE_GPU'] == 'True':  # force GPU even if not auto-detected
+            tf_gpu = True
+
+    tf_dependency = []
+    if install_tf:
+        tf_dependency = ['tensorflow-gpu>=1.8.0,<2.0.0'] if tf_gpu else ['tensorflow>=1.8.0,<2.0.0']
+        if tf_gpu:
+            print("A GPU was detected, tensorflow-gpu will be installed")
+
+    return tf_dependency
 
 
 long_description = """
@@ -119,7 +125,7 @@
           'numpy',
           'pandas',
           'matplotlib'
-      ] + tf_dependency,
+      ] + find_tf_dependency(),
       extras_require={
         'mpi': [
             'mpi4py',