Fix timers when using multithreading. (#3901)

Chris Elion · web-flow · commit bd9a394d8fd8 · 2020-04-29T14:55:49.000-07:00
diff --git a/.pylintrc b/.pylintrc
@@ -46,3 +46,6 @@ disable =
 
     # Using the global statement
     W0603,
+
+    # "Access to a protected member _foo of a client class (protected-access)"
+    W0212
diff --git a/docs/Profiling-Python.md b/docs/Profiling-Python.md
@@ -43,9 +43,14 @@ By default, at the end of training, timers are collected and written in json for
  is optional and defaults to false.
 
 ### Parallel execution
+#### Subprocesses
 For code that executes in multiple processes (for example, SubprocessEnvManager), we periodically send the timer
 information back to the "main" process, aggregate the timers there, and flush them in the subprocess. Note that
 (depending on the number of processes) this can result in timers where the total time may exceed the parent's total
 time. This is analogous to the difference between "real" and "user" values reported from the unix `time` command. In the
 timer output, blocks that were run in parallel are indicated by the `is_parallel` flag.
 
+#### Threads
+Timers currently use `time.perf_counter()` to track time spent, which may not give accurate results for multiple
+threads. If this is problematic, set `threaded: false` in your trainer configuration.
+
diff --git a/ml-agents-envs/mlagents_envs/tests/test_timers.py b/ml-agents-envs/mlagents_envs/tests/test_timers.py
@@ -10,9 +10,8 @@ def decorated_func(x: int = 0, y: float = 1.0) -> str:
 
 
 def test_timers() -> None:
-    with mock.patch(
-        "mlagents_envs.timers._global_timer_stack", new_callable=timers.TimerStack
-    ) as test_timer:
+    test_timer = timers.TimerStack()
+    with mock.patch("mlagents_envs.timers._get_thread_timer", return_value=test_timer):
         # First, run some simple code
         with timers.hierarchical_timer("top_level"):
             for i in range(3):
diff --git a/ml-agents-envs/mlagents_envs/timers.py b/ml-agents-envs/mlagents_envs/timers.py
@@ -31,9 +31,10 @@ def main():
 import math
 import sys
 import time
+import threading
 
 from contextlib import contextmanager
-from typing import Any, Callable, Dict, Generator, TypeVar
+from typing import Any, Callable, Dict, Generator, Optional, TypeVar
 
 TIMER_FORMAT_VERSION = "0.1.0"
 
@@ -97,19 +98,31 @@ class GaugeNode:
     Tracks the most recent value of a metric. This is analogous to gauges in statsd.
     """
 
-    __slots__ = ["value", "min_value", "max_value", "count"]
+    __slots__ = ["value", "min_value", "max_value", "count", "_timestamp"]
 
     def __init__(self, value: float):
         self.value = value
         self.min_value = value
         self.max_value = value
         self.count = 1
+        # Internal timestamp so we can determine priority.
+        self._timestamp = time.time()
 
     def update(self, new_value: float) -> None:
         self.min_value = min(self.min_value, new_value)
         self.max_value = max(self.max_value, new_value)
         self.value = new_value
         self.count += 1
+        self._timestamp = time.time()
+
+    def merge(self, other: "GaugeNode") -> None:
+        if self._timestamp < other._timestamp:
+            # Keep the "later" value
+            self.value = other.value
+            self._timestamp = other._timestamp
+        self.min_value = min(self.min_value, other.min_value)
+        self.max_value = max(self.max_value, other.max_value)
+        self.count += other.count
 
     def as_dict(self) -> Dict[str, float]:
         return {
@@ -232,9 +245,23 @@ def _add_default_metadata(self):
         self.metadata["command_line_arguments"] = " ".join(sys.argv)
 
 
-# Global instance of a TimerStack. This is generally all that we need for profiling, but you can potentially
-# create multiple instances and pass them to the contextmanager
-_global_timer_stack = TimerStack()
+# Maintain a separate "global" timer per thread, so that they don't accidentally conflict with each other.
+_thread_timer_stacks: Dict[int, TimerStack] = {}
+
+
+def _get_thread_timer() -> TimerStack:
+    ident = threading.get_ident()
+    if ident not in _thread_timer_stacks:
+        timer_stack = TimerStack()
+        _thread_timer_stacks[ident] = timer_stack
+    return _thread_timer_stacks[ident]
+
+
+def get_timer_stack_for_thread(t: threading.Thread) -> Optional[TimerStack]:
+    if t.ident is None:
+        # Thread hasn't started, shouldn't ever happen
+        return None
+    return _thread_timer_stacks.get(t.ident)
 
 
 @contextmanager
@@ -243,7 +270,7 @@ def hierarchical_timer(name: str, timer_stack: TimerStack = None) -> Generator:
     Creates a scoped timer around a block of code. This time spent will automatically be incremented when
     the context manager exits.
     """
-    timer_stack = timer_stack or _global_timer_stack
+    timer_stack = timer_stack or _get_thread_timer()
     timer_node = timer_stack.push(name)
     start_time = time.perf_counter()
 
@@ -284,34 +311,52 @@ def set_gauge(name: str, value: float, timer_stack: TimerStack = None) -> None:
     """
     Updates the value of the gauge (or creates it if it hasn't been set before).
     """
-    timer_stack = timer_stack or _global_timer_stack
+    timer_stack = timer_stack or _get_thread_timer()
     timer_stack.set_gauge(name, value)
 
 
+def merge_gauges(gauges: Dict[str, GaugeNode], timer_stack: TimerStack = None) -> None:
+    """
+    Merge the gauges from another TimerStack with the provided one (or the
+    current thread's stack if none is provided).
+    :param gauges:
+    :param timer_stack:
+    :return:
+    """
+    timer_stack = timer_stack or _get_thread_timer()
+    for n, g in gauges.items():
+        if n in timer_stack.gauges:
+            timer_stack.gauges[n].merge(g)
+        else:
+            timer_stack.gauges[n] = g
+
+
 def add_metadata(key: str, value: str, timer_stack: TimerStack = None) -> None:
-    timer_stack = timer_stack or _global_timer_stack
+    timer_stack = timer_stack or _get_thread_timer()
     timer_stack.add_metadata(key, value)
 
 
 def get_timer_tree(timer_stack: TimerStack = None) -> Dict[str, Any]:
     """
-    Return the tree of timings from the TimerStack as a dictionary (or the global stack if none is provided)
+    Return the tree of timings from the TimerStack as a dictionary (or the
+     current thread's  stack if none is provided)
     """
-    timer_stack = timer_stack or _global_timer_stack
+    timer_stack = timer_stack or _get_thread_timer()
     return timer_stack.get_timing_tree()
 
 
 def get_timer_root(timer_stack: TimerStack = None) -> TimerNode:
     """
-    Get the root TimerNode of the timer_stack (or the global TimerStack if not specified)
+    Get the root TimerNode of the timer_stack (or the current thread's
+    TimerStack if not specified)
     """
-    timer_stack = timer_stack or _global_timer_stack
+    timer_stack = timer_stack or _get_thread_timer()
     return timer_stack.get_root()
 
 
 def reset_timers(timer_stack: TimerStack = None) -> None:
     """
-    Reset the timer_stack (or the global TimerStack if not specified)
+    Reset the timer_stack (or the current thread's TimerStack if not specified)
     """
-    timer_stack = timer_stack or _global_timer_stack
+    timer_stack = timer_stack or _get_thread_timer()
     timer_stack.reset()
diff --git a/ml-agents/mlagents/trainers/trainer_controller.py b/ml-agents/mlagents/trainers/trainer_controller.py
@@ -19,7 +19,12 @@
     UnityCommunicatorStoppedException,
 )
 from mlagents.trainers.sampler_class import SamplerManager
-from mlagents_envs.timers import hierarchical_timer, timed
+from mlagents_envs.timers import (
+    hierarchical_timer,
+    timed,
+    get_timer_stack_for_thread,
+    merge_gauges,
+)
 from mlagents.trainers.trainer import Trainer
 from mlagents.trainers.meta_curriculum import MetaCurriculum
 from mlagents.trainers.trainer_util import TrainerFactory
@@ -228,7 +233,7 @@ def start_learning(self, env_manager: EnvManager) -> None:
                     if self._should_save_model(global_step):
                         self._save_model()
             # Stop advancing trainers
-            self.kill_trainers = True
+            self.join_threads()
             # Final save Tensorflow model
             if global_step != 0 and self.train_model:
                 self._save_model()
@@ -238,7 +243,7 @@ def start_learning(self, env_manager: EnvManager) -> None:
             UnityEnvironmentException,
             UnityCommunicatorStoppedException,
         ) as ex:
-            self.kill_trainers = True
+            self.join_threads()
             if self.train_model:
                 self._save_model_when_interrupted()
 
@@ -315,6 +320,30 @@ def advance(self, env: EnvManager) -> int:
 
         return num_steps
 
+    def join_threads(self, timeout_seconds: float = 1.0) -> None:
+        """
+        Wait for threads to finish, and merge their timer information into the main thread.
+        :param timeout_seconds:
+        :return:
+        """
+        self.kill_trainers = True
+        for t in self.trainer_threads:
+            try:
+                t.join(timeout_seconds)
+            except Exception:
+                pass
+
+        with hierarchical_timer("trainer_threads") as main_timer_node:
+            for trainer_thread in self.trainer_threads:
+                thread_timer_stack = get_timer_stack_for_thread(trainer_thread)
+                if thread_timer_stack:
+                    main_timer_node.merge(
+                        thread_timer_stack.root,
+                        root_name="thread_root",
+                        is_parallel=True,
+                    )
+                    merge_gauges(thread_timer_stack.gauges)
+
     def trainer_update_func(self, trainer: Trainer) -> None:
         while not self.kill_trainers:
             with hierarchical_timer("trainer_advance"):