Merge pull request #4 from SwamyDev/p3

Integrate final multi-agent project
SwamyDev · Mar 14, 2020 · 18bbb35 · 18bbb35
2 parents 4facb11 + 3c6089c
commit 18bbb35
Show file tree

Hide file tree

Showing 40 changed files with 771 additions and 138 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,14 @@
+[report]
+exclude_lines =
+    pragma: no cover
+    def __repr__
+    except ImportError
+    raise ImportError
+    if __name__ == .__main__.:
+    raise NotImplementedError
+    @abc.abstractmethod
+    @abstractmethod
+
+[run]
+omit =
+    udacity_rl/main.py
diff --git a/.idea/dataSources.xml b/.idea/dataSources.xml
diff --git a/Makefile b/Makefile
@@ -60,3 +60,4 @@ test: venv/test_done
 
 coverage: venv/test_done
 	. venv/bin/activate; pytest --cov=udacity_rl --cov-report term-missing tests
+	. venv/bin/activate; pytest --cov=udacity_rl --cov-report term-missing --cov-append --run-reacher tests/test_gym_adapter.py
diff --git a/README.md b/README.md
@@ -1,10 +1,11 @@
 [![Build Status](https://travis-ci.com/SwamyDev/udacity-deep-rl-navigation.svg?branch=master)](https://travis-ci.com/SwamyDev/udacity-deep-rl-navigation) [![Coverage Status](https://coveralls.io/repos/github/SwamyDev/udacity-deep-rl-navigation/badge.svg?branch=master)](https://coveralls.io/github/SwamyDev/udacity-deep-rl-navigation?branch=master)
 # Udacity Projects
 
-This repository is part of the Udacity Reinforcement Learning Nanodegree. It contains solutions to the courses class projects `navigation` and `continuous control`. You can find more detailed explanations for each project and their environments in their dedicated README or Report files:
+This repository is part of the Udacity Reinforcement Learning Nanodegree. It contains solutions to the courses class projects `navigation`, `continuous control` and `multi-agent`. You can find more detailed explanations for each project and their environments in their dedicated README or Report files:
 
 - [Project Navigation](doc/README_p1_navigation.md)
 - [Project Continuous Control](doc/README_p2_continuous.md) 
+- [Project Mulit-Agent](doc/README_p3_multiagent.md) 
 
 ## Installation
 To run the code of the projects you need to install the repositories virtual environment. To make this as easy as possible I provide a `Makefile` using `GNU Make` to set up virtual environments and download dependencies. It requires a Linux environment. Under Ubuntu make is part of the `build-essential` package (`apt install build-essential`). Other dependencies are python3 virutalenv (`apt install python3-venv`) and pip (`apt install python3-pip`).

diff --git a/Report.md b/Report.md
@@ -4,3 +4,4 @@ Each Udacity project is accompanied by a report. Each report follows a similar s
 
 - [Project: Navigation](doc/Report_p1_navigation.md)
 - [Project: Continuous Control](doc/Report_p2_continuous.md)
+- [Project: Multi-Agent](doc/Report_p3_multiagent.md)
diff --git a/configs/multi_ddpg_ann_a_2x256_c_2x256_1x128-2020-03-07.json b/configs/multi_ddpg_ann_a_2x256_c_2x256_1x128-2020-03-07.json
@@ -0,0 +1,40 @@
+{
+  "act_noise_std": 0.1,
+  "batch_size": 128,
+  "record_size": 1000000,
+  "actor": {
+    "layers": [
+      {
+        "activation": "relu",
+        "size": 256
+      },
+      {
+        "activation": "relu",
+        "size": 256
+      }
+    ],
+    "device": "cuda:0",
+    "lr": 1e-4
+  },
+  "critic": {
+    "layers": [
+      {
+        "activation": "leaky_relu",
+        "size": 256
+      },
+      {
+        "activation": "leaky_relu",
+        "size": 256
+      },
+      {
+        "activation": "leaky_relu",
+        "size": 128
+      }
+    ],
+    "device": "cuda:0",
+    "lr": 3e-4
+  },
+  "gamma": 0.99,
+  "tau": 1e-3
+}
+
diff --git a/doc/README_p3_multiagent.md b/doc/README_p3_multiagent.md
@@ -0,0 +1,31 @@
+# Project: Multi-Agent
+
+This project is part of the Udacity Reinforcement Learning Nanodegree. In this project, multiple `DDPG` agents are trained to solve a multi-agent environment. Specifically, each agent needs to control a tennis racket to pass a ball back and forth, keeping it in play as long as possible. Each agent receives a reward, each time it hits the ball over the net and gets penalized when the ball hits the ground or goes out of bounds. Hence, it is in the interest of both agents to keep the ball in play, making this a cooperative environment. The environment is considered solved when the maximum score from the agents reaches an average of >0.5 points throughout 100 episodes.
+
+## Environment Setup
+### Reward Signal
+Each agent receives a reward of `0.1` when it hits the ball over the net, but gets a penalty of `-0.01` each time the ball hits the ground or goes out of bounds. The goal for both agents is, therefore, to keep the ball in play as long as possible. 
+
+### Observation
+An observation state for each agent individually consists of the agent's current position and velocity and the position and velocity of the ball. The total observation of both agents is encoded in a 2x24 tensor (stacking the observations of both agents). 
+
+### Actions
+The action each agent can take consists of a 1x2 tensor corresponding to 2 continuous actions: Moving towards or away from the net, and jumping. The action values are normalized to a range between `-1` and `1`.
+
+## Exploring
+To explore the `Tennis_Linux` environment for 100 episodes run the following command from the root of the repository:
+```bash
+udacity-rl -e resources/environments/Tennis_Linux/Tennis.x86_64 explore -n 100
+```
+
+## Training
+To train the `multi-agent` agents run the following command from the root of the repository:
+```bash
+udacity-rl -e resources/environments/Tennis_Linux/Tennis.x86_64 train NDDPG 5000 -c configs/multi_ddpg_ann_a_2x256_c_2x256_1x128-2020-03-07.json 
+```
+
+## Running
+To observe the stored final agent run the following command from the root of the repository:
+```bash
+udacity-rl -e resources/environments/Tennis_Linux/Tennis.x86_64 run resources/models/p3_tennis_final/ 1
+```
diff --git a/doc/Report_p3_multiagent.md b/doc/Report_p3_multiagent.md
@@ -0,0 +1,80 @@
+# Project: Multi-Agent
+
+This report details how multiple agents using the Deep Deterministic Policy Gradient (`DDPG`) algorithm solve a cooperative task. One of the main issues with multi-agent (`MA`) environments is, that actions of each agent influence observations, rewards and the set of valid actions for other agents. This makes the problem non-stationary. However, in the case of the `Tennis` environment, this issue seems to be not that pronounced. In this environment, two agents control a tennis racket and their goal is to keep the ball in play without hitting it out of bounds or dropping it to the ground (more details in [README_p3_multiagent.md](README_p3_multiagent.md)). Each agent can act relatively independent of the actions of the other agent (agents can't interfere with each other for instance) and just focus on hitting the ball over the net. This makes this environment easier to solve than other `MA` settings and is possibly the reason why the naive approach of training two independent `DDPG` agents was sufficient. As with the last project I was able to implement the agent relatively quick, and without running into too many defects, by using best software engineering practises such as Continuous Integration (`CI`) and Test Driven Development (`TDD`). Additionally, I've learned, that it is a good idea to study the performance of a random agent within the environment, before diving into it. In this particular case, it happened quite often that I thought the agent was doing well when it was just acting randomly. Often this was due to some bug in the code or oversight in the configuration. What made this phenomenon more pronounced was the fact that the learning agents would crash initially to the absolute minimum of rewards, hence the random agent actually looked better in comparison. 
+
+## N Deep Deterministic Policy Gradient
+The N Deep Deterministic Policy Gradient simply instantiates multiple independent agents using the `DDPG` algorithm. For this environment, it wasn't necessary to do much tweaking of the original algorithm. In this implementation, I simply reused the memory replay buffer, but without sharing experiences between agents. 
+
+The main "trick" was just to train both agents long enough to get them over the initial crashing phase. After about 1000 - 2000 episodes they would get out of their initial rut and quickly reach peak performance around `~4` points on the average 100 episodes. Once the algorithm reached peak performance the algorithm would still fluctuate a lot, but not crash down to the minimum again. In fact, in the run, I submitted it consistently stayed above `0.5` points, even though it got close once.  I'm suspecting this is due to the agents learning to reach a delicate balance where they would bounce the ball back end forth at the exact same position, for very long periods. This results in accumulating a very high amount of similar experiences in the replay buffer. In turn, this leads to the oversampling of these states. Intuitively this would mean that the agents do not learn from a diverse set of situations where the ball would come from different angles at different positions. Hence agents are easily "surprised" when getting in an unusual situation. This might result in agents to underperform until some balance is found again.
+
+## The DDPG Algorithm
+The algorithm is the same as described in [Report_p2_continuous.md](Report_p2_continuous.md). However, there are some minor tweaks to how exploration is done. I implemented a similar setup as found in [Spinning Up RL from OpenAI](https://spinningup.openai.com/en/latest/algorithms/ddpg.html). Now the agents start with completely random "preheating" steps before they are allowed to act. After that, some Gaussian noise with a fixed standard deviation is added to ensure continued exploration. 
+
+## Notes on Development
+First I implemented a single `DDPG` agent which would just control both rackets and receive both observations, which I called the "one-mind" agent. With this much easier approach to solving the environment, I could explore the various pitfalls of it. It turned out in the end that one of the major challenges wasn't agent coordination or the environment being non-stationary, but rather the exploration/exploitation trade-off and the reward signal. Even the easier "one-mind" agent would crash early during training and not recover for a long time.
+
+I've spent a lot of time investigating the exploration/exploitation tread-off because of this initial hard-crash of the agent (constant `0` average reward over 100 episodes). The exploration setup described in the previous section turned out to be the best, however, it still didn't improve that much. I tried tuning various other hyperparameters like learning rate, model architecture, gamma and tau. However, none of these improved the performance much. 
+
+Following that, I considered implementing prioritized experience replay as I suspected that accumulating lots of similar low reward experiences would lead to oversampling them and stall learning. However, once I trained the "one-mind" agent for longer episodes I noticed that it got out of its rut after a while. Hence, I decided to postpone the implementation of prioritized replay and try the environment with multiple agents. I started with a naive approach by just training two `DDPG` agents simultaneously on the environment. It turned out that these performed just as well as the "one-mind" agent, solving it after `~4500` episodes.
+
+Comparing poorly performing agents with well-performing ones, I have the suspicion that various factors contribute to the initial crash of performance. One is that there might be a better way to initialize model parameters. For now, I use the default initialization of PyTorch. I'm also suspecting that the reward signal is difficult. When agents constantly drop the ball they have no indication which of their actions actually got them closer to their desired goal. For instance, one agent might have hit the ball and got it closer to the net but still dropped it. However, the reward signal would still be just `-0.05` no matter how close the ball got to the net. This means that agents have to rely on randomly hitting the ball across the net. I'm thinking that prioritized replay might help with that. Of course, reshaping the reward function might as well.
+
+While investigating these issues, I've extended the command line interface with some convenience functionality. For instance, it is now possible to create snapshots once the agent reaches a certain performance level. This feature was used during training to save the agent model parameters when it actually achieved its highest performance. This peak performance model is also the model reported in the repository (`resources/models/p3_tennis_final`). Additionally, I now properly take care of keyboard interrupts which allows me to stop training in-between, save the agent and display the training graph. This helped a lot in investigating issues with training performance. 
+
+## Results
+Using a neural network architecture similar to the one used in the [Report_p2_continuous.md](Report_p2_continuous.md), my agent solved the environment after about ~4500 episodes. 
+
+![Graph of Training Run](../resources/images/nddpg_training.png)
+
+Both agents used the `multi_ddpg_ann_a_2x256_c_2x256_1x128-2020-03-07` agent configuration:
+```json
+{
+  "act_noise_std": 0.1,
+  "batch_size": 128,
+  "record_size": 1000000,
+  "actor": {
+    "layers": [
+      {
+        "activation": "relu",
+        "size": 256
+      },
+      {
+        "activation": "relu",
+        "size": 256
+      }
+    ],
+    "device": "cuda:0",
+    "lr": 1e-4
+  },
+  "critic": {
+    "layers": [
+      {
+        "activation": "leaky_relu",
+        "size": 256
+      },
+      {
+        "activation": "leaky_relu",
+        "size": 256
+      },
+      {
+        "activation": "leaky_relu",
+        "size": 128
+      }
+    ],
+    "device": "cuda:0",
+    "lr": 3e-4
+  },
+  "gamma": 0.99,
+  "tau": 1e-3
+}
+```
+(tau specifies the linear interpolation value used for the "soft update" of target and local models)
+
+The final trained model is stored under `resources/models/p3_tennis_final`.
+
+## Future Work
+Considering the challenges I faced in this environment, I think the most fruitful approach to improve performance is to focus on improving the replayed experience and the exploration/exploitation trade-off. What points me to this conclusion, is the fact that the "one-minded" agent exhibits similar performance to the naive multi-agent approach. An easy improvement could be to find a better way to initialize the model weights, so the agents do better exploring initially. 
+
+Additionally, prioritized replay could help the agent to learn more from unusual experiences and reduce the oversampling of what I'd call "states in delicate balance". The agent might then be not that "surprised" by unexpected ball trajectories and perform more robustly overall. 
+
+Of course, one could also improve the `multi-agent` aspect of it as well and implement the [MADDPG](https://arxiv.org/abs/1706.02275) algorithm (which I intended initially, but it turned out not to be necessary). 
diff --git a/resources/fetch-unity-environments.sh b/resources/fetch-unity-environments.sh
@@ -1,10 +1,16 @@
 #!/bin/bash
 
-TMP_DIR=`mktemp -d`
+TMP_DIR=$(mktemp -d)
 mkdir -p resources/environments
+
 wget https://s3-us-west-1.amazonaws.com/udacity-drlnd/P1/Banana/Banana_Linux.zip -P $TMP_DIR/
 unzip -d resources/environments $TMP_DIR/Banana_Linux.zip
+
 wget https://s3-us-west-1.amazonaws.com/udacity-drlnd/P2/Reacher/one_agent/Reacher_Linux.zip -P $TMP_DIR/
 unzip -d resources/environments $TMP_DIR/Reacher_Linux.zip
+
+wget https://s3-us-west-1.amazonaws.com/udacity-drlnd/P3/Tennis/Tennis_Linux.zip -P $TMP_DIR/
+unzip -d resources/environments $TMP_DIR/Tennis_Linux.zip
+
 rm -r $TMP_DIR
 
diff --git a/resources/images/nddpg_training.png b/resources/images/nddpg_training.png
diff --git a/resources/models/p3_tennis_final/action_space b/resources/models/p3_tennis_final/action_space
diff --git a/resources/models/p3_tennis_final/config.json b/resources/models/p3_tennis_final/config.json
diff --git a/resources/models/p3_tennis_final/mind_0/actor_local.pth b/resources/models/p3_tennis_final/mind_0/actor_local.pth
diff --git a/resources/models/p3_tennis_final/mind_0/actor_target.pth b/resources/models/p3_tennis_final/mind_0/actor_target.pth
diff --git a/resources/models/p3_tennis_final/mind_0/critic_local.pth b/resources/models/p3_tennis_final/mind_0/critic_local.pth
diff --git a/resources/models/p3_tennis_final/mind_0/critic_target.pth b/resources/models/p3_tennis_final/mind_0/critic_target.pth
diff --git a/resources/models/p3_tennis_final/mind_1/actor_local.pth b/resources/models/p3_tennis_final/mind_1/actor_local.pth
diff --git a/resources/models/p3_tennis_final/mind_1/actor_target.pth b/resources/models/p3_tennis_final/mind_1/actor_target.pth
diff --git a/resources/models/p3_tennis_final/mind_1/critic_local.pth b/resources/models/p3_tennis_final/mind_1/critic_local.pth
diff --git a/resources/models/p3_tennis_final/mind_1/critic_target.pth b/resources/models/p3_tennis_final/mind_1/critic_target.pth
diff --git a/resources/models/p3_tennis_final/observation_space b/resources/models/p3_tennis_final/observation_space
diff --git a/resources/models/p3_tennis_final/type.meta b/resources/models/p3_tennis_final/type.meta
diff --git a/tests/auxiliary/__init__.py b/tests/auxiliary/__init__.py
@@ -52,8 +52,8 @@ def follows_contract(interface=None, properties=None):
 
 
 class GymSession(gym.Wrapper):
-    def __init__(self, gym_id, eps_calc):
-        super().__init__(gym.make(gym_id))
+    def __init__(self, gym_instance, eps_calc):
+        super().__init__(gym_instance)
         self.eps_calc = eps_calc
 
     def test(self, agent, num_episodes=100):

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -9,6 +9,9 @@ def pytest_addoption(parser):
     parser.addoption(
         "--run-reacher", action="store_true", default=False, help="run tests for reacher unity environment"
     )
+    parser.addoption(
+        "--run-multiagent", action="store_true", default=False, help="run tests for multi agent unity environment"
+    )
 
 
 def pytest_configure(config):
@@ -20,15 +23,21 @@ def pytest_configure(config):
                    "to the specified `max_samples`. If it is smaller than `max_samples` it is capped to `sample_size`")
     config.addinivalue_line(
         "markers", "reacher: mark test to use be run only when the reacher unity environment is configured")
+    config.addinivalue_line(
+        "markers", "multiagent: mark test to use be run only when the multi agent unity environment is configured")
 
 
 def pytest_collection_modifyitems(config, items):
-    if config.getoption("--run-reacher"):
-        return
-    skip_reacher = pytest.mark.skip(reason="need --run-reacher option to run")
-    for item in items:
-        if "reacher" in item.keywords:
-            item.add_marker(skip_reacher)
+    skip_marker(config, items, "reacher")
+    skip_marker(config, items, "multiagent")
+
+
+def skip_marker(config, items, marker):
+    if not config.getoption(f"--run-{marker}"):
+        skip = pytest.mark.skip(reason=f"need --run-{marker} option to run")
+        for item in items:
+            if marker in item.keywords:
+                item.add_marker(skip)
 
 
 class StochasticRunRecorder:
@@ -108,6 +117,11 @@ def use_reacher(request):
     return request.config.getoption("--run-reacher")
 
 
+@pytest.fixture(scope='session')
+def use_multi_agent(request):
+    return request.config.getoption("--run-multiagent")
+
+
 @pytest.fixture(autouse=True)
 def set_log_level(caplog):
     caplog.set_level(logging.WARNING)