diff --git a/.circleci/config.yml b/.circleci/config.yml index 26366aa392..710264f72d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -24,7 +24,7 @@ jobs: pip install --upgrade setuptools cd ml-agents-envs && pip install -e . cd ../ml-agents && pip install -e . - pip install black pytest-cov==2.6.1 codacy-coverage==1.3.11 + pip install pre-commit pytest-cov==2.6.1 cd ../gym-unity && pip install -e . - save_cache: @@ -38,15 +38,12 @@ jobs: . venv/bin/activate mkdir test-reports pytest --cov=mlagents --cov-report xml --junitxml=test-reports/junit.xml -p no:warnings - python-codacy-coverage -r coverage.xml - run: - name: Check Code Style for ml-agents and gym_unity using black + name: Check Code Style using pre-commit command: | . venv/bin/activate - black --check ml-agents - black --check ml-agents-envs - black --check gym-unity + pre-commit run --show-diff-on-failure --all-files - run: name: Verify there are no hidden/missing metafiles. diff --git a/.gitignore b/.gitignore index 75de23b60a..a127b1bbd4 100644 --- a/.gitignore +++ b/.gitignore @@ -86,13 +86,11 @@ .DS_Store .ipynb_checkpoints -# pytest cache +# pytest cache *.pytest_cache/ # Ignore compiled protobuf files. -ml-agents-protobuf/cs -ml-agents-protobuf/python -ml-agents-protobuf/Grpc* +*Grpc.Tools* # Ignore PyPi build files. dist/ @@ -100,3 +98,7 @@ build/ # Python virtual environment venv/ +.mypy_cache/ + +# Code coverage report +.coverage diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000..a71c9c0815 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,39 @@ +repos: +- repo: https://github.com/python/black + rev: 19.3b0 + hooks: + - id: black +- repo: https://github.com/pre-commit/mirrors-mypy + rev: v0.720 + hooks: + - id: mypy + name: mypy-ml-agents + files: "ml-agents/.*" + args: [--ignore-missing-imports, --disallow-incomplete-defs] + - id: mypy + name: mypy-ml-agents-envs + files: "ml-agents-envs/.*" + # Exclude protobuf files and don't follow them when imported + exclude: ".*_pb2.py" + # TODO get disallow-incomplete-defs working + args: [--ignore-missing-imports, --follow-imports=silent] + - id: mypy + name: mypy-gym-unity + files: "gym-unity/.*" + args: [--ignore-missing-imports, --disallow-incomplete-defs] +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.2.3 + hooks: + - id: mixed-line-ending + exclude: > + (?x)^( + .*cs.meta| + .*.css + )$ + args: [--fix=lf] + - id: flake8 + exclude: > + (?x)^( + .*_pb2.py| + .*_pb2_grpc.py + )$ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c5e6f81a50..ef58c1b215 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -48,6 +48,22 @@ the platform, and provide a unique non-trivial challenge to modern machine learning algorithms. Feel free to submit these environments with a PR explaining the nature of the environment and task. -## Style Guide +## Continuous Integration (CI) -When performing changes to the codebase, please ensure that all python code is reformatted using the [black](https://github.com/ambv/black) formatter. For C#, we will soon be requirements for style and formatting. +We run CircleCI on all PRs; all tests must be passing before the PR is merged. + +Several static checks are run on the codebase using the [pre-commit framework](https://pre-commit.com/) during CI. To execute the same checks locally, install `pre-commit` and run `pre-commit run --all-files`. Some hooks (for example, `black`) will output the corrected version of the code; others (like `mypy`) may require more effort to fix. + +### Code style +All python code should be formatted with [`black`](https://github.com/ambv/black). Style and formatting for C# may be enforced later. + +### Python type annotations +We use [`mypy`](http://mypy-lang.org/) to perform static type checking on python code. Currently not all code is annotated but we will increase coverage over time. If you are adding or refactoring code, please +1. Add type annotations to the new or refactored code. +2. Make sure that code calling or called by the modified code also has type annotations. + +The [type hint cheat sheet](https://mypy.readthedocs.io/en/stable/cheat_sheet_py3.html) provides a good introduction to adding type hints. + +## Contributor License Agreements + +When you open a pull request, you will be asked to acknolwedge our Contributor License Agreement. We allow both individual contributions and contributions made on behalf of companies. We use an open source tool called CLA assistant. If you have any questions on our CLA, please [submit an issue](https://github.com/Unity-Technologies/ml-agents/issues) or email us at ml-agents@unity3d.com. diff --git a/README.md b/README.md index e6f46ddfee..ddb167655b 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ developer communities. * 10+ sample Unity environments * Support for multiple environment configurations and training scenarios * Train memory-enhanced agents using deep reinforcement learning -* Easily definable Curriculum Learning scenarios +* Easily definable Curriculum Learning and Generalization scenarios * Broadcasting of agent behavior for supervised learning * Built-in support for Imitation Learning * Flexible agent control with On Demand Decision Making @@ -77,11 +77,11 @@ If you run into any problems using the ML-Agents toolkit, [submit an issue](https://github.com/Unity-Technologies/ml-agents/issues) and make sure to include as much detail as possible. -Your opinion matters a great deal to us. Only by hearing your thoughts on the Unity ML-Agents Toolkit can we continue to improve and grow. Please take a few minutes to [let us know about it](https://github.com/Unity-Technologies/ml-agents/issues/1454). +Your opinion matters a great deal to us. Only by hearing your thoughts on the Unity ML-Agents Toolkit can we continue to improve and grow. Please take a few minutes to [let us know about it](https://github.com/Unity-Technologies/ml-agents/issues/1454). For any other questions or feedback, connect directly with the ML-Agents -team at ml-agents@unity3d.com. +team at ml-agents@unity3d.com. ## Translations @@ -93,6 +93,7 @@ translating more pages and to other languages. Consequently, we welcome any enhancements and improvements from the community. * [Chinese](docs/localized/zh-CN/) +* [Korean](docs/localized/KR/) ## License diff --git a/UnitySDK/Assets/ML-Agents/Editor/Tests/EditModeTestInternalBrainTensorApplier.cs b/UnitySDK/Assets/ML-Agents/Editor/Tests/EditModeTestInternalBrainTensorApplier.cs index f9774882e3..cddebf13c7 100644 --- a/UnitySDK/Assets/ML-Agents/Editor/Tests/EditModeTestInternalBrainTensorApplier.cs +++ b/UnitySDK/Assets/ML-Agents/Editor/Tests/EditModeTestInternalBrainTensorApplier.cs @@ -3,6 +3,7 @@ using NUnit.Framework; using UnityEngine; using System.Reflection; +using Barracuda; using MLAgents.InferenceBrain; namespace MLAgents.Tests @@ -35,17 +36,20 @@ public AgentAction GetAction() public void Contruction() { var bp = new BrainParameters(); - var tensorGenerator = new TensorApplier(bp, 0); + var alloc = new TensorCachingAllocator(); + var tensorGenerator = new TensorApplier(bp, 0, alloc); Assert.IsNotNull(tensorGenerator); + alloc.Dispose(); } [Test] public void ApplyContinuousActionOutput() { - var inputTensor = new Tensor() + var inputTensor = new TensorProxy() { Shape = new long[] {2, 3}, - Data = new float[,] {{1, 2, 3}, {4, 5, 6}} + Data = new Tensor (2, 3, new float[] {1, 2, 3, + 4, 5, 6}) }; var agentInfos = GetFakeAgentInfos(); @@ -67,15 +71,15 @@ public void ApplyContinuousActionOutput() [Test] public void ApplyDiscreteActionOutput() { - var inputTensor = new Tensor() + var inputTensor = new TensorProxy() { Shape = new long[] {2, 5}, - Data = new float[,] {{0.5f, 22.5f, 0.1f, 5f, 1f}, - {4f, 5f, 6f, 7f, 8f}} + Data = new Tensor (2, 5, new[] {0.5f, 22.5f, 0.1f, 5f, 1f, + 4f, 5f, 6f, 7f, 8f}) }; var agentInfos = GetFakeAgentInfos(); - - var applier = new DiscreteActionOutputApplier(new int[]{2, 3}, 0); + var alloc = new TensorCachingAllocator(); + var applier = new DiscreteActionOutputApplier(new int[]{2, 3}, 0, alloc); applier.Apply(inputTensor, agentInfos); var agents = agentInfos.Keys.ToList(); var agent = agents[0] as TestAgent; @@ -86,16 +90,17 @@ public void ApplyDiscreteActionOutput() action = agent.GetAction(); Assert.AreEqual(action.vectorActions[0], 1); Assert.AreEqual(action.vectorActions[1], 2); + alloc.Dispose(); } [Test] public void ApplyMemoryOutput() { - var inputTensor = new Tensor() + var inputTensor = new TensorProxy() { Shape = new long[] {2, 5}, - Data = new float[,] {{0.5f, 22.5f, 0.1f, 5f, 1f}, - {4f, 5f, 6f, 7f, 8f}} + Data = new Tensor (2, 5, new[] {0.5f, 22.5f, 0.1f, 5f, 1f, + 4f, 5f, 6f, 7f, 8f}) }; var agentInfos = GetFakeAgentInfos(); @@ -115,10 +120,10 @@ public void ApplyMemoryOutput() [Test] public void ApplyValueEstimate() { - var inputTensor = new Tensor() + var inputTensor = new TensorProxy() { Shape = new long[] {2, 1}, - Data = new float[,] {{0.5f}, {8f}} + Data = new Tensor (2, 1, new[]{0.5f, 8f}) }; var agentInfos = GetFakeAgentInfos(); diff --git a/UnitySDK/Assets/ML-Agents/Editor/Tests/EditModeTestInternalBrainTensorGenerator.cs b/UnitySDK/Assets/ML-Agents/Editor/Tests/EditModeTestInternalBrainTensorGenerator.cs index 1d986f2c87..6ffcd72ed2 100644 --- a/UnitySDK/Assets/ML-Agents/Editor/Tests/EditModeTestInternalBrainTensorGenerator.cs +++ b/UnitySDK/Assets/ML-Agents/Editor/Tests/EditModeTestInternalBrainTensorGenerator.cs @@ -1,6 +1,7 @@ using System; using System.Collections.Generic; using System.Linq; +using Barracuda; using NUnit.Framework; using UnityEngine; using MLAgents.InferenceBrain; @@ -43,111 +44,121 @@ private class TestAgent : Agent public void Contruction() { var bp = new BrainParameters(); - var tensorGenerator = new TensorGenerator(bp, 0); + var alloc = new TensorCachingAllocator(); + var tensorGenerator = new TensorGenerator(bp, 0, alloc); Assert.IsNotNull(tensorGenerator); + alloc.Dispose(); } [Test] public void GenerateBatchSize() { - var inputTensor = new Tensor(); + var inputTensor = new TensorProxy(); + var alloc = new TensorCachingAllocator(); var batchSize = 4; - var generator = new BatchSizeGenerator(); + var generator = new BatchSizeGenerator(alloc); generator.Generate(inputTensor, batchSize, null); - Assert.IsNotNull(inputTensor.Data as int[]); - Assert.AreEqual((inputTensor.Data as int[])[0], batchSize); + Assert.IsNotNull(inputTensor.Data); + Assert.AreEqual(inputTensor.Data[0], batchSize); + alloc.Dispose(); } [Test] public void GenerateSequenceLength() { - var inputTensor = new Tensor(); + var inputTensor = new TensorProxy(); + var alloc = new TensorCachingAllocator(); var batchSize = 4; - var generator = new SequenceLengthGenerator(); + var generator = new SequenceLengthGenerator(alloc); generator.Generate(inputTensor, batchSize, null); - Assert.IsNotNull(inputTensor.Data as int[]); - Assert.AreEqual((inputTensor.Data as int[])[0], 1); + Assert.IsNotNull(inputTensor.Data); + Assert.AreEqual(inputTensor.Data[0], 1); + alloc.Dispose(); } [Test] public void GenerateVectorObservation() { - var inputTensor = new Tensor() + var inputTensor = new TensorProxy() { Shape = new long[] {2, 3} }; var batchSize = 4; var agentInfos = GetFakeAgentInfos(); - - var generator = new VectorObservationGenerator(); + var alloc = new TensorCachingAllocator(); + var generator = new VectorObservationGenerator(alloc); generator.Generate(inputTensor, batchSize, agentInfos); - Assert.IsNotNull(inputTensor.Data as float[,]); - Assert.AreEqual((inputTensor.Data as float[,])[0, 0], 1); - Assert.AreEqual((inputTensor.Data as float[,])[0, 2], 3); - Assert.AreEqual((inputTensor.Data as float[,])[1, 0], 4); - Assert.AreEqual((inputTensor.Data as float[,])[1, 2], 6); + Assert.IsNotNull(inputTensor.Data); + Assert.AreEqual(inputTensor.Data[0, 0], 1); + Assert.AreEqual(inputTensor.Data[0, 2], 3); + Assert.AreEqual(inputTensor.Data[1, 0], 4); + Assert.AreEqual(inputTensor.Data[1, 2], 6); + alloc.Dispose(); } [Test] public void GenerateRecurrentInput() { - var inputTensor = new Tensor() + var inputTensor = new TensorProxy() { Shape = new long[] {2, 5} }; var batchSize = 4; var agentInfos = GetFakeAgentInfos(); - - var generator = new RecurrentInputGenerator(); + var alloc = new TensorCachingAllocator(); + var generator = new RecurrentInputGenerator(alloc); generator.Generate(inputTensor, batchSize, agentInfos); - Assert.IsNotNull(inputTensor.Data as float[,]); - Assert.AreEqual((inputTensor.Data as float[,])[0, 0], 0); - Assert.AreEqual((inputTensor.Data as float[,])[0, 4], 0); - Assert.AreEqual((inputTensor.Data as float[,])[1, 0], 1); - Assert.AreEqual((inputTensor.Data as float[,])[1, 4], 0); + Assert.IsNotNull(inputTensor.Data); + Assert.AreEqual(inputTensor.Data[0, 0], 0); + Assert.AreEqual(inputTensor.Data[0, 4], 0); + Assert.AreEqual(inputTensor.Data[1, 0], 1); + Assert.AreEqual(inputTensor.Data[1, 4], 0); + alloc.Dispose(); } [Test] public void GeneratePreviousActionInput() { - var inputTensor = new Tensor() + var inputTensor = new TensorProxy() { Shape = new long[] {2, 2}, - ValueType = Tensor.TensorType.Integer + ValueType = TensorProxy.TensorType.Integer }; var batchSize = 4; var agentInfos = GetFakeAgentInfos(); - - var generator = new PreviousActionInputGenerator(); + var alloc = new TensorCachingAllocator(); + var generator = new PreviousActionInputGenerator(alloc); generator.Generate(inputTensor, batchSize, agentInfos); - Assert.IsNotNull(inputTensor.Data as int[,]); - Assert.AreEqual((inputTensor.Data as int[,])[0, 0], 1); - Assert.AreEqual((inputTensor.Data as int[,])[0, 1], 2); - Assert.AreEqual((inputTensor.Data as int[,])[1, 0], 3); - Assert.AreEqual((inputTensor.Data as int[,])[1, 1], 4); + Assert.IsNotNull(inputTensor.Data); + Assert.AreEqual(inputTensor.Data[0, 0], 1); + Assert.AreEqual(inputTensor.Data[0, 1], 2); + Assert.AreEqual(inputTensor.Data[1, 0], 3); + Assert.AreEqual(inputTensor.Data[1, 1], 4); + alloc.Dispose(); } [Test] public void GenerateActionMaskInput() { - var inputTensor = new Tensor() + var inputTensor = new TensorProxy() { Shape = new long[] {2, 5}, - ValueType = Tensor.TensorType.FloatingPoint + ValueType = TensorProxy.TensorType.FloatingPoint }; var batchSize = 4; var agentInfos = GetFakeAgentInfos(); - - var generator = new ActionMaskInputGenerator(); + var alloc = new TensorCachingAllocator(); + var generator = new ActionMaskInputGenerator(alloc); generator.Generate(inputTensor, batchSize, agentInfos); - Assert.IsNotNull(inputTensor.Data as float[,]); - Assert.AreEqual((inputTensor.Data as float[,])[0, 0], 1); - Assert.AreEqual((inputTensor.Data as float[,])[0, 4], 1); - Assert.AreEqual((inputTensor.Data as float[,])[1, 0], 0); - Assert.AreEqual((inputTensor.Data as float[,])[1, 4], 1); + Assert.IsNotNull(inputTensor.Data); + Assert.AreEqual(inputTensor.Data[0, 0], 1); + Assert.AreEqual(inputTensor.Data[0, 4], 1); + Assert.AreEqual(inputTensor.Data[1, 0], 0); + Assert.AreEqual(inputTensor.Data[1, 4], 1); + alloc.Dispose(); } } } diff --git a/UnitySDK/Assets/ML-Agents/Editor/Tests/MultinomialTest.cs b/UnitySDK/Assets/ML-Agents/Editor/Tests/MultinomialTest.cs index c812f3f230..7e7b62e605 100644 --- a/UnitySDK/Assets/ML-Agents/Editor/Tests/MultinomialTest.cs +++ b/UnitySDK/Assets/ML-Agents/Editor/Tests/MultinomialTest.cs @@ -1,4 +1,5 @@ using System; +using Barracuda; using NUnit.Framework; using UnityEngine; using MLAgents.InferenceBrain; @@ -13,25 +14,24 @@ public void TestEvalP() { Multinomial m = new Multinomial(2018); - Tensor src = new Tensor + TensorProxy src = new TensorProxy { - Data = new float[1, 3] {{0.1f, 0.2f, 0.7f}}, - ValueType = Tensor.TensorType.FloatingPoint + Data = new Tensor(1, 3, new[] {0.1f, 0.2f, 0.7f}), + ValueType = TensorProxy.TensorType.FloatingPoint }; - Tensor dst = new Tensor + TensorProxy dst = new TensorProxy { - Data = new float[1, 3], - ValueType = Tensor.TensorType.FloatingPoint + Data = new Tensor(1, 3), + ValueType = TensorProxy.TensorType.FloatingPoint }; m.Eval(src, dst); float[] reference = {2, 2, 1}; - int i = 0; - foreach (var f in dst.Data) + for (var i = 0; i < dst.Data.length; i++) { - Assert.AreEqual(reference[i], f); + Assert.AreEqual(reference[i], dst.Data[i]); ++i; } } @@ -41,25 +41,24 @@ public void TestEvalLogits() { Multinomial m = new Multinomial(2018); - Tensor src = new Tensor + TensorProxy src = new TensorProxy { - Data = new float[1, 3] {{Mathf.Log(0.1f) - 50, Mathf.Log(0.2f) - 50, Mathf.Log(0.7f) - 50}}, - ValueType = Tensor.TensorType.FloatingPoint + Data = new Tensor(1, 3, new[] {Mathf.Log(0.1f) - 50, Mathf.Log(0.2f) - 50, Mathf.Log(0.7f) - 50}), + ValueType = TensorProxy.TensorType.FloatingPoint }; - Tensor dst = new Tensor + TensorProxy dst = new TensorProxy { - Data = new float[1, 3], - ValueType = Tensor.TensorType.FloatingPoint + Data = new Tensor(1, 3), + ValueType = TensorProxy.TensorType.FloatingPoint }; m.Eval(src, dst); float[] reference = {2, 2, 2}; - int i = 0; - foreach (var f in dst.Data) + for (var i = 0; i < dst.Data.length; i++) { - Assert.AreEqual(reference[i], f); + Assert.AreEqual(reference[i], dst.Data[i]); ++i; } } @@ -69,30 +68,29 @@ public void TestEvalBatching() { Multinomial m = new Multinomial(2018); - Tensor src = new Tensor + TensorProxy src = new TensorProxy { - Data = new float[2, 3] + Data = new Tensor(2, 3, new [] { - {Mathf.Log(0.1f) - 50, Mathf.Log(0.2f) - 50, Mathf.Log(0.7f) - 50}, - {Mathf.Log(0.3f) - 25, Mathf.Log(0.4f) - 25, Mathf.Log(0.3f) - 25}, + Mathf.Log(0.1f) - 50, Mathf.Log(0.2f) - 50, Mathf.Log(0.7f) - 50, + Mathf.Log(0.3f) - 25, Mathf.Log(0.4f) - 25, Mathf.Log(0.3f) - 25 - }, - ValueType = Tensor.TensorType.FloatingPoint + }), + ValueType = TensorProxy.TensorType.FloatingPoint }; - Tensor dst = new Tensor + TensorProxy dst = new TensorProxy { - Data = new float[2, 3], - ValueType = Tensor.TensorType.FloatingPoint + Data = new Tensor(2, 3), + ValueType = TensorProxy.TensorType.FloatingPoint }; m.Eval(src, dst); float[] reference = {2, 2, 2, 0, 1, 0}; - int i = 0; - foreach (var f in dst.Data) + for (var i = 0; i < dst.Data.length; i++) { - Assert.AreEqual(reference[i], f); + Assert.AreEqual(reference[i], dst.Data[i]); ++i; } } @@ -102,9 +100,9 @@ public void TestSrcInt() { Multinomial m = new Multinomial(2018); - Tensor src = new Tensor + TensorProxy src = new TensorProxy { - ValueType = Tensor.TensorType.Integer + ValueType = TensorProxy.TensorType.Integer }; Assert.Throws(() => m.Eval(src, null)); @@ -115,13 +113,13 @@ public void TestDstInt() { Multinomial m = new Multinomial(2018); - Tensor src = new Tensor + TensorProxy src = new TensorProxy { - ValueType = Tensor.TensorType.FloatingPoint + ValueType = TensorProxy.TensorType.FloatingPoint }; - Tensor dst = new Tensor + TensorProxy dst = new TensorProxy { - ValueType = Tensor.TensorType.Integer + ValueType = TensorProxy.TensorType.Integer }; Assert.Throws(() => m.Eval(src, dst)); @@ -132,13 +130,13 @@ public void TestSrcDataNull() { Multinomial m = new Multinomial(2018); - Tensor src = new Tensor + TensorProxy src = new TensorProxy { - ValueType = Tensor.TensorType.FloatingPoint + ValueType = TensorProxy.TensorType.FloatingPoint }; - Tensor dst = new Tensor + TensorProxy dst = new TensorProxy { - ValueType = Tensor.TensorType.FloatingPoint + ValueType = TensorProxy.TensorType.FloatingPoint }; Assert.Throws(() => m.Eval(src, dst)); @@ -149,71 +147,33 @@ public void TestDstDataNull() { Multinomial m = new Multinomial(2018); - Tensor src = new Tensor + TensorProxy src = new TensorProxy { - ValueType = Tensor.TensorType.FloatingPoint, - Data = new float[1] + ValueType = TensorProxy.TensorType.FloatingPoint, + Data = new Tensor(0,1) }; - Tensor dst = new Tensor + TensorProxy dst = new TensorProxy { - ValueType = Tensor.TensorType.FloatingPoint + ValueType = TensorProxy.TensorType.FloatingPoint }; Assert.Throws(() => m.Eval(src, dst)); } - [Test] - public void TestSrcWrongShape() - { - Multinomial m = new Multinomial(2018); - - Tensor src = new Tensor - { - ValueType = Tensor.TensorType.FloatingPoint, - Data = new float[1] - }; - Tensor dst = new Tensor - { - ValueType = Tensor.TensorType.FloatingPoint, - Data = new float[1] - }; - - Assert.Throws(() => m.Eval(src, dst)); - } - - [Test] - public void TestDstWrongShape() - { - Multinomial m = new Multinomial(2018); - - Tensor src = new Tensor - { - ValueType = Tensor.TensorType.FloatingPoint, - Data = new float[1, 1] - }; - Tensor dst = new Tensor - { - ValueType = Tensor.TensorType.FloatingPoint, - Data = new float[1] - }; - - Assert.Throws(() => m.Eval(src, dst)); - } - [Test] public void TestUnequalBatchSize() { Multinomial m = new Multinomial(2018); - Tensor src = new Tensor + TensorProxy src = new TensorProxy { - ValueType = Tensor.TensorType.FloatingPoint, - Data = new float[1, 1] + ValueType = TensorProxy.TensorType.FloatingPoint, + Data = new Tensor(1, 1) }; - Tensor dst = new Tensor + TensorProxy dst = new TensorProxy { - ValueType = Tensor.TensorType.FloatingPoint, - Data = new float[2, 1] + ValueType = TensorProxy.TensorType.FloatingPoint, + Data = new Tensor(2, 1) }; Assert.Throws(() => m.Eval(src, dst)); diff --git a/UnitySDK/Assets/ML-Agents/Editor/Tests/RandomNormalTest.cs b/UnitySDK/Assets/ML-Agents/Editor/Tests/RandomNormalTest.cs index 7c33500ac3..8e2dbefcd5 100644 --- a/UnitySDK/Assets/ML-Agents/Editor/Tests/RandomNormalTest.cs +++ b/UnitySDK/Assets/ML-Agents/Editor/Tests/RandomNormalTest.cs @@ -1,122 +1,167 @@ -using System; +using System; +using Barracuda; using NUnit.Framework; using MLAgents.InferenceBrain; using MLAgents.InferenceBrain.Utils; + namespace MLAgents.Tests { - public class RandomNormalTest - { - - [Test] - public void RandomNormalTestTwoDouble() - { - RandomNormal rn = new RandomNormal(2018); - - Assert.AreEqual(-0.46666, rn.NextDouble(), 0.0001); - Assert.AreEqual(-0.37989, rn.NextDouble(), 0.0001); - } - - [Test] - public void RandomNormalTestWithMean() - { - RandomNormal rn = new RandomNormal(2018, 5.0f); - - Assert.AreEqual(4.53333, rn.NextDouble(), 0.0001); - Assert.AreEqual(4.6201, rn.NextDouble(), 0.0001); - } - - [Test] - public void RandomNormalTestWithStddev() - { - RandomNormal rn = new RandomNormal(2018, 1.0f, 4.2f); - - Assert.AreEqual(-0.9599, rn.NextDouble(), 0.0001); - Assert.AreEqual(-0.5955, rn.NextDouble(), 0.0001); - } - - [Test] - public void RandomNormalTestWithMeanStddev() - { - RandomNormal rn = new RandomNormal(2018, -3.2f, 2.2f); - - Assert.AreEqual(-4.2266, rn.NextDouble(), 0.0001); - Assert.AreEqual(-4.0357, rn.NextDouble(), 0.0001); - } - - [Test] - public void RandomNormalTestTensorInt() - { - RandomNormal rn = new RandomNormal(1982); - Tensor t = new Tensor - { - ValueType = Tensor.TensorType.Integer - }; - - Assert.Throws(() => rn.FillTensor(t)); - } - - [Test] - public void RandomNormalTestDataNull() - { - RandomNormal rn = new RandomNormal(1982); - Tensor t = new Tensor - { - ValueType = Tensor.TensorType.FloatingPoint - }; - - Assert.Throws(() => rn.FillTensor(t)); - } - - [Test] - public void RandomNormalTestTensor() - { - RandomNormal rn = new RandomNormal(1982); - Tensor t = new Tensor - { - ValueType = Tensor.TensorType.FloatingPoint, - Data = Array.CreateInstance(typeof(float), new long[3] {3, 4, 2}) - }; - - rn.FillTensor(t); - - float[] reference = new float[] - { - -0.2139822f, - 0.5051259f, - -0.5640336f, - -0.3357787f, - -0.2055894f, - -0.09432302f, - -0.01419199f, - 0.53621f, - -0.5507085f, - -0.2651141f, - 0.09315512f, - -0.04918706f, - -0.179625f, - 0.2280539f, - 0.1883962f, - 0.4047216f, - 0.1704049f, - 0.5050544f, - -0.3365685f, - 0.3542781f, - 0.5951571f, - 0.03460682f, - -0.5537263f, - -0.4378373f, - }; - - int i = 0; - foreach (float f in t.Data) - { - Assert.AreEqual(f, reference[i], 0.0001); - ++i; - } - - - } - } + public class RandomNormalTest + { + + private const float firstValue = -1.19580f; + private const float secondValue = -0.97345f; + private const double epsilon = 0.0001; + + [Test] + public void RandomNormalTestTwoDouble() + { + RandomNormal rn = new RandomNormal(2018); + + Assert.AreEqual(firstValue, rn.NextDouble(), epsilon); + Assert.AreEqual(secondValue, rn.NextDouble(), epsilon); + } + + [Test] + public void RandomNormalTestWithMean() + { + RandomNormal rn = new RandomNormal(2018, 5.0f); + + Assert.AreEqual(firstValue + 5.0, rn.NextDouble(), epsilon); + Assert.AreEqual(secondValue + 5.0, rn.NextDouble(), epsilon); + } + + [Test] + public void RandomNormalTestWithStddev() + { + RandomNormal rn = new RandomNormal(2018, 0.0f, 4.2f); + + Assert.AreEqual(firstValue * 4.2, rn.NextDouble(), epsilon); + Assert.AreEqual(secondValue * 4.2, rn.NextDouble(), epsilon); + } + + [Test] + public void RandomNormalTestWithMeanStddev() + { + float mean = -3.2f; + float stddev = 2.2f; + RandomNormal rn = new RandomNormal(2018, mean, stddev); + + Assert.AreEqual(firstValue * stddev + mean, rn.NextDouble(), epsilon); + Assert.AreEqual(secondValue * stddev + mean, rn.NextDouble(), epsilon); + } + + [Test] + public void RandomNormalTestTensorInt() + { + RandomNormal rn = new RandomNormal(1982); + TensorProxy t = new TensorProxy + { + ValueType = TensorProxy.TensorType.Integer + }; + + Assert.Throws(() => rn.FillTensor(t)); + } + + [Test] + public void RandomNormalTestDataNull() + { + RandomNormal rn = new RandomNormal(1982); + TensorProxy t = new TensorProxy + { + ValueType = TensorProxy.TensorType.FloatingPoint + }; + + Assert.Throws(() => rn.FillTensor(t)); + } + + [Test] + public void RandomNormalTestDistribution() + { + float mean = -3.2f; + float stddev = 2.2f; + RandomNormal rn = new RandomNormal(2018, mean, stddev); + + int numSamples = 100000; + // Adapted from https://www.johndcook.com/blog/standard_deviation/ + // Computes stddev and mean without losing precision + double oldM = 0.0, newM = 0.0, oldS = 0.0, newS = 0.0; + + for (int i = 0; i < numSamples; i++) + { + double x = rn.NextDouble(); + if (i == 0) + { + oldM = newM = x; + oldS = 0.0; + } + else + { + newM = oldM + (x - oldM) / i; + newS = oldS + (x - oldM) * (x - newM); + + // set up for next iteration + oldM = newM; + oldS = newS; + } + } + + double sampleMean = newM; + double sampleVariance = newS / (numSamples - 1); + double sampleStddev = Math.Sqrt(sampleVariance); + + // Note a larger epsilon here. We could get closer to the true values with more samples. + Assert.AreEqual(mean, sampleMean, 0.01); + Assert.AreEqual(stddev, sampleStddev, 0.01); + + } + + [Test] + public void RandomNormalTestTensor() + { + RandomNormal rn = new RandomNormal(1982); + TensorProxy t = new TensorProxy + { + ValueType = TensorProxy.TensorType.FloatingPoint, + Data = new Tensor(1, 3, 4, 2) + }; + + rn.FillTensor(t); + + float[] reference = new float[] + { + -0.4315872f, + -1.11074f, + 0.3414804f, + -1.130287f, + 0.1413168f, + -0.5105762f, + -0.3027347f, + -0.2645015f, + 1.225356f, + -0.02921959f, + 0.3716498f, + -1.092338f, + 0.9561074f, + -0.5018106f, + 1.167787f, + -0.7763879f, + -0.07491868f, + 0.5396146f, + -0.1377991f, + 0.3331701f, + 0.06144788f, + 0.9520947f, + 1.088157f, + -1.177194f, + }; + + for (var i = 0; i < t.Data.length; i++) + { + Assert.AreEqual(t.Data[i], reference[i], 0.0001); + } + } + } } diff --git a/UnitySDK/Assets/ML-Agents/Examples/3DBall/Scenes/3DBall.unity b/UnitySDK/Assets/ML-Agents/Examples/3DBall/Scenes/3DBall.unity index 2b07e75fbb..249502ccec 100644 --- a/UnitySDK/Assets/ML-Agents/Examples/3DBall/Scenes/3DBall.unity +++ b/UnitySDK/Assets/ML-Agents/Examples/3DBall/Scenes/3DBall.unity @@ -38,7 +38,7 @@ RenderSettings: m_ReflectionIntensity: 1 m_CustomReflection: {fileID: 0} m_Sun: {fileID: 0} - m_IndirectSpecularColor: {r: 0.44824862, g: 0.49827534, b: 0.57558274, a: 1} + m_IndirectSpecularColor: {r: 0.44824898, g: 0.49827564, b: 0.5755826, a: 1} --- !u!157 &3 LightmapSettings: m_ObjectHideFlags: 0 @@ -783,7 +783,13 @@ MonoBehaviour: timeScale: 1 targetFrameRate: -1 resetParameters: - resetParameters: [] + resetParameters: + - key: mass + value: 1 + - key: gravity + value: 9.81 + - key: scale + value: 1 --- !u!1 &1746325439 GameObject: m_ObjectHideFlags: 0 diff --git a/UnitySDK/Assets/ML-Agents/Examples/3DBall/Scenes/3DBallHard.unity b/UnitySDK/Assets/ML-Agents/Examples/3DBall/Scenes/3DBallHard.unity index 117ed483a6..f94605845b 100644 --- a/UnitySDK/Assets/ML-Agents/Examples/3DBall/Scenes/3DBallHard.unity +++ b/UnitySDK/Assets/ML-Agents/Examples/3DBall/Scenes/3DBallHard.unity @@ -38,7 +38,7 @@ RenderSettings: m_ReflectionIntensity: 1 m_CustomReflection: {fileID: 0} m_Sun: {fileID: 0} - m_IndirectSpecularColor: {r: 0.45096254, g: 0.5008292, b: 0.5744089, a: 1} + m_IndirectSpecularColor: {r: 0.45096314, g: 0.50082976, b: 0.57440954, a: 1} --- !u!157 &3 LightmapSettings: m_ObjectHideFlags: 0 @@ -711,7 +711,13 @@ MonoBehaviour: timeScale: 1 targetFrameRate: -1 resetParameters: - resetParameters: [] + resetParameters: + - key: mass + value: 1 + - key: gravity + value: 9.81 + - key: scale + value: 1 --- !u!1001 &1591880668 Prefab: m_ObjectHideFlags: 0 diff --git a/UnitySDK/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAcademy.cs b/UnitySDK/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAcademy.cs old mode 100755 new mode 100644 index 328db90cc7..1a789d4528 --- a/UnitySDK/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAcademy.cs +++ b/UnitySDK/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAcademy.cs @@ -7,7 +7,7 @@ public class Ball3DAcademy : Academy { public override void AcademyReset() { - + Physics.gravity = new Vector3(0, -resetParameters["gravity"], 0); } public override void AcademyStep() diff --git a/UnitySDK/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs b/UnitySDK/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs old mode 100755 new mode 100644 index ca05b99df9..753dcc2dd8 --- a/UnitySDK/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs +++ b/UnitySDK/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs @@ -8,10 +8,14 @@ public class Ball3DAgent : Agent [Header("Specific to Ball3D")] public GameObject ball; private Rigidbody ballRb; + private ResetParameters resetParams; public override void InitializeAgent() { ballRb = ball.GetComponent(); + var academy = Object.FindObjectOfType() as Academy; + resetParams = academy.resetParameters; + SetResetParameters(); } public override void CollectObservations() @@ -24,7 +28,7 @@ public override void CollectObservations() public override void AgentAction(float[] vectorAction, string textAction) { - + if (brain.brainParameters.vectorActionSpaceType == SpaceType.continuous) { var actionZ = 2f * Mathf.Clamp(vectorAction[0], -1f, 1f); @@ -63,7 +67,20 @@ public override void AgentReset() ballRb.velocity = new Vector3(0f, 0f, 0f); ball.transform.position = new Vector3(Random.Range(-1.5f, 1.5f), 4f, Random.Range(-1.5f, 1.5f)) + gameObject.transform.position; + //Reset the parameters when the Agent is reset. + SetResetParameters(); + } + public void SetBall() + { + //Set the attributes of the ball by fetching the information from the academy + ballRb.mass = resetParams["mass"]; + var scale = resetParams["scale"]; + ball.transform.localScale = new Vector3(scale, scale, scale); } + public void SetResetParameters() + { + SetBall(); + } } diff --git a/UnitySDK/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DHardAgent.cs b/UnitySDK/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DHardAgent.cs old mode 100755 new mode 100644 index 5b2e742b7d..7f02a51e31 --- a/UnitySDK/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DHardAgent.cs +++ b/UnitySDK/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DHardAgent.cs @@ -8,10 +8,14 @@ public class Ball3DHardAgent : Agent [Header("Specific to Ball3DHard")] public GameObject ball; private Rigidbody ballRb; + private ResetParameters resetParams; public override void InitializeAgent() { ballRb = ball.GetComponent(); + var academy = Object.FindObjectOfType() as Academy; + resetParams = academy.resetParameters; + SetResetParameters(); } public override void CollectObservations() @@ -23,7 +27,7 @@ public override void CollectObservations() public override void AgentAction(float[] vectorAction, string textAction) { - + if (brain.brainParameters.vectorActionSpaceType == SpaceType.continuous) { var actionZ = 2f * Mathf.Clamp(vectorAction[0], -1f, 1f); @@ -65,4 +69,17 @@ public override void AgentReset() } + + public void SetBall() + { + //Set the attributes of the ball by fetching the information from the academy + ballRb.mass = resetParams["mass"]; + var scale = resetParams["scale"]; + ball.transform.localScale = new Vector3(scale, scale, scale); + } + + public void SetResetParameters() + { + SetBall(); + } } diff --git a/UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallHardLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallHardLearning.nn index f5c74dc2a9..38ca710c2e 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallHardLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallHardLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallLearning.nn index dbe8ca2626..16c4b0baea 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/Scenes/Banana.unity b/UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/Scenes/Banana.unity index 6b14c6feaa..90bf5661be 100644 --- a/UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/Scenes/Banana.unity +++ b/UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/Scenes/Banana.unity @@ -38,7 +38,7 @@ RenderSettings: m_ReflectionIntensity: 1 m_CustomReflection: {fileID: 0} m_Sun: {fileID: 0} - m_IndirectSpecularColor: {r: 0.44824862, g: 0.49827534, b: 0.57558274, a: 1} + m_IndirectSpecularColor: {r: 0.44824898, g: 0.49827564, b: 0.5755826, a: 1} --- !u!157 &3 LightmapSettings: m_ObjectHideFlags: 0 @@ -683,7 +683,11 @@ MonoBehaviour: timeScale: 1 targetFrameRate: 60 resetParameters: - resetParameters: [] + resetParameters: + - key: laser_length + value: 1 + - key: agent_scale + value: 1 agents: [] listArea: [] totalScore: 0 diff --git a/UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/Scripts/BananaAgent.cs b/UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/Scripts/BananaAgent.cs index 85f983dcf7..cf9533bdcf 100644 --- a/UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/Scripts/BananaAgent.cs +++ b/UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/Scripts/BananaAgent.cs @@ -16,7 +16,7 @@ public class BananaAgent : Agent float effectTime; Rigidbody agentRb; private int bananas; - + private float laser_length; // Speed of agent rotation. public float turnSpeed = 300; @@ -30,6 +30,7 @@ public class BananaAgent : Agent public bool contribute; private RayPerception3D rayPer; public bool useVectorObs; + public override void InitializeAgent() { @@ -39,6 +40,8 @@ public override void InitializeAgent() myArea = area.GetComponent(); rayPer = GetComponent(); myAcademy = FindObjectOfType(); + + SetResetParameters(); } public override void CollectObservations() @@ -157,7 +160,7 @@ public void MoveAgent(float[] act) if (shoot) { - myLaser.transform.localScale = new Vector3(1f, 1f, 1f); + myLaser.transform.localScale = new Vector3(1f, 1f, laser_length); Vector3 position = transform.TransformDirection(RayPerception3D.PolarToCartesian(25f, 90f)); Debug.DrawRay(transform.position, position, Color.red, 0f, true); RaycastHit hit; @@ -239,6 +242,8 @@ public override void AgentReset() 2f, Random.Range(-myArea.range, myArea.range)) + area.transform.position; transform.rotation = Quaternion.Euler(new Vector3(0f, Random.Range(0, 360))); + + SetResetParameters(); } void OnCollisionEnter(Collision collision) @@ -271,4 +276,21 @@ public override void AgentOnDone() { } + + public void SetLaserLengths() + { + laser_length = myAcademy.resetParameters["laser_length"]; + } + + public void SetAgentScale() + { + var agent_scale = myAcademy.resetParameters["agent_scale"]; + gameObject.transform.localScale = new Vector3(agent_scale, agent_scale, agent_scale); + } + + public void SetResetParameters() + { + SetLaserLengths(); + SetAgentScale(); + } } diff --git a/UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/Scripts/BananaLogic.cs b/UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/Scripts/BananaLogic.cs index 25501ed7a9..7a18b45591 100644 --- a/UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/Scripts/BananaLogic.cs +++ b/UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/Scripts/BananaLogic.cs @@ -21,8 +21,8 @@ public class BananaLogic : MonoBehaviour { if (respawn) { transform.position = new Vector3(Random.Range(-myArea.range, myArea.range), - transform.position.y + 3f, - Random.Range(-myArea.range, myArea.range)); + 3f, + Random.Range(-myArea.range, myArea.range)) + myArea.transform.position; } else { diff --git a/UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/TFModels/BananaLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/TFModels/BananaLearning.nn index ac8a519cec..f83aa78394 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/TFModels/BananaLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/TFModels/BananaLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/Basic/TFModels/BasicLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/Basic/TFModels/BasicLearning.nn index 9d94677cf5..7ded379ad9 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/Basic/TFModels/BasicLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/Basic/TFModels/BasicLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/Bouncer/Scenes/Bouncer.unity b/UnitySDK/Assets/ML-Agents/Examples/Bouncer/Scenes/Bouncer.unity index ab4b1bbb0d..9e31bfdfb0 100644 --- a/UnitySDK/Assets/ML-Agents/Examples/Bouncer/Scenes/Bouncer.unity +++ b/UnitySDK/Assets/ML-Agents/Examples/Bouncer/Scenes/Bouncer.unity @@ -38,7 +38,7 @@ RenderSettings: m_ReflectionIntensity: 1 m_CustomReflection: {fileID: 0} m_Sun: {fileID: 0} - m_IndirectSpecularColor: {r: 0.44824862, g: 0.49827534, b: 0.57558274, a: 1} + m_IndirectSpecularColor: {r: 0.44824898, g: 0.49827564, b: 0.5755826, a: 1} --- !u!157 &3 LightmapSettings: m_ObjectHideFlags: 0 @@ -872,7 +872,9 @@ MonoBehaviour: timeScale: 1 targetFrameRate: 60 resetParameters: - resetParameters: [] + resetParameters: + - key: banana_scale + value: 150 gravityMultiplier: 2 --- !u!4 &1453982295 Transform: diff --git a/UnitySDK/Assets/ML-Agents/Examples/Bouncer/Scenes/BouncerIL.unity b/UnitySDK/Assets/ML-Agents/Examples/Bouncer/Scenes/BouncerIL.unity index 382f982c78..70e54dd53a 100644 --- a/UnitySDK/Assets/ML-Agents/Examples/Bouncer/Scenes/BouncerIL.unity +++ b/UnitySDK/Assets/ML-Agents/Examples/Bouncer/Scenes/BouncerIL.unity @@ -38,7 +38,7 @@ RenderSettings: m_ReflectionIntensity: 1 m_CustomReflection: {fileID: 0} m_Sun: {fileID: 0} - m_IndirectSpecularColor: {r: 0.4482636, g: 0.49828887, b: 0.5755903, a: 1} + m_IndirectSpecularColor: {r: 0.44824898, g: 0.49827564, b: 0.5755826, a: 1} --- !u!157 &3 LightmapSettings: m_ObjectHideFlags: 0 @@ -1002,6 +1002,7 @@ MonoBehaviour: brain: {fileID: 11400000, guid: 5527511df7b944e8e9177dd69db5a9c1, type: 2} agentParameters: agentCameras: [] + agentRenderTextures: [] maxStep: 0 resetOnDone: 1 onDemandDecision: 1 @@ -1547,6 +1548,7 @@ MonoBehaviour: brain: {fileID: 11400000, guid: 573920e3a672d40038169c7ffdbdca05, type: 2} agentParameters: agentCameras: [] + agentRenderTextures: [] maxStep: 0 resetOnDone: 1 onDemandDecision: 1 @@ -1638,7 +1640,9 @@ MonoBehaviour: timeScale: 1 targetFrameRate: 60 resetParameters: - resetParameters: [] + resetParameters: + - key: banana_scale + value: 150 gravityMultiplier: 2 --- !u!4 &1453982295 Transform: diff --git a/UnitySDK/Assets/ML-Agents/Examples/Bouncer/Scripts/BouncerAgent.cs b/UnitySDK/Assets/ML-Agents/Examples/Bouncer/Scripts/BouncerAgent.cs old mode 100755 new mode 100644 index 0a188bbd3f..c272c17038 --- a/UnitySDK/Assets/ML-Agents/Examples/Bouncer/Scripts/BouncerAgent.cs +++ b/UnitySDK/Assets/ML-Agents/Examples/Bouncer/Scripts/BouncerAgent.cs @@ -1,4 +1,4 @@ -using System.Collections; +using System.Collections; using System.Collections.Generic; using UnityEngine; using MLAgents; @@ -15,10 +15,17 @@ public class BouncerAgent : Agent { int numberJumps = 20; int jumpLeft = 20; + ResetParameters resetParams; + public override void InitializeAgent() { rb = gameObject.GetComponent(); lookDir = Vector3.zero; + + var academy = FindObjectOfType() as Academy; + resetParams = academy.resetParameters; + + SetResetParameters(); } public override void CollectObservations() @@ -29,7 +36,7 @@ public override void CollectObservations() public override void AgentAction(float[] vectorAction, string textAction) { - for (int i = 0; i < vectorAction.Length; i++) + for (int i = 0; i < vectorAction.Length; i++) { vectorAction[i] = Mathf.Clamp(vectorAction[i], -1f, 1f); } @@ -60,6 +67,8 @@ public override void AgentReset() bb.Respawn(); } jumpLeft = numberJumps; + + SetResetParameters(); } public override void AgentOnDone() @@ -109,4 +118,15 @@ private void Update() Time.deltaTime * 10f); } } + + public void SetBananaScale() + { + var banana_scale = resetParams["banana_scale"]; + banana.transform.localScale = new Vector3(banana_scale, banana_scale, banana_scale); + } + + public void SetResetParameters() + { + SetBananaScale(); + } } diff --git a/UnitySDK/Assets/ML-Agents/Examples/Bouncer/TFModels/BouncerLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/Bouncer/TFModels/BouncerLearning.nn index 4ca329a0d4..f647e15ca4 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/Bouncer/TFModels/BouncerLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/Bouncer/TFModels/BouncerLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerDynamicLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerDynamicLearning.nn index 6562f3bc85..87403f0d24 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerDynamicLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerDynamicLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerStaticLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerStaticLearning.nn index 8734082436..bd37244fc0 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerStaticLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerStaticLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/GridWorld/TFModels/GridWorldLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/GridWorld/TFModels/GridWorldLearning.nn index 1cfd3cc679..c5fc081004 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/GridWorld/TFModels/GridWorldLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/GridWorld/TFModels/GridWorldLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/Hallway/TFModels/HallwayLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/Hallway/TFModels/HallwayLearning.nn index 960ed82e2d..9ec4299254 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/Hallway/TFModels/HallwayLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/Hallway/TFModels/HallwayLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Prefabs/PushBlockArea.prefab b/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Prefabs/PushBlockArea.prefab index 0357bfa2ee..4246a53459 100644 --- a/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Prefabs/PushBlockArea.prefab +++ b/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Prefabs/PushBlockArea.prefab @@ -838,7 +838,7 @@ BoxCollider: m_PrefabParentObject: {fileID: 0} m_PrefabInternal: {fileID: 100100000} m_GameObject: {fileID: 1500989011945850} - m_Material: {fileID: 0} + m_Material: {fileID: 13400000, guid: f440cd475293044139739aff331224fb, type: 2} m_IsTrigger: 0 m_Enabled: 1 serializedVersion: 2 @@ -905,6 +905,7 @@ MonoBehaviour: brain: {fileID: 11400000, guid: e8b2d719f6a324b1abb68d8cf2859f5c, type: 2} agentParameters: agentCameras: [] + agentRenderTextures: [] maxStep: 5000 resetOnDone: 1 onDemandDecision: 0 diff --git a/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Prefabs/StudentArea.prefab b/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Prefabs/StudentArea.prefab index 1d518d4234..d8ffc62dc9 100644 --- a/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Prefabs/StudentArea.prefab +++ b/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Prefabs/StudentArea.prefab @@ -790,7 +790,7 @@ BoxCollider: m_PrefabParentObject: {fileID: 0} m_PrefabInternal: {fileID: 100100000} m_GameObject: {fileID: 1132847330634378} - m_Material: {fileID: 0} + m_Material: {fileID: 13400000, guid: f440cd475293044139739aff331224fb, type: 2} m_IsTrigger: 0 m_Enabled: 1 serializedVersion: 2 @@ -905,6 +905,7 @@ MonoBehaviour: brain: {fileID: 11400000, guid: e8b2d719f6a324b1abb68d8cf2859f5c, type: 2} agentParameters: agentCameras: [] + agentRenderTextures: [] maxStep: 5000 resetOnDone: 1 onDemandDecision: 0 diff --git a/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Prefabs/TeacherArea.prefab b/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Prefabs/TeacherArea.prefab index 6b9bfe7ed6..94a0da4238 100644 --- a/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Prefabs/TeacherArea.prefab +++ b/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Prefabs/TeacherArea.prefab @@ -791,7 +791,7 @@ BoxCollider: m_PrefabParentObject: {fileID: 0} m_PrefabInternal: {fileID: 100100000} m_GameObject: {fileID: 1492788814869846} - m_Material: {fileID: 0} + m_Material: {fileID: 13400000, guid: f440cd475293044139739aff331224fb, type: 2} m_IsTrigger: 0 m_Enabled: 1 serializedVersion: 2 @@ -919,6 +919,7 @@ MonoBehaviour: brain: {fileID: 11400000, guid: dd07b1953eac4411b81fba032f394726, type: 2} agentParameters: agentCameras: [] + agentRenderTextures: [] maxStep: 5000 resetOnDone: 1 onDemandDecision: 0 diff --git a/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Prefabs/VisualArea.prefab b/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Prefabs/VisualArea.prefab index 7f35e22ef2..355c957683 100644 --- a/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Prefabs/VisualArea.prefab +++ b/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Prefabs/VisualArea.prefab @@ -870,7 +870,7 @@ BoxCollider: m_PrefabParentObject: {fileID: 0} m_PrefabInternal: {fileID: 100100000} m_GameObject: {fileID: 1553741088268304} - m_Material: {fileID: 0} + m_Material: {fileID: 13400000, guid: f440cd475293044139739aff331224fb, type: 2} m_IsTrigger: 0 m_Enabled: 1 serializedVersion: 2 @@ -970,6 +970,7 @@ MonoBehaviour: agentParameters: agentCameras: - {fileID: 20223756300728806} + agentRenderTextures: [] maxStep: 5000 resetOnDone: 1 onDemandDecision: 0 diff --git a/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Scenes/PushBlock.unity b/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Scenes/PushBlock.unity index 7b5c639ab6..66dce0ad69 100644 --- a/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Scenes/PushBlock.unity +++ b/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Scenes/PushBlock.unity @@ -1570,7 +1570,15 @@ MonoBehaviour: timeScale: 1 targetFrameRate: 60 resetParameters: - resetParameters: [] + resetParameters: + - key: dynamic_friction + value: 0 + - key: static_friction + value: 0 + - key: block_drag + value: 0.5 + - key: block_scale + value: 2 agentRunSpeed: 2 agentRotationSpeed: 15 spawnAreaMarginMultiplier: 0.5 diff --git a/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Scenes/PushBlockIL.unity b/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Scenes/PushBlockIL.unity index 4b732c14ba..99b839a12f 100644 --- a/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Scenes/PushBlockIL.unity +++ b/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Scenes/PushBlockIL.unity @@ -339,6 +339,11 @@ Prefab: propertyPath: m_RootOrder value: 4 objectReference: {fileID: 0} + - target: {fileID: 65880592586321730, guid: bed6005cc2a1a47edafba27cde6b5538, + type: 2} + propertyPath: m_Material + value: + objectReference: {fileID: 0} m_RemovedComponents: [] m_ParentPrefab: {fileID: 100100000, guid: bed6005cc2a1a47edafba27cde6b5538, type: 2} m_IsPrefabParent: 0 @@ -552,7 +557,15 @@ MonoBehaviour: timeScale: 1 targetFrameRate: 60 resetParameters: - resetParameters: [] + resetParameters: + - key: dynamic_friction + value: 0 + - key: static_friction + value: 0 + - key: block_drag + value: 0.5 + - key: block_scale + value: 2 agentRunSpeed: 2 agentRotationSpeed: 15 spawnAreaMarginMultiplier: 0.5 diff --git a/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Scenes/VisualPushBlock.unity b/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Scenes/VisualPushBlock.unity index e573b4c77c..9757c2f31e 100644 --- a/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Scenes/VisualPushBlock.unity +++ b/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Scenes/VisualPushBlock.unity @@ -658,7 +658,15 @@ MonoBehaviour: timeScale: 1 targetFrameRate: 60 resetParameters: - resetParameters: [] + resetParameters: + - key: dynamic_friction + value: 0 + - key: static_friction + value: 0 + - key: block_drag + value: 0.5 + - key: block_scale + value: 2 agentRunSpeed: 2 agentRotationSpeed: 15 spawnAreaMarginMultiplier: 0.5 diff --git a/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Scripts/PushAgentBasic.cs b/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Scripts/PushAgentBasic.cs old mode 100755 new mode 100644 index 1001bbe8e0..89b26954b9 --- a/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Scripts/PushAgentBasic.cs +++ b/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Scripts/PushAgentBasic.cs @@ -44,6 +44,9 @@ public class PushAgentBasic : Agent Rigidbody agentRB; //cached on initialization Material groundMaterial; //cached on Awake() RayPerception rayPer; + + float[] rayAngles = { 0f, 45f, 90f, 135f, 180f, 110f, 70f }; + string[] detectableObjects = { "block", "goal", "wall" }; /// /// We will be changing the ground material based on success/failue @@ -72,6 +75,8 @@ public override void InitializeAgent() groundRenderer = ground.GetComponent(); // Starting material groundMaterial = groundRenderer.material; + + SetResetParameters(); } public override void CollectObservations() @@ -79,8 +84,7 @@ public override void CollectObservations() if (useVectorObs) { var rayDistance = 12f; - float[] rayAngles = { 0f, 45f, 90f, 135f, 180f, 110f, 70f }; - var detectableObjects = new[] { "block", "goal", "wall" }; + AddVectorObs(rayPer.Perceive(rayDistance, rayAngles, detectableObjects, 0f, 0f)); AddVectorObs(rayPer.Perceive(rayDistance, rayAngles, detectableObjects, 1.5f, 0f)); } @@ -215,5 +219,34 @@ public override void AgentReset() transform.position = GetRandomSpawnPos(); agentRB.velocity = Vector3.zero; agentRB.angularVelocity = Vector3.zero; + + SetResetParameters(); + } + + public void SetGroundMaterialFriction() + { + var resetParams = academy.resetParameters; + + var groundCollider = ground.GetComponent() as Collider; + + groundCollider.material.dynamicFriction = resetParams["dynamic_friction"]; + groundCollider.material.staticFriction = resetParams["static_friction"]; + } + + public void SetBlockProperties() + { + var resetParams = academy.resetParameters; + + //Set the scale of the block + blockRB.transform.localScale = new Vector3(resetParams["block_scale"], 0.75f, resetParams["block_scale"]); + + // Set the drag of the block + blockRB.drag = resetParams["block_drag"]; + } + + public void SetResetParameters() + { + SetGroundMaterialFriction(); + SetBlockProperties(); } } diff --git a/UnitySDK/Assets/ML-Agents/Examples/PushBlock/TFModels/PushBlockLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/PushBlock/TFModels/PushBlockLearning.nn index 6c2f960098..b80b164554 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/PushBlock/TFModels/PushBlockLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/PushBlock/TFModels/PushBlockLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/Pyramids/TFModels/PyramidsLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/Pyramids/TFModels/PyramidsLearning.nn index 72def7accf..c6da2cfc8e 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/Pyramids/TFModels/PyramidsLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/Pyramids/TFModels/PyramidsLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/Reacher/Scenes/Reacher.unity b/UnitySDK/Assets/ML-Agents/Examples/Reacher/Scenes/Reacher.unity index 5f65e9ed51..04382108ef 100644 --- a/UnitySDK/Assets/ML-Agents/Examples/Reacher/Scenes/Reacher.unity +++ b/UnitySDK/Assets/ML-Agents/Examples/Reacher/Scenes/Reacher.unity @@ -38,7 +38,7 @@ RenderSettings: m_ReflectionIntensity: 1 m_CustomReflection: {fileID: 0} m_Sun: {fileID: 762086411} - m_IndirectSpecularColor: {r: 0.4465934, g: 0.49642956, b: 0.5748249, a: 1} + m_IndirectSpecularColor: {r: 0.44657898, g: 0.49641287, b: 0.5748173, a: 1} --- !u!157 &3 LightmapSettings: m_ObjectHideFlags: 0 @@ -1122,8 +1122,12 @@ MonoBehaviour: value: 5 - key: goal_speed value: 1 - goalSize: 5 - goalSpeed: 1 + - key: gravity + value: 9.81 + - key: deviation + value: 0 + - key: deviation_freq + value: 0 --- !u!4 &1574236049 Transform: m_ObjectHideFlags: 0 diff --git a/UnitySDK/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAcademy.cs b/UnitySDK/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAcademy.cs old mode 100755 new mode 100644 index 39f3e36283..32a5c0490d --- a/UnitySDK/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAcademy.cs +++ b/UnitySDK/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAcademy.cs @@ -3,16 +3,13 @@ using UnityEngine; using MLAgents; -public class ReacherAcademy : Academy { - - public float goalSize; - public float goalSpeed; +public class ReacherAcademy : Academy +{ public override void AcademyReset() { - goalSize = (float)resetParameters["goal_size"]; - goalSpeed = (float)resetParameters["goal_speed"]; + Physics.gravity = new Vector3(0, -resetParameters["gravity"], 0); } public override void AcademyStep() diff --git a/UnitySDK/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAgent.cs b/UnitySDK/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAgent.cs old mode 100755 new mode 100644 index 7715eade38..85e1f5cecc --- a/UnitySDK/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAgent.cs +++ b/UnitySDK/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAgent.cs @@ -1,7 +1,8 @@ using UnityEngine; using MLAgents; -public class ReacherAgent : Agent { +public class ReacherAgent : Agent +{ public GameObject pendulumA; public GameObject pendulumB; @@ -11,11 +12,17 @@ public class ReacherAgent : Agent { float goalDegree; private Rigidbody rbA; private Rigidbody rbB; + // speed of the goal zone around the arm (in radians) private float goalSpeed; + // radius of the goal zone private float goalSize; + // Magnitude of sinusoidal (cosine) deviation of the goal along the vertical dimension + private float deviation; + // Frequency of the cosine deviation of the goal along the vertical dimension + private float deviationFreq; /// - /// Collect the rigidbodies of the reacher in order to resue them for + /// Collect the rigidbodies of the reacher in order to resue them for /// observations and actions. /// public override void InitializeAgent() @@ -23,6 +30,8 @@ public override void InitializeAgent() rbA = pendulumA.GetComponent(); rbB = pendulumB.GetComponent(); myAcademy = GameObject.Find("Academy").GetComponent(); + + SetResetParameters(); } /// @@ -43,15 +52,15 @@ public override void CollectObservations() AddVectorObs(goal.transform.localPosition); AddVectorObs(hand.transform.localPosition); - + AddVectorObs(goalSpeed); - } + } /// /// The agent's four actions correspond to torques on each of the two joints. /// public override void AgentAction(float[] vectorAction, string textAction) - { + { goalDegree += goalSpeed; UpdateGoalPosition(); @@ -62,17 +71,18 @@ public override void AgentAction(float[] vectorAction, string textAction) torqueX = Mathf.Clamp(vectorAction[2], -1f, 1f) * 150f; torqueZ = Mathf.Clamp(vectorAction[3], -1f, 1f) * 150f; rbB.AddTorque(new Vector3(torqueX, 0f, torqueZ)); - } + } /// /// Used to move the position of the target goal around the agent. /// - void UpdateGoalPosition() + void UpdateGoalPosition() { var radians = goalDegree * Mathf.PI / 180f; var goalX = 8f * Mathf.Cos(radians); var goalY = 8f * Mathf.Sin(radians); - goal.transform.position = new Vector3(goalY, -1f, goalX) + transform.position; + var goalZ = deviation * Mathf.Cos(deviationFreq * radians); + goal.transform.position = new Vector3(goalY, goalZ, goalX) + transform.position; } /// @@ -93,9 +103,19 @@ public override void AgentReset() goalDegree = Random.Range(0, 360); UpdateGoalPosition(); - goalSize = myAcademy.goalSize; - goalSpeed = Random.Range(-1f, 1f) * myAcademy.goalSpeed; + SetResetParameters(); + goal.transform.localScale = new Vector3(goalSize, goalSize, goalSize); } + + + public void SetResetParameters() + { + goalSize = myAcademy.resetParameters["goal_size"]; + goalSpeed = Random.Range(-1f, 1f) * myAcademy.resetParameters["goal_speed"]; + deviation = myAcademy.resetParameters["deviation"]; + deviationFreq = myAcademy.resetParameters["deviation_freq"]; + + } } diff --git a/UnitySDK/Assets/ML-Agents/Examples/Reacher/TFModels/ReacherLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/Reacher/TFModels/ReacherLearning.nn index 6d93c6afd7..a1967b4b96 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/Reacher/TFModels/ReacherLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/Reacher/TFModels/ReacherLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Scripts/RayPerception2D.cs b/UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Scripts/RayPerception2D.cs index 827a8ab109..45cd0fda59 100644 --- a/UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Scripts/RayPerception2D.cs +++ b/UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Scripts/RayPerception2D.cs @@ -15,6 +15,15 @@ public class RayPerception2D : RayPerception /// /// Creates perception vector to be used as part of an observation of an agent. + /// Each ray in the rayAngles array adds a sublist of data to the observation. + /// The sublist contains the observation data for a single ray. The list is composed of the following: + /// 1. A one-hot encoding for detectable objects. For example, if detectableObjects.Length = n, the + /// first n elements of the sublist will be a one-hot encoding of the detectableObject that was hit, or + /// all zeroes otherwise. + /// 2. The 'length' element of the sublist will be 1 if the ray missed everything, or 0 if it hit + /// something (detectable or not). + /// 3. The 'length+1' element of the sublist will contain the normalised distance to the object hit. + /// NOTE: Only objects with tags in the detectableObjects array will have a distance set. /// /// The partial vector observation corresponding to the set of rays /// Radius of rays diff --git a/UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Scripts/RayPerception3D.cs b/UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Scripts/RayPerception3D.cs index 7c7179de8c..3467b7fffd 100644 --- a/UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Scripts/RayPerception3D.cs +++ b/UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Scripts/RayPerception3D.cs @@ -1,4 +1,5 @@ -using System.Collections.Generic; +using System; +using System.Collections.Generic; using UnityEngine; namespace MLAgents @@ -12,9 +13,19 @@ public class RayPerception3D : RayPerception { Vector3 endPosition; RaycastHit hit; + private float[] subList; /// /// Creates perception vector to be used as part of an observation of an agent. + /// Each ray in the rayAngles array adds a sublist of data to the observation. + /// The sublist contains the observation data for a single ray. The list is composed of the following: + /// 1. A one-hot encoding for detectable objects. For example, if detectableObjects.Length = n, the + /// first n elements of the sublist will be a one-hot encoding of the detectableObject that was hit, or + /// all zeroes otherwise. + /// 2. The 'length' element of the sublist will be 1 if the ray missed everything, or 0 if it hit + /// something (detectable or not). + /// 3. The 'length+1' element of the sublist will contain the normalised distance to the object hit. + /// NOTE: Only objects with tags in the detectableObjects array will have a distance set. /// /// The partial vector observation corresponding to the set of rays /// Radius of rays @@ -26,7 +37,12 @@ public class RayPerception3D : RayPerception float[] rayAngles, string[] detectableObjects, float startOffset, float endOffset) { + if (subList == null || subList.Length != detectableObjects.Length + 2) + subList = new float[detectableObjects.Length + 2]; + perceptionBuffer.Clear(); + perceptionBuffer.Capacity = subList.Length * rayAngles.Length; + // For each ray sublist stores categorical information on detected object // along with object distance. foreach (float angle in rayAngles) @@ -40,7 +56,8 @@ public class RayPerception3D : RayPerception endPosition, Color.black, 0.01f, true); } - float[] subList = new float[detectableObjects.Length + 2]; + Array.Clear(subList, 0, subList.Length); + if (Physics.SphereCast(transform.position + new Vector3(0f, startOffset, 0f), 0.5f, endPosition, out hit, rayDistance)) @@ -60,7 +77,7 @@ public class RayPerception3D : RayPerception subList[detectableObjects.Length] = 1f; } - perceptionBuffer.AddRange(subList); + Utilities.AddRangeNoAlloc(perceptionBuffer, subList); } return perceptionBuffer; diff --git a/UnitySDK/Assets/ML-Agents/Examples/Soccer/Scenes/SoccerTwos.unity b/UnitySDK/Assets/ML-Agents/Examples/Soccer/Scenes/SoccerTwos.unity index 658b4b10b8..1d43d118cb 100644 --- a/UnitySDK/Assets/ML-Agents/Examples/Soccer/Scenes/SoccerTwos.unity +++ b/UnitySDK/Assets/ML-Agents/Examples/Soccer/Scenes/SoccerTwos.unity @@ -647,7 +647,11 @@ MonoBehaviour: timeScale: 2 targetFrameRate: 60 resetParameters: - resetParameters: [] + resetParameters: + - key: ball_scale + value: 7.5 + - key: gravity + value: 9.81 brainStriker: {fileID: 11400000, guid: 29ed78b3e8fef4340b3a1f6954b88f18, type: 2} brainGoalie: {fileID: 11400000, guid: 090fa5a8588f5433bb7f878e6f5ac954, type: 2} redMaterial: {fileID: 2100000, guid: 776dd8b57653342839c3fb5f46ce664e, type: 2} diff --git a/UnitySDK/Assets/ML-Agents/Examples/Soccer/Scripts/AgentSoccer.cs b/UnitySDK/Assets/ML-Agents/Examples/Soccer/Scripts/AgentSoccer.cs old mode 100755 new mode 100644 index fcc4b5ee02..bf2049974f --- a/UnitySDK/Assets/ML-Agents/Examples/Soccer/Scripts/AgentSoccer.cs +++ b/UnitySDK/Assets/ML-Agents/Examples/Soccer/Scripts/AgentSoccer.cs @@ -26,6 +26,12 @@ public enum AgentRole SoccerAcademy academy; Renderer agentRenderer; RayPerception rayPer; + + float[] rayAngles = { 0f, 45f, 90f, 135f, 180f, 110f, 70f }; + string[] detectableObjectsRed = { "ball", "redGoal", "blueGoal", + "wall", "redAgent", "blueAgent" }; + string[] detectableObjectsBlue = { "ball", "blueGoal", "redGoal", + "wall", "blueAgent", "redAgent" }; public void ChooseRandomTeam() { @@ -79,17 +85,14 @@ public override void InitializeAgent() public override void CollectObservations() { float rayDistance = 20f; - float[] rayAngles = { 0f, 45f, 90f, 135f, 180f, 110f, 70f }; string[] detectableObjects; if (team == Team.Red) { - detectableObjects = new[] { "ball", "redGoal", "blueGoal", - "wall", "redAgent", "blueAgent" }; + detectableObjects = detectableObjectsRed; } else { - detectableObjects = new[] { "ball", "blueGoal", "redGoal", - "wall", "blueAgent", "redAgent" }; + detectableObjects = detectableObjectsBlue; } AddVectorObs(rayPer.Perceive(rayDistance, rayAngles, detectableObjects, 0f, 0f)); AddVectorObs(rayPer.Perceive(rayDistance, rayAngles, detectableObjects, 1f, 0f)); @@ -206,6 +209,11 @@ public override void AgentReset() transform.position = area.GetRandomSpawnPos(agentRole, team); agentRb.velocity = Vector3.zero; agentRb.angularVelocity = Vector3.zero; + SetResetParameters(); + } + + public void SetResetParameters() + { area.ResetBall(); } } diff --git a/UnitySDK/Assets/ML-Agents/Examples/Soccer/Scripts/SoccerAcademy.cs b/UnitySDK/Assets/ML-Agents/Examples/Soccer/Scripts/SoccerAcademy.cs index 6102c81def..94002669ce 100644 --- a/UnitySDK/Assets/ML-Agents/Examples/Soccer/Scripts/SoccerAcademy.cs +++ b/UnitySDK/Assets/ML-Agents/Examples/Soccer/Scripts/SoccerAcademy.cs @@ -29,7 +29,7 @@ void Start() } public override void AcademyReset() { - + Physics.gravity = new Vector3(0, -resetParameters["gravity"], 0); } public override void AcademyStep() diff --git a/UnitySDK/Assets/ML-Agents/Examples/Soccer/Scripts/SoccerFieldArea.cs b/UnitySDK/Assets/ML-Agents/Examples/Soccer/Scripts/SoccerFieldArea.cs index 5e9de7581c..469fbbee27 100644 --- a/UnitySDK/Assets/ML-Agents/Examples/Soccer/Scripts/SoccerFieldArea.cs +++ b/UnitySDK/Assets/ML-Agents/Examples/Soccer/Scripts/SoccerFieldArea.cs @@ -6,10 +6,10 @@ [System.Serializable] public class PlayerState { - public int playerIndex; - public Rigidbody agentRB; - public Vector3 startingPos; - public AgentSoccer agentScript; + public int playerIndex; + public Rigidbody agentRB; + public Vector3 startingPos; + public AgentSoccer agentScript; public float ballPosReward; } @@ -24,7 +24,7 @@ public class SoccerFieldArea : MonoBehaviour public GameObject ball; [HideInInspector] public Rigidbody ballRB; - public GameObject ground; + public GameObject ground; public GameObject centerPitch; SoccerBallController ballController; public List playerStates = new List(); @@ -49,7 +49,7 @@ public class SoccerFieldArea : MonoBehaviour public IEnumerator GoalScoredSwapGroundMaterial(Material mat, float time) { groundRenderer.material = mat; - yield return new WaitForSeconds(time); + yield return new WaitForSeconds(time); groundRenderer.material = groundMaterial; } @@ -57,7 +57,7 @@ public IEnumerator GoalScoredSwapGroundMaterial(Material mat, float time) void Awake() { academy = FindObjectOfType(); - groundRenderer = centerPitch.GetComponent(); + groundRenderer = centerPitch.GetComponent(); groundMaterial = groundRenderer.material; canResetBall = true; if (goalTextUI) { goalTextUI.SetActive(false); } @@ -152,8 +152,8 @@ public Vector3 GetRandomSpawnPos(AgentSoccer.AgentRole role, AgentSoccer.Team te { xOffset = xOffset * -1f; } - var randomSpawnPos = ground.transform.position + - new Vector3(xOffset, 0f, 0f) + var randomSpawnPos = ground.transform.position + + new Vector3(xOffset, 0f, 0f) + (Random.insideUnitSphere * 2); randomSpawnPos.y = ground.transform.position.y + 2; return randomSpawnPos; @@ -161,8 +161,8 @@ public Vector3 GetRandomSpawnPos(AgentSoccer.AgentRole role, AgentSoccer.Team te public Vector3 GetBallSpawnPosition() { - var randomSpawnPos = ground.transform.position + - new Vector3(0f, 0f, 0f) + var randomSpawnPos = ground.transform.position + + new Vector3(0f, 0f, 0f) + (Random.insideUnitSphere * 2); randomSpawnPos.y = ground.transform.position.y + 2; return randomSpawnPos; @@ -178,5 +178,8 @@ public void ResetBall() ball.transform.position = GetBallSpawnPosition(); ballRB.velocity = Vector3.zero; ballRB.angularVelocity = Vector3.zero; + + var ballScale = academy.resetParameters["ball_scale"]; + ballRB.transform.localScale = new Vector3(ballScale, ballScale, ballScale); } } diff --git a/UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/GoalieLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/GoalieLearning.nn index f4700c929e..3fa9a8fccb 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/GoalieLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/GoalieLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/StrikerLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/StrikerLearning.nn index 51d9a092d1..36946f2cfc 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/StrikerLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/StrikerLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/Tennis/Scenes/Tennis.unity b/UnitySDK/Assets/ML-Agents/Examples/Tennis/Scenes/Tennis.unity index 8974fa69d0..6b5be54353 100644 --- a/UnitySDK/Assets/ML-Agents/Examples/Tennis/Scenes/Tennis.unity +++ b/UnitySDK/Assets/ML-Agents/Examples/Tennis/Scenes/Tennis.unity @@ -712,7 +712,13 @@ MonoBehaviour: timeScale: 1 targetFrameRate: 60 resetParameters: - resetParameters: [] + resetParameters: + - key: gravity + value: 9.81 + - key: angle + value: 55 + - key: scale + value: 1 --- !u!1001 &1065879750 Prefab: m_ObjectHideFlags: 0 diff --git a/UnitySDK/Assets/ML-Agents/Examples/Tennis/Scenes/TennisIL.unity b/UnitySDK/Assets/ML-Agents/Examples/Tennis/Scenes/TennisIL.unity index 9c9884b4bb..6e9a2f252f 100644 --- a/UnitySDK/Assets/ML-Agents/Examples/Tennis/Scenes/TennisIL.unity +++ b/UnitySDK/Assets/ML-Agents/Examples/Tennis/Scenes/TennisIL.unity @@ -367,6 +367,7 @@ MonoBehaviour: brain: {fileID: 11400000, guid: 6bf6a586a645b471bb9bd1194ae0e229, type: 2} agentParameters: agentCameras: [] + agentRenderTextures: [] maxStep: 5000 resetOnDone: 1 onDemandDecision: 0 @@ -374,9 +375,9 @@ MonoBehaviour: ball: {fileID: 1114726487} invertX: 0 score: 0 - scoreText: {fileID: 2073469450} myArea: {fileID: 74527003} - opponent: {fileID: 1894084401} + angle: 0 + scale: 0 --- !u!65 &348265184 BoxCollider: m_ObjectHideFlags: 0 @@ -892,7 +893,13 @@ MonoBehaviour: timeScale: 1 targetFrameRate: 60 resetParameters: - resetParameters: [] + resetParameters: + - key: angle + value: 55 + - key: scale + value: 1 + - key: gravity + value: 9.81 --- !u!1 &1114726487 GameObject: m_ObjectHideFlags: 0 @@ -1672,6 +1679,7 @@ MonoBehaviour: brain: {fileID: 11400000, guid: 1674996276be448c2ad51fb139e21e05, type: 2} agentParameters: agentCameras: [] + agentRenderTextures: [] maxStep: 5000 resetOnDone: 1 onDemandDecision: 0 @@ -1679,9 +1687,9 @@ MonoBehaviour: ball: {fileID: 1114726487} invertX: 1 score: 0 - scoreText: {fileID: 1871669621} myArea: {fileID: 74527003} - opponent: {fileID: 348265181} + angle: 0 + scale: 0 --- !u!65 &1894084404 BoxCollider: m_ObjectHideFlags: 0 diff --git a/UnitySDK/Assets/ML-Agents/Examples/Tennis/Scripts/HitWall.cs b/UnitySDK/Assets/ML-Agents/Examples/Tennis/Scripts/HitWall.cs old mode 100755 new mode 100644 index 3da8fd7e1a..adf245b6b9 --- a/UnitySDK/Assets/ML-Agents/Examples/Tennis/Scripts/HitWall.cs +++ b/UnitySDK/Assets/ML-Agents/Examples/Tennis/Scripts/HitWall.cs @@ -126,4 +126,4 @@ private void OnCollisionEnter(Collision collision) lastAgentHit = collision.gameObject.name == "AgentA" ? 0 : 1; } } -} \ No newline at end of file +} diff --git a/UnitySDK/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAcademy.cs b/UnitySDK/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAcademy.cs old mode 100755 new mode 100644 index 2b68e7dc07..f51de78e10 --- a/UnitySDK/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAcademy.cs +++ b/UnitySDK/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAcademy.cs @@ -9,6 +9,7 @@ public class TennisAcademy : Academy public override void AcademyReset() { + Physics.gravity = new Vector3(0, -resetParameters["gravity"], 0); } public override void AcademyStep() diff --git a/UnitySDK/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAgent.cs b/UnitySDK/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAgent.cs old mode 100755 new mode 100644 index b511630062..183d255be1 --- a/UnitySDK/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAgent.cs +++ b/UnitySDK/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAgent.cs @@ -11,11 +11,14 @@ public class TennisAgent : Agent public bool invertX; public int score; public GameObject myArea; + public float angle; + public float scale; private Text textComponent; private Rigidbody agentRb; private Rigidbody ballRb; private float invertMult; + private ResetParameters resetParams; // Looks for the scoreboard based on the name of the gameObjects. // Do not modify the names of the Score GameObjects @@ -29,6 +32,8 @@ public override void InitializeAgent() ballRb = ball.GetComponent(); var canvas = GameObject.Find(CanvasName); GameObject scoreBoard; + var academy = Object.FindObjectOfType() as Academy; + resetParams = academy.resetParameters; if (invertX) { scoreBoard = canvas.transform.Find(ScoreBoardBName).gameObject; @@ -38,6 +43,7 @@ public override void InitializeAgent() scoreBoard = canvas.transform.Find(ScoreBoardAName).gameObject; } textComponent = scoreBoard.GetComponent(); + SetResetParameters(); } public override void CollectObservations() @@ -58,7 +64,7 @@ public override void AgentAction(float[] vectorAction, string textAction) { var moveX = Mathf.Clamp(vectorAction[0], -1f, 1f) * invertMult; var moveY = Mathf.Clamp(vectorAction[1], -1f, 1f); - + if (moveY > 0.5 && transform.position.y - transform.parent.transform.position.y < -1.5f) { agentRb.velocity = new Vector3(agentRb.velocity.x, 7f, 0f); @@ -66,12 +72,12 @@ public override void AgentAction(float[] vectorAction, string textAction) agentRb.velocity = new Vector3(moveX * 30f, agentRb.velocity.y, 0f); - if (invertX && transform.position.x - transform.parent.transform.position.x < -invertMult || + if (invertX && transform.position.x - transform.parent.transform.position.x < -invertMult || !invertX && transform.position.x - transform.parent.transform.position.x > -invertMult) { - transform.position = new Vector3(-invertMult + transform.parent.transform.position.x, - transform.position.y, - transform.position.z); + transform.position = new Vector3(-invertMult + transform.parent.transform.position.x, + transform.position.y, + transform.position.z); } textComponent.text = score.ToString(); @@ -83,5 +89,29 @@ public override void AgentReset() transform.position = new Vector3(-invertMult * Random.Range(6f, 8f), -1.5f, 0f) + transform.parent.transform.position; agentRb.velocity = new Vector3(0f, 0f, 0f); + + SetResetParameters(); + } + + public void SetRacket() + { + angle = resetParams["angle"]; + gameObject.transform.eulerAngles = new Vector3( + gameObject.transform.eulerAngles.x, + gameObject.transform.eulerAngles.y, + invertMult * angle + ); + } + + public void SetBall() + { + scale = resetParams["scale"]; + ball.transform.localScale = new Vector3(scale, scale, scale); + } + + public void SetResetParameters() + { + SetRacket(); + SetBall(); } } diff --git a/UnitySDK/Assets/ML-Agents/Examples/Tennis/TFModels/TennisLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/Tennis/TFModels/TennisLearning.nn index 7ed0efa125..6a8287c873 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/Tennis/TFModels/TennisLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/Tennis/TFModels/TennisLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/Walker/Scenes/Walker.unity b/UnitySDK/Assets/ML-Agents/Examples/Walker/Scenes/Walker.unity index 048e9fe59d..1738095010 100644 --- a/UnitySDK/Assets/ML-Agents/Examples/Walker/Scenes/Walker.unity +++ b/UnitySDK/Assets/ML-Agents/Examples/Walker/Scenes/Walker.unity @@ -715,7 +715,15 @@ MonoBehaviour: timeScale: 1 targetFrameRate: 60 resetParameters: - resetParameters: [] + resetParameters: + - key: gravity + value: 9.81 + - key: hip_mass + value: 15 + - key: chest_mass + value: 8 + - key: spine_mass + value: 10 --- !u!4 &1409355322 Transform: m_ObjectHideFlags: 0 diff --git a/UnitySDK/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAcademy.cs b/UnitySDK/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAcademy.cs old mode 100755 new mode 100644 index d00b98365a..5d4c5f2f04 --- a/UnitySDK/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAcademy.cs +++ b/UnitySDK/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAcademy.cs @@ -19,6 +19,7 @@ public override void InitializeAcademy() public override void AcademyReset() { + Physics.gravity = new Vector3(0, -resetParameters["gravity"], 0); } public override void AcademyStep() diff --git a/UnitySDK/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAgent.cs b/UnitySDK/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAgent.cs old mode 100755 new mode 100644 index 5becda615a..56fb6a3246 --- a/UnitySDK/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAgent.cs +++ b/UnitySDK/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAgent.cs @@ -29,6 +29,12 @@ public class WalkerAgent : Agent bool isNewDecisionStep; int currentDecisionStep; + private Rigidbody hipsRb; + private Rigidbody chestRb; + private Rigidbody spineRb; + + private ResetParameters resetParams; + public override void InitializeAgent() { jdController = GetComponent(); @@ -48,6 +54,15 @@ public override void InitializeAgent() jdController.SetupBodyPart(armR); jdController.SetupBodyPart(forearmR); jdController.SetupBodyPart(handR); + + hipsRb = hips.GetComponent(); + chestRb = chest.GetComponent(); + spineRb = spine.GetComponent(); + + var academy = FindObjectOfType() as WalkerAcademy; + resetParams = academy.resetParameters; + + SetResetParameters(); } /// @@ -184,5 +199,18 @@ public override void AgentReset() isNewDecisionStep = true; currentDecisionStep = 1; + SetResetParameters(); + } + + public void SetTorsoMass() + { + chestRb.mass = resetParams["chest_mass"]; + spineRb.mass = resetParams["spine_mass"]; + hipsRb.mass = resetParams["hip_mass"]; + } + + public void SetResetParameters() + { + SetTorsoMass(); } } diff --git a/UnitySDK/Assets/ML-Agents/Examples/Walker/TFModels/WalkerLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/Walker/TFModels/WalkerLearning.nn index 55899f37a7..759cbb4b1d 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/Walker/TFModels/WalkerLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/Walker/TFModels/WalkerLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/BigWallJumpLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/BigWallJumpLearning.nn index 2d97135c03..3dbb8babae 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/BigWallJumpLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/BigWallJumpLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/SmallWallJumpLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/SmallWallJumpLearning.nn index 084c4097f6..630aed3361 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/SmallWallJumpLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/SmallWallJumpLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Barracuda.dll b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Barracuda.dll index 0e9660dfb5..a9f15d0a01 100644 Binary files a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Barracuda.dll and b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Barracuda.dll differ diff --git a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Conv.compute b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Conv.compute index 84c400f999..89ba4ed07a 100644 --- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Conv.compute +++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Conv.compute @@ -1,7 +1,10 @@ #pragma kernel Conv2D #pragma kernel Conv2D_RegisterBlock4x2 -#pragma kernel Conv2D_L1Cached64_RegisterBlock4x4 -#pragma kernel Conv2D_L1Cached32_RegisterBlock4x4 +//#pragma kernel Conv2D_L1Cached64_RegisterBlock4x4 +//#pragma kernel Conv2D_L1Cached32_RegisterBlock4x4 +#pragma kernel Conv2DKernelKxK_T16x16_R4x4 BLOCK_SIZE=4 SUFFIX=KernelKxK_T16x16_R +#pragma kernel Conv2DKernelKxK_StrictC16K64_T16x16_R4x4 BLOCK_SIZE=4 STRICT_CHANNELS=1 SUFFIX=KernelKxK_StrictC16K64_T16x16_R +#pragma kernel Conv2DKernel1x1_StrictC16K64_T16x16_R4x4 BLOCK_SIZE=4 KERNEL_1x1=1 STRICT_CHANNELS=1 SUFFIX=Kernel1x1_StrictC16K64_T16x16_R #pragma kernel DepthwiseConv2D @@ -19,6 +22,249 @@ TENSOR_DECL_RW(O) uint4 _Pad; uint4 _Stride; +#define DEBUG_CHECK_BOUNDS 0 + +// Conv2DBlock64x64_4x4 + index optimizations +// T +// -1|0 -1|0 +// 16: 142|142ms 144|155ms + +float ffma(float a, float b, float c) { return dot(float2(a,c), float2(b,1)); } +#define FUNC_NAME(KERNEL, SUFFIX, SIZE) KERNEL##SUFFIX##SIZE##x##SIZE +#define CACHE_NAME(KERNEL, SUFFIX, SIZE, TENSOR) KERNEL##SUFFIX##SIZE##x##SIZE##_Cache_##TENSOR + +#define KERNEL_NAME Conv2D + +#if BLOCK_SIZE == 4 +#define TRANSPOSED_X 0 +#define BUF_OFFSET 0 +#define CACHE_DEPTH 16 +groupshared float CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, X)[CACHE_DEPTH*16*BLOCK_SIZE+(1-TRANSPOSED_X)*CACHE_DEPTH]; +groupshared float CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, W)[CACHE_DEPTH*16*BLOCK_SIZE]; +[numthreads(16,16,1)] +void FUNC_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE)(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID, uint threadIndex : SV_GroupIndex) +{ + DISPATCH_ARGS(K.kernelCount, O.width * O.height * O.batch, 1); + TENSOR_SHARED2_ARGS4(X, K, B, WBK, O); + + // [W*H, Ky*Kx*In] * [Ky*Kx*In, Out] => [W*H, Out] + + #define X_ CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, X) + #define W_ CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, W) + + int x = (int)dispatchThreadID.x * BLOCK_SIZE; // output_channels + int y = (int)dispatchThreadID.y * BLOCK_SIZE; // batch*width*height + int tx = (int)groupThreadID.x; + int ty = (int)groupThreadID.y; + int bx = ((int)dispatchThreadID.x - (int)groupThreadID.x) * BLOCK_SIZE; + int by = ((int)dispatchThreadID.y - (int)groupThreadID.y) * BLOCK_SIZE; + int ti = (int)threadIndex; + uint w = O.width; + uint h = O.height; + int channels = X.channels; + int widthX = X.width; + int heightX = X.height; + int strideX = X.channels; + int strideK = K.channels; + int strideO = O.channels; + int offsetX = BUF_OFFSET; + int offsetK = BUF_OFFSET; + int offsetO = BUF_OFFSET; + + float4 dstA[4]; + dstA[0].x = B.Get(x+0); dstA[0].y = B.Get(x+1); dstA[0].z = B.Get(x+2); dstA[0].w = B.Get(x+3); + dstA[1].x = B.Get(x+0); dstA[1].y = B.Get(x+1); dstA[1].z = B.Get(x+2); dstA[1].w = B.Get(x+3); + dstA[2].x = B.Get(x+0); dstA[2].y = B.Get(x+1); dstA[2].z = B.Get(x+2); dstA[2].w = B.Get(x+3); + dstA[3].x = B.Get(x+0); dstA[3].y = B.Get(x+1); dstA[3].z = B.Get(x+2); dstA[3].w = B.Get(x+3); + + int readK = strideK * (ti>>6) + bx + (ti&63) + offsetK; + #if STRICT_CHANNELS == 1 + #else + bool maskK = (bx + (ti&63)) < strideK; + #endif + +#if TRANSPOSED_X == 1 + uint centroidId = by + (ti&63); + #if KERNEL_1x1 == 1 + int readX = strideX * (ti>>6) + centroidId; + #else + int batch = centroidId / w / h; + int topY = (centroidId / w % h) * _Stride.y - _Pad.y; + int leftX = (centroidId % w) * _Stride.x - _Pad.x; + int cornerId = batch * heightX * widthX + topY * widthX + leftX; + int readX = strideX * (ti>>6) + cornerId; + bool mask; + #endif +#else + uint4 centroidId = uint4( + (by + (ti>>4) + 0), + (by + (ti>>4) + 16), + (by + (ti>>4) + 32), + (by + (ti>>4) + 48)); + #if KERNEL_1x1 == 1 + int4 readX = strideX * centroidId + (ti&15); + #else + int4 batch = centroidId / w / h; + int4 topY = (centroidId / w % h) * _Stride.y - _Pad.y; + int4 leftX = (centroidId % w) * _Stride.x - _Pad.x; + int4 cornerId = batch * heightX * widthX + topY * widthX + leftX; + int4 readX = strideX * cornerId + (ti&15); + bool4 mask; + #endif +#endif + +#if KERNEL_1x1 == 1 + { + { +#else + for (int dy = 0; dy < (int)K.GetKernelHeight(); dy++) + { + for (int dx = 0; dx < (int)K.GetKernelWidth(); dx++) + { + int kernelOffsetX = (dy * widthX + dx) * strideX; + mask = + topY + dy >= 0 && + topY + dy < heightX && + leftX + dx >= 0 && + leftX + dx < widthX; +#endif // KERNEL_1x1 + for (int i = 0; i < channels; i += CACHE_DEPTH) + { + #if STRICT_CHANNELS == 1 + #else + if (i + CACHE_DEPTH > channels) + { + int channelRemainder = channels - i; + [unroll] for (int j = 0; j < 4; ++j) + { + bool maskChannelsK = ti < 64 * (channelRemainder - j * 4); + bool maskChannelsX = + #if TRANSPOSED_X == 1 + maskChannelsK; + #else + (ti&15) < channelRemainder; + #endif + + W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2) + 256*j] = + (maskK & maskChannelsK) ? K.data[readK] : 0; + readK += strideK * max(0, min(channelRemainder - j * 4, 4)); + + #if TRANSPOSED_X == 1 + X_[ti + 256*j] = + #if KERNEL_1x1 == 1 + maskChannelsX ? X.data[readX + strideX * (i + j * 4) + offsetX]: 0; + #else + (mask && maskChannelsX) ? X.data[readX + strideX * (i + j * 4) + kernelOffsetX + offsetX]: 0; + #endif + #else + X_[(ti>>4) + 65*(ti&15) + 16*j] = + #if KERNEL_1x1 == 1 + maskChannelsX ? X.data[readX[j] + i + offsetX]: 0; + #else + (mask[j] && maskChannelsX) ? X.data[readX[j] + i + kernelOffsetX + offsetX]: 0; + #endif + #endif + } + } + else + #endif + [unroll] for (int j = 0; j < 4; ++j) + { + W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2) + 256*j] = + #if STRICT_CHANNELS == 1 + K.data[readK]; + #else + maskK ? K.data[readK]: 0; + #endif + readK += strideK * 4; + + #if TRANSPOSED_X == 1 + X_[ti + 256*j] = + #if KERNEL_1x1 == 1 + X.data[readX + strideX * (i + j * 4) + offsetX]; + #else + mask ? X.data[readX + strideX * (i + j * 4) + kernelOffsetX + offsetX]: 0; + #endif + #else + X_[(ti>>4) + 65*(ti&15) + 16*j] = + #if KERNEL_1x1 == 1 + X.data[readX[j] + i + offsetX]; + #else + mask[j] ? X.data[readX[j] + i + kernelOffsetX + offsetX]: 0; + #endif + #endif + + #if DEBUG_CHECK_BOUNDS && KERNEL_1x1 == 0 + if (mask[j] && readX[j] + i + kernelOffsetX < 0) + X_[(ti>>4) + 65*(ti&15) + 16*j] = -1; + if (mask[j] && readX[j] + i + kernelOffsetX >= X.GetLength()) + X_[(ti>>4) + 65*(ti&15) + 16*j] = -1; + #endif + } + + GroupMemoryBarrierWithGroupSync(); + + int4 idX = int4(0,1,2,3); + int4 idW = int4(0,16,32,48); + int incX = 64 + (1-TRANSPOSED_X); + int incW = 64; + + for (int di = 0; di < CACHE_DEPTH; di++) + { + float4 srcX = float4( + X_[idX.x + ty*4], + X_[idX.y + ty*4], + X_[idX.z + ty*4], + X_[idX.w + ty*4]); + float4 srcW = float4( + W_[idW.x + tx], + W_[idW.y + tx], + W_[idW.z + tx], + W_[idW.w + tx] + ); + idX += incX; + idW += incW; + + dstA[0].x = ffma(srcX.x, srcW.x, dstA[0].x); + dstA[0].y = ffma(srcX.x, srcW.y, dstA[0].y); + dstA[0].z = ffma(srcX.x, srcW.z, dstA[0].z); + dstA[0].w = ffma(srcX.x, srcW.w, dstA[0].w); + + dstA[1].x = ffma(srcX.y, srcW.x, dstA[1].x); + dstA[1].y = ffma(srcX.y, srcW.y, dstA[1].y); + dstA[1].z = ffma(srcX.y, srcW.z, dstA[1].z); + dstA[1].w = ffma(srcX.y, srcW.w, dstA[1].w); + + dstA[2].x = ffma(srcX.z, srcW.x, dstA[2].x); + dstA[2].y = ffma(srcX.z, srcW.y, dstA[2].y); + dstA[2].z = ffma(srcX.z, srcW.z, dstA[2].z); + dstA[2].w = ffma(srcX.z, srcW.w, dstA[2].w); + + dstA[3].x = ffma(srcX.w, srcW.x, dstA[3].x); + dstA[3].y = ffma(srcX.w, srcW.y, dstA[3].y); + dstA[3].z = ffma(srcX.w, srcW.z, dstA[3].z); + dstA[3].w = ffma(srcX.w, srcW.w, dstA[3].w); + } + + GroupMemoryBarrierWithGroupSync(); + } + } + } + + [unroll] for (int sy = 0; sy < 4 && y+sy < (int)w * (int)h * (int)O.batch; ++sy) + [unroll] for (int sx = 0; sx < 4 && x+sx < strideO; ++sx) + O.data[strideO * (y+sy) + x+sx + offsetO] = dstA[sy][sx]; + + #undef X_ + #undef W_ +} +#else +#endif +#undef TRANSPOSED_X +#undef CACHE_DEPTH +#undef BUF_OFFSET +#undef KERNEL_NAME + NUMTHREADS((16,4,4), (8,4,4), (4,4,4)) void Conv2D(uint3 dispatchThreadID : SV_DispatchThreadID) { @@ -178,7 +424,7 @@ CONV2D_L1CACHED(64,4, fastfma) CONV2D_L1CACHED(32,4, fastfma) - +// IDEA: iterate over channels in the inner loop - needs channels first layout NUMTHREADS((16,4,4), (8,4,4), (4,4,4)) void DepthwiseConv2D(uint3 dispatchThreadID : SV_DispatchThreadID) { diff --git a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Dense.compute b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Dense.compute index f39086dd02..bc5f328db9 100644 --- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Dense.compute +++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Dense.compute @@ -3,6 +3,10 @@ #pragma kernel DenseTiled32x32 #pragma kernel DenseTiled64x64 +//#pragma kernel Dense_T8x8_R8x8 DENSE=1 BLOCK_SIZE=8 +#pragma kernel Dense_T8x8_R4x4 DENSE=1 BLOCK_SIZE=4 +#pragma kernel Dense_T16x16_R4x4 DENSE=1 BLOCK_SIZE=4 + #include "Tensor.cginc" TENSOR_DECL(X) @@ -11,6 +15,1369 @@ TENSOR_DECL(B) TENSOR_DECL(WBK) TENSOR_DECL_RW(O) +#if DENSE +float ffma(float a, float b, float c) { return dot(float2(a,c), float2(b,1)); } //return a*b+c;} //fastfma(a,b,c); } +#define FUNC_NAME(KERNEL, SIZE) KERNEL##SIZE##x##SIZE +#define CACHE_NAME(KERNEL, SIZE, TENSOR) KERNEL##SIZE##x##SIZE##_Cache_##TENSOR + + +//CACHE_DEPTH +// T >>X +//16: 178ms 272ms 181ms +// 8: 173ms 395ms 205ms +// 4: 176ms 630ms 260ms +// 2: 205ms 495ms 420ms +// 1: 209ms 980ms -- + + +//@HARDCODED_DIMS + BUF_OFFSET + lds read index alu opt +//CACHE_DEPTH +// T >>X +//16: 169ms 241ms 173ms +// 8: 169ms 356ms 178ms +// 4: 170ms 612ms 209ms +// 2: 178ms 900ms 380ms +// 1: 250ms 875ms -- + +//@BLOCKED_W + HARDCODED_DIMS + BUF_OFFSET + lds read index alu opt +//!INCLUDING ValidateData by mistake! +//CACHE_DEPTH +// T >>X +//16: 144ms 241ms 155ms +// 8: 158ms 357ms 164ms +// 4: 151ms 630ms 202ms +// 2: 180ms 815ms 350ms +// 1: 258ms 883ms -- +// @TODO: try 32 + + +//============================================ +//@BLOCKED_W + BUF_OFFSET + lds read index alu opt +//CACHE_DEPTH +// T T >>X +// hard_dims +//32: 167ms +//16: 122ms 141ms 140ms +// 8: 136ms 147ms 154ms +// 4: 130ms 141ms 189ms +// 2: 159ms ***ms ***ms +// 1: 220ms ***ms ***ms +// +//Vega +//32: 172ms +//16: 154ms +// 8: 156ms +// 4: 161ms +// 2: 162ms +// 1: 245ms +//iOS(8layers) +//32: 28ms + + +//@BLOCKED_W + lds read index alu opt +//16: 134ms 142ms 146ms + + +//@BLOCKED_W + BUF_OFFSET + optimized read indices +//CACHE_DEPTH +//16: 123ms 131ms 135ms + + +#define KERNEL_NAME Dense_T16x16_R +#if BLOCK_SIZE == 4 +#define TRANSPOSED_X 0 +#define SHIFTED_X 1 +#define BLOCKED_W 1 +#define HARDCODED_DIMS 0 +#define BUF_OFFSET 0 +#define DOUBLE_BUFFER_LDS_READS 0 +#define CACHE_DEPTH 16 +groupshared float CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, X)[CACHE_DEPTH*16*BLOCK_SIZE+SHIFTED_X*CACHE_DEPTH]; +groupshared float CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, W)[CACHE_DEPTH*16*BLOCK_SIZE]; +[numthreads(16,16,1)] +void FUNC_NAME(KERNEL_NAME, BLOCK_SIZE)(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID, uint threadIndex : SV_GroupIndex) +{ + DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1); + TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); + + int x = (int)dispatchThreadID.x * BLOCK_SIZE; + int y = (int)dispatchThreadID.y * BLOCK_SIZE; + int tx = (int)groupThreadID.x; + int ty = (int)groupThreadID.y; + int bx = ((int)dispatchThreadID.x - (int)groupThreadID.x) * BLOCK_SIZE; + int by = ((int)dispatchThreadID.y - (int)groupThreadID.y) * BLOCK_SIZE; + int ti = (int)threadIndex; + int n = (int)X.GetFlatWidth(); + int strideX = (int)X.GetFlatWidth(); + int strideW = (int)W.GetFlatWidth(); + int strideO = (int)O.GetFlatWidth(); + int offsetX = BUF_OFFSET; + int offsetW = BUF_OFFSET; + int offsetO = BUF_OFFSET; +#if HARDCODED_DIMS == 1 + n = 1024; + strideX = 1024; + strideW = 1024; + strideO = 1024; +#endif + + #define X_ CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, X) + #define W_ CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, W) + + //if (x >= (int)O.GetFlatWidth()) return; + //if (y >= (int)O.GetFlatHeight()) return; + + float4 dstA_0, dstA_1, dstA_2, dstA_3; + + dstA_0.x = B.Get(x+0); + dstA_1.x = B.Get(x+0); + dstA_2.x = B.Get(x+0); + dstA_3.x = B.Get(x+0); + dstA_0.y = B.Get(x+1); + dstA_1.y = B.Get(x+1); + dstA_2.y = B.Get(x+1); + dstA_3.y = B.Get(x+1); + dstA_0.z = B.Get(x+2); + dstA_1.z = B.Get(x+2); + dstA_2.z = B.Get(x+2); + dstA_3.z = B.Get(x+2); + dstA_0.w = B.Get(x+3); + dstA_1.w = B.Get(x+3); + dstA_2.w = B.Get(x+3); + dstA_3.w = B.Get(x+3); + + int j; + int readW = strideW * (ti>>6) + bx + (ti&63) + offsetW; + #if TRANSPOSED_X == 1 + int readX = strideX * (ti>>6) + by + (ti&63) + offsetX; + #elif SHIFTED_X == 1 + int4 readX = int4( + strideX * (by + (ti>>4) + 0) + (ti&15) + offsetX, + strideX * (by + (ti>>4) +16) + (ti&15) + offsetX, + strideX * (by + (ti>>4) +32) + (ti&15) + offsetX, + strideX * (by + (ti>>4) +48) + (ti&15) + offsetX); + #endif + + for (int i = 0; i < n; i += CACHE_DEPTH) + { + + #if CACHE_DEPTH == 32 + #if BLOCKED_W == 1 + W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2) ] = W.data[strideW * (i + (ti>>6) + 0) + bx + (ti&63) + offsetW]; + W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2)+256] = W.data[strideW * (i + (ti>>6) + 4) + bx + (ti&63) + offsetW]; + W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2)+512] = W.data[strideW * (i + (ti>>6) + 8) + bx + (ti&63) + offsetW]; + W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2)+768] = W.data[strideW * (i + (ti>>6) +12) + bx + (ti&63) + offsetW]; + W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2)+1024]= W.data[strideW * (i + (ti>>6) +16) + bx + (ti&63) + offsetW]; + W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2)+1280]= W.data[strideW * (i + (ti>>6) +20) + bx + (ti&63) + offsetW]; + W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2)+1536]= W.data[strideW * (i + (ti>>6) +24) + bx + (ti&63) + offsetW]; + W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2)+1792]= W.data[strideW * (i + (ti>>6) +28) + bx + (ti&63) + offsetW]; + #else + #endif + + #if TRANSPOSED_X == 1 + X_[ti ] = X.data[strideX * (i + (ti>>6) + 0) + by + (ti&63) + offsetX]; + X_[ti+256] = X.data[strideX * (i + (ti>>6) + 4) + by + (ti&63) + offsetX]; + X_[ti+512] = X.data[strideX * (i + (ti>>6) + 8) + by + (ti&63) + offsetX]; + X_[ti+768] = X.data[strideX * (i + (ti>>6) +12) + by + (ti&63) + offsetX]; + X_[ti+1024]= X.data[strideX * (i + (ti>>6) +16) + by + (ti&63) + offsetX]; + X_[ti+1280]= X.data[strideX * (i + (ti>>6) +20) + by + (ti&63) + offsetX]; + X_[ti+1536]= X.data[strideX * (i + (ti>>6) +24) + by + (ti&63) + offsetX]; + X_[ti+1792]= X.data[strideX * (i + (ti>>6) +28) + by + (ti&63) + offsetX]; + #elif SHIFTED_X == 1 + // 16x64 => 64x16 + X_[(ti>>5) + 65*(ti&31) + 0] = X.data[strideX * (by + (ti>>5) + 0) + i + (ti&31) + offsetX]; + X_[(ti>>5) + 65*(ti&31) + 8] = X.data[strideX * (by + (ti>>5) + 8) + i + (ti&31) + offsetX]; + X_[(ti>>5) + 65*(ti&31) +16] = X.data[strideX * (by + (ti>>5) +16) + i + (ti&31) + offsetX]; + X_[(ti>>5) + 65*(ti&31) +24] = X.data[strideX * (by + (ti>>5) +24) + i + (ti&31) + offsetX]; + X_[(ti>>5) + 65*(ti&31) +32] = X.data[strideX * (by + (ti>>5) +32) + i + (ti&31) + offsetX]; + X_[(ti>>5) + 65*(ti&31) +40] = X.data[strideX * (by + (ti>>5) +40) + i + (ti&31) + offsetX]; + X_[(ti>>5) + 65*(ti&31) +48] = X.data[strideX * (by + (ti>>5) +48) + i + (ti&31) + offsetX]; + X_[(ti>>5) + 65*(ti&31) +56] = X.data[strideX * (by + (ti>>5) +56) + i + (ti&31) + offsetX]; + #else + // 16x64 => 64x16 + #endif + + + #elif CACHE_DEPTH == 16 + #if BLOCKED_W == 1 + #if HARDCODED_DIMS + W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2) ] = W.data[strideW * (i + (ti>>6) + 0) + bx + (ti&63) + offsetW]; + W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2)+256] = W.data[strideW * (i + (ti>>6) + 4) + bx + (ti&63) + offsetW]; + W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2)+512] = W.data[strideW * (i + (ti>>6) + 8) + bx + (ti&63) + offsetW]; + W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2)+768] = W.data[strideW * (i + (ti>>6) +12) + bx + (ti&63) + offsetW]; + #else + [unroll] for (j = 0; j < 4; ++j, readW += strideW * 4) + W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2) + 256*j] = W.data[readW]; + #endif + #else + W_[ti ] = W.data[strideW * (i + (ti>>6) + 0) + bx + (ti&63) + offsetW]; + W_[ti+256] = W.data[strideW * (i + (ti>>6) + 4) + bx + (ti&63) + offsetW]; + W_[ti+512] = W.data[strideW * (i + (ti>>6) + 8) + bx + (ti&63) + offsetW]; + W_[ti+768] = W.data[strideW * (i + (ti>>6) +12) + bx + (ti&63) + offsetW]; + #endif + + #if TRANSPOSED_X == 1 + #if HARDCODED_DIMS + X_[ti ] = X.data[strideX * (i + (ti>>6) + 0) + by + (ti&63) + offsetX]; + X_[ti+256] = X.data[strideX * (i + (ti>>6) + 4) + by + (ti&63) + offsetX]; + X_[ti+512] = X.data[strideX * (i + (ti>>6) + 8) + by + (ti&63) + offsetX]; + X_[ti+768] = X.data[strideX * (i + (ti>>6) +12) + by + (ti&63) + offsetX]; + #else + [unroll] for (j = 0; j < 4; ++j, readX += strideX * 4) + X_[ti + 256*j] = X.data[readX]; + #endif + + #elif SHIFTED_X == 1 + // 16x64 => 64x16 + #if HARDCODED_DIMS + X_[(ti>>4) + 65*(ti&15) + 0] = X.data[strideX * (by + (ti>>4) + 0) + i + (ti&15) + offsetX]; + X_[(ti>>4) + 65*(ti&15) +16] = X.data[strideX * (by + (ti>>4) +16) + i + (ti&15) + offsetX]; + X_[(ti>>4) + 65*(ti&15) +32] = X.data[strideX * (by + (ti>>4) +32) + i + (ti&15) + offsetX]; + X_[(ti>>4) + 65*(ti&15) +48] = X.data[strideX * (by + (ti>>4) +48) + i + (ti&15) + offsetX]; + #else + [unroll] for (j = 0; j < 4; ++j) + X_[(ti>>4) + 65*(ti&15) + 16*j] = X.data[readX[j]]; + readX += CACHE_DEPTH; + #endif + #else + // 16x64 => 64x16 + X_[ti ] = X.data[strideX * (by + (ti&63)) + i + (ti>>6) + 0 + offsetX]; + X_[ti+256] = X.data[strideX * (by + (ti&63)) + i + (ti>>6) + 4 + offsetX]; + X_[ti+512] = X.data[strideX * (by + (ti&63)) + i + (ti>>6) + 8 + offsetX]; + X_[ti+768] = X.data[strideX * (by + (ti&63)) + i + (ti>>6) +12 + offsetX]; + #endif + + #elif CACHE_DEPTH == 8 + #if BLOCKED_W == 1 + W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2) ] = W.data[strideW * (i + (ti>>6) + 0) + bx + (ti&63) + offsetW]; + W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2)+256] = W.data[strideW * (i + (ti>>6) + 4) + bx + (ti&63) + offsetW]; + #else + W_[ti ] = W.data[strideW * (i + (ti>>6) + 0) + bx + (ti&63) + offsetW]; + W_[ti+256] = W.data[strideW * (i + (ti>>6) + 4) + bx + (ti&63) + offsetW]; + #endif + + #if TRANSPOSED_X == 1 + X_[ti ] = X.data[strideX * (i + (ti>>6) + 0) + by + (ti&63) + offsetX]; + X_[ti+256] = X.data[strideX * (i + (ti>>6) + 4) + by + (ti&63) + offsetX]; + #elif SHIFTED_X == 1 + // 8x64 => 64x8 + X_[(ti>>3) + 65*(ti&7) + 0] = X.data[strideX * (by + (ti>>3) + 0) + i + (ti&7) + offsetX]; + X_[(ti>>3) + 65*(ti&7) +32] = X.data[strideX * (by + (ti>>3) +32) + i + (ti&7) + offsetX]; + #else + // 8x64 => 64x8 + X_[ti ] = X.data[strideX * (by + (ti&63)) + i + (ti>>6) + 0 + offsetX]; + X_[ti+256] = X.data[strideX * (by + (ti&63)) + i + (ti>>6) + 4 + offsetX]; + #endif + + #elif CACHE_DEPTH == 4 + #if BLOCKED_W == 1 + W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2) ] = W.data[strideW * (i + (ti>>6) + 0) + bx + (ti&63) + offsetW]; + #else + W_[ti ] = W.data[strideW * (i + (ti>>6) + 0) + bx + (ti&63) + offsetW]; + #endif + #if TRANSPOSED_X == 1 + X_[ti ] = X.data[strideX * (i + (ti>>6) + 0) + by + (ti&63) + offsetX]; + #elif SHIFTED_X == 1 + // 4x64 => 64x4 + X_[(ti>>2) + 65*(ti&3) + 0] = X.data[strideX * (by + (ti>>2) + 0) + i + (ti&3) + offsetX]; + #else + // 4x64 => 64x4 + X_[ti ] = X.data[strideX * (by + (ti&63)) + i + (ti>>6) + 0 + offsetX]; + #endif + + #elif CACHE_DEPTH == 2 + if (ti < 128) + { + #if BLOCKED_W == 1 + W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2) ] = W.data[strideW * (i + (ti>>6) + 0) + bx + (ti&63) + offsetW]; + #else + W_[ti ] = W.data[strideW * (i + (ti>>6) + 0) + bx + (ti&63) + offsetW]; + #endif + #if TRANSPOSED_X == 1 + X_[ti ] = X.data[strideX * (i + (ti>>6) + 0) + by + (ti&63) + offsetX]; + #elif SHIFTED_X == 1 + X_[(ti>>1) + 65*(ti&1) + 0] = X.data[strideX * (by + (ti>>1) + 0) + i + (ti&1) + offsetX]; + #else + X_[ti ] = X.data[strideX * (by + (ti&63)) + i + (ti>>6) + 0 + offsetX]; + #endif + } + + #elif CACHE_DEPTH == 1 + if (ti < 64) + { + #if BLOCKED_W == 1 + W_[((ti&3)<<4) + ((ti&63)>>2) ] = W.data[strideW * i + bx + ti + offsetW]; + #else + W_[ti] = W.data[strideW * i + bx + ti + offsetW]; + #endif + #if TRANSPOSED_X == 1 + X_[ti] = X.data[strideX * i + by + ti + offsetX]; + #else + //X_[ti] = X.Get(by+ti, i); + X_[ti] = X.data[strideX * (by + ti) + i + offsetX]; + #endif + } + #endif + + GroupMemoryBarrierWithGroupSync(); + + int4 idX = int4(0,1,2,3); + int4 idW = int4(0,1,2,3); + #if BLOCKED_W == 1 + idW = int4(0,16,32,48); + #endif + int incX = 64 + (SHIFTED_X & ~TRANSPOSED_X); + int incW = 64; +#if 0 //DOUBLE_BUFFER_LDS_READS == 1 + float4 srcW_ = float4( + #if BLOCKED_W == 1 + W_[idW.x + tx], + W_[idW.y + tx], + W_[idW.z + tx], + W_[idW.w + tx] + #else + W_[idW.x + tx*4], + W_[idW.y + tx*4], + W_[idW.z + tx*4], + W_[idW.w + tx*4] + #endif + ); + idW += incW; + + //int lastX = idX.x + (CACHE_DEPTH - 2) * incX.x; + //while (idX.x < lastX.x) + for (int di = 0; di < CACHE_DEPTH - 2; di+=2) + { + float4 srcX, srcW; + srcX = float4( + X_[idX.x + ty*4], + X_[idX.y + ty*4], + X_[idX.z + ty*4], + X_[idX.w + ty*4]); + srcW = float4( + #if BLOCKED_W == 1 + W_[idW.x + tx], + W_[idW.y + tx], + W_[idW.z + tx], + W_[idW.w + tx] + #else + W_[idW.x + tx*4], + W_[idW.y + tx*4], + W_[idW.z + tx*4], + W_[idW.w + tx*4] + #endif + ); + idX += incX; + idW += incW; + + dstA_0.x = ffma(srcX.x, srcW_.x, dstA_0.x); + dstA_0.y = ffma(srcX.x, srcW_.y, dstA_0.y); + dstA_0.z = ffma(srcX.x, srcW_.z, dstA_0.z); + dstA_0.w = ffma(srcX.x, srcW_.w, dstA_0.w); + + dstA_1.x = ffma(srcX.y, srcW_.x, dstA_1.x); + dstA_1.y = ffma(srcX.y, srcW_.y, dstA_1.y); + dstA_1.z = ffma(srcX.y, srcW_.z, dstA_1.z); + dstA_1.w = ffma(srcX.y, srcW_.w, dstA_1.w); + + dstA_2.x = ffma(srcX.z, srcW_.x, dstA_2.x); + dstA_2.y = ffma(srcX.z, srcW_.y, dstA_2.y); + dstA_2.z = ffma(srcX.z, srcW_.z, dstA_2.z); + dstA_2.w = ffma(srcX.z, srcW_.w, dstA_2.w); + + dstA_3.x = ffma(srcX.w, srcW_.x, dstA_3.x); + dstA_3.y = ffma(srcX.w, srcW_.y, dstA_3.y); + dstA_3.z = ffma(srcX.w, srcW_.z, dstA_3.z); + dstA_3.w = ffma(srcX.w, srcW_.w, dstA_3.w); + + srcX = float4( + X_[idX.x + ty*4], + X_[idX.y + ty*4], + X_[idX.z + ty*4], + X_[idX.w + ty*4]); + srcW_ = float4( + #if BLOCKED_W == 1 + W_[idW.x + tx], + W_[idW.y + tx], + W_[idW.z + tx], + W_[idW.w + tx] + #else + W_[idW.x + tx*4], + W_[idW.y + tx*4], + W_[idW.z + tx*4], + W_[idW.w + tx*4] + #endif + ); + idX += incX; + idW += incW; + + dstA_0.x = ffma(srcX.x, srcW.x, dstA_0.x); + dstA_0.y = ffma(srcX.x, srcW.y, dstA_0.y); + dstA_0.z = ffma(srcX.x, srcW.z, dstA_0.z); + dstA_0.w = ffma(srcX.x, srcW.w, dstA_0.w); + + dstA_1.x = ffma(srcX.y, srcW.x, dstA_1.x); + dstA_1.y = ffma(srcX.y, srcW.y, dstA_1.y); + dstA_1.z = ffma(srcX.y, srcW.z, dstA_1.z); + dstA_1.w = ffma(srcX.y, srcW.w, dstA_1.w); + + dstA_2.x = ffma(srcX.z, srcW.x, dstA_2.x); + dstA_2.y = ffma(srcX.z, srcW.y, dstA_2.y); + dstA_2.z = ffma(srcX.z, srcW.z, dstA_2.z); + dstA_2.w = ffma(srcX.z, srcW.w, dstA_2.w); + + dstA_3.x = ffma(srcX.w, srcW.x, dstA_3.x); + dstA_3.y = ffma(srcX.w, srcW.y, dstA_3.y); + dstA_3.z = ffma(srcX.w, srcW.z, dstA_3.z); + dstA_3.w = ffma(srcX.w, srcW.w, dstA_3.w); + } + + float4 srcX = float4( + X_[idX.x + ty*4], + X_[idX.y + ty*4], + X_[idX.z + ty*4], + X_[idX.w + ty*4]); + float4 srcW = float4( + #if BLOCKED_W == 1 + W_[idW.x + tx], + W_[idW.y + tx], + W_[idW.z + tx], + W_[idW.w + tx] + #else + W_[idW.x + tx*4], + W_[idW.y + tx*4], + W_[idW.z + tx*4], + W_[idW.w + tx*4] + #endif + ); + + dstA_0.x = ffma(srcX.x, srcW_.x, dstA_0.x); + dstA_0.y = ffma(srcX.x, srcW_.y, dstA_0.y); + dstA_0.z = ffma(srcX.x, srcW_.z, dstA_0.z); + dstA_0.w = ffma(srcX.x, srcW_.w, dstA_0.w); + + dstA_1.x = ffma(srcX.y, srcW_.x, dstA_1.x); + dstA_1.y = ffma(srcX.y, srcW_.y, dstA_1.y); + dstA_1.z = ffma(srcX.y, srcW_.z, dstA_1.z); + dstA_1.w = ffma(srcX.y, srcW_.w, dstA_1.w); + + dstA_2.x = ffma(srcX.z, srcW_.x, dstA_2.x); + dstA_2.y = ffma(srcX.z, srcW_.y, dstA_2.y); + dstA_2.z = ffma(srcX.z, srcW_.z, dstA_2.z); + dstA_2.w = ffma(srcX.z, srcW_.w, dstA_2.w); + + dstA_3.x = ffma(srcX.w, srcW_.x, dstA_3.x); + dstA_3.y = ffma(srcX.w, srcW_.y, dstA_3.y); + dstA_3.z = ffma(srcX.w, srcW_.z, dstA_3.z); + dstA_3.w = ffma(srcX.w, srcW_.w, dstA_3.w); + + srcX = float4( + X_[idX.x + ty*4], + X_[idX.y + ty*4], + X_[idX.z + ty*4], + X_[idX.w + ty*4]); + idX += incX; + + dstA_0.x = ffma(srcX.x, srcW.x, dstA_0.x); + dstA_0.y = ffma(srcX.x, srcW.y, dstA_0.y); + dstA_0.z = ffma(srcX.x, srcW.z, dstA_0.z); + dstA_0.w = ffma(srcX.x, srcW.w, dstA_0.w); + + dstA_1.x = ffma(srcX.y, srcW.x, dstA_1.x); + dstA_1.y = ffma(srcX.y, srcW.y, dstA_1.y); + dstA_1.z = ffma(srcX.y, srcW.z, dstA_1.z); + dstA_1.w = ffma(srcX.y, srcW.w, dstA_1.w); + + dstA_2.x = ffma(srcX.z, srcW.x, dstA_2.x); + dstA_2.y = ffma(srcX.z, srcW.y, dstA_2.y); + dstA_2.z = ffma(srcX.z, srcW.z, dstA_2.z); + dstA_2.w = ffma(srcX.z, srcW.w, dstA_2.w); + + dstA_3.x = ffma(srcX.w, srcW.x, dstA_3.x); + dstA_3.y = ffma(srcX.w, srcW.y, dstA_3.y); + dstA_3.z = ffma(srcX.w, srcW.z, dstA_3.z); + dstA_3.w = ffma(srcX.w, srcW.w, dstA_3.w); + + + GroupMemoryBarrierWithGroupSync(); + } +#else // DOUBLE_BUFFER_LDS_READS + +#define CACHE_UNROLL 1 + for (int di = 0; di < CACHE_DEPTH; di+=CACHE_UNROLL) + { + float4 srcX = float4( + X_[idX.x + /*ti+0**/ ty*4], + X_[idX.y + /*ti+0**/ ty*4], + X_[idX.z + /*ti+0**/ ty*4], + X_[idX.w + /*ti+0**/ ty*4]); + //X_[di*_64 + ty*4 + 0], + //X_[di*_64 + ty*4 + 1], + //X_[di*_64 + ty*4 + 2], + //X_[di*_64 + ty*4 + 3]); + //X.Get(y+0, i+di), + //X.Get(y+1, i+di), + //X.Get(y+2, i+di), + //X.Get(y+3, i+di)); + float4 srcW = float4( + #if BLOCKED_W == 1 + W_[idW.x + tx], + W_[idW.y + tx], + W_[idW.z + tx], + W_[idW.w + tx] + #else + W_[idW.x + tx*4], + W_[idW.y + tx*4], + W_[idW.z + tx*4], + W_[idW.w + tx*4] + #endif + //W_[di*64 + tx*4 + 0], + //W_[di*64 + tx*4 + 1], + //W_[di*64 + tx*4 + 2], + //W_[di*64 + tx*4 + 3] + //W.Get(i+di, x+0), + //W.Get(i+di, x+1), + //W.Get(i+di, x+2), + //W.Get(i+di, x+3) + ); + idX += incX; + idW += incW; + + dstA_0.x = ffma(srcX.x, srcW.x, dstA_0.x); + dstA_0.y = ffma(srcX.x, srcW.y, dstA_0.y); + dstA_0.z = ffma(srcX.x, srcW.z, dstA_0.z); + dstA_0.w = ffma(srcX.x, srcW.w, dstA_0.w); + + dstA_1.x = ffma(srcX.y, srcW.x, dstA_1.x); + dstA_1.y = ffma(srcX.y, srcW.y, dstA_1.y); + dstA_1.z = ffma(srcX.y, srcW.z, dstA_1.z); + dstA_1.w = ffma(srcX.y, srcW.w, dstA_1.w); + + dstA_2.x = ffma(srcX.z, srcW.x, dstA_2.x); + dstA_2.y = ffma(srcX.z, srcW.y, dstA_2.y); + dstA_2.z = ffma(srcX.z, srcW.z, dstA_2.z); + dstA_2.w = ffma(srcX.z, srcW.w, dstA_2.w); + + dstA_3.x = ffma(srcX.w, srcW.x, dstA_3.x); + dstA_3.y = ffma(srcX.w, srcW.y, dstA_3.y); + dstA_3.z = ffma(srcX.w, srcW.z, dstA_3.z); + dstA_3.w = ffma(srcX.w, srcW.w, dstA_3.w); + +#if CACHE_UNROLL>=2 +#endif +#if CACHE_UNROLL>=3 +#endif +#if CACHE_UNROLL>=4 +#endif + } + + GroupMemoryBarrierWithGroupSync(); + } +#undef CACHE_UNROLL +#endif //DOUBLE_BUFFER_LDS_READS + + O.data[strideO * (y+0) + x+0 + offsetO] = dstA_0.x; + O.data[strideO * (y+0) + x+1 + offsetO] = dstA_0.y; + O.data[strideO * (y+0) + x+2 + offsetO] = dstA_0.z; + O.data[strideO * (y+0) + x+3 + offsetO] = dstA_0.w; + O.data[strideO * (y+1) + x+0 + offsetO] = dstA_1.x; + O.data[strideO * (y+1) + x+1 + offsetO] = dstA_1.y; + O.data[strideO * (y+1) + x+2 + offsetO] = dstA_1.z; + O.data[strideO * (y+1) + x+3 + offsetO] = dstA_1.w; + O.data[strideO * (y+2) + x+0 + offsetO] = dstA_2.x; + O.data[strideO * (y+2) + x+1 + offsetO] = dstA_2.y; + O.data[strideO * (y+2) + x+2 + offsetO] = dstA_2.z; + O.data[strideO * (y+2) + x+3 + offsetO] = dstA_2.w; + O.data[strideO * (y+3) + x+0 + offsetO] = dstA_3.x; + O.data[strideO * (y+3) + x+1 + offsetO] = dstA_3.y; + O.data[strideO * (y+3) + x+2 + offsetO] = dstA_3.z; + O.data[strideO * (y+3) + x+3 + offsetO] = dstA_3.w; + + #undef X_ + #undef W_ +} +#undef TRANSPOSED_X +#undef SHIFTED_X +#undef BLOCKED_W +#undef HARDCODED_DIMS +#undef BUF_OFFSET +#undef DOUBLE_BUFFER_LDS_READS +#undef CACHE_DEPTH +#else +[numthreads(16,16,1)] +void FUNC_NAME(KERNEL_NAME, BLOCK_SIZE)(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1); + TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); + + int x = (int)dispatchThreadID.x * BLOCK_SIZE; + int y = (int)dispatchThreadID.y * BLOCK_SIZE; + int n = (int)X.GetFlatWidth(); + + if (x >= (int)O.GetFlatWidth()) return; + if (y >= (int)O.GetFlatHeight()) return; + + float dstA[BLOCK_SIZE][BLOCK_SIZE]; + float srcX[BLOCK_SIZE]; + + int dy, dx; + for (dx = 0; dx < BLOCK_SIZE; ++dx) + for (dy = 0; dy < BLOCK_SIZE; ++dy) + dstA[dy][dx] = B.data[x+dx+B.offset];//B.Get(x+dx); + + for (int i = 0; i < n; ++i) + { + for (dy = 0; dy < BLOCK_SIZE; ++dy) + srcX[dy] = X.data[(y+dy)*X.channels+i];//X.Get(y+dy, i); + + for (dx = 0; dx < BLOCK_SIZE; ++dx) + { + float srcW = W.data[i*W.channels+x+dx];//W.Get(i, x+dx); + for (dy = 0; dy < BLOCK_SIZE; ++dy) + dstA[dy][dx] += srcX[dy] * srcW; + } + } + + for (dx = 0; dx < BLOCK_SIZE; ++dx) + for (dy = 0; dy < BLOCK_SIZE; ++dy) + O.Set(y+dy, x+dx, dstA[dy][dx]); +} +#endif +#undef KERNEL_NAME + + +//CACHE_DEPTH +// T >>X +//16: 183ms 207ms +// 8: 158ms 202ms +// 4: 162ms 334ms +// 2: 159ms ***ms +// 1: 173ms -- + +#define KERNEL_NAME Dense_T8x8_R +#if BLOCK_SIZE == 8 +#define UNROLL_INNER_LOOP 0 +#define TRANSPOSED_X 0 +#define HARDCODED_DIMS 0 +#define BUF_OFFSET 0 +#define CACHE_DEPTH 8 +groupshared float CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, X)[CACHE_DEPTH*8*BLOCK_SIZE+(1-TRANSPOSED_X)*CACHE_DEPTH]; +groupshared float CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, W)[CACHE_DEPTH*8*BLOCK_SIZE]; +[numthreads(8,8,1)] +void FUNC_NAME(KERNEL_NAME, BLOCK_SIZE)(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID, uint threadIndex : SV_GroupIndex) +{ + DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1); + TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); + + int x = (int)dispatchThreadID.x * BLOCK_SIZE; + int y = (int)dispatchThreadID.y * BLOCK_SIZE; + int tx = (int)groupThreadID.x; + int ty = (int)groupThreadID.y; + int bx = ((int)dispatchThreadID.x - (int)groupThreadID.x) * BLOCK_SIZE; + int by = ((int)dispatchThreadID.y - (int)groupThreadID.y) * BLOCK_SIZE; + int ti = (int)threadIndex; + int n = (int)X.GetFlatWidth(); + int strideX = (int)X.GetFlatWidth(); + int strideW = (int)W.GetFlatWidth(); + int strideO = (int)O.GetFlatWidth(); + int offsetX = BUF_OFFSET; + int offsetW = BUF_OFFSET; + int offsetO = BUF_OFFSET; +#if HARDCODED_DIMS == 1 + n = 1024; + strideX = 1024; + strideW = 1024; + strideO = 1024; +#endif + + #define X_ CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, X) + #define W_ CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, W) + +#if UNROLL_INNER_LOOP + float4 dstA_0, dstA_1, dstA_2, dstA_3; + float4 dstB_0, dstB_1, dstB_2, dstB_3; + float4 dstC_0, dstC_1, dstC_2, dstC_3; + float4 dstD_0, dstD_1, dstD_2, dstD_3; + + dstA_0.x = dstC_0.x = B.Get(x+0); + dstA_1.x = dstC_1.x = B.Get(x+0); + dstA_2.x = dstC_2.x = B.Get(x+0); + dstA_3.x = dstC_3.x = B.Get(x+0); + dstA_0.y = dstC_0.y = B.Get(x+1); + dstA_1.y = dstC_1.y = B.Get(x+1); + dstA_2.y = dstC_2.y = B.Get(x+1); + dstA_3.y = dstC_3.y = B.Get(x+1); + dstA_0.z = dstC_0.z = B.Get(x+2); + dstA_1.z = dstC_1.z = B.Get(x+2); + dstA_2.z = dstC_2.z = B.Get(x+2); + dstA_3.z = dstC_3.z = B.Get(x+2); + dstA_0.w = dstC_0.w = B.Get(x+3); + dstA_1.w = dstC_1.w = B.Get(x+3); + dstA_2.w = dstC_2.w = B.Get(x+3); + dstA_3.w = dstC_3.w = B.Get(x+3); + + dstB_0.x = dstD_0.x = B.Get(x+4); + dstB_1.x = dstD_1.x = B.Get(x+4); + dstB_2.x = dstD_2.x = B.Get(x+4); + dstB_3.x = dstD_3.x = B.Get(x+4); + dstB_0.y = dstD_0.y = B.Get(x+5); + dstB_1.y = dstD_1.y = B.Get(x+5); + dstB_2.y = dstD_2.y = B.Get(x+5); + dstB_3.y = dstD_3.y = B.Get(x+5); + dstB_0.z = dstD_0.z = B.Get(x+6); + dstB_1.z = dstD_1.z = B.Get(x+6); + dstB_2.z = dstD_2.z = B.Get(x+6); + dstB_3.z = dstD_3.z = B.Get(x+6); + dstB_0.w = dstD_0.w = B.Get(x+7); + dstB_1.w = dstD_1.w = B.Get(x+7); + dstB_2.w = dstD_2.w = B.Get(x+7); + dstB_3.w = dstD_3.w = B.Get(x+7); +#else + float4 dstA_0[4], dstA_1[4], dstA_2[4], dstA_3[4]; + dstA_0[0].x = dstA_0[2].x = B.Get(x+0); + dstA_1[0].x = dstA_1[2].x = B.Get(x+0); + dstA_2[0].x = dstA_2[2].x = B.Get(x+0); + dstA_3[0].x = dstA_3[2].x = B.Get(x+0); + dstA_0[0].y = dstA_0[2].y = B.Get(x+1); + dstA_1[0].y = dstA_1[2].y = B.Get(x+1); + dstA_2[0].y = dstA_2[2].y = B.Get(x+1); + dstA_3[0].y = dstA_3[2].y = B.Get(x+1); + dstA_0[0].z = dstA_0[2].z = B.Get(x+2); + dstA_1[0].z = dstA_1[2].z = B.Get(x+2); + dstA_2[0].z = dstA_2[2].z = B.Get(x+2); + dstA_3[0].z = dstA_3[2].z = B.Get(x+2); + dstA_0[0].w = dstA_0[2].w = B.Get(x+3); + dstA_1[0].w = dstA_1[2].w = B.Get(x+3); + dstA_2[0].w = dstA_2[2].w = B.Get(x+3); + dstA_3[0].w = dstA_3[2].w = B.Get(x+3); + + dstA_0[1].x = dstA_0[3].x = B.Get(x+4); + dstA_1[1].x = dstA_1[3].x = B.Get(x+4); + dstA_2[1].x = dstA_2[3].x = B.Get(x+4); + dstA_3[1].x = dstA_3[3].x = B.Get(x+4); + dstA_0[1].y = dstA_0[3].y = B.Get(x+5); + dstA_1[1].y = dstA_1[3].y = B.Get(x+5); + dstA_2[1].y = dstA_2[3].y = B.Get(x+5); + dstA_3[1].y = dstA_3[3].y = B.Get(x+5); + dstA_0[1].z = dstA_0[3].z = B.Get(x+6); + dstA_1[1].z = dstA_1[3].z = B.Get(x+6); + dstA_2[1].z = dstA_2[3].z = B.Get(x+6); + dstA_3[1].z = dstA_3[3].z = B.Get(x+6); + dstA_0[1].w = dstA_0[3].w = B.Get(x+7); + dstA_1[1].w = dstA_1[3].w = B.Get(x+7); + dstA_2[1].w = dstA_2[3].w = B.Get(x+7); + dstA_3[1].w = dstA_3[3].w = B.Get(x+7); + +#endif + + for (int i = 0; i < n; i += CACHE_DEPTH) + { + #if TRANSPOSED_X == 1 + [unroll] + for (int j = 0; j < CACHE_DEPTH; ++j) + { + X_[ti + j*64] = X.data[strideX * (i + j) + by + ti + offsetX]; + + // split 64 into 8 blocks and interleave them + // 000000001111111122222222... => 012345678012345678... + W_[((ti&7)<<3) + (ti>>3) + j*64] = W.data[strideW * (i + j) + bx + ti + offsetW]; + } + #else + int tiDiv = (uint)ti/CACHE_DEPTH; + int tiMod = ti&(CACHE_DEPTH-1); + int jStride = 64/CACHE_DEPTH; + + [unroll] + for (int j = 0; j < CACHE_DEPTH; ++j) + { + // CACHE_DEPTHx64 => 64xCACHE_DEPTH + X_[tiDiv + 65*tiMod + j*jStride] = X.data[strideX * (by + tiDiv + j*jStride) + i + tiMod]; + + // split 64 into 8 blocks and interleave them + // 000000001111111122222222... => 012345678012345678... + W_[((ti&7)<<3) + (ti>>3) + j*64] = W.data[strideW * (i + j) + bx + ti + offsetW]; + } + #endif + + GroupMemoryBarrierWithGroupSync(); + +#if UNROLL_INNER_LOOP + int4 idX0 = int4(0,1,2,3); int4 idX1 = int4(4,5,6,7); + int4 idW0 = int4(0,8,16,24); int4 idW1 = int4(32,40,48,56); +#else + int4 idX[2], idW[2]; + idX[0] = int4(0,1,2,3); idX[1] = int4(4,5,6,7); + idW[0] = int4(0,8,16,24); idW[1] = int4(32,40,48,56); +#endif + int incX = 64 + (TRANSPOSED_X?0:1); + int incW = 64; + for (int di = 0; di < CACHE_DEPTH; di++) + { +#if UNROLL_INNER_LOOP + float4 srcX0 = float4( + X_[idX0.x + ty*8], + X_[idX0.y + ty*8], + X_[idX0.z + ty*8], + X_[idX0.w + ty*8]); + float4 srcX1 = float4( + X_[idX1.x + ty*8], + X_[idX1.y + ty*8], + X_[idX1.z + ty*8], + X_[idX1.w + ty*8]); + float4 srcW0 = float4( + W_[idW0.x + tx], + W_[idW0.y + tx], + W_[idW0.z + tx], + W_[idW0.w + tx]); + float4 srcW1 = float4( + W_[idW1.x + tx], + W_[idW1.y + tx], + W_[idW1.z + tx], + W_[idW1.w + tx]); + idX0 += incX; idX1 += incX; + idW0 += incW; idW1 += incW; + + dstA_0.x = ffma(srcX0.x, srcW0.x, dstA_0.x); + dstA_0.y = ffma(srcX0.x, srcW0.y, dstA_0.y); + dstA_0.z = ffma(srcX0.x, srcW0.z, dstA_0.z); + dstA_0.w = ffma(srcX0.x, srcW0.w, dstA_0.w); + dstA_1.x = ffma(srcX0.y, srcW0.x, dstA_1.x); + dstA_1.y = ffma(srcX0.y, srcW0.y, dstA_1.y); + dstA_1.z = ffma(srcX0.y, srcW0.z, dstA_1.z); + dstA_1.w = ffma(srcX0.y, srcW0.w, dstA_1.w); + dstA_2.x = ffma(srcX0.z, srcW0.x, dstA_2.x); + dstA_2.y = ffma(srcX0.z, srcW0.y, dstA_2.y); + dstA_2.z = ffma(srcX0.z, srcW0.z, dstA_2.z); + dstA_2.w = ffma(srcX0.z, srcW0.w, dstA_2.w); + dstA_3.x = ffma(srcX0.w, srcW0.x, dstA_3.x); + dstA_3.y = ffma(srcX0.w, srcW0.y, dstA_3.y); + dstA_3.z = ffma(srcX0.w, srcW0.z, dstA_3.z); + dstA_3.w = ffma(srcX0.w, srcW0.w, dstA_3.w); + + // + dstB_0.x = ffma(srcX0.x, srcW1.x, dstB_0.x); + dstB_0.y = ffma(srcX0.x, srcW1.y, dstB_0.y); + dstB_0.z = ffma(srcX0.x, srcW1.z, dstB_0.z); + dstB_0.w = ffma(srcX0.x, srcW1.w, dstB_0.w); + dstB_1.x = ffma(srcX0.y, srcW1.x, dstB_1.x); + dstB_1.y = ffma(srcX0.y, srcW1.y, dstB_1.y); + dstB_1.z = ffma(srcX0.y, srcW1.z, dstB_1.z); + dstB_1.w = ffma(srcX0.y, srcW1.w, dstB_1.w); + dstB_2.x = ffma(srcX0.z, srcW1.x, dstB_2.x); + dstB_2.y = ffma(srcX0.z, srcW1.y, dstB_2.y); + dstB_2.z = ffma(srcX0.z, srcW1.z, dstB_2.z); + dstB_2.w = ffma(srcX0.z, srcW1.w, dstB_2.w); + dstB_3.x = ffma(srcX0.w, srcW1.x, dstB_3.x); + dstB_3.y = ffma(srcX0.w, srcW1.y, dstB_3.y); + dstB_3.z = ffma(srcX0.w, srcW1.z, dstB_3.z); + dstB_3.w = ffma(srcX0.w, srcW1.w, dstB_3.w); + + // + dstC_0.x = ffma(srcX1.x, srcW0.x, dstC_0.x); + dstC_0.y = ffma(srcX1.x, srcW0.y, dstC_0.y); + dstC_0.z = ffma(srcX1.x, srcW0.z, dstC_0.z); + dstC_0.w = ffma(srcX1.x, srcW0.w, dstC_0.w); + dstC_1.x = ffma(srcX1.y, srcW0.x, dstC_1.x); + dstC_1.y = ffma(srcX1.y, srcW0.y, dstC_1.y); + dstC_1.z = ffma(srcX1.y, srcW0.z, dstC_1.z); + dstC_1.w = ffma(srcX1.y, srcW0.w, dstC_1.w); + dstC_2.x = ffma(srcX1.z, srcW0.x, dstC_2.x); + dstC_2.y = ffma(srcX1.z, srcW0.y, dstC_2.y); + dstC_2.z = ffma(srcX1.z, srcW0.z, dstC_2.z); + dstC_2.w = ffma(srcX1.z, srcW0.w, dstC_2.w); + dstC_3.x = ffma(srcX1.w, srcW0.x, dstC_3.x); + dstC_3.y = ffma(srcX1.w, srcW0.y, dstC_3.y); + dstC_3.z = ffma(srcX1.w, srcW0.z, dstC_3.z); + dstC_3.w = ffma(srcX1.w, srcW0.w, dstC_3.w); + + // + dstD_0.x = ffma(srcX1.x, srcW1.x, dstD_0.x); + dstD_0.y = ffma(srcX1.x, srcW1.y, dstD_0.y); + dstD_0.z = ffma(srcX1.x, srcW1.z, dstD_0.z); + dstD_0.w = ffma(srcX1.x, srcW1.w, dstD_0.w); + dstD_1.x = ffma(srcX1.y, srcW1.x, dstD_1.x); + dstD_1.y = ffma(srcX1.y, srcW1.y, dstD_1.y); + dstD_1.z = ffma(srcX1.y, srcW1.z, dstD_1.z); + dstD_1.w = ffma(srcX1.y, srcW1.w, dstD_1.w); + dstD_2.x = ffma(srcX1.z, srcW1.x, dstD_2.x); + dstD_2.y = ffma(srcX1.z, srcW1.y, dstD_2.y); + dstD_2.z = ffma(srcX1.z, srcW1.z, dstD_2.z); + dstD_2.w = ffma(srcX1.z, srcW1.w, dstD_2.w); + dstD_3.x = ffma(srcX1.w, srcW1.x, dstD_3.x); + dstD_3.y = ffma(srcX1.w, srcW1.y, dstD_3.y); + dstD_3.z = ffma(srcX1.w, srcW1.z, dstD_3.z); + dstD_3.w = ffma(srcX1.w, srcW1.w, dstD_3.w); + +#else + float4 srcX[2], srcW[2]; + srcX[0] = float4( + X_[idX[0].x + ty*8], + X_[idX[0].y + ty*8], + X_[idX[0].z + ty*8], + X_[idX[0].w + ty*8]); + srcX[1] = float4( + X_[idX[1].x + ty*8], + X_[idX[1].y + ty*8], + X_[idX[1].z + ty*8], + X_[idX[1].w + ty*8]); + srcW[0] = float4( + W_[idW[0].x + tx], + W_[idW[0].y + tx], + W_[idW[0].z + tx], + W_[idW[0].w + tx]); + srcW[1] = float4( + W_[idW[1].x + tx], + W_[idW[1].y + tx], + W_[idW[1].z + tx], + W_[idW[1].w + tx]); + idX[0] += incX; idX[1] += incX; + idW[0] += incW; idW[1] += incW; + + + [loop] + for (uint qw = 0; qw < 4; ++qw) + { + uint q = qw >> 1; + uint w = qw & 1; + dstA_0[qw].x = ffma(srcX[q].x, srcW[w].x, dstA_0[qw].x); + dstA_0[qw].y = ffma(srcX[q].x, srcW[w].y, dstA_0[qw].y); + dstA_0[qw].z = ffma(srcX[q].x, srcW[w].z, dstA_0[qw].z); + dstA_0[qw].w = ffma(srcX[q].x, srcW[w].w, dstA_0[qw].w); + dstA_1[qw].x = ffma(srcX[q].y, srcW[w].x, dstA_1[qw].x); + dstA_1[qw].y = ffma(srcX[q].y, srcW[w].y, dstA_1[qw].y); + dstA_1[qw].z = ffma(srcX[q].y, srcW[w].z, dstA_1[qw].z); + dstA_1[qw].w = ffma(srcX[q].y, srcW[w].w, dstA_1[qw].w); + dstA_2[qw].x = ffma(srcX[q].z, srcW[w].x, dstA_2[qw].x); + dstA_2[qw].y = ffma(srcX[q].z, srcW[w].y, dstA_2[qw].y); + dstA_2[qw].z = ffma(srcX[q].z, srcW[w].z, dstA_2[qw].z); + dstA_2[qw].w = ffma(srcX[q].z, srcW[w].w, dstA_2[qw].w); + dstA_3[qw].x = ffma(srcX[q].w, srcW[w].x, dstA_3[qw].x); + dstA_3[qw].y = ffma(srcX[q].w, srcW[w].y, dstA_3[qw].y); + dstA_3[qw].z = ffma(srcX[q].w, srcW[w].z, dstA_3[qw].z); + dstA_3[qw].w = ffma(srcX[q].w, srcW[w].w, dstA_3[qw].w); + } +#endif + } + + GroupMemoryBarrierWithGroupSync(); + } +#if UNROLL_INNER_LOOP + O.data[strideO * (y+0) + x+0 + offsetO] = dstA_0.x; + O.data[strideO * (y+0) + x+1 + offsetO] = dstA_0.y; + O.data[strideO * (y+0) + x+2 + offsetO] = dstA_0.z; + O.data[strideO * (y+0) + x+3 + offsetO] = dstA_0.w; + O.data[strideO * (y+0) + x+4 + offsetO] = dstB_0.x; + O.data[strideO * (y+0) + x+5 + offsetO] = dstB_0.y; + O.data[strideO * (y+0) + x+6 + offsetO] = dstB_0.z; + O.data[strideO * (y+0) + x+7 + offsetO] = dstB_0.w; + O.data[strideO * (y+1) + x+0 + offsetO] = dstA_1.x; + O.data[strideO * (y+1) + x+1 + offsetO] = dstA_1.y; + O.data[strideO * (y+1) + x+2 + offsetO] = dstA_1.z; + O.data[strideO * (y+1) + x+3 + offsetO] = dstA_1.w; + O.data[strideO * (y+1) + x+4 + offsetO] = dstB_1.x; + O.data[strideO * (y+1) + x+5 + offsetO] = dstB_1.y; + O.data[strideO * (y+1) + x+6 + offsetO] = dstB_1.z; + O.data[strideO * (y+1) + x+7 + offsetO] = dstB_1.w; + O.data[strideO * (y+2) + x+0 + offsetO] = dstA_2.x; + O.data[strideO * (y+2) + x+1 + offsetO] = dstA_2.y; + O.data[strideO * (y+2) + x+2 + offsetO] = dstA_2.z; + O.data[strideO * (y+2) + x+3 + offsetO] = dstA_2.w; + O.data[strideO * (y+2) + x+4 + offsetO] = dstB_2.x; + O.data[strideO * (y+2) + x+5 + offsetO] = dstB_2.y; + O.data[strideO * (y+2) + x+6 + offsetO] = dstB_2.z; + O.data[strideO * (y+2) + x+7 + offsetO] = dstB_2.w; + O.data[strideO * (y+3) + x+0 + offsetO] = dstA_3.x; + O.data[strideO * (y+3) + x+1 + offsetO] = dstA_3.y; + O.data[strideO * (y+3) + x+2 + offsetO] = dstA_3.z; + O.data[strideO * (y+3) + x+3 + offsetO] = dstA_3.w; + O.data[strideO * (y+3) + x+4 + offsetO] = dstB_3.x; + O.data[strideO * (y+3) + x+5 + offsetO] = dstB_3.y; + O.data[strideO * (y+3) + x+6 + offsetO] = dstB_3.z; + O.data[strideO * (y+3) + x+7 + offsetO] = dstB_3.w; + + O.data[strideO * (y+4) + x+0 + offsetO] = dstC_0.x; + O.data[strideO * (y+4) + x+1 + offsetO] = dstC_0.y; + O.data[strideO * (y+4) + x+2 + offsetO] = dstC_0.z; + O.data[strideO * (y+4) + x+3 + offsetO] = dstC_0.w; + O.data[strideO * (y+4) + x+4 + offsetO] = dstD_0.x; + O.data[strideO * (y+4) + x+5 + offsetO] = dstD_0.y; + O.data[strideO * (y+4) + x+6 + offsetO] = dstD_0.z; + O.data[strideO * (y+4) + x+7 + offsetO] = dstD_0.w; + O.data[strideO * (y+5) + x+0 + offsetO] = dstC_1.x; + O.data[strideO * (y+5) + x+1 + offsetO] = dstC_1.y; + O.data[strideO * (y+5) + x+2 + offsetO] = dstC_1.z; + O.data[strideO * (y+5) + x+3 + offsetO] = dstC_1.w; + O.data[strideO * (y+5) + x+4 + offsetO] = dstD_1.x; + O.data[strideO * (y+5) + x+5 + offsetO] = dstD_1.y; + O.data[strideO * (y+5) + x+6 + offsetO] = dstD_1.z; + O.data[strideO * (y+5) + x+7 + offsetO] = dstD_1.w; + O.data[strideO * (y+6) + x+0 + offsetO] = dstC_2.x; + O.data[strideO * (y+6) + x+1 + offsetO] = dstC_2.y; + O.data[strideO * (y+6) + x+2 + offsetO] = dstC_2.z; + O.data[strideO * (y+6) + x+3 + offsetO] = dstC_2.w; + O.data[strideO * (y+6) + x+4 + offsetO] = dstD_2.x; + O.data[strideO * (y+6) + x+5 + offsetO] = dstD_2.y; + O.data[strideO * (y+6) + x+6 + offsetO] = dstD_2.z; + O.data[strideO * (y+6) + x+7 + offsetO] = dstD_2.w; + O.data[strideO * (y+7) + x+0 + offsetO] = dstC_3.x; + O.data[strideO * (y+7) + x+1 + offsetO] = dstC_3.y; + O.data[strideO * (y+7) + x+2 + offsetO] = dstC_3.z; + O.data[strideO * (y+7) + x+3 + offsetO] = dstC_3.w; + O.data[strideO * (y+7) + x+4 + offsetO] = dstD_3.x; + O.data[strideO * (y+7) + x+5 + offsetO] = dstD_3.y; + O.data[strideO * (y+7) + x+6 + offsetO] = dstD_3.z; + O.data[strideO * (y+7) + x+7 + offsetO] = dstD_3.w; +#else + O.data[strideO * (y+0) + x+0 + offsetO] = dstA_0[0].x; + O.data[strideO * (y+0) + x+1 + offsetO] = dstA_0[0].y; + O.data[strideO * (y+0) + x+2 + offsetO] = dstA_0[0].z; + O.data[strideO * (y+0) + x+3 + offsetO] = dstA_0[0].w; + O.data[strideO * (y+0) + x+4 + offsetO] = dstA_0[1].x; + O.data[strideO * (y+0) + x+5 + offsetO] = dstA_0[1].y; + O.data[strideO * (y+0) + x+6 + offsetO] = dstA_0[1].z; + O.data[strideO * (y+0) + x+7 + offsetO] = dstA_0[1].w; + O.data[strideO * (y+1) + x+0 + offsetO] = dstA_1[0].x; + O.data[strideO * (y+1) + x+1 + offsetO] = dstA_1[0].y; + O.data[strideO * (y+1) + x+2 + offsetO] = dstA_1[0].z; + O.data[strideO * (y+1) + x+3 + offsetO] = dstA_1[0].w; + O.data[strideO * (y+1) + x+4 + offsetO] = dstA_1[1].x; + O.data[strideO * (y+1) + x+5 + offsetO] = dstA_1[1].y; + O.data[strideO * (y+1) + x+6 + offsetO] = dstA_1[1].z; + O.data[strideO * (y+1) + x+7 + offsetO] = dstA_1[1].w; + O.data[strideO * (y+2) + x+0 + offsetO] = dstA_2[0].x; + O.data[strideO * (y+2) + x+1 + offsetO] = dstA_2[0].y; + O.data[strideO * (y+2) + x+2 + offsetO] = dstA_2[0].z; + O.data[strideO * (y+2) + x+3 + offsetO] = dstA_2[0].w; + O.data[strideO * (y+2) + x+4 + offsetO] = dstA_2[1].x; + O.data[strideO * (y+2) + x+5 + offsetO] = dstA_2[1].y; + O.data[strideO * (y+2) + x+6 + offsetO] = dstA_2[1].z; + O.data[strideO * (y+2) + x+7 + offsetO] = dstA_2[1].w; + O.data[strideO * (y+3) + x+0 + offsetO] = dstA_3[0].x; + O.data[strideO * (y+3) + x+1 + offsetO] = dstA_3[0].y; + O.data[strideO * (y+3) + x+2 + offsetO] = dstA_3[0].z; + O.data[strideO * (y+3) + x+3 + offsetO] = dstA_3[0].w; + O.data[strideO * (y+3) + x+4 + offsetO] = dstA_3[1].x; + O.data[strideO * (y+3) + x+5 + offsetO] = dstA_3[1].y; + O.data[strideO * (y+3) + x+6 + offsetO] = dstA_3[1].z; + O.data[strideO * (y+3) + x+7 + offsetO] = dstA_3[1].w; + + O.data[strideO * (y+4) + x+0 + offsetO] = dstA_0[2].x; + O.data[strideO * (y+4) + x+1 + offsetO] = dstA_0[2].y; + O.data[strideO * (y+4) + x+2 + offsetO] = dstA_0[2].z; + O.data[strideO * (y+4) + x+3 + offsetO] = dstA_0[2].w; + O.data[strideO * (y+4) + x+4 + offsetO] = dstA_0[3].x; + O.data[strideO * (y+4) + x+5 + offsetO] = dstA_0[3].y; + O.data[strideO * (y+4) + x+6 + offsetO] = dstA_0[3].z; + O.data[strideO * (y+4) + x+7 + offsetO] = dstA_0[3].w; + O.data[strideO * (y+5) + x+0 + offsetO] = dstA_1[2].x; + O.data[strideO * (y+5) + x+1 + offsetO] = dstA_1[2].y; + O.data[strideO * (y+5) + x+2 + offsetO] = dstA_1[2].z; + O.data[strideO * (y+5) + x+3 + offsetO] = dstA_1[2].w; + O.data[strideO * (y+5) + x+4 + offsetO] = dstA_1[3].x; + O.data[strideO * (y+5) + x+5 + offsetO] = dstA_1[3].y; + O.data[strideO * (y+5) + x+6 + offsetO] = dstA_1[3].z; + O.data[strideO * (y+5) + x+7 + offsetO] = dstA_1[3].w; + O.data[strideO * (y+6) + x+0 + offsetO] = dstA_2[2].x; + O.data[strideO * (y+6) + x+1 + offsetO] = dstA_2[2].y; + O.data[strideO * (y+6) + x+2 + offsetO] = dstA_2[2].z; + O.data[strideO * (y+6) + x+3 + offsetO] = dstA_2[2].w; + O.data[strideO * (y+6) + x+4 + offsetO] = dstA_2[3].x; + O.data[strideO * (y+6) + x+5 + offsetO] = dstA_2[3].y; + O.data[strideO * (y+6) + x+6 + offsetO] = dstA_2[3].z; + O.data[strideO * (y+6) + x+7 + offsetO] = dstA_2[3].w; + O.data[strideO * (y+7) + x+0 + offsetO] = dstA_3[2].x; + O.data[strideO * (y+7) + x+1 + offsetO] = dstA_3[2].y; + O.data[strideO * (y+7) + x+2 + offsetO] = dstA_3[2].z; + O.data[strideO * (y+7) + x+3 + offsetO] = dstA_3[2].w; + O.data[strideO * (y+7) + x+4 + offsetO] = dstA_3[3].x; + O.data[strideO * (y+7) + x+5 + offsetO] = dstA_3[3].y; + O.data[strideO * (y+7) + x+6 + offsetO] = dstA_3[3].z; + O.data[strideO * (y+7) + x+7 + offsetO] = dstA_3[3].w; +#endif + + #undef X_ + #undef W_ +} +#undef TRANSPOSED_X +#undef BLOCKED_W +#undef HARDCODED_DIMS +#undef BUF_OFFSET +#undef CACHE_DEPTH +#elif BLOCK_SIZE == 4 +#define TRANSPOSED_X 0 +#define SHIFTED_X 0 +#define CACHE_DEPTH 4 +groupshared float CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, X)[CACHE_DEPTH*8*BLOCK_SIZE+SHIFTED_X*CACHE_DEPTH]; +groupshared float CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, W)[CACHE_DEPTH*8*BLOCK_SIZE]; +[numthreads(8,8,1)] +void FUNC_NAME(KERNEL_NAME, BLOCK_SIZE)(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID, uint threadIndex : SV_GroupIndex) +{ + DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1); + TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); + + int x = (int)dispatchThreadID.x * BLOCK_SIZE; + int y = (int)dispatchThreadID.y * BLOCK_SIZE; + int tx = (int)groupThreadID.x; + int ty = (int)groupThreadID.y; + int bx = ((int)dispatchThreadID.x - (int)groupThreadID.x) * BLOCK_SIZE; + int by = ((int)dispatchThreadID.y - (int)groupThreadID.y) * BLOCK_SIZE; + int ti = (int)threadIndex; + int n = (int)X.GetFlatWidth(); + int strideX = (int)X.GetFlatWidth(); + int strideW = (int)W.GetFlatWidth(); + + #define X_ CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, X) + #define W_ CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, W) + + //if (x >= (int)O.GetFlatWidth()) return; + //if (y >= (int)O.GetFlatHeight()) return; + + float4 dstA_0, dstA_1, dstA_2, dstA_3; + + dstA_0.x = B.Get(x+0); + dstA_1.x = B.Get(x+0); + dstA_2.x = B.Get(x+0); + dstA_3.x = B.Get(x+0); + dstA_0.y = B.Get(x+1); + dstA_1.y = B.Get(x+1); + dstA_2.y = B.Get(x+1); + dstA_3.y = B.Get(x+1); + dstA_0.z = B.Get(x+2); + dstA_1.z = B.Get(x+2); + dstA_2.z = B.Get(x+2); + dstA_3.z = B.Get(x+2); + dstA_0.w = B.Get(x+3); + dstA_1.w = B.Get(x+3); + dstA_2.w = B.Get(x+3); + dstA_3.w = B.Get(x+3); + + for (int i = 0; i < n; i += CACHE_DEPTH) + { + #if CACHE_DEPTH == 16 + W_[ti ] = W.data[strideW * (i + (ti>>5) + 0) + bx + (ti&31)]; + W_[ti+ 64] = W.data[strideW * (i + (ti>>5) + 2) + bx + (ti&31)]; + W_[ti+128] = W.data[strideW * (i + (ti>>5) + 4) + bx + (ti&31)]; + W_[ti+192] = W.data[strideW * (i + (ti>>5) + 6) + bx + (ti&31)]; + W_[ti+256] = W.data[strideW * (i + (ti>>5) + 8) + bx + (ti&31)]; + W_[ti+320] = W.data[strideW * (i + (ti>>5) +10) + bx + (ti&31)]; + W_[ti+384] = W.data[strideW * (i + (ti>>5) +12) + bx + (ti&31)]; + W_[ti+448] = W.data[strideW * (i + (ti>>5) +14) + bx + (ti&31)]; + #if TRANSPOSED_X == 1 + X_[ti ] = X.data[strideX * (i + (ti>>5) + 0) + by + (ti&31)]; + X_[ti+ 64] = X.data[strideX * (i + (ti>>5) + 2) + by + (ti&31)]; + X_[ti+128] = X.data[strideX * (i + (ti>>5) + 4) + by + (ti&31)]; + X_[ti+192] = X.data[strideX * (i + (ti>>5) + 6) + by + (ti&31)]; + X_[ti+256] = X.data[strideX * (i + (ti>>5) + 8) + by + (ti&31)]; + X_[ti+320] = X.data[strideX * (i + (ti>>5) +10) + by + (ti&31)]; + X_[ti+384] = X.data[strideX * (i + (ti>>5) +12) + by + (ti&31)]; + X_[ti+448] = X.data[strideX * (i + (ti>>5) +14) + by + (ti&31)]; + #elif SHIFTED_X == 1 + /* + g=ti/16 + j=ti&15 + + g0 j0123456789ABCDEF + g1 j0123456789ABCDEF + g2 j0123456789ABCDEF + g3 j0123456789ABCDEF + g0.j0 g1.j0 g2.j0 g3.j0 g0.j1 g1.j1 g2.j1 g3.j1 + + 16x32 => 32x16 + */ + X_[(ti>>4) + 33*(ti&15) + 0] = X.data[strideX * (by + (ti>>4) + 0) + i + (ti&15) ]; + X_[(ti>>4) + 33*(ti&15) + 4] = X.data[strideX * (by + (ti>>4) + 4) + i + (ti&15) ]; + X_[(ti>>4) + 33*(ti&15) + 8] = X.data[strideX * (by + (ti>>4) + 8) + i + (ti&15) ]; + X_[(ti>>4) + 33*(ti&15) +12] = X.data[strideX * (by + (ti>>4) +12) + i + (ti&15) ]; + X_[(ti>>4) + 33*(ti&15) +16] = X.data[strideX * (by + (ti>>4) +16) + i + (ti&15) ]; + X_[(ti>>4) + 33*(ti&15) +20] = X.data[strideX * (by + (ti>>4) +20) + i + (ti&15) ]; + X_[(ti>>4) + 33*(ti&15) +24] = X.data[strideX * (by + (ti>>4) +24) + i + (ti&15) ]; + X_[(ti>>4) + 33*(ti&15) +28] = X.data[strideX * (by + (ti>>4) +28) + i + (ti&15) ]; + #else + //X_[ti] = X.Get(by + (ti/16), i + (ti&15)); + X_[ti ] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 0]; + X_[ti+ 64] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 2]; + X_[ti+128] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 4]; + X_[ti+192] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 6]; + X_[ti+256] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 8]; + X_[ti+320] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) +10]; + X_[ti+384] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) +12]; + X_[ti+448] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) +14]; + #endif + + #elif CACHE_DEPTH == 8 + W_[ti ] = W.data[strideW * (i + (ti>>5) + 0) + bx + (ti&31)]; + W_[ti+ 64] = W.data[strideW * (i + (ti>>5) + 2) + bx + (ti&31)]; + W_[ti+128] = W.data[strideW * (i + (ti>>5) + 4) + bx + (ti&31)]; + W_[ti+192] = W.data[strideW * (i + (ti>>5) + 6) + bx + (ti&31)]; + #if TRANSPOSED_X == 1 + X_[ti ] = X.data[strideX * (i + (ti>>5) + 0) + by + (ti&31)]; + X_[ti+ 64] = X.data[strideX * (i + (ti>>5) + 2) + by + (ti&31)]; + X_[ti+128] = X.data[strideX * (i + (ti>>5) + 4) + by + (ti&31)]; + X_[ti+192] = X.data[strideX * (i + (ti>>5) + 6) + by + (ti&31)]; + #elif SHIFTED_X == 1 + // 8x32 => 32x8 + X_[(ti>>3) + 33*(ti&7) + 0] = X.data[strideX * (by + (ti>>3) + 0) + i + (ti&7) ]; + X_[(ti>>3) + 33*(ti&7) + 8] = X.data[strideX * (by + (ti>>3) + 8) + i + (ti&7) ]; + X_[(ti>>3) + 33*(ti&7) +16] = X.data[strideX * (by + (ti>>3) +16) + i + (ti&7) ]; + X_[(ti>>3) + 33*(ti&7) +24] = X.data[strideX * (by + (ti>>3) +24) + i + (ti&7) ]; + #else + // 8x32 => 32x8 + X_[ti ] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 0]; + X_[ti+ 64] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 2]; + X_[ti+128] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 4]; + X_[ti+192] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 6]; + #endif + + #elif CACHE_DEPTH == 4 + W_[ti ] = W.data[strideW * (i + (ti>>5) + 0) + bx + (ti&31)]; + W_[ti+ 64] = W.data[strideW * (i + (ti>>5) + 2) + bx + (ti&31)]; + #if TRANSPOSED_X == 1 + X_[ti ] = X.data[strideX * (i + (ti>>5) + 0) + by + (ti&31)]; + X_[ti+ 64] = X.data[strideX * (i + (ti>>5) + 2) + by + (ti&31)]; + #elif SHIFTED_X == 1 + // 4x32 => 32x4 + X_[(ti>>2) + 33*(ti&3) + 0] = X.data[strideX * (by + (ti>>2) + 0) + i + (ti&3) ]; + X_[(ti>>2) + 33*(ti&3) +16] = X.data[strideX * (by + (ti>>2) + 16) + i + (ti&3) ]; + #else + // 4x32 => 32x4 + X_[ti ] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 0]; + X_[ti+ 64] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 2]; + #endif + + #elif CACHE_DEPTH == 2 + W_[ti ] = W.data[strideW * (i + (ti>>5) + 0) + bx + (ti&31)]; + #if TRANSPOSED_X == 1 + X_[ti ] = X.data[strideX * (i + (ti>>5) + 0) + by + (ti&31)]; + #elif SHIFTED_X == 1 + // 2x32 => 32x2 + X_[(ti>>1) + 33*(ti&1) + 0] = X.data[strideX * (by + (ti>>1) + 0) + i + (ti&1) ]; + #else + X_[ti ] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 0]; + #endif + + #elif CACHE_DEPTH == 1 + if (ti < 32) + { + W_[ti] = W.data[strideW * i + bx + ti]; + #if TRANSPOSED_X == 1 + X_[ti] = X.data[strideX * i + by + ti]; + #else + //X_[ti] = X.Get(by+ti, i); + X_[ti] = X.data[strideX * (by + ti) + i]; + #endif + } + #endif + + GroupMemoryBarrierWithGroupSync(); + + for (int di = 0; di < CACHE_DEPTH; di++) + { + int _32 = 32 + SHIFTED_X; + float4 srcX = float4( + X_[di*_32 + ty*4 + 0], + X_[di*_32 + ty*4 + 1], + X_[di*_32 + ty*4 + 2], + X_[di*_32 + ty*4 + 3]); + float4 srcW = float4( + W_[di*32 + tx*4 + 0], + W_[di*32 + tx*4 + 1], + W_[di*32 + tx*4 + 2], + W_[di*32 + tx*4 + 3]); + + dstA_0.x = ffma(srcX.x, srcW.x, dstA_0.x); + dstA_0.y = ffma(srcX.x, srcW.y, dstA_0.y); + dstA_0.z = ffma(srcX.x, srcW.z, dstA_0.z); + dstA_0.w = ffma(srcX.x, srcW.w, dstA_0.w); + + dstA_1.x = ffma(srcX.y, srcW.x, dstA_1.x); + dstA_1.y = ffma(srcX.y, srcW.y, dstA_1.y); + dstA_1.z = ffma(srcX.y, srcW.z, dstA_1.z); + dstA_1.w = ffma(srcX.y, srcW.w, dstA_1.w); + + dstA_2.x = ffma(srcX.z, srcW.x, dstA_2.x); + dstA_2.y = ffma(srcX.z, srcW.y, dstA_2.y); + dstA_2.z = ffma(srcX.z, srcW.z, dstA_2.z); + dstA_2.w = ffma(srcX.z, srcW.w, dstA_2.w); + + dstA_3.x = ffma(srcX.w, srcW.x, dstA_3.x); + dstA_3.y = ffma(srcX.w, srcW.y, dstA_3.y); + dstA_3.z = ffma(srcX.w, srcW.z, dstA_3.z); + dstA_3.w = ffma(srcX.w, srcW.w, dstA_3.w); + } + + GroupMemoryBarrierWithGroupSync(); + } + + O.Set(y+0, x+0, dstA_0.x); + O.Set(y+0, x+1, dstA_0.y); + O.Set(y+0, x+2, dstA_0.z); + O.Set(y+0, x+3, dstA_0.w); + O.Set(y+1, x+0, dstA_1.x); + O.Set(y+1, x+1, dstA_1.y); + O.Set(y+1, x+2, dstA_1.z); + O.Set(y+1, x+3, dstA_1.w); + O.Set(y+2, x+0, dstA_2.x); + O.Set(y+2, x+1, dstA_2.y); + O.Set(y+2, x+2, dstA_2.z); + O.Set(y+2, x+3, dstA_2.w); + O.Set(y+3, x+0, dstA_3.x); + O.Set(y+3, x+1, dstA_3.y); + O.Set(y+3, x+2, dstA_3.z); + O.Set(y+3, x+3, dstA_3.w); + /*for (dx = 0; dx < BLOCK_SIZE; ++dx) + for (dy = 0; dy < BLOCK_SIZE; ++dy) + O.Set(y+dy, x+dx, dstA[dy][dx]); + */ + #undef X_ + #undef W_ +} +#undef TRANSPOSED_X +#undef SHIFTED_X +#undef CACHE_DEPTH +#else +[numthreads(8,8,1)] +void FUNC_NAME(KERNEL_NAME, BLOCK_SIZE)(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1); + TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); + + int x = (int)dispatchThreadID.x * BLOCK_SIZE; + int y = (int)dispatchThreadID.y * BLOCK_SIZE; + int n = (int)X.GetFlatWidth(); + + if (x >= (int)O.GetFlatWidth()) return; + if (y >= (int)O.GetFlatHeight()) return; + + float dstA[BLOCK_SIZE][BLOCK_SIZE]; + float srcX[BLOCK_SIZE]; + + int dy, dx; + for (dx = 0; dx < BLOCK_SIZE; ++dx) + for (dy = 0; dy < BLOCK_SIZE; ++dy) + dstA[dy][dx] = B.data[x+dx+B.offset];//B.Get(x+dx); + + for (int i = 0; i < n; ++i) + { + for (dy = 0; dy < BLOCK_SIZE; ++dy) + srcX[dy] = X.data[(y+dy)*X.channels+i];//X.Get(y+dy, i); + + for (dx = 0; dx < BLOCK_SIZE; ++dx) + { + float srcW = W.data[i*W.channels+x+dx];//W.Get(i, x+dx); + for (dy = 0; dy < BLOCK_SIZE; ++dy) + dstA[dy][dx] += srcX[dy] * srcW; + } + } + + for (dx = 0; dx < BLOCK_SIZE; ++dx) + for (dy = 0; dy < BLOCK_SIZE; ++dy) + O.Set(y+dy, x+dx, dstA[dy][dx]); +} +#endif +#undef KERNEL_NAME + +#endif // DENSE + // NOTE: usually this path is used for <16 batches #undef CACHESIZE #define CACHESIZE 64 diff --git a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Generic.compute b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Generic.compute index f9c29abda4..fc3dc82793 100644 --- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Generic.compute +++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Generic.compute @@ -2,6 +2,7 @@ #pragma kernel ScaleBias_CNyx #pragma kernel ScaleBias_CNyx2 #pragma kernel ScaleBias_Flat +#pragma kernel ScaleBias_Loop #pragma kernel Upsample2D #pragma kernel AvgPool2D #pragma kernel MaxPool2D @@ -10,6 +11,8 @@ //#pragma kernel MaxPool2D_Pool2x2_NoPads #pragma kernel GlobalAvgPool2D #pragma kernel InstanceNorm +#pragma kernel InstanceNormTail_CNyx2 +#pragma kernel InstanceNormTail_Flat #pragma kernel Copy /* @@ -33,6 +36,7 @@ uint4 _Pool; uint4 _Stride; uint4 _Pad; float _Alpha; +uint _LoopStride; NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) void ScaleBias(uint3 dispatchThreadID : SV_DispatchThreadID) @@ -102,6 +106,29 @@ void ScaleBias_Flat(uint3 dispatchThreadID : SV_DispatchThreadID) O.Set(i, v); } +NUMTHREADS((256,1,1), (128,1,1), (64,1,1)) +void ScaleBias_Loop(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + DISPATCH_ARGS(O.length, 1, 1); + TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); + + uint i = dispatchThreadID.x; + uint len = O.GetLength(); + + while (i < len) + { + uint c = i % X.channels; + float bias = B.Get(c); + float scale = W.Get(c); + + float v = X.Get(i); + v = v * scale + bias; + O.Set(i, v); + + i += _LoopStride; + } +} + NUMTHREADS((32,4,1), (32,2,1), (16,2,1)) void ScaleBias_CNyx2(uint3 dispatchThreadID : SV_DispatchThreadID) { @@ -371,6 +398,53 @@ void InstanceNorm(uint3 dispatchThreadID : SV_DispatchThreadID) } } +NUMTHREADS((256,1,1), (128,1,1), (64,1,1)) +void InstanceNormTail_Flat(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + DISPATCH_ARGS(O.length, 1, 1); + TENSOR_ARGS4(X, W, B, O); + + uint i = dispatchThreadID.x; + if (i > O.GetLength()) return; + + uint c = i % X.channels; + + float variance = W.Get(c); + float mean = B.Get(c); + // normalization factor + float invNormFactor = 1 / sqrt(variance + FLT_EPSILON); + + float v = X.Get(i); + //v = gamma * (v * invNormFactor - mean * invNormFactor) + beta + v = v * invNormFactor - mean * invNormFactor; + + O.Set(i, v); +} + +NUMTHREADS((32,4,1), (32,2,1), (16,2,1)) +void InstanceNormTail_CNyx2(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1); + TENSOR_ARGS4(X, W, B, O); + + uint c = dispatchThreadID.x; + uint i = dispatchThreadID.y * X.channels + c; + + if (c >= X.channels) return; + if (i >= X.GetLength()) return; + + float variance = W.Get(c); + float mean = B.Get(c); + // normalization factor + float invNormFactor = 1 / sqrt(variance + FLT_EPSILON); + + float v = X.Get(i); + //v = gamma * (v * invNormFactor - mean * invNormFactor) + beta + v = v * invNormFactor - mean * invNormFactor; + + O.Set(i, v); +} + NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) void Copy(uint3 dispatchThreadID : SV_DispatchThreadID) { diff --git a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/ReleaseNotes.md b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/ReleaseNotes.md index 47f570e6cf..46495fb6fd 100644 --- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/ReleaseNotes.md +++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/ReleaseNotes.md @@ -1,5 +1,19 @@ # Release notes +## 0.2.4 +- Switched to 2018.4.3f1 as primary Unity version for testing. +- Fixed ScaleBias scheduling issue with large amounts of data (reproduced with MobileNet @ 16 batch) +- Fixed buffer overrun in ThreadGroup SharedMemory when TRANSPOSE_X and/or SHIFTED_X paths are enabled. This should fix GPU worker issues on Windows. +- Added string cache to minimise string concat generated GC pressure. +- Added small fixes for temp memory allocations, saves ~200B per layer. +- Refactored inner loop workings, to avoid GC allocations for delegates. +- Fixed input handling for layers, now inputs are not regenerated with every execution. Static model tensors are to stay forever until worker is disposed. +- Bumped Burst version to 1.1.1. + +## 0.2.3 +- Rewritten Dense, Conv and some other ops on GPU. Speedup of 33% in most models with batch=1 and over 100% for batch=16. +- Optimizations: reimplemented InstanceNormalization using pyramid approach for calculating mean and variance. + ## 0.2.2 - Added support for --print-supported-ops flag for model converters, now it will print approximate list of supported operations. List of supported ops depends on converter. - Added Keras converter as part of distribution. @@ -130,6 +144,7 @@ - Renaldas (ReJ) Zioma - Mantas Puida - Vladimir Oster +- Aurimas Petrovas - Martin Sternevald - Valdemar Bučilko - Kuba Cupisz diff --git a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/package.json b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/package.json index e4dad8dab7..4d09c393a7 100644 --- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/package.json +++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/package.json @@ -1,7 +1,7 @@ { "name": "com.unity.barracuda", "displayName": "Barracuda", - "version": "0.2.2-preview", + "version": "0.2.4-preview", "unity": "2017.4", "description": "Barracuda is lightweight and cross-platform Neural Net inference library. Barracuda supports inference both on GPU and CPU.", "dependencies": {} diff --git a/UnitySDK/Assets/ML-Agents/Scripts/Academy.cs b/UnitySDK/Assets/ML-Agents/Scripts/Academy.cs index d6cecb4acf..ae66f328cf 100755 --- a/UnitySDK/Assets/ML-Agents/Scripts/Academy.cs +++ b/UnitySDK/Assets/ML-Agents/Scripts/Academy.cs @@ -95,7 +95,7 @@ public abstract class Academy : MonoBehaviour [SerializeField] public BroadcastHub broadcastHub = new BroadcastHub(); - private const string kApiVersion = "API-8"; + private const string kApiVersion = "API-9"; /// Temporary storage for global gravity value /// Used to restore oringal value when deriving Academy modifies it diff --git a/UnitySDK/Assets/ML-Agents/Scripts/Agent.cs b/UnitySDK/Assets/ML-Agents/Scripts/Agent.cs index e28b72d763..85f74747ea 100755 --- a/UnitySDK/Assets/ML-Agents/Scripts/Agent.cs +++ b/UnitySDK/Assets/ML-Agents/Scripts/Agent.cs @@ -1,4 +1,4 @@ -using System.Collections.Generic; +using System.Collections.Generic; using System.Linq; using Google.Protobuf; using MLAgents.CommunicatorObjects; @@ -120,8 +120,9 @@ public CommunicatorObjects.AgentInfoProto ToProto() agentInfoProto.VisualObservations.Add( ByteString.CopyFrom(obs.EncodeToPNG()) ); + Object.Destroy(obs); } - + visualObservations.Clear(); return agentInfoProto; } } @@ -314,10 +315,6 @@ public abstract class Agent : MonoBehaviour /// Keeps track of the actions that are masked at each step. private ActionMasker actionMasker; - /// Array of Texture2D used to render to from render buffer before - /// transforming into float tensor. - Texture2D[] textureArray; - /// /// Demonstration recorder. /// @@ -327,13 +324,6 @@ public abstract class Agent : MonoBehaviour /// becomes enabled or active. void OnEnable() { - var textureCount = agentParameters.agentCameras.Count+agentParameters.agentRenderTextures.Count; - textureArray = new Texture2D[textureCount]; - for (int i = 0; i < textureCount; i++) - { - textureArray[i] = new Texture2D(1, 1, TextureFormat.RGB24, false); - } - id = gameObject.GetInstanceID(); Academy academy = Object.FindObjectOfType() as Academy; OnEnableHelper(academy); @@ -608,10 +598,10 @@ void SendInfoToBrain() brain.brainParameters.vectorObservationSize, info.vectorObservation.Count)); } - - info.stackedVectorObservation.RemoveRange( - 0, param.vectorObservationSize); - info.stackedVectorObservation.AddRange(info.vectorObservation); + + Utilities.ShiftLeft(info.stackedVectorObservation, param.vectorObservationSize); + Utilities.ReplaceRange(info.stackedVectorObservation, info.vectorObservation, + info.stackedVectorObservation.Count - info.vectorObservation.Count); info.visualObservations.Clear(); var visualObservationCount = agentParameters.agentCameras.Count+agentParameters.agentRenderTextures.Count; @@ -628,24 +618,22 @@ void SendInfoToBrain() //First add all cameras for (int i = 0; i < agentParameters.agentCameras.Count; i++) { - ObservationToTexture( + var obsTexture = ObservationToTexture( agentParameters.agentCameras[i], param.cameraResolutions[i].width, - param.cameraResolutions[i].height, - ref textureArray[i]); - info.visualObservations.Add(textureArray[i]); + param.cameraResolutions[i].height); + info.visualObservations.Add(obsTexture); } //Then add all renderTextures var camCount = agentParameters.agentCameras.Count; for (int i = 0; i < agentParameters.agentRenderTextures.Count; i++) { - ObservationToTexture( + var obsTexture = ObservationToTexture( agentParameters.agentRenderTextures[i], param.cameraResolutions[camCount+i].width, - param.cameraResolutions[camCount+i].height, - ref textureArray[i]); - info.visualObservations.Add(textureArray[i]); + param.cameraResolutions[camCount+i].height); + info.visualObservations.Add(obsTexture); } info.reward = reward; @@ -923,7 +911,7 @@ public void AppendMemoriesAction(List memories) { action.memories.AddRange(memories); } - + public List GetMemoriesAction() { return action.memories; @@ -1116,8 +1104,9 @@ void MakeRequests(int academyStepCounter) /// Width of resulting 2D texture. /// Height of resulting 2D texture. /// Texture2D to render to. - public static void ObservationToTexture(Camera obsCamera, int width, int height, ref Texture2D texture2D) + public static Texture2D ObservationToTexture(Camera obsCamera, int width, int height) { + var texture2D = new Texture2D(width, height, TextureFormat.RGB24, false); Rect oldRec = obsCamera.rect; obsCamera.rect = new Rect(0f, 0f, 1f, 1f); var depth = 24; @@ -1127,11 +1116,6 @@ public static void ObservationToTexture(Camera obsCamera, int width, int height, var tempRT = RenderTexture.GetTemporary(width, height, depth, format, readWrite); - if (width != texture2D.width || height != texture2D.height) - { - texture2D.Resize(width, height); - } - var prevActiveRT = RenderTexture.active; var prevCameraRT = obsCamera.targetTexture; @@ -1142,11 +1126,12 @@ public static void ObservationToTexture(Camera obsCamera, int width, int height, obsCamera.Render(); texture2D.ReadPixels(new Rect(0, 0, texture2D.width, texture2D.height), 0, 0); - texture2D.Apply(); + obsCamera.targetTexture = prevCameraRT; obsCamera.rect = oldRec; RenderTexture.active = prevActiveRT; RenderTexture.ReleaseTemporary(tempRT); + return texture2D; } /// @@ -1157,8 +1142,10 @@ public static void ObservationToTexture(Camera obsCamera, int width, int height, /// Width of resulting 2D texture. /// Height of resulting 2D texture. /// Texture2D to render to. - public static void ObservationToTexture(RenderTexture obsTexture, int width, int height, ref Texture2D texture2D) + public static Texture2D ObservationToTexture(RenderTexture obsTexture, int width, int height) { + var texture2D = new Texture2D(width, height, TextureFormat.RGB24, false); + if (width != texture2D.width || height != texture2D.height) { texture2D.Resize(width, height); @@ -1177,6 +1164,7 @@ public static void ObservationToTexture(RenderTexture obsTexture, int width, int texture2D.ReadPixels(new Rect(0, 0, texture2D.width, texture2D.height), 0, 0); texture2D.Apply(); RenderTexture.active = prevActiveRT; + return texture2D; } /// diff --git a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/ApplierImpl.cs b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/ApplierImpl.cs index d92b1c7052..511dcb0dfe 100644 --- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/ApplierImpl.cs +++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/ApplierImpl.cs @@ -1,5 +1,6 @@ using System.Collections.Generic; using System.Linq; +using Barracuda; using MLAgents.InferenceBrain.Utils; using UnityEngine; @@ -11,17 +12,16 @@ namespace MLAgents.InferenceBrain /// public class ContinuousActionOutputApplier : TensorApplier.Applier { - public void Apply(Tensor tensor, Dictionary agentInfo) + public void Apply(TensorProxy tensorProxy, Dictionary agentInfo) { - var tensorDataAction = tensor.Data as float[,]; - var actionSize = tensor.Shape[tensor.Shape.Length - 1]; + var actionSize = tensorProxy.Shape[tensorProxy.Shape.Length - 1]; var agentIndex = 0; foreach (var agent in agentInfo.Keys) { var action = new float[actionSize]; for (var j = 0; j < actionSize; j++) { - action[j] = tensorDataAction[agentIndex, j]; + action[j] = tensorProxy.Data[agentIndex, j]; } agent.UpdateVectorAction(action); agentIndex++; @@ -37,51 +37,54 @@ public class DiscreteActionOutputApplier : TensorApplier.Applier { private int[] _actionSize; private Multinomial _multinomial; + private ITensorAllocator _allocator; - public DiscreteActionOutputApplier(int[] actionSize, int seed) + public DiscreteActionOutputApplier(int[] actionSize, int seed, ITensorAllocator allocator) { _actionSize = actionSize; _multinomial = new Multinomial(seed); + _allocator = allocator; } - public void Apply(Tensor tensor, Dictionary agentInfo) + public void Apply(TensorProxy tensorProxy, Dictionary agentInfo) { - var tensorDataProbabilities = tensor.Data as float[,]; + //var tensorDataProbabilities = tensorProxy.Data as float[,]; var batchSize = agentInfo.Keys.Count; var actions = new float[batchSize, _actionSize.Length]; var startActionIndices = Utilities.CumSum(_actionSize); for (var actionIndex=0; actionIndex < _actionSize.Length; actionIndex++) { var nBranchAction = _actionSize[actionIndex]; - var actionProbs = new float[batchSize, nBranchAction]; + var actionProbs = new TensorProxy() + { + ValueType = TensorProxy.TensorType.FloatingPoint, + Shape = new long[]{batchSize, nBranchAction}, + Data = _allocator.Alloc(new TensorShape(batchSize, nBranchAction)) + }; + for (var batchIndex = 0; batchIndex < batchSize; batchIndex++) { for (var branchActionIndex = 0; branchActionIndex < nBranchAction; branchActionIndex++) { - actionProbs[batchIndex, branchActionIndex] = - tensorDataProbabilities[ - batchIndex, startActionIndices[actionIndex] + branchActionIndex]; + actionProbs.Data[batchIndex, branchActionIndex] = + tensorProxy.Data[batchIndex, startActionIndices[actionIndex] + branchActionIndex]; } } - var inputTensor = new Tensor() - { - ValueType = Tensor.TensorType.FloatingPoint, - Shape = new long[]{batchSize, _actionSize[actionIndex]}, - Data = actionProbs - }; - var outputTensor = new Tensor() + + var outputTensor = new TensorProxy() { - ValueType = Tensor.TensorType.FloatingPoint, + ValueType = TensorProxy.TensorType.FloatingPoint, Shape = new long[]{batchSize, 1}, - Data = new float[batchSize, 1] + Data = _allocator.Alloc(new TensorShape(batchSize, 1)) }; - _multinomial.Eval(inputTensor, outputTensor); - var outTensor = outputTensor.Data as float[,]; + + _multinomial.Eval(actionProbs, outputTensor); + for (var ii = 0; ii < batchSize; ii++) { - actions[ii, actionIndex] = outTensor[ii, 0]; + actions[ii, actionIndex] = outputTensor.Data[ii, 0]; } } var agentIndex = 0; @@ -109,11 +112,10 @@ public BarracudaMemoryOutputApplier(int memoriesCount, int memoryIndex) this.memoryIndex = memoryIndex; } - public void Apply(Tensor tensor, Dictionary agentInfo) + public void Apply(TensorProxy tensorProxy, Dictionary agentInfo) { - var tensorDataMemory = tensor.Data as float[,]; var agentIndex = 0; - var memorySize = (int)tensor.Shape[tensor.Shape.Length - 1]; + var memorySize = (int)tensorProxy.Shape[tensorProxy.Shape.Length - 1]; foreach (var agent in agentInfo.Keys) { @@ -127,7 +129,7 @@ public void Apply(Tensor tensor, Dictionary agentInfo) for (var j = 0; j < memorySize; j++) { - memory[memorySize * memoryIndex + j] = tensorDataMemory[agentIndex, j]; + memory[memorySize * memoryIndex + j] = tensorProxy.Data[agentIndex, j]; } agent.UpdateMemoriesAction(memory); @@ -143,17 +145,16 @@ public void Apply(Tensor tensor, Dictionary agentInfo) /// public class MemoryOutputApplier : TensorApplier.Applier { - public void Apply(Tensor tensor, Dictionary agentInfo) + public void Apply(TensorProxy tensorProxy, Dictionary agentInfo) { - var tensorDataMemory = tensor.Data as float[,]; var agentIndex = 0; - var memorySize = tensor.Shape[tensor.Shape.Length - 1]; + var memorySize = tensorProxy.Shape[tensorProxy.Shape.Length - 1]; foreach (var agent in agentInfo.Keys) { var memory = new List(); for (var j = 0; j < memorySize; j++) { - memory.Add(tensorDataMemory[agentIndex, j]); + memory.Add(tensorProxy.Data[agentIndex, j]); } agent.UpdateMemoriesAction(memory); @@ -168,13 +169,12 @@ public void Apply(Tensor tensor, Dictionary agentInfo) /// public class ValueEstimateApplier : TensorApplier.Applier { - public void Apply(Tensor tensor, Dictionary agentInfo) + public void Apply(TensorProxy tensorProxy, Dictionary agentInfo) { - var tensorDataValue = tensor.Data as float[,]; var agentIndex = 0; foreach (var agent in agentInfo.Keys) { - agent.UpdateValueAction(tensorDataValue[agentIndex, 0]); + agent.UpdateValueAction(tensorProxy.Data[agentIndex, 0]); agentIndex++; } } diff --git a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/BarracudaModelParamLoader.cs b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/BarracudaModelParamLoader.cs index 96c0f894e5..3541673a85 100644 --- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/BarracudaModelParamLoader.cs +++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/BarracudaModelParamLoader.cs @@ -1,12 +1,9 @@ -#define ENABLE_BARRACUDA -#if ENABLE_BARRACUDA -using System; +using System; using System.Collections.Generic; using System.Linq; using System.Runtime.InteropServices; using Barracuda; -using UnityEngine; -using Tensor = MLAgents.InferenceBrain.Tensor; +using MLAgents.InferenceBrain; namespace MLAgents.InferenceBrain { @@ -57,20 +54,20 @@ private BarracudaModelParamLoader(IWorker engine, Model model, BrainParameters b /// /// Generates the Tensor inputs that are expected to be present in the Model. /// - /// Tensor IEnumerable with the expected Tensor inputs - public IReadOnlyList GetInputTensors() + /// TensorProxy IEnumerable with the expected Tensor inputs + public IReadOnlyList GetInputTensors() { - List tensors = new List(); + List tensors = new List(); if (_model == null) return tensors; foreach (var input in _model.inputs) { - tensors.Add(new Tensor + tensors.Add(new TensorProxy { Name = input.name, - ValueType = Tensor.TensorType.FloatingPoint, + ValueType = TensorProxy.TensorType.FloatingPoint, Data = null, Shape = input.shape.Select(i => (long)i).ToArray() }); @@ -78,13 +75,13 @@ public IReadOnlyList GetInputTensors() foreach (var mem in _model.memories) { - //Debug.Log($"{mem.input}: {mem.shape} -> {BarracudaUtils.FromBarracuda(mem.shape).Length}"); - tensors.Add(new Tensor + //Debug.Log($"{mem.input}: {mem.shape} -> {BarracudaUtils.TensorShapeFromBarracuda(mem.shape).Length}"); + tensors.Add(new TensorProxy { Name = mem.input, - ValueType = Tensor.TensorType.FloatingPoint, + ValueType = TensorProxy.TensorType.FloatingPoint, Data = null, - Shape = BarracudaUtils.FromBarracuda(mem.shape) + Shape = TensorUtils.TensorShapeFromBarracuda(mem.shape) }); } @@ -96,7 +93,7 @@ public IReadOnlyList GetInputTensors() /// /// Generates the Tensor outputs that are expected to be present in the Model. /// - /// Tensor IEnumerable with the expected Tensor outputs + /// TensorProxy IEnumerable with the expected Tensor outputs public string[] GetOutputNames() { var names = new List(); @@ -320,7 +317,7 @@ private void CheckOutputTensorPresence(int memory) private void CheckInputTensorShape() { var tensorTester = - new Dictionary>() + new Dictionary>() { {TensorNames.VectorObservationPlacholder, CheckVectorObsShape}, {TensorNames.PreviousActionPlaceholder, CheckPreviousActionShape}, @@ -363,14 +360,14 @@ private void CheckInputTensorShape() /// Checks that the shape of the Vector Observation input placeholder is the same in the /// model and in the Brain Parameters. /// - /// The tensor that is expected by the model + /// The tensor that is expected by the model /// If the Check failed, returns a string containing information about why the /// check failed. If the check passed, returns null. - private string CheckVectorObsShape(Tensor tensor) + private string CheckVectorObsShape(TensorProxy tensorProxy) { var vecObsSizeBp = _brainParameters.vectorObservationSize; var numStackedVector = _brainParameters.numStackedVectorObservations; - var totalVecObsSizeT = tensor.Shape[tensor.Shape.Length - 1]; + var totalVecObsSizeT = tensorProxy.Shape[tensorProxy.Shape.Length - 1]; if (vecObsSizeBp * numStackedVector != totalVecObsSizeT) { return string.Format( @@ -385,13 +382,13 @@ private string CheckVectorObsShape(Tensor tensor) /// Checks that the shape of the Previous Vector Action input placeholder is the same in the /// model and in the Brain Parameters. /// - /// The tensor that is expected by the model + /// The tensor that is expected by the model /// If the Check failed, returns a string containing information about why the /// check failed. If the check passed, returns null. - private string CheckPreviousActionShape(Tensor tensor) + private string CheckPreviousActionShape(TensorProxy tensorProxy) { var numberActionsBp = _brainParameters.vectorActionSize.Length; - var numberActionsT = tensor.Shape[tensor.Shape.Length - 1]; + var numberActionsT = tensorProxy.Shape[tensorProxy.Shape.Length - 1]; if (numberActionsBp != numberActionsT) { return string.Format( @@ -406,24 +403,24 @@ private string CheckPreviousActionShape(Tensor tensor) /// Checks that the shape of the visual observation input placeholder is the same in the /// model and in the Brain Parameters. /// - /// The tensor that is expected by the model + /// The tensor that is expected by the model /// The index of the visual observation. /// If the Check failed, returns a string containing information about why the /// check failed. If the check passed, returns null. - private string CheckVisualObsShape(Tensor tensor, int visObsIndex) + private string CheckVisualObsShape(TensorProxy tensorProxy, int visObsIndex) { var resolutionBp = _brainParameters.cameraResolutions[visObsIndex]; var widthBp = resolutionBp.width; var heightBp = resolutionBp.height; var pixelBp = resolutionBp.blackAndWhite ? 1 : 3; - var heightT = tensor.Shape[1]; - var widthT = tensor.Shape[2]; - var pixelT = tensor.Shape[3]; + var heightT = tensorProxy.Shape[1]; + var widthT = tensorProxy.Shape[2]; + var pixelT = tensorProxy.Shape[3]; if ((widthBp != widthT) || (heightBp != heightT) || (pixelBp != pixelT)) { return string.Format( "The visual Observation {0} of the model does not match. " + - "Received Tensor of shape [?x{1}x{2}x{3}] but was expecting [?x{4}x{5}x{6}].", + "Received TensorProxy of shape [?x{1}x{2}x{3}] but was expecting [?x{4}x{5}x{6}].", visObsIndex, widthBp, heightBp, pixelBp, widthT, heightT, pixelT); } return null; @@ -531,89 +528,4 @@ private string CheckContinuousActionOutputShape(TensorShape shape, int modelActi return null; } } -} - -public class BarracudaUtils -{ - private static Array LinearizeArray(Array src) - { - var elementType = src.GetType().GetElementType(); - var elementSize = Marshal.SizeOf(elementType); - var dest = Array.CreateInstance(elementType, src.Length); - Buffer.BlockCopy(src, 0, dest, 0, src.Length * elementSize); - return dest; - } - - protected static Barracuda.TensorShape ToBarracuda(long[] src) - { - if (src.Length > 4) - throw new NotImplementedException("Barracuda does not support Tensor shapes with rank higher than 4"); - - var shape = new int[4]; - - if (src.Length == 2) - { - shape[0] = (int)src[0]; - shape[1] = 1; - shape[2] = 1; - shape[3] = (int)src[1]; - } - else - { - for (var axis = 0; axis < src.Length; ++axis) - shape[shape.Length-axis-1] = (int)src[src.Length-axis-1]; - } - - return new Barracuda.TensorShape(shape); - } - - private static float[] IntArrayToFloatArray(int[] src) - { - var dest = new float[src.Length]; - for (var i = 0; i < src.Length; i++) - dest[i] = (float) src[i]; - - return dest; - } - - public static Barracuda.Tensor ToBarracuda(MLAgents.InferenceBrain.Tensor src) - { - Array linearArray = LinearizeArray(src.Data); - - if (linearArray.GetType().GetElementType() == typeof(int)) - linearArray = IntArrayToFloatArray(linearArray as int[]); - - var shape = ToBarracuda(src.Shape); - return new Barracuda.Tensor(shape, linearArray as float[], src.Name); - } - - internal static long[] FromBarracuda(Barracuda.TensorShape src) - { - if (src.height == 1 && src.width == 1) - return new long[2] {src.batch, src.channels}; - - return new long[4] {src.batch, src.height, src.width, src.channels}; - } - - private static Array ReshapeArray(Array src, long[] shape) - { - var elementType = src.GetType().GetElementType(); - var elementSize = Marshal.SizeOf(elementType); - var dest = Array.CreateInstance(elementType, shape); - Buffer.BlockCopy(src, 0, dest, 0, dest.Length * elementSize); - return dest; - } - - public static Tensor FromBarracuda(Barracuda.Tensor src, string nameOverride = null) - { - var shape = FromBarracuda(src.shape); - return new Tensor - { - Name = nameOverride ?? src.name, - ValueType = Tensor.TensorType.FloatingPoint, - Shape = shape, - Data = ReshapeArray(src.data.Download(src.length), shape) - }; - } -} -#endif +} \ No newline at end of file diff --git a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/GeneratorImpl.cs b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/GeneratorImpl.cs index e5cf4bfd88..742c51b663 100644 --- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/GeneratorImpl.cs +++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/GeneratorImpl.cs @@ -13,18 +13,16 @@ namespace MLAgents.InferenceBrain /// public class BiDimensionalOutputGenerator : TensorGenerator.Generator { - public void Generate(Tensor tensor, int batchSize, Dictionary agentInfo) + private ITensorAllocator _allocator; + + public BiDimensionalOutputGenerator(ITensorAllocator allocator) { - var shapeSecondAxis = tensor.Shape[tensor.Shape.Length - 1]; - tensor.Shape[0] = batchSize; - if (tensor.ValueType == Tensor.TensorType.FloatingPoint) - { - tensor.Data = new float[batchSize, shapeSecondAxis]; - } - else - { - tensor.Data = new int[batchSize, shapeSecondAxis]; - } + _allocator = allocator; + } + + public void Generate(TensorProxy tensorProxy, int batchSize, Dictionary agentInfo) + { + TensorUtils.ResizeTensor(tensorProxy, batchSize, _allocator); } } @@ -34,9 +32,17 @@ public void Generate(Tensor tensor, int batchSize, Dictionary /// public class BatchSizeGenerator : TensorGenerator.Generator { - public void Generate(Tensor tensor, int batchSize, Dictionary agentInfo) + private ITensorAllocator _allocator; + + public BatchSizeGenerator(ITensorAllocator allocator) + { + _allocator = allocator; + } + + public void Generate(TensorProxy tensorProxy, int batchSize, Dictionary agentInfo) { - tensor.Data = new int[] {batchSize}; + tensorProxy.Data = _allocator.Alloc(new TensorShape(1,1)); + tensorProxy.Data[0] = batchSize; } } @@ -48,10 +54,19 @@ public void Generate(Tensor tensor, int batchSize, Dictionary /// public class SequenceLengthGenerator : TensorGenerator.Generator { - public void Generate(Tensor tensor, int batchSize, Dictionary agentInfo) + private ITensorAllocator _allocator; + + public SequenceLengthGenerator(ITensorAllocator allocator) + { + _allocator = allocator; + } + + public void Generate(TensorProxy tensorProxy, int batchSize, Dictionary agentInfo) { - tensor.Shape = new long[0]; - tensor.Data = new int[] {1}; + tensorProxy.Shape = new long[0]; + tensorProxy.Data = _allocator.Alloc(new TensorShape(1,1)); + + tensorProxy.Data[0] = 1; } } @@ -63,19 +78,24 @@ public void Generate(Tensor tensor, int batchSize, Dictionary /// public class VectorObservationGenerator : TensorGenerator.Generator { - public void Generate(Tensor tensor, int batchSize, Dictionary agentInfo) + private ITensorAllocator _allocator; + public VectorObservationGenerator(ITensorAllocator allocator) + { + _allocator = allocator; + } + + public void Generate(TensorProxy tensorProxy, int batchSize, Dictionary agentInfo) { - tensor.Shape[0] = batchSize; - var vecObsSizeT = tensor.Shape[tensor.Shape.Length - 1]; - var floatArray = new float[batchSize, vecObsSizeT]; - tensor.Data = floatArray; + TensorUtils.ResizeTensor(tensorProxy, batchSize, _allocator); + var vecObsSizeT = tensorProxy.Shape[tensorProxy.Shape.Length - 1]; + var agentIndex = 0; foreach (var agent in agentInfo.Keys) { var vectorObs = agentInfo[agent].stackedVectorObservation; for (var j = 0; j < vecObsSizeT; j++) { - floatArray[agentIndex, j] = vectorObs[j]; + tensorProxy.Data[agentIndex, j] = vectorObs[j]; } agentIndex++; } @@ -90,12 +110,18 @@ public void Generate(Tensor tensor, int batchSize, Dictionary /// public class RecurrentInputGenerator : TensorGenerator.Generator { - public void Generate(Tensor tensor, int batchSize, Dictionary agentInfo) + private ITensorAllocator _allocator; + + public RecurrentInputGenerator(ITensorAllocator allocator) + { + _allocator = allocator; + } + + public void Generate(TensorProxy tensorProxy, int batchSize, Dictionary agentInfo) { - tensor.Shape[0] = batchSize; - var memorySize = tensor.Shape[tensor.Shape.Length - 1]; - var floatArray = new float[batchSize, memorySize]; - tensor.Data = floatArray; + TensorUtils.ResizeTensor(tensorProxy, batchSize, _allocator); + + var memorySize = tensorProxy.Shape[tensorProxy.Shape.Length - 1]; var agentIndex = 0; foreach (var agent in agentInfo.Keys) { @@ -111,7 +137,7 @@ public void Generate(Tensor tensor, int batchSize, Dictionary { break; } - floatArray[agentIndex, j] = memory[j]; + tensorProxy.Data[agentIndex, j] = memory[j]; } agentIndex++; } @@ -122,19 +148,19 @@ public class BarracudaRecurrentInputGenerator : TensorGenerator.Generator { private int memoriesCount; private int memoryIndex; + private ITensorAllocator _allocator; - public BarracudaRecurrentInputGenerator(int memoryIndex) + public BarracudaRecurrentInputGenerator(int memoryIndex, ITensorAllocator allocator) { this.memoryIndex = memoryIndex; + _allocator = allocator; } - public void Generate(Tensor tensor, int batchSize, Dictionary agentInfo) + public void Generate(TensorProxy tensorProxy, int batchSize, Dictionary agentInfo) { - tensor.Shape[0] = batchSize; - - var memorySize = (int)tensor.Shape[tensor.Shape.Length - 1]; + TensorUtils.ResizeTensor(tensorProxy, batchSize, _allocator); - tensor.Data = new float[batchSize, memorySize]; + var memorySize = (int)tensorProxy.Shape[tensorProxy.Shape.Length - 1]; var agentIndex = 0; foreach (var agent in agentInfo.Keys) { @@ -153,7 +179,7 @@ public void Generate(Tensor tensor, int batchSize, Dictionary { break; } - tensor.Data.SetValue(memory[j + offset], new int[2] {agentIndex, j}); + tensorProxy.Data[agentIndex, j] = memory[j + offset]; } agentIndex++; } @@ -168,19 +194,25 @@ public void Generate(Tensor tensor, int batchSize, Dictionary /// public class PreviousActionInputGenerator : TensorGenerator.Generator { - public void Generate(Tensor tensor, int batchSize, Dictionary agentInfo) + private ITensorAllocator _allocator; + + public PreviousActionInputGenerator(ITensorAllocator allocator) { - tensor.Shape[0] = batchSize; - var actionSize = tensor.Shape[tensor.Shape.Length - 1]; - var intArray = new int[batchSize, actionSize]; - tensor.Data = intArray; + _allocator = allocator; + } + + public void Generate(TensorProxy tensorProxy, int batchSize, Dictionary agentInfo) + { + TensorUtils.ResizeTensor(tensorProxy, batchSize, _allocator); + + var actionSize = tensorProxy.Shape[tensorProxy.Shape.Length - 1]; var agentIndex = 0; foreach (var agent in agentInfo.Keys) { var pastAction = agentInfo[agent].storedVectorActions; for (var j = 0; j < actionSize; j++) { - intArray[agentIndex, j] = (int) pastAction[j]; + tensorProxy.Data[agentIndex, j] = pastAction[j]; } agentIndex++; @@ -196,12 +228,18 @@ public void Generate(Tensor tensor, int batchSize, Dictionary /// public class ActionMaskInputGenerator : TensorGenerator.Generator { - public void Generate(Tensor tensor, int batchSize, Dictionary agentInfo) + private ITensorAllocator _allocator; + + public ActionMaskInputGenerator(ITensorAllocator allocator) { - tensor.Shape[0] = batchSize; - var maskSize = tensor.Shape[tensor.Shape.Length - 1]; - var floatArray = new float[batchSize, maskSize]; - tensor.Data = floatArray; + _allocator = allocator; + } + + public void Generate(TensorProxy tensorProxy, int batchSize, Dictionary agentInfo) + { + TensorUtils.ResizeTensor(tensorProxy, batchSize, _allocator); + + var maskSize = tensorProxy.Shape[tensorProxy.Shape.Length - 1]; var agentIndex = 0; foreach (var agent in agentInfo.Keys) { @@ -209,7 +247,7 @@ public void Generate(Tensor tensor, int batchSize, Dictionary for (var j = 0; j < maskSize; j++) { var isUnmasked = (maskList != null && maskList[j]) ? 0.0f : 1.0f; - floatArray[agentIndex, j] = isUnmasked; + tensorProxy.Data[agentIndex, j] = isUnmasked; } agentIndex++; } @@ -224,18 +262,18 @@ public void Generate(Tensor tensor, int batchSize, Dictionary public class RandomNormalInputGenerator : TensorGenerator.Generator { private RandomNormal _randomNormal; + private ITensorAllocator _allocator; - public RandomNormalInputGenerator(int seed) + public RandomNormalInputGenerator(int seed, ITensorAllocator allocator) { _randomNormal = new RandomNormal(seed); + _allocator = allocator; } - public void Generate(Tensor tensor, int batchSize, Dictionary agentInfo) + public void Generate(TensorProxy tensorProxy, int batchSize, Dictionary agentInfo) { - tensor.Shape[0] = batchSize; - var actionSize = tensor.Shape[tensor.Shape.Length - 1]; - tensor.Data = new float[batchSize, actionSize]; - _randomNormal.FillTensor(tensor); + TensorUtils.ResizeTensor(tensorProxy, batchSize, _allocator); + _randomNormal.FillTensor(tensorProxy); } } @@ -249,18 +287,22 @@ public class VisualObservationInputGenerator : TensorGenerator.Generator { private int _index; private bool _grayScale; - public VisualObservationInputGenerator(int index, bool grayScale) + private ITensorAllocator _allocator; + + public VisualObservationInputGenerator(int index, bool grayScale, ITensorAllocator allocator) { _index = index; _grayScale = grayScale; + _allocator = allocator; } - public void Generate(Tensor tensor, int batchSize, Dictionary agentInfo) + public void Generate(TensorProxy tensorProxy, int batchSize, Dictionary agentInfo) { var textures = agentInfo.Keys.Select( agent => agentInfo[agent].visualObservations[_index]).ToList(); - tensor.Data = Utilities.TextureToFloatArray(textures, _grayScale); - tensor.Shape[0] = textures.Count; + + TensorUtils.ResizeTensor(tensorProxy, batchSize, _allocator); + Utilities.TextureToTensorProxy(tensorProxy, textures, _grayScale, _allocator); } } } diff --git a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/ModelParamLoader.cs b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/ModelParamLoader.cs index 25554a8972..ab13401c16 100644 --- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/ModelParamLoader.cs +++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/ModelParamLoader.cs @@ -1,7 +1,9 @@ #if ENABLE_TENSORFLOW using System; using System.Collections.Generic; +using System.Diagnostics; using System.Linq; +using Barracuda; namespace MLAgents.InferenceBrain { @@ -48,8 +50,8 @@ private ModelParamLoader(TFSharpInferenceEngine engine, BrainParameters brainPar /// /// Generates the Tensor inputs that are expected to be present in the Model. /// - /// Tensor IEnumerable with the expected Tensor inputs - public IReadOnlyList GetInputTensors() + /// TensorProxy IEnumerable with the expected Tensor inputs + public IReadOnlyList GetInputTensors() { return _engine?.InputFeatures(); } @@ -57,48 +59,48 @@ public IReadOnlyList GetInputTensors() /// /// Generates the Tensor outputs that are expected to be present in the Model. /// - /// Tensor IEnumerable with the expected Tensor outputs - public IReadOnlyList GetOutputTensors() + /// TensorProxy IEnumerable with the expected Tensor outputs + public IReadOnlyList GetOutputTensors() { - var tensorList = new List(); + var tensorList = new List(); if (_brainParameters.vectorActionSpaceType == SpaceType.continuous) { - tensorList.Add(new Tensor() + tensorList.Add(new TensorProxy() { Name = TensorNames.ActionOutput, Shape = new long[] { -1, _brainParameters.vectorActionSize[0] }, - ValueType = Tensor.TensorType.FloatingPoint, + ValueType = TensorProxy.TensorType.FloatingPoint, Data = null }); } else { tensorList.Add( - new Tensor() + new TensorProxy() { Name = TensorNames.ActionOutput, Shape = new long[] { -1, _brainParameters.vectorActionSize.Sum() }, - ValueType = Tensor.TensorType.FloatingPoint, + ValueType = TensorProxy.TensorType.FloatingPoint, Data = null }); } var memory = GetIntScalar(TensorNames.MemorySize); if (memory > 0) { - tensorList.Add(new Tensor() + tensorList.Add(new TensorProxy() { Name = TensorNames.RecurrentOutput, Shape = new long[2] { -1, memory }, - ValueType = Tensor.TensorType.FloatingPoint, + ValueType = TensorProxy.TensorType.FloatingPoint, Data = null }); } @@ -114,25 +116,26 @@ public IReadOnlyList GetOutputTensors() /// The value of the scalar variable in the model. (-1 if not found) private int GetIntScalar(string name) { - var outputs = new Tensor[] + var outputs = new TensorProxy[] { - new Tensor() + new TensorProxy() { Name = name, - ValueType = Tensor.TensorType.Integer, + ValueType = TensorProxy.TensorType.Integer, Shape = new long[] { }, - Data = new long[1] + Data = new Tensor(1,1) }, }; try { - _engine.ExecuteGraph(new Tensor[0], outputs); + _engine.ExecuteGraph(new TensorProxy[0], outputs); } - catch + catch (Exception ex) { + UnityEngine.Debug.LogError($"Failed to execute GetIntScalar()\n{ex}"); return -1; } - return (outputs[0].Data as int[])[0]; + return (int)outputs[0].Data[0]; } /// @@ -319,7 +322,7 @@ private void CheckOutputTensorPresence(int memory) private void CheckInputTensorShape() { var tensorTester = - new Dictionary>() + new Dictionary>() { {TensorNames.VectorObservationPlacholder, CheckVectorObsShape}, {TensorNames.PreviousActionPlaceholder, CheckPreviousActionShape}, @@ -361,7 +364,7 @@ private void CheckInputTensorShape() /// The tensor that is expected by the model /// If the Check failed, returns a string containing information about why the /// check failed. If the check passed, returns null. - private string CheckVectorObsShape(Tensor tensor) + private string CheckVectorObsShape(TensorProxy tensor) { var vecObsSizeBp = _brainParameters.vectorObservationSize; var numStackedVector = _brainParameters.numStackedVectorObservations; @@ -383,7 +386,7 @@ private string CheckVectorObsShape(Tensor tensor) /// The tensor that is expected by the model /// If the Check failed, returns a string containing information about why the /// check failed. If the check passed, returns null. - private string CheckPreviousActionShape(Tensor tensor) + private string CheckPreviousActionShape(TensorProxy tensor) { var numberActionsBp = _brainParameters.vectorActionSize.Length; var numberActionsT = tensor.Shape[1]; @@ -405,7 +408,7 @@ private string CheckPreviousActionShape(Tensor tensor) /// The index of the visual observation. /// If the Check failed, returns a string containing information about why the /// check failed. If the check passed, returns null. - private string CheckVisualObsShape(Tensor tensor, int visObsIndex) + private string CheckVisualObsShape(TensorProxy tensor, int visObsIndex) { var resolutionBp = _brainParameters.cameraResolutions[visObsIndex]; var widthBp = resolutionBp.width; @@ -418,7 +421,7 @@ private string CheckVisualObsShape(Tensor tensor, int visObsIndex) { return string.Format( "The visual Observation {0} of the model does not match. " + - "Received Tensor of shape [?x{1}x{2}x{3}] but was expecting [?x{4}x{5}x{6}].", + "Received TensorProxy of shape [?x{1}x{2}x{3}] but was expecting [?x{4}x{5}x{6}].", visObsIndex, widthBp, heightBp, pixelBp, widthT, heightT, pixelT); } return null; @@ -458,7 +461,7 @@ private void CheckOutputTensorShape(ModelActionType isContinuous, int modelActio "suggest Continuous Control."); return; } - var tensorTester = new Dictionary>(); + var tensorTester = new Dictionary>(); if (_brainParameters.vectorActionSpaceType == SpaceType.continuous) { tensorTester[TensorNames.ActionOutput] = CheckContinuousActionOutputShape; @@ -491,7 +494,7 @@ private void CheckOutputTensorShape(ModelActionType isContinuous, int modelActio /// by the model. /// If the Check failed, returns a string containing information about why the /// check failed. If the check passed, returns null. - private string CheckDiscreteActionOutputShape(Tensor tensor, int modelActionSize) + private string CheckDiscreteActionOutputShape(TensorProxy tensor, int modelActionSize) { var bpActionSize = _brainParameters.vectorActionSize.Sum(); if (modelActionSize != bpActionSize) @@ -513,7 +516,7 @@ private string CheckDiscreteActionOutputShape(Tensor tensor, int modelActionSize /// by the model. /// If the Check failed, returns a string containing information about why the /// check failed. If the check passed, returns null. - private string CheckContinuousActionOutputShape(Tensor tensor, int modelActionSize) + private string CheckContinuousActionOutputShape(TensorProxy tensor, int modelActionSize) { var bpActionSize = _brainParameters.vectorActionSize[0]; if (modelActionSize != bpActionSize) diff --git a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TFSharpInferenceEngine.cs b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TFSharpInferenceEngine.cs index 6e30c8b30c..8b5d93295f 100644 --- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TFSharpInferenceEngine.cs +++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TFSharpInferenceEngine.cs @@ -5,6 +5,7 @@ using System; using UnityEngine.Profiling; using System.Runtime.InteropServices; +using Barracuda; using UnityEngine; namespace MLAgents.InferenceBrain @@ -37,20 +38,20 @@ public void PrepareModel(byte[] model) Profiler.EndSample(); } - public int ExecuteGraph(IEnumerable inputs_it, IEnumerable outputs_it) + public int ExecuteGraph(IEnumerable inputs_it, IEnumerable outputs_it) { Profiler.BeginSample("TFSharpInferenceComponent.ExecuteGraph"); - Tensor[] inputs = inputs_it.ToArray(); - Tensor[] outputs = outputs_it.ToArray(); + TensorProxy[] inputs = inputs_it.ToArray(); + TensorProxy[] outputs = outputs_it.ToArray(); // TODO: Can/should we pre-allocate that? TFSession.Runner runner = m_session.GetRunner(); - inputs.ToList().ForEach((Tensor input) => - { + inputs.ToList().ForEach((TensorProxy input) => + { if (input.Shape.Length == 0) { - var data = input.Data.GetValue(0); + var data = input.Data[0]; if (input.DataType == typeof(int)) { runner.AddInput(m_graph[input.Name][0], (int)data); @@ -62,7 +63,9 @@ public int ExecuteGraph(IEnumerable inputs_it, IEnumerable outpu } else { - runner.AddInput(m_graph[input.Name][0], input.Data); + runner.AddInput(m_graph[input.Name][0], input.DataType == typeof(int) ? + TensorUtils.BarracudaToIntArray(input.Data) : + TensorUtils.BarracudaToFloatArray(input.Data)); } }); @@ -87,12 +90,12 @@ public int ExecuteGraph(IEnumerable inputs_it, IEnumerable outpu if (outputs[i].Shape.Length == 0) { // Handle scalars - outputs[i].Data = Array.CreateInstance(outputs[i].DataType, new long[1] {1}); - outputs[i].Data.SetValue(out_tensors[i].GetValue(), 0); + outputs[i].Data = new Tensor(1,1); + outputs[i].Data[0] = (float)(int)out_tensors[i].GetValue(); } else { - outputs[i].Data = out_tensors[i].GetValue() as Array; + outputs[i].Data = TensorUtils.ArrayToBarracuda(out_tensors[i].GetValue() as Array); } } @@ -109,7 +112,7 @@ public int ExecuteGraph(IEnumerable inputs_it, IEnumerable outpu private static extern unsafe void TF_OperationGetAttrShape(IntPtr oper, string attr_name, long[] value, int num_dims, IntPtr status); - private Tensor GetOpMetadata(TFOperation op) + private TensorProxy GetOpMetadata(TFOperation op) { TFStatus status = new TFStatus(); @@ -118,8 +121,8 @@ private Tensor GetOpMetadata(TFOperation op) var shape_attr = op.GetAttributeMetadata("shape", status); if (!status.Ok || shape_attr.TotalSize <= 0) { - Debug.LogWarning("Operation " + op.Name + " does not contain shape attribute or it" + - " doesn't contain valid shape data!"); + Debug.LogWarning($"Operation {op.Name} does not contain shape attribute or it" + + $" doesn't contain valid shape data! Status: {status.StatusMessage}"); } else { @@ -170,14 +173,14 @@ private Tensor GetOpMetadata(TFOperation op) } } - Tensor.TensorType placeholder_type = Tensor.TensorType.FloatingPoint; + TensorProxy.TensorType placeholder_type = TensorProxy.TensorType.FloatingPoint; switch (type_value) { case TFDataType.Float: - placeholder_type = Tensor.TensorType.FloatingPoint; + placeholder_type = TensorProxy.TensorType.FloatingPoint; break; case TFDataType.Int32: - placeholder_type = Tensor.TensorType.Integer; + placeholder_type = TensorProxy.TensorType.Integer; break; default: Debug.LogWarning("Operation " + op.Name + @@ -185,7 +188,7 @@ private Tensor GetOpMetadata(TFOperation op) break; } - Tensor t = new Tensor + TensorProxy t = new TensorProxy { Data = null, Name = op.Name, @@ -195,9 +198,9 @@ private Tensor GetOpMetadata(TFOperation op) return t; } - public IReadOnlyList InputFeatures() + public IReadOnlyList InputFeatures() { - List inputs = new List(); + List inputs = new List(); foreach (var op in m_graph.GetEnumerator()) { if (op.OpType == "Placeholder") diff --git a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/Tensor.cs b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/Tensor.cs deleted file mode 100644 index 54dd42aedf..0000000000 --- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/Tensor.cs +++ /dev/null @@ -1,40 +0,0 @@ -using System; -using System.Collections.Generic; - -namespace MLAgents.InferenceBrain -{ - - /// - /// Tensor - A class to encapsulate a Tensor used for inference. - /// - /// This class contains the Array that holds the data array, the shapes, type and the placeholder in the - /// execution graph. All the fields are editable in the inspector, allowing the user to specify everything - /// but the data in a graphical way. - /// - [System.Serializable] - public class Tensor - { - public enum TensorType - { - Integer, - FloatingPoint - }; - - private static Dictionary m_typeMap = new Dictionary() - { - { TensorType.FloatingPoint, typeof(float)}, - {TensorType.Integer, typeof(int)} - }; - - public string Name; - public TensorType ValueType; - // Since Type is not serializable, we use the DisplayType for the Inspector - public Type DataType - { - get { return m_typeMap[ValueType]; } - } - public long[] Shape; - public Array Data; - } - -} diff --git a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorApplier.cs b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorApplier.cs index 20ef8b6889..6657c7c5b1 100644 --- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorApplier.cs +++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorApplier.cs @@ -1,33 +1,33 @@ -#define ENABLE_BARRACUDA -using System.Collections.Generic; +using System.Collections.Generic; +using Barracuda; namespace MLAgents.InferenceBrain { /// - /// Mapping between the output Tensor names and the method that will use the + /// Mapping between the output tensor names and the method that will use the /// output tensors and the Agents present in the batch to update their action, memories and /// value estimates. /// A TensorApplier implements a Dictionary of strings (node names) to an Action. - /// This action takes as input the Tensor and the Dictionary of Agent to AgentInfo for + /// This action takes as input the tensor and the Dictionary of Agent to AgentInfo for /// the current batch. /// public class TensorApplier { /// - /// A tensor Applier's Execute method takes a Tensor and a Dictionary of Agent to AgentInfo. - /// Uses the data contained inside the Tensor to modify the state of the Agent. The Tensors + /// A tensor Applier's Execute method takes a tensor and a Dictionary of Agent to AgentInfo. + /// Uses the data contained inside the tensor to modify the state of the Agent. The Tensors /// are assumed to have the batch size on the first dimension and the agents to be ordered - /// the same way in the dictionary and in the Tensor. + /// the same way in the dictionary and in the tensor. /// public interface Applier { /// /// Applies the values in the Tensor to the Agents present in the agentInfos /// - /// The Tensor containing the data to be applied to the Agents + /// The Tensor containing the data to be applied to the Agents /// Dictionary of Agents to AgentInfo that will reveive /// the values of the Tensor. - void Apply(Tensor tensor, Dictionary agentInfo); + void Apply(TensorProxy tensorProxy, Dictionary agentInfo); } Dictionary _dict = new Dictionary(); @@ -38,7 +38,8 @@ public interface Applier /// The BrainParameters used to determine what Appliers will be /// used /// The seed the Appliers will be initialized with. - public TensorApplier(BrainParameters bp, int seed, object barracudaModel = null) + /// Tensor allocator + public TensorApplier(BrainParameters bp, int seed, ITensorAllocator allocator, object barracudaModel = null) { _dict[TensorNames.ValueEstimateOutput] = new ValueEstimateApplier(); if (bp.vectorActionSpaceType == SpaceType.continuous) @@ -47,19 +48,19 @@ public TensorApplier(BrainParameters bp, int seed, object barracudaModel = null) } else { - _dict[TensorNames.ActionOutput] = new DiscreteActionOutputApplier( - bp.vectorActionSize, seed); + _dict[TensorNames.ActionOutput] = new DiscreteActionOutputApplier(bp.vectorActionSize, seed, allocator); } _dict[TensorNames.RecurrentOutput] = new MemoryOutputApplier(); - - #if ENABLE_BARRACUDA - Barracuda.Model model = (Barracuda.Model) barracudaModel; - for (var i = 0; i < model?.memories.Length; i++) + if (barracudaModel != null) { - _dict[model.memories[i].output] = new BarracudaMemoryOutputApplier(model.memories.Length, i); + Model model = (Model) barracudaModel; + + for (var i = 0; i < model?.memories.Length; i++) + { + _dict[model.memories[i].output] = new BarracudaMemoryOutputApplier(model.memories.Length, i); + } } - #endif } /// @@ -71,14 +72,14 @@ public TensorApplier(BrainParameters bp, int seed, object barracudaModel = null) /// One of the tensor does not have an /// associated applier. public void ApplyTensors( - IEnumerable tensors, Dictionary agentInfos) + IEnumerable tensors, Dictionary agentInfos) { foreach (var tensor in tensors) { if (!_dict.ContainsKey(tensor.Name)) { throw new UnityAgentsException( - "Unknow tensor expected as output : "+tensor.Name); + "Unknow tensorProxy expected as output : "+tensor.Name); } _dict[tensor.Name].Apply(tensor, agentInfos); } diff --git a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorGenerator.cs b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorGenerator.cs index 46d3f760e7..0bbf629b11 100644 --- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorGenerator.cs +++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorGenerator.cs @@ -1,106 +1,111 @@ -#define ENABLE_BARRACUDA -using System.Collections.Generic; -using Barracuda; - -namespace MLAgents.InferenceBrain -{ - /// - /// Mapping between Tensor names and generators. - /// A TensorGenerator implements a Dictionary of strings (node names) to an Action. - /// The Action take as argument the tensor, the current batch size and a Dictionary of - /// Agent to AgentInfo corresponding to the current batch. - /// Each Generator reshapes and fills the data of the tensor based of the data of the batch. - /// When the Tensor is an Input to the model, the shape of the Tensor will be modified - /// depending on the current batch size and the data of the Tensor will be filled using the - /// Dictionary of Agent to AgentInfo. - /// When the Tensor is an Output of the model, only the shape of the Tensor will be modified - /// using the current batch size. The data will be prefilled with zeros. - /// - public class TensorGenerator - { - public interface Generator - { - /// - /// Modifies the data inside a Tensor according to the information contained in the - /// AgentInfos contained in the current batch. - /// - /// The tensor the data and shape will be modified - /// The number of agents present in the current batch - /// Dictionary of Agent to AgentInfo containing the - /// information that will be used to populate the tensor's data - void Generate(Tensor tensor, int batchSize, Dictionary agentInfo); - } - - Dictionary _dict = new Dictionary(); - - /// - /// Returns a new TensorGenerators object. - /// - /// The BrainParameters used to determine what Generators will be - /// used - /// The seed the Generators will be initialized with. - public TensorGenerator(BrainParameters bp, int seed, object barracudaModel = null) - { - // Generator for Inputs - _dict[TensorNames.BatchSizePlaceholder] = new BatchSizeGenerator(); - _dict[TensorNames.SequenceLengthPlaceholder] = new SequenceLengthGenerator(); - _dict[TensorNames.VectorObservationPlacholder] = new VectorObservationGenerator(); - _dict[TensorNames.RecurrentInPlaceholder] = new RecurrentInputGenerator(); - - #if ENABLE_BARRACUDA - Barracuda.Model model = (Barracuda.Model) barracudaModel; - for (var i = 0; i < model?.memories.Length; i++) - { - _dict[model.memories[i].input] = new BarracudaRecurrentInputGenerator(i); - } - #endif - - _dict[TensorNames.PreviousActionPlaceholder] = new PreviousActionInputGenerator(); - _dict[TensorNames.ActionMaskPlaceholder] = new ActionMaskInputGenerator(); - _dict[TensorNames.RandomNormalEpsilonPlaceholder] = new RandomNormalInputGenerator(seed); - if (bp.cameraResolutions != null) - { - for (var visIndex = 0; - visIndex < bp.cameraResolutions.Length; - visIndex++) - { - var index = visIndex; - var bw = bp.cameraResolutions[visIndex].blackAndWhite; - _dict[TensorNames.VisualObservationPlaceholderPrefix + visIndex] = new - VisualObservationInputGenerator(index, bw); - } - } - - // Generators for Outputs - _dict[TensorNames.ActionOutput] = new BiDimensionalOutputGenerator(); - _dict[TensorNames.RecurrentOutput] = new BiDimensionalOutputGenerator(); - _dict[TensorNames.ValueEstimateOutput] = new BiDimensionalOutputGenerator(); - } - - /// - /// Populates the data of the tensor inputs given the data contained in the current batch - /// of agents. - /// - /// Enumerable of tensors that will be modified. - /// The number of agents present in the current batch - /// - /// Dictionary of Agent to AgentInfo that contains the - /// data that will be used to modify the tensors - /// One of the tensor does not have an - /// associated generator. - public void GenerateTensors(IEnumerable tensors, - int currentBatchSize, - Dictionary agentInfos) - { - foreach (var tensor in tensors) - { - if (!_dict.ContainsKey(tensor.Name)) - { - throw new UnityAgentsException( - "Unknow tensor expected as input : " + tensor.Name); - } - _dict[tensor.Name].Generate(tensor, currentBatchSize, agentInfos); - } - } - } -} +using System.Collections.Generic; +using System.Runtime.InteropServices.ComTypes; +using Barracuda; + +namespace MLAgents.InferenceBrain +{ + /// + /// Mapping between Tensor names and generators. + /// A TensorGenerator implements a Dictionary of strings (node names) to an Action. + /// The Action take as argument the tensor, the current batch size and a Dictionary of + /// Agent to AgentInfo corresponding to the current batch. + /// Each Generator reshapes and fills the data of the tensor based of the data of the batch. + /// When the TensorProxy is an Input to the model, the shape of the Tensor will be modified + /// depending on the current batch size and the data of the Tensor will be filled using the + /// Dictionary of Agent to AgentInfo. + /// When the TensorProxy is an Output of the model, only the shape of the Tensor will be modified + /// using the current batch size. The data will be prefilled with zeros. + /// + public class TensorGenerator + { + public interface Generator + { + /// + /// Modifies the data inside a Tensor according to the information contained in the + /// AgentInfos contained in the current batch. + /// + /// The tensor the data and shape will be modified + /// The number of agents present in the current batch + /// Dictionary of Agent to AgentInfo containing the + /// information that will be used to populate the tensor's data + void Generate(TensorProxy tensorProxy, int batchSize, Dictionary agentInfo); + } + + Dictionary _dict = new Dictionary(); + ITensorAllocator _allocator; + + /// + /// Returns a new TensorGenerators object. + /// + /// The BrainParameters used to determine what Generators will be + /// used + /// The seed the Generators will be initialized with. + /// Tensor allocator + public TensorGenerator(BrainParameters bp, int seed, ITensorAllocator allocator, object barracudaModel = null) + { + _allocator = allocator; + + // Generator for Inputs + _dict[TensorNames.BatchSizePlaceholder] = new BatchSizeGenerator(_allocator); + _dict[TensorNames.SequenceLengthPlaceholder] = new SequenceLengthGenerator(_allocator); + _dict[TensorNames.VectorObservationPlacholder] = new VectorObservationGenerator(_allocator); + _dict[TensorNames.RecurrentInPlaceholder] = new RecurrentInputGenerator(_allocator); + + if (barracudaModel != null) + { + Model model = (Model) barracudaModel; + for (var i = 0; i < model?.memories.Length; i++) + { + _dict[model.memories[i].input] = new BarracudaRecurrentInputGenerator(i, _allocator); + } + } + + _dict[TensorNames.PreviousActionPlaceholder] = new PreviousActionInputGenerator(_allocator); + _dict[TensorNames.ActionMaskPlaceholder] = new ActionMaskInputGenerator(_allocator); + _dict[TensorNames.RandomNormalEpsilonPlaceholder] = new RandomNormalInputGenerator(seed, _allocator); + if (bp.cameraResolutions != null) + { + for (var visIndex = 0; + visIndex < bp.cameraResolutions.Length; + visIndex++) + { + var index = visIndex; + var bw = bp.cameraResolutions[visIndex].blackAndWhite; + _dict[TensorNames.VisualObservationPlaceholderPrefix + visIndex] = new + VisualObservationInputGenerator(index, bw, _allocator); + } + } + + // Generators for Outputs + _dict[TensorNames.ActionOutput] = new BiDimensionalOutputGenerator(_allocator); + _dict[TensorNames.RecurrentOutput] = new BiDimensionalOutputGenerator(_allocator); + _dict[TensorNames.ValueEstimateOutput] = new BiDimensionalOutputGenerator(_allocator); + } + + /// + /// Populates the data of the tensor inputs given the data contained in the current batch + /// of agents. + /// + /// Enumerable of tensors that will be modified. + /// The number of agents present in the current batch + /// + /// Dictionary of Agent to AgentInfo that contains the + /// data that will be used to modify the tensors + /// One of the tensor does not have an + /// associated generator. + public void GenerateTensors(IEnumerable tensors, + int currentBatchSize, + Dictionary agentInfos) + { + foreach (var tensor in tensors) + { + if (!_dict.ContainsKey(tensor.Name)) + { + throw new UnityAgentsException( + "Unknow tensorProxy expected as input : " + tensor.Name); + } + _dict[tensor.Name].Generate(tensor, currentBatchSize, agentInfos); + } + } + } +} diff --git a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorNames.cs b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorNames.cs index b9910adbb9..90e85d5210 100644 --- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorNames.cs +++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorNames.cs @@ -1,7 +1,7 @@ namespace MLAgents.InferenceBrain { /// - /// Contains the names of the input and output Tensor for the Inference Brain. + /// Contains the names of the input and output tensors for the Inference Brain. /// public static class TensorNames { diff --git a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorProxy.cs b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorProxy.cs new file mode 100644 index 0000000000..c559e8b9dc --- /dev/null +++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorProxy.cs @@ -0,0 +1,143 @@ +using System; +using System.Collections.Generic; +using System.Runtime.InteropServices; +using Barracuda; +using UnityEngine; + +namespace MLAgents.InferenceBrain +{ + + /// + /// Tensor - A class to encapsulate a Tensor used for inference. + /// + /// This class contains the Array that holds the data array, the shapes, type and the placeholder in the + /// execution graph. All the fields are editable in the inspector, allowing the user to specify everything + /// but the data in a graphical way. + /// + [System.Serializable] + public class TensorProxy + { + public enum TensorType + { + Integer, + FloatingPoint + }; + + private static Dictionary m_typeMap = new Dictionary() + { + { TensorType.FloatingPoint, typeof(float)}, + {TensorType.Integer, typeof(int)} + }; + + public string Name; + public TensorType ValueType; + // Since Type is not serializable, we use the DisplayType for the Inspector + public Type DataType + { + get { return m_typeMap[ValueType]; } + } + public long[] Shape; + + public Tensor Data; + } + + public class TensorUtils + { + public static void ResizeTensor(TensorProxy tensor, int batch, ITensorAllocator allocator) + { + if (tensor.Shape[0] == batch && + tensor.Data != null && tensor.Data.batch == batch) + return; + + tensor.Data?.Dispose(); + tensor.Shape[0] = batch; + + if (tensor.Shape.Length == 4) + tensor.Data = allocator.Alloc(new TensorShape(batch, (int)tensor.Shape[1], (int)tensor.Shape[2], (int)tensor.Shape[3])); + else + tensor.Data = allocator.Alloc(new TensorShape(batch, (int)tensor.Shape[tensor.Shape.Length - 1])); + } + + public static Array BarracudaToFloatArray(Tensor tensor) + { + Array res; + + if (tensor.height == 1 && tensor.width == 1) + res = new float[tensor.batch, tensor.channels]; + else + res = new float[tensor.batch, tensor.height, tensor.width, tensor.channels]; + + Buffer.BlockCopy(tensor.readonlyArray, 0, res, 0, tensor.length * Marshal.SizeOf()); + + return res; + } + + public static Array BarracudaToIntArray(Tensor tensor) + { + + if (tensor.height == 1 && tensor.width == 1) + { + var res = new int[tensor.batch, tensor.channels]; + + for (int b = 0; b < tensor.batch; b++) + for (int c = 0; c < tensor.channels; c++) + { + res[b, c] = (int)tensor[b, c]; + } + + return res; + } + else + { + var res = new int[tensor.batch, tensor.height, tensor.width, tensor.channels]; + for (int b = 0; b < tensor.batch; b++) + for (int y = 0; y < tensor.height; y++) + for (int x = 0; x < tensor.width; x++) + for (int c = 0; c < tensor.channels; c++) + { + res[b, y, x, c] = (int)tensor[b, y, x, c]; + } + + return res; + } + } + + public static Tensor ArrayToBarracuda(Array array) + { + Tensor res; + + if (array.Rank == 2) + res = new Tensor(array.GetLength(0), array.GetLength(1)); + else + res = new Tensor(array.GetLength(0), array.GetLength(1), array.GetLength(2), array.GetLength(3)); + + int offset = 0; + var barracudaArray = res.data != null ? res.tensorOnDevice.SharedAccess(out offset) : null; + + Buffer.BlockCopy(array, 0, barracudaArray, offset, res.length * Marshal.SizeOf()); + + return res; + } + + internal static long[] TensorShapeFromBarracuda(TensorShape src) + { + if (src.height == 1 && src.width == 1) + return new long[2] {src.batch, src.channels}; + + return new long[4] {src.batch, src.height, src.width, src.channels}; + } + + public static TensorProxy TensorProxyFromBarracuda(Tensor src, string nameOverride = null) + { + var shape = TensorShapeFromBarracuda(src.shape); + return new TensorProxy + { + Name = nameOverride ?? src.name, + ValueType = TensorProxy.TensorType.FloatingPoint, + Shape = shape, + Data = src + }; + } + } + +} diff --git a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/Tensor.cs.meta b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorProxy.cs.meta similarity index 100% rename from UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/Tensor.cs.meta rename to UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorProxy.cs.meta diff --git a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/Utils/Multinomial.cs b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/Utils/Multinomial.cs index 7431863381..4dc8bd252a 100644 --- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/Utils/Multinomial.cs +++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/Utils/Multinomial.cs @@ -26,7 +26,7 @@ public Multinomial(int seed) /// Multinomial doesn't support integer tensors /// Issue with tensor shape or type /// At least one of the tensors is not allocated - public void Eval(Tensor src, Tensor dst) + public void Eval(TensorProxy src, TensorProxy dst) { if (src.DataType != typeof(float)) { @@ -43,44 +43,33 @@ public void Eval(Tensor src, Tensor dst) throw new ArgumentNullException(); } - float[,] input_data = src.Data as float[,]; - if (input_data == null) - { - throw new ArgumentException("Input data is not of the correct shape! Required batch x logits"); - } - - float[,] output_data = dst.Data as float[,]; - if (output_data == null) - { - throw new ArgumentException("Output data is not of the correct shape! Required batch x samples"); - } - if (input_data.GetLength(0) != output_data.GetLength(0)) + if (src.Data.batch != dst.Data.batch) { throw new ArgumentException("Batch size for input and output data is different!"); } - float[] cdf = new float[input_data.GetLength(1)]; + float[] cdf = new float[src.Data.channels]; - for (int batch = 0; batch < input_data.GetLength(0); ++batch) + for (int batch = 0; batch < src.Data.batch; ++batch) { // Find the class maximum float maxProb = float.NegativeInfinity; - for (int cls = 0; cls < input_data.GetLength(1); ++cls) + for (int cls = 0; cls < src.Data.channels; ++cls) { - maxProb = Mathf.Max(input_data[batch, cls], maxProb); + maxProb = Mathf.Max(src.Data[batch, cls], maxProb); } // Sum the log probabilities and compute CDF float sumProb = 0.0f; - for (int cls = 0; cls < input_data.GetLength(1); ++cls) + for (int cls = 0; cls < src.Data.channels; ++cls) { - sumProb += Mathf.Exp(input_data[batch, cls] - maxProb); + sumProb += Mathf.Exp(src.Data[batch, cls] - maxProb); cdf[cls] = sumProb; } // Generate the samples - for (int sample = 0; sample < output_data.GetLength(1); ++sample) + for (int sample = 0; sample < dst.Data.channels; ++sample) { float p = (float)m_random.NextDouble() * sumProb; int cls = 0; @@ -89,7 +78,7 @@ public void Eval(Tensor src, Tensor dst) ++cls; } - output_data[batch, sample] = cls; + dst.Data[batch, sample] = cls; } } diff --git a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/Utils/RandomNormal.cs b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/Utils/RandomNormal.cs index 2684819c06..e68a558aec 100644 --- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/Utils/RandomNormal.cs +++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/Utils/RandomNormal.cs @@ -1,4 +1,5 @@ -using System; +using System; +using UnityEngine; namespace MLAgents.InferenceBrain.Utils { @@ -12,7 +13,7 @@ public class RandomNormal private readonly double m_mean; private readonly double m_stddev; private readonly System.Random m_random; - + public RandomNormal(int seed, float mean = 0.0f, float stddev = 1.0f) { m_mean = mean; @@ -23,7 +24,7 @@ public RandomNormal(int seed, float mean = 0.0f, float stddev = 1.0f) // Each iteration produces two numbers. Hold one here for next call private bool m_hasSpare = false; private double m_spare = 0.0f; - + /// /// Return the next random double number /// @@ -44,36 +45,20 @@ public double NextDouble() s = u * u + v * v; } while (s >= 1.0 || s == 0.0); - s = Math.Sqrt(-2.0 * Math.Log(s) / 2); + s = Math.Sqrt(-2.0 * Math.Log(s) / s); m_spare = u * s; m_hasSpare = true; return v * s * m_stddev + m_mean; } - private void IncreaseNextDim(Array arr, long[] indices) - { - for (int i = 1; i < arr.Rank; ++i) - { - ++indices[i]; - if (i == arr.Rank - 1 || indices[i] < arr.GetLength(i)) - { - break; - } - else - { - indices[i] = 0; - } - } - } - /// - /// Fill a pre-allocated Tensor with random numbers + /// Fill a pre-allocated Tensor with random numbers /// /// The pre-allocated Tensor to fill /// Throws when trying to fill a Tensor of type other than float /// Throws when the Tensor is not allocated - public void FillTensor(Tensor t) + public void FillTensor(TensorProxy t) { if (t.DataType != typeof(float)) { @@ -85,21 +70,8 @@ public void FillTensor(Tensor t) throw new ArgumentNullException(); } - long[] indices = new long[t.Data.Rank]; - - // Since IEnumerable is const, and we don't know the dimentions of the Array - // we need to traverse all the dimentions - // TODO: this seems like a nice general operation for the Tensor, consider moving it there - do - { - t.Data.SetValue((float) NextDouble(), indices); - ++indices[0]; - if (indices[0] == t.Data.GetLength(0)) - { - indices[0] = 0; - IncreaseNextDim(t.Data, indices); - } - } while (indices[t.Data.Rank - 1] < t.Data.GetLength(t.Data.Rank - 1)); + for (int i = 0; i < t.Data.length; i++) + t.Data[i] = (float)NextDouble(); } } } diff --git a/UnitySDK/Assets/ML-Agents/Scripts/LearningBrain.cs b/UnitySDK/Assets/ML-Agents/Scripts/LearningBrain.cs index b557354606..0900bda935 100644 --- a/UnitySDK/Assets/ML-Agents/Scripts/LearningBrain.cs +++ b/UnitySDK/Assets/ML-Agents/Scripts/LearningBrain.cs @@ -1,13 +1,10 @@ -#define ENABLE_BARRACUDA - -using System; +using System; using System.Collections.Generic; using UnityEngine; using System.Linq; using Barracuda; using MLAgents.InferenceBrain; using UnityEngine.Profiling; -using Tensor = MLAgents.InferenceBrain.Tensor; namespace MLAgents { @@ -32,13 +29,14 @@ public enum InferenceDevice [CreateAssetMenu(fileName = "NewLearningBrain", menuName = "ML-Agents/Learning Brain")] public class LearningBrain : Brain { + private ITensorAllocator _tensorAllocator; private TensorGenerator _tensorGenerator; private TensorApplier _tensorApplier; #if ENABLE_TENSORFLOW public TextAsset model; private ModelParamLoader _modelParamLoader; private TFSharpInferenceEngine _engine; -#elif ENABLE_BARRACUDA +#else public NNModel model; private Model _barracudaModel; private IWorker _engine; @@ -47,12 +45,13 @@ public class LearningBrain : Brain private BarracudaModelParamLoader _modelParamLoader; private string[] _outputNames; #endif + [Tooltip("Inference execution device. CPU is the fastest option for most of ML Agents models. " + "(This field is not applicable for training).")] public InferenceDevice inferenceDevice = InferenceDevice.CPU; - private IReadOnlyList _inferenceInputs; - private IReadOnlyList _inferenceOutputs; + private IReadOnlyList _inferenceInputs; + private IReadOnlyList _inferenceOutputs; [NonSerialized] private bool _isControlled; @@ -82,6 +81,9 @@ protected override void Initialize() /// public void ReloadModel(int seed = 0) { + if (_tensorAllocator == null) + _tensorAllocator = new TensorCachingAllocator(); + #if ENABLE_TENSORFLOW if (model != null) { @@ -95,9 +97,9 @@ public void ReloadModel(int seed = 0) _modelParamLoader = ModelParamLoader.GetLoaderAndCheck(_engine, brainParameters); _inferenceInputs = _modelParamLoader.GetInputTensors(); _inferenceOutputs = _modelParamLoader.GetOutputTensors(); - _tensorGenerator = new TensorGenerator(brainParameters, seed); - _tensorApplier = new TensorApplier(brainParameters, seed); -#elif ENABLE_BARRACUDA + _tensorGenerator = new TensorGenerator(brainParameters, seed, _tensorAllocator); + _tensorApplier = new TensorApplier(brainParameters, seed, _tensorAllocator); +#else if (model != null) { #if BARRACUDA_VERBOSE @@ -126,8 +128,8 @@ public void ReloadModel(int seed = 0) _modelParamLoader = BarracudaModelParamLoader.GetLoaderAndCheck(_engine, _barracudaModel, brainParameters); _inferenceInputs = _modelParamLoader.GetInputTensors(); _outputNames = _modelParamLoader.GetOutputNames(); - _tensorGenerator = new TensorGenerator(brainParameters, seed, _barracudaModel); - _tensorApplier = new TensorApplier(brainParameters, seed, _barracudaModel); + _tensorGenerator = new TensorGenerator(brainParameters, seed, _tensorAllocator, _barracudaModel); + _tensorApplier = new TensorApplier(brainParameters, seed, _tensorAllocator, _barracudaModel); #endif } @@ -144,12 +146,8 @@ public IEnumerable GetModelFailedChecks() #if ENABLE_TENSORFLOW return (_modelParamLoader != null) ? _modelParamLoader.GetChecks() : new List(); -#elif ENABLE_BARRACUDA - return (_modelParamLoader != null) ? _modelParamLoader.GetChecks() : new List(); #else - return new List(){ - "You need to install the TensorflowSharp plugin and add the ENABLE_TENSORFLOW " + - "flag in your Player Settings in order to use inference. "}; + return (_modelParamLoader != null) ? _modelParamLoader.GetChecks() : new List(); #endif } @@ -166,6 +164,9 @@ protected override void DecideAction() { return; } + + Profiler.BeginSample("LearningBrain.DecideAction"); + #if ENABLE_TENSORFLOW if (_engine == null) { @@ -185,77 +186,72 @@ protected override void DecideAction() // Update the outputs _tensorApplier.ApplyTensors(_inferenceOutputs, agentInfos); -#elif ENABLE_BARRACUDA +#else if (_engine == null) { Debug.LogError($"No model was present for the Brain {name}."); return; } + Profiler.BeginSample($"MLAgents.{name}.GenerateTensors"); // Prepare the input tensors to be feed into the engine _tensorGenerator.GenerateTensors(_inferenceInputs, currentBatchSize, agentInfos); + Profiler.EndSample(); + Profiler.BeginSample($"MLAgents.{name}.PrepareBarracudaInputs"); var inputs = PrepareBarracudaInputs(_inferenceInputs); + Profiler.EndSample(); // Execute the Model Profiler.BeginSample($"MLAgents.{name}.ExecuteGraph"); _engine.Execute(inputs); Profiler.EndSample(); + Profiler.BeginSample($"MLAgents.{name}.FetchBarracudaOutputs"); _inferenceOutputs = FetchBarracudaOutputs(_outputNames); - CleanupBarracudaState(inputs); + Profiler.EndSample(); + Profiler.BeginSample($"MLAgents.{name}.ApplyTensors"); // Update the outputs _tensorApplier.ApplyTensors(_inferenceOutputs, agentInfos); -#else - if (agentInfos.Count > 0) - { - Debug.LogError(string.Format( - "The brain {0} was set to inference mode but the Tensorflow library is not " + - "present in the Unity project.", - name)); - } + Profiler.EndSample(); #endif agentInfos.Clear(); + Profiler.EndSample(); } -#if ENABLE_BARRACUDA && !ENABLE_TENSORFLOW - protected Dictionary PrepareBarracudaInputs(IEnumerable infInputs) +#if !ENABLE_TENSORFLOW + protected Dictionary PrepareBarracudaInputs(IEnumerable infInputs) { - var inputs = new Dictionary(); + var inputs = new Dictionary(); foreach (var inp in _inferenceInputs) { - inputs[inp.Name] = BarracudaUtils.ToBarracuda(inp); + inputs[inp.Name] = inp.Data; } return inputs; } - protected List FetchBarracudaOutputs(string[] names) + protected List FetchBarracudaOutputs(string[] names) { - var outputs = new List(); + var outputs = new List(); foreach (var name in names) { var outp = _engine.Peek(name); - outputs.Add(BarracudaUtils.FromBarracuda(outp, name)); + outputs.Add(TensorUtils.TensorProxyFromBarracuda(outp, name)); } return outputs; } - - protected void CleanupBarracudaState(Dictionary inputs) - { - foreach (var key in inputs.Keys) - { - inputs[key].Dispose(); - } - inputs.Clear(); - } - +#endif + public void OnDisable() { +#if !ENABLE_TENSORFLOW _engine?.Dispose(); - } #endif + _tensorAllocator?.Reset(false); + } + } } diff --git a/UnitySDK/Assets/ML-Agents/Scripts/Utilities.cs b/UnitySDK/Assets/ML-Agents/Scripts/Utilities.cs index 3c1530c564..8ac668a7e7 100644 --- a/UnitySDK/Assets/ML-Agents/Scripts/Utilities.cs +++ b/UnitySDK/Assets/ML-Agents/Scripts/Utilities.cs @@ -1,6 +1,8 @@ using UnityEngine; using System.Collections; using System.Collections.Generic; +using Barracuda; +using MLAgents.InferenceBrain; namespace MLAgents { @@ -9,17 +11,9 @@ public class Utilities /// /// Converts a list of Texture2D into a Tensor. /// - /// - /// A 4 dimensional float Tensor of dimension - /// [batch_size, height, width, channel]. - /// Where batch_size is the number of input textures, - /// height corresponds to the height of the texture, - /// width corresponds to the width of the texture, - /// channel corresponds to the number of channels extracted from the - /// input textures (based on the input blackAndWhite flag - /// (3 if the flag is false, 1 otherwise). - /// The values of the Tensor are between 0 and 1. - /// + /// + /// Tensor proxy to fill with Texture data. + /// /// /// The list of textures to be put into the tensor. /// Note that the textures must have same width and height. @@ -28,13 +22,14 @@ public class Utilities /// If set to true the textures /// will be converted to grayscale before being stored in the tensor. /// - public static float[,,,] TextureToFloatArray(List textures, bool blackAndWhite) + /// Tensor allocator + public static void TextureToTensorProxy(TensorProxy tensorProxy, List textures, bool blackAndWhite, + ITensorAllocator allocator) { var batchSize = textures.Count; var width = textures[0].width; var height = textures[0].height; - var pixels = blackAndWhite ? 1 : 3; - var result = new float[batchSize, height, width, pixels]; + var data = tensorProxy.Data; for (var b = 0; b < batchSize; b++) { @@ -48,19 +43,18 @@ public class Utilities { // For Color32, the r, g and b values are between // 0 and 255. - result[b, h, w, 0] = currentPixel.r / 255.0f; - result[b, h, w, 1] = currentPixel.g / 255.0f; - result[b, h, w,2] = currentPixel.b / 255.0f; + data[b, h, w, 0] = currentPixel.r / 255.0f; + data[b, h, w, 1] = currentPixel.g / 255.0f; + data[b, h, w,2] = currentPixel.b / 255.0f; } else { - result[b, h, w, 0] = (currentPixel.r + currentPixel.g + currentPixel.b) + data[b, h, w, 0] = (currentPixel.r + currentPixel.g + currentPixel.b) / 3f / 255.0f; } } } } - return result; } @@ -81,5 +75,65 @@ public static int[] CumSum(int [] array) } return result; } + + /// + /// Shifts list elements to the left by the specified amount. + /// + /// Target list + /// + /// + /// Shift amount + /// + /// + public static void ShiftLeft(List list, int amount) + { + for (var i = amount; i < list.Count; i++) + { + list[i - amount] = list[i]; + } + } + + /// + /// Replaces target list elements with source list elements starting at specified position in target list. + /// + /// Target list + /// + /// + /// Source list + /// + /// + /// Offset in target list + /// + /// + public static void ReplaceRange(List dst, List src, int start) + { + for (var i = 0; i < src.Count; i++) + { + dst[i + start] = src[i]; + } + } + + + /// + /// Adds elements to list without extra temp allocations (assuming it fits pre-allocated capacity of the list). + /// Regular List.AddRange() unfortunately allocates temp list to add items. + /// https://stackoverflow.com/questions/2123161/listt-addrange-implementation-suboptimal + /// Note: this implementation might be slow with large numbers of elements in the source array. + /// + /// Target list + /// + /// + /// Source array + /// + /// + public static void AddRangeNoAlloc(List dst, T[] src) + { + var offset = dst.Count; + + for (var i = 0; i < src.Length; i++) + { + dst.Add(src[i]); + } + } } } diff --git a/config/3dball_generalize.yaml b/config/3dball_generalize.yaml new file mode 100644 index 0000000000..b57e2686a0 --- /dev/null +++ b/config/3dball_generalize.yaml @@ -0,0 +1,16 @@ +resampling-interval: 5000 + +mass: + sampler-type: "uniform" + min_value: 0.5 + max_value: 10 + +gravity: + sampler-type: "uniform" + min_value: 7 + max_value: 12 + +scale: + sampler-type: "uniform" + min_value: 0.75 + max_value: 3 diff --git a/config/gail_config.yaml b/config/gail_config.yaml new file mode 100644 index 0000000000..82fde73c49 --- /dev/null +++ b/config/gail_config.yaml @@ -0,0 +1,106 @@ +default: + trainer: ppo + batch_size: 1024 + beta: 5.0e-3 + buffer_size: 10240 + epsilon: 0.2 + hidden_units: 128 + lambd: 0.95 + learning_rate: 3.0e-4 + max_steps: 5.0e4 + memory_size: 256 + normalize: false + num_epoch: 3 + num_layers: 2 + time_horizon: 64 + sequence_length: 64 + summary_freq: 1000 + use_recurrent: false + reward_signals: + extrinsic: + strength: 1.0 + gamma: 0.99 + +PyramidsLearning: + summary_freq: 2000 + time_horizon: 128 + batch_size: 128 + buffer_size: 2048 + hidden_units: 512 + num_layers: 2 + beta: 1.0e-2 + max_steps: 5.0e5 + num_epoch: 3 + pretraining: + demo_path: ./demos/ExpertPyramid.demo + strength: 0.5 + steps: 10000 + reward_signals: + extrinsic: + strength: 1.0 + gamma: 0.99 + curiosity: + strength: 0.02 + gamma: 0.99 + encoding_size: 256 + gail: + strength: 0.01 + gamma: 0.99 + encoding_size: 128 + demo_path: demos/ExpertPyramid.demo + +CrawlerStaticLearning: + normalize: true + num_epoch: 3 + time_horizon: 1000 + batch_size: 2024 + buffer_size: 20240 + max_steps: 1e6 + summary_freq: 3000 + num_layers: 3 + hidden_units: 512 + reward_signals: + gail: + strength: 1.0 + gamma: 0.99 + encoding_size: 128 + demo_path: demos/ExpertCrawlerSta.demo + +PushBlockLearning: + max_steps: 5.0e4 + batch_size: 128 + buffer_size: 2048 + beta: 1.0e-2 + hidden_units: 256 + summary_freq: 2000 + time_horizon: 64 + num_layers: 2 + reward_signals: + gail: + strength: 1.0 + gamma: 0.99 + encoding_size: 128 + demo_path: demos/ExpertPush.demo + +HallwayLearning: + use_recurrent: true + sequence_length: 64 + num_layers: 2 + hidden_units: 128 + memory_size: 256 + beta: 1.0e-2 + num_epoch: 3 + buffer_size: 1024 + batch_size: 128 + max_steps: 5.0e5 + summary_freq: 1000 + time_horizon: 64 + reward_signals: + extrinsic: + strength: 1.0 + gamma: 0.99 + gail: + strength: 0.1 + gamma: 0.99 + encoding_size: 128 + demo_path: demos/ExpertHallway.demo diff --git a/config/trainer_config.yaml b/config/trainer_config.yaml index 66a2a945c5..9a60aefcf4 100644 --- a/config/trainer_config.yaml +++ b/config/trainer_config.yaml @@ -4,7 +4,6 @@ default: beta: 5.0e-3 buffer_size: 10240 epsilon: 0.2 - gamma: 0.99 hidden_units: 128 lambd: 0.95 learning_rate: 3.0e-4 @@ -17,14 +16,16 @@ default: sequence_length: 64 summary_freq: 1000 use_recurrent: false - use_curiosity: false - curiosity_strength: 0.01 - curiosity_enc_size: 128 + vis_encode_type: simple + reward_signals: + extrinsic: + strength: 1.0 + gamma: 0.99 BananaLearning: normalize: false - batch_size: 1024 beta: 5.0e-3 + batch_size: 1024 buffer_size: 10240 max_steps: 1.0e5 @@ -44,7 +45,7 @@ PushBlockLearning: time_horizon: 64 num_layers: 2 -SmallWallJumpLearning: +SmallWallJumpLearning: max_steps: 1.0e6 batch_size: 128 buffer_size: 2048 @@ -55,7 +56,7 @@ SmallWallJumpLearning: num_layers: 2 normalize: false -BigWallJumpLearning: +BigWallJumpLearning: max_steps: 1.0e6 batch_size: 128 buffer_size: 2048 @@ -93,10 +94,7 @@ GoalieLearning: normalize: false PyramidsLearning: - use_curiosity: true summary_freq: 2000 - curiosity_strength: 0.01 - curiosity_enc_size: 256 time_horizon: 128 batch_size: 128 buffer_size: 2048 @@ -105,11 +103,16 @@ PyramidsLearning: beta: 1.0e-2 max_steps: 5.0e5 num_epoch: 3 + reward_signals: + extrinsic: + strength: 1.0 + gamma: 0.99 + curiosity: + strength: 0.02 + gamma: 0.99 + encoding_size: 256 VisualPyramidsLearning: - use_curiosity: true - curiosity_strength: 0.01 - curiosity_enc_size: 256 time_horizon: 128 batch_size: 64 buffer_size: 2024 @@ -118,6 +121,14 @@ VisualPyramidsLearning: beta: 1.0e-2 max_steps: 5.0e5 num_epoch: 3 + reward_signals: + extrinsic: + strength: 1.0 + gamma: 0.99 + curiosity: + strength: 0.01 + gamma: 0.99 + encoding_size: 256 3DBallLearning: normalize: true @@ -126,7 +137,6 @@ VisualPyramidsLearning: summary_freq: 1000 time_horizon: 1000 lambd: 0.99 - gamma: 0.995 beta: 0.001 3DBallHardLearning: @@ -136,8 +146,11 @@ VisualPyramidsLearning: summary_freq: 1000 time_horizon: 1000 max_steps: 5.0e5 - gamma: 0.995 beta: 0.001 + reward_signals: + extrinsic: + strength: 1.0 + gamma: 0.995 TennisLearning: normalize: true @@ -149,11 +162,14 @@ CrawlerStaticLearning: time_horizon: 1000 batch_size: 2024 buffer_size: 20240 - gamma: 0.995 max_steps: 1e6 summary_freq: 3000 num_layers: 3 hidden_units: 512 + reward_signals: + extrinsic: + strength: 1.0 + gamma: 0.995 CrawlerDynamicLearning: normalize: true @@ -161,11 +177,14 @@ CrawlerDynamicLearning: time_horizon: 1000 batch_size: 2024 buffer_size: 20240 - gamma: 0.995 max_steps: 1e6 summary_freq: 3000 num_layers: 3 hidden_units: 512 + reward_signals: + extrinsic: + strength: 1.0 + gamma: 0.995 WalkerLearning: normalize: true @@ -173,11 +192,14 @@ WalkerLearning: time_horizon: 1000 batch_size: 2048 buffer_size: 20480 - gamma: 0.995 max_steps: 2e6 summary_freq: 3000 num_layers: 3 hidden_units: 512 + reward_signals: + extrinsic: + strength: 1.0 + gamma: 0.995 ReacherLearning: normalize: true @@ -185,9 +207,12 @@ ReacherLearning: time_horizon: 1000 batch_size: 2024 buffer_size: 20240 - gamma: 0.995 max_steps: 1e6 summary_freq: 3000 + reward_signals: + extrinsic: + strength: 1.0 + gamma: 0.995 HallwayLearning: use_recurrent: true @@ -196,7 +221,6 @@ HallwayLearning: hidden_units: 128 memory_size: 256 beta: 1.0e-2 - gamma: 0.99 num_epoch: 3 buffer_size: 1024 batch_size: 128 @@ -211,7 +235,6 @@ VisualHallwayLearning: hidden_units: 128 memory_size: 256 beta: 1.0e-2 - gamma: 0.99 num_epoch: 3 buffer_size: 1024 batch_size: 64 @@ -226,7 +249,6 @@ VisualPushBlockLearning: hidden_units: 128 memory_size: 256 beta: 1.0e-2 - gamma: 0.99 num_epoch: 3 buffer_size: 1024 batch_size: 64 @@ -240,11 +262,14 @@ GridWorldLearning: num_layers: 1 hidden_units: 256 beta: 5.0e-3 - gamma: 0.9 buffer_size: 256 max_steps: 5.0e5 summary_freq: 2000 time_horizon: 5 + reward_signals: + extrinsic: + strength: 1.0 + gamma: 0.9 BasicLearning: batch_size: 32 @@ -252,8 +277,11 @@ BasicLearning: num_layers: 1 hidden_units: 20 beta: 5.0e-3 - gamma: 0.9 buffer_size: 256 max_steps: 5.0e5 summary_freq: 2000 time_horizon: 3 + reward_signals: + extrinsic: + strength: 1.0 + gamma: 0.9 diff --git a/demos/Expert3DBall.demo b/demos/Expert3DBall.demo new file mode 100644 index 0000000000..873e1770a8 Binary files /dev/null and b/demos/Expert3DBall.demo differ diff --git a/demos/Expert3DBallHard.demo b/demos/Expert3DBallHard.demo new file mode 100644 index 0000000000..3130d251ca Binary files /dev/null and b/demos/Expert3DBallHard.demo differ diff --git a/demos/ExpertBanana.demo b/demos/ExpertBanana.demo new file mode 100644 index 0000000000..33c86abd14 Binary files /dev/null and b/demos/ExpertBanana.demo differ diff --git a/demos/ExpertBasic.demo b/demos/ExpertBasic.demo new file mode 100644 index 0000000000..6c1c962f98 Binary files /dev/null and b/demos/ExpertBasic.demo differ diff --git a/demos/ExpertBouncer.demo b/demos/ExpertBouncer.demo new file mode 100644 index 0000000000..2ab16a9666 Binary files /dev/null and b/demos/ExpertBouncer.demo differ diff --git a/demos/ExpertCrawlerDyn.demo b/demos/ExpertCrawlerDyn.demo new file mode 100644 index 0000000000..04736d4312 Binary files /dev/null and b/demos/ExpertCrawlerDyn.demo differ diff --git a/demos/ExpertCrawlerSta.demo b/demos/ExpertCrawlerSta.demo new file mode 100644 index 0000000000..9001d074ef Binary files /dev/null and b/demos/ExpertCrawlerSta.demo differ diff --git a/demos/ExpertGrid.demo b/demos/ExpertGrid.demo new file mode 100644 index 0000000000..65f37610fd Binary files /dev/null and b/demos/ExpertGrid.demo differ diff --git a/demos/ExpertHallway.demo b/demos/ExpertHallway.demo new file mode 100644 index 0000000000..ee6de388cb Binary files /dev/null and b/demos/ExpertHallway.demo differ diff --git a/demos/ExpertPush.demo b/demos/ExpertPush.demo new file mode 100644 index 0000000000..4184685ff4 Binary files /dev/null and b/demos/ExpertPush.demo differ diff --git a/demos/ExpertPyramid.demo b/demos/ExpertPyramid.demo new file mode 100644 index 0000000000..c34c60c2ee Binary files /dev/null and b/demos/ExpertPyramid.demo differ diff --git a/demos/ExpertReacher.demo b/demos/ExpertReacher.demo new file mode 100644 index 0000000000..c32c6c7f02 Binary files /dev/null and b/demos/ExpertReacher.demo differ diff --git a/demos/ExpertSoccerGoal.demo b/demos/ExpertSoccerGoal.demo new file mode 100644 index 0000000000..eaad1ec561 Binary files /dev/null and b/demos/ExpertSoccerGoal.demo differ diff --git a/demos/ExpertSoccerStri.demo b/demos/ExpertSoccerStri.demo new file mode 100644 index 0000000000..ca7a3afeb3 Binary files /dev/null and b/demos/ExpertSoccerStri.demo differ diff --git a/demos/ExpertTennis.demo b/demos/ExpertTennis.demo new file mode 100644 index 0000000000..66658cebf8 Binary files /dev/null and b/demos/ExpertTennis.demo differ diff --git a/demos/ExpertWalker.demo b/demos/ExpertWalker.demo new file mode 100644 index 0000000000..875608f046 Binary files /dev/null and b/demos/ExpertWalker.demo differ diff --git a/docs/Background-Machine-Learning.md b/docs/Background-Machine-Learning.md index 128117dfa3..a6f61c4123 100644 --- a/docs/Background-Machine-Learning.md +++ b/docs/Background-Machine-Learning.md @@ -118,7 +118,7 @@ good policies can be difficult (and/or time-consuming) for complex environments. [Learning a policy](https://blogs.unity3d.com/2017/08/22/unity-ai-reinforcement-learning-with-q-learning/) usually requires many trials and iterative policy updates. More specifically, the robot is placed in several fire situations and over time learns an optimal -policy which allows it to put our fires more effectively. Obviously, we cannot +policy which allows it to put out fires more effectively. Obviously, we cannot expect to train a robot repeatedly in the real world, particularly when fires are involved. This is precisely why the use of [Unity as a simulator](https://blogs.unity3d.com/2018/01/23/designing-safer-cities-through-simulations/) diff --git a/docs/Creating-Custom-Protobuf-Messages.md b/docs/Creating-Custom-Protobuf-Messages.md index 08569ab361..512a9247ac 100644 --- a/docs/Creating-Custom-Protobuf-Messages.md +++ b/docs/Creating-Custom-Protobuf-Messages.md @@ -4,7 +4,7 @@ Unity and Python communicate by sending protobuf messages to and from each other ## Implementing a Custom Message -Assume the ml-agents repository is checked out to a folder named $MLAGENTS_ROOT. Whenever you change the fields of a custom message, you must run `$MLAGENTS_ROOT/protobuf-definitions/make.bat` to create C# and Python files corresponding to the new message. Follow the directions in [this file](../protobuf-definitions/README.md) for guidance. After running `$MLAGENTS_ROOT/protobuf-definitions/make.bat`, reinstall the Python package by running `pip install $MLAGENTS_ROOT/ml-agents` and make sure your Unity project is using the newly-generated version of `$MLAGENTS_ROOT/UnitySDK`. +Whenever you change the fields of a custom message, you must follow the directions in [this file](../protobuf-definitions/README.md) to create C# and Python files corresponding to the new message and re-install the mlagents Python package. ## Custom Message Types diff --git a/docs/Installation.md b/docs/Installation.md index fae9ae760b..bb87d88b17 100644 --- a/docs/Installation.md +++ b/docs/Installation.md @@ -45,21 +45,14 @@ The `gym-unity` subdirectory contains a package to interface with OpenAI Gym. ### Install Python and mlagents Package -In order to use ML-Agents toolkit, you need Python 3.6 along with the -dependencies listed in the [setup.py file](../ml-agents/setup.py). -Some of the primary dependencies include: - -- [TensorFlow](Background-TensorFlow.md) (Requires a CPU w/ AVX support) -- [Jupyter](Background-Jupyter.md) - -[Download](https://www.python.org/downloads/) and install Python 3.6 if you do not -already have it. +In order to use ML-Agents toolkit, you need Python 3.6. +[Download](https://www.python.org/downloads/) and install Python 3.6 if you do not already have it. If your Python environment doesn't include `pip3`, see these [instructions](https://packaging.python.org/guides/installing-using-linux-tools/#installing-pip-setuptools-wheel-with-linux-package-managers) on installing it. -To install the dependencies and `mlagents` Python package, run from the command line: +To install the `mlagents` Python package, run from the command line: ```sh pip3 install mlagents @@ -70,6 +63,12 @@ If you installed this correctly, you should be able to run `mlagents-learn --help`, after which you will see the Unity logo and the command line parameters you can use with `mlagents-learn`. +By installing the `mlagents` package, the dependencies listed in the [setup.py file](../ml-agents/setup.py) are also installed. +Some of the primary dependencies include: + +- [TensorFlow](Background-TensorFlow.md) (Requires a CPU w/ AVX support) +- [Jupyter](Background-Jupyter.md) + **Notes:** - We do not currently support Python 3.7 or Python 3.5. diff --git a/docs/Learning-Environment-Examples.md b/docs/Learning-Environment-Examples.md index 189ab03232..4930ecf291 100644 --- a/docs/Learning-Environment-Examples.md +++ b/docs/Learning-Environment-Examples.md @@ -1,4 +1,4 @@ -# Example Learning Environments +# Example Learning Environments The Unity ML-Agents toolkit contains an expanding set of example environments which demonstrate various features of the platform. Environments are located in @@ -32,7 +32,7 @@ If you would like to contribute environments, please see our * Vector Observation space: One variable corresponding to current state. * Vector Action space: (Discrete) Two possible actions (Move left, move right). - * Visual Observations: None. + * Visual Observations: None * Reset Parameters: None * Benchmark Mean Reward: 0.94 @@ -56,7 +56,19 @@ If you would like to contribute environments, please see our * Vector Action space: (Continuous) Size of 2, with one value corresponding to X-rotation, and the other to Z-rotation. * Visual Observations: None. -* Reset Parameters: None +* Reset Parameters: Three + * scale: Specifies the scale of the ball in the 3 dimensions (equal across the three dimensions) + * Default: 1 + * Recommended Minimum: 0.2 + * Recommended Maximum: 5 + * gravity: Magnitude of gravity + * Default: 9.81 + * Recommended Minimum: 4 + * Recommended Maximum: 105 + * mass: Specifies mass of the ball + * Default: 1 + * Recommended Minimum: 0.1 + * Recommended Maximum: 20 * Benchmark Mean Reward: 100 ## [GridWorld](https://youtu.be/gu8HE9WKEVI) @@ -104,8 +116,20 @@ If you would like to contribute environments, please see our of ball and racket. * Vector Action space: (Continuous) Size of 2, corresponding to movement toward net or away from net, and jumping. - * Visual Observations: None. -* Reset Parameters: One, corresponding to size of ball. + * Visual Observations: None +* Reset Parameters: Three + * angle: Angle of the racket from the vertical (Y) axis. + * Default: 55 + * Recommended Minimum: 35 + * Recommended Maximum: 65 + * gravity: Magnitude of gravity + * Default: 9.81 + * Recommended Minimum: 6 + * Recommended Maximum: 20 + * scale: Specifies the scale of the ball in the 3 dimensions (equal across the three dimensions) + * Default: 1 + * Recommended Minimum: 0.2 + * Recommended Maximum: 5 * Benchmark Mean Reward: 2.5 * Optional Imitation Learning scene: `TennisIL`. @@ -129,7 +153,23 @@ If you would like to contribute environments, please see our `VisualPushBlock` scene. __The visual observation version of this environment does not train with the provided default training parameters.__ -* Reset Parameters: None. +* Reset Parameters: Four + * block_scale: Scale of the block along the x and z dimensions + * Default: 2 + * Recommended Minimum: 0.5 + * Recommended Maximum: 4 + * dynamic_friction: Coefficient of friction for the ground material acting on moving objects + * Default: 0 + * Recommended Minimum: 0 + * Recommended Maximum: 1 + * static_friction: Coefficient of friction for the ground material acting on stationary objects + * Default: 0 + * Recommended Minimum: 0 + * Recommended Maximum: 1 + * block_drag: Effect of air resistance on block + * Default: 0.5 + * Recommended Minimum: 0 + * Recommended Maximum: 2000 * Benchmark Mean Reward: 4.5 * Optional Imitation Learning scene: `PushBlockIL`. @@ -154,8 +194,8 @@ If you would like to contribute environments, please see our * Rotation (3 possible actions: Rotate Left, Rotate Right, No Action) * Side Motion (3 possible actions: Left, Right, No Action) * Jump (2 possible actions: Jump, No Action) - * Visual Observations: None. -* Reset Parameters: 4, corresponding to the height of the possible walls. + * Visual Observations: None +* Reset Parameters: Four * Benchmark Mean Reward (Big & Small Wall Brain): 0.8 ## [Reacher](https://youtu.be/2N9EoF6pQyE) @@ -173,7 +213,27 @@ If you would like to contribute environments, please see our * Vector Action space: (Continuous) Size of 4, corresponding to torque applicable to two joints. * Visual Observations: None. -* Reset Parameters: Two, corresponding to goal size, and goal movement speed. +* Reset Parameters: Five + * goal_size: radius of the goal zone + * Default: 5 + * Recommended Minimum: 1 + * Recommended Maximum: 10 + * goal_speed: speed of the goal zone around the arm (in radians) + * Default: 1 + * Recommended Minimum: 0.2 + * Recommended Maximum: 4 + * gravity + * Default: 9.81 + * Recommended Minimum: 4 + * Recommended Maximum: 20 + * deviation: Magnitude of sinusoidal (cosine) deviation of the goal along the vertical dimension + * Default: 0 + * Recommended Minimum: 0 + * Recommended Maximum: 5 + * deviation_freq: Frequency of the cosine deviation of the goal along the vertical dimension + * Default: 0 + * Recommended Minimum: 0 + * Recommended Maximum: 3 * Benchmark Mean Reward: 30 ## [Crawler](https://youtu.be/ftLliaeooYI) @@ -194,7 +254,7 @@ If you would like to contribute environments, please see our angular acceleration of the body. * Vector Action space: (Continuous) Size of 20, corresponding to target rotations for joints. - * Visual Observations: None. + * Visual Observations: None * Reset Parameters: None * Benchmark Mean Reward for `CrawlerStaticTarget`: 2000 * Benchmark Mean Reward for `CrawlerDynamicTarget`: 400 @@ -224,7 +284,15 @@ If you would like to contribute environments, please see our `VisualBanana` scene. __The visual observation version of this environment does not train with the provided default training parameters.__ -* Reset Parameters: None. +* Reset Parameters: Two + * laser_length: Length of the laser used by the agent + * Default: 1 + * Recommended Minimum: 0.2 + * Recommended Maximum: 7 + * agent_scale: Specifies the scale of the agent in the 3 dimensions (equal across the three dimensions) + * Default: 1 + * Recommended Minimum: 0.5 + * Recommended Maximum: 5 * Benchmark Mean Reward: 10 * Optional Imitation Learning scene: `BananaIL`. @@ -250,7 +318,7 @@ If you would like to contribute environments, please see our `VisualHallway` scene. __The visual observation version of this environment does not train with the provided default training parameters.__ -* Reset Parameters: None. +* Reset Parameters: None * Benchmark Mean Reward: 0.7 * To speed up training, you can enable curiosity by adding `use_curiosity: true` in `config/trainer_config.yaml` * Optional Imitation Learning scene: `HallwayIL`. @@ -272,8 +340,12 @@ If you would like to contribute environments, please see our banana. * Vector Action space: (Continuous) 3 corresponding to agent force applied for the jump. - * Visual Observations: None. -* Reset Parameters: None. + * Visual Observations: None +* Reset Parameters: Two + * banana_scale: The scale of the banana in the 3 dimensions + * Default: 150 + * Recommended Minimum: 50 + * Recommended Maximum: 250 * Benchmark Mean Reward: 10 ## [Soccer Twos](https://youtu.be/Hg3nmYD3DjQ) @@ -303,8 +375,16 @@ If you would like to contribute environments, please see our * Striker: 6 actions corresponding to forward, backward, sideways movement, as well as rotation. * Goalie: 4 actions corresponding to forward, backward, sideways movement. - * Visual Observations: None. -* Reset Parameters: None + * Visual Observations: None +* Reset Parameters: Two + * ball_scale: Specifies the scale of the ball in the 3 dimensions (equal across the three dimensions) + * Default: 7.5 + * Recommended minimum: 4 + * Recommended maximum: 10 + * gravity: Magnitude of the gravity + * Default: 9.81 + * Recommended minimum: 6 + * Recommended maximum: 20 * Benchmark Mean Reward (Striker & Goalie Brain): 0 (the means will be inverse of each other and criss crosses during training) __Note that our trainer is currently unable to consistently train this environment__ @@ -329,8 +409,24 @@ If you would like to contribute environments, please see our velocity, and angular velocities of each limb, along with goal direction. * Vector Action space: (Continuous) Size of 39, corresponding to target rotations applicable to the joints. - * Visual Observations: None. -* Reset Parameters: None. + * Visual Observations: None +* Reset Parameters: Four + * gravity: Magnitude of gravity + * Default: 9.81 + * Recommended Minimum: + * Recommended Maximum: + * hip_mass: Mass of the hip component of the walker + * Default: 15 + * Recommended Minimum: 7 + * Recommended Maximum: 28 + * chest_mass: Mass of the chest component of the walker + * Default: 8 + * Recommended Minimum: 3 + * Recommended Maximum: 20 + * spine_mass: Mass of the spine component of the walker + * Default: 10 + * Recommended Minimum: 3 + * Recommended Maximum: 20 * Benchmark Mean Reward: 1000 ## Pyramids @@ -354,6 +450,6 @@ If you would like to contribute environments, please see our `VisualPyramids` scene. __The visual observation version of this environment does not train with the provided default training parameters.__ -* Reset Parameters: None. +* Reset Parameters: None * Optional Imitation Learning scene: `PyramidsIL`. * Benchmark Mean Reward: 1.75 diff --git a/docs/ML-Agents-Overview.md b/docs/ML-Agents-Overview.md index 5d727ca6ad..f194c64ef0 100644 --- a/docs/ML-Agents-Overview.md +++ b/docs/ML-Agents-Overview.md @@ -185,8 +185,8 @@ range of training and inference scenarios: - **Learning** - where decisions are made using an embedded [TensorFlow](Background-TensorFlow.md) model. The embedded TensorFlow model represents a learned policy and the Brain directly uses this model to - determine the action for each Agent. You can train a **Learning Brain** - by dragging it into the Academy's `Broadcast Hub` with the `Control` + determine the action for each Agent. You can train a **Learning Brain** + by dragging it into the Academy's `Broadcast Hub` with the `Control` checkbox checked. - **Player** - where decisions are made using real input from a keyboard or controller. Here, a human player is controlling the Agent and the observations @@ -224,7 +224,7 @@ inference can proceed. As mentioned previously, the ML-Agents toolkit ships with several implementations of state-of-the-art algorithms for training intelligent agents. -In this mode, the only Brain used is a **Learning Brain**. More +In this mode, the only Brain used is a **Learning Brain**. More specifically, during training, all the medics in the scene send their observations to the Python API through the External Communicator (this is the behavior with an External Brain). The Python API @@ -244,7 +244,7 @@ time. To summarize: our built-in implementations are based on TensorFlow, thus, during training the Python API uses the observations it receives to learn a TensorFlow model. This model is then embedded within the Learning Brain during inference to -generate the optimal actions for all Agents linked to that Brain. +generate the optimal actions for all Agents linked to that Brain. The [Getting Started with the 3D Balance Ball Example](Getting-Started-with-Balance-Ball.md) @@ -255,7 +255,7 @@ tutorial covers this training mode with the **3D Balance Ball** sample environme In the previous mode, the Learning Brain was used for training to generate a TensorFlow model that the Learning Brain can later use. However, any user of the ML-Agents toolkit can leverage their own algorithms for -training. In this case, the Brain type would be set to Learning and be linked +training. In this case, the Brain type would be set to Learning and be linked to the BroadcastHub (with checked `Control` checkbox) and the behaviors of all the Agents in the scene will be controlled within Python. You can even turn your environment into a [gym.](../gym-unity/README.md) @@ -319,8 +319,11 @@ imitation learning algorithm will then use these pairs of observations and actions from the human player to learn a policy. [Video Link](https://youtu.be/kpb8ZkMBFYs). -The [Training with Imitation Learning](Training-Imitation-Learning.md) tutorial -covers this training mode with the **Banana Collector** sample environment. +The toolkit provides a way to learn directly from demonstrations, as well as use them +to help speed up reward-based training (RL). We include two algorithms called +Behavioral Cloning (BC) and Generative Adversarial Imitation Learning (GAIL). The +[Training with Imitation Learning](Training-Imitation-Learning.md) tutorial covers these +features in more depth. ## Flexible Training Scenarios @@ -405,10 +408,18 @@ training process. learn more about adding visual observations to an agent [here](Learning-Environment-Design-Agents.md#multiple-visual-observations). +- **Training with Reset Parameter Sampling** - To train agents to be adapt + to changes in its environment (i.e., generalization), the agent should be exposed + to several variations of the environment. Similar to Curriculum Learning, + where environments become more difficult as the agent learns, the toolkit provides + a way to randomly sample Reset Parameters of the environment during training. See + [Training Generalized Reinforcement Learning Agents](Training-Generalized-Reinforcement-Learning-Agents.md) + to learn more about this feature. + - **Broadcasting** - As discussed earlier, a Learning Brain sends the observations for all its Agents to the Python API when dragged into the Academy's `Broadcast Hub` with the `Control` checkbox checked. This is helpful - for training and later inference. Broadcasting is a feature which can be + for training and later inference. Broadcasting is a feature which can be enabled all types of Brains (Player, Learning, Heuristic) where the Agent observations and actions are also sent to the Python API (despite the fact that the Agent is **not** controlled by the Python API). This feature is diff --git a/docs/Migrating.md b/docs/Migrating.md index 7c99341e12..9ab3dafce6 100644 --- a/docs/Migrating.md +++ b/docs/Migrating.md @@ -1,5 +1,26 @@ # Migrating +## Migrating from ML-Agents toolkit v0.8 to v0.9 + +### Important Changes +* We have changed the way reward signals (including Curiosity) are defined in the +`trainer_config.yaml`. +* When using multiple environments, every "step" is recorded in TensorBoard. +* The steps in the command line console corresponds to a single step of a single environment. +Previously, each step corresponded to one step for all environments (i.e., `num_envs` steps). + +#### Steps to Migrate +* If you were overriding any of these following parameters in your config file, remove them +from the top-level config and follow the steps below: + * `gamma`: Define a new `extrinsic` reward signal and set it's `gamma` to your new gamma. + * `use_curiosity`, `curiosity_strength`, `curiosity_enc_size`: Define a `curiosity` reward signal + and set its `strength` to `curiosity_strength`, and `encoding_size` to `curiosity_enc_size`. Give it + the same `gamma` as your `extrinsic` signal to mimic previous behavior. +See [Reward Signals](Reward-Signals.md) for more information on defining reward signals. +* TensorBoards generated when running multiple environments in v0.8 are not comparable to those generated in +v0.9 in terms of step count. Multiply your v0.8 step count by `num_envs` for an approximate comparison. +You may need to change `max_steps` in your config as appropriate as well. + ## Migrating from ML-Agents toolkit v0.7 to v0.8 ### Important Changes diff --git a/docs/Profiling-Python.md b/docs/Profiling-Python.md new file mode 100644 index 0000000000..45904b883e --- /dev/null +++ b/docs/Profiling-Python.md @@ -0,0 +1,52 @@ +# Profiling in Python + +As part of the ML-Agents tookit, we provide a lightweight profiling system, +in order to identity hotspots in the training process and help spot regressions from changes. + +Timers are hierarchical, meaning that the time tracked in a block of code can be further split into other blocks if +desired. This also means that a function that is called from multiple places in the code will appear in multiple +places in the timing output. + +All timers operate using a "global" instance by default, but this can be overridden if necessary (mainly for testing). + +## Adding Profiling + +There are two ways to indicate code should be included in profiling. The simplest way is to add the `@timed` +decorator to a function or method of interested. + +```python +class TrainerController: + # .... + @timed + def advance(self, env: EnvManager) -> int: + # do stuff +``` + +You can also used the `hierarchical_timer` context manager. + +``` python +with hierarchical_timer("communicator.exchange"): + outputs = self.communicator.exchange(step_input) +``` + +The context manager may be easier than the `@timed` decorator for profiling different parts of a large function, or +profiling calls to abstract methods that might not use decorator. + +## Output +By default, at the end of training, timers are collected and written in json format to +`{summaries_dir}/{run_id}_timers.json`. The output consists of node objects with the following keys: + * name (string): The name of the block of code. + * total (float): The total time in seconds spent in the block, including child calls. + * count (int): The number of times the block was called. + * self (float): The total time in seconds spent in the block, excluding child calls. + * children (list): A list of child nodes. + * is_parallel (bool): Indicates that the block of code was executed in multiple threads or processes (see below). This + is optional and defaults to false. + +### Parallel execution +For code that executes in multiple processes (for example, SubprocessEnvManager), we periodically send the timer +information back to the "main" process, aggregate the timers there, and flush them in the subprocess. Note that +(depending on the number of processes) this can result in timers where the total time may exceed the parent's total +time. This is analogous to the difference between "real" and "user" values reported from the unix `time` command. In the +timer output, blocks that were run in parallel are indicated by the `is_parallel` flag. + diff --git a/docs/Readme.md b/docs/Readme.md index fdad80e4f5..f85ae59d80 100644 --- a/docs/Readme.md +++ b/docs/Readme.md @@ -39,6 +39,7 @@ * [Training with Curriculum Learning](Training-Curriculum-Learning.md) * [Training with Imitation Learning](Training-Imitation-Learning.md) * [Training with LSTM](Feature-Memory.md) +* [Training Generalized Reinforcement Learning Agents](Training-Generalized-Reinforcement-Learning-Agents.md) * [Training on the Cloud with Amazon Web Services](Training-on-Amazon-Web-Service.md) * [Training on the Cloud with Microsoft Azure](Training-on-Microsoft-Azure.md) * [Training Using Concurrent Unity Instances](Training-Using-Concurrent-Unity-Instances.md) diff --git a/docs/Reward-Signals.md b/docs/Reward-Signals.md new file mode 100644 index 0000000000..0b44185766 --- /dev/null +++ b/docs/Reward-Signals.md @@ -0,0 +1,236 @@ +# Reward Signals + +In reinforcement learning, the end goal for the Agent is to discover a behavior (a Policy) +that maximizes a reward. Typically, a reward is defined by your environment, and corresponds +to reaching some goal. These are what we refer to as "extrinsic" rewards, as they are defined +external of the learning algorithm. + +Rewards, however, can be defined outside of the enviroment as well, to encourage the agent to +behave in certain ways, or to aid the learning of the true extrinsic reward. We refer to these +rewards as "intrinsic" reward signals. The total reward that the agent will learn to maximize can +be a mix of extrinsic and intrinsic reward signals. + +ML-Agents allows reward signals to be defined in a modular way, and we provide three reward +signals that can the mixed and matched to help shape your agent's behavior. The `extrinsic` Reward +Signal represents the rewards defined in your environment, and is enabled by default. +The `curiosity` reward signal helps your agent explore when extrinsic rewards are sparse. + +## Enabling Reward Signals + +Reward signals, like other hyperparameters, are defined in the trainer config `.yaml` file. An +example is provided in `config/trainer_config.yaml` and `config/gail_config.yaml`. To enable a reward signal, add it to the +`reward_signals:` section under the brain name. For instance, to enable the extrinsic signal +in addition to a small curiosity reward and a GAIL reward signal, you would define your `reward_signals` as follows: + +```yaml +reward_signals: + extrinsic: + strength: 1.0 + gamma: 0.99 + curiosity: + strength: 0.02 + gamma: 0.99 + encoding_size: 256 + gail: + strength: 0.01 + gamma: 0.99 + encoding_size: 128 + demo_path: demos/ExpertPyramid.demo +``` + +Each reward signal should define at least two parameters, `strength` and `gamma`, in addition +to any class-specific hyperparameters. Note that to remove a reward signal, you should delete +its entry entirely from `reward_signals`. At least one reward signal should be left defined +at all times. + +## Reward Signal Types +As part of the toolkit, we provide three reward signal types as part of hyperparameters - Extrinsic, Curiosity, and GAIL. + +### Extrinsic Reward Signal + +The `extrinsic` reward signal is simply the reward given by the +[environment](Learning-Environment-Design.md). Remove it to force the agent +to ignore the environment reward. + +#### Strength + +`strength` is the factor by which to multiply the raw +reward. Typical ranges will vary depending on the reward signal. + +Typical Range: `1.0` + +#### Gamma + +`gamma` corresponds to the discount factor for future rewards. This can be +thought of as how far into the future the agent should care about possible +rewards. In situations when the agent should be acting in the present in order +to prepare for rewards in the distant future, this value should be large. In +cases when rewards are more immediate, it can be smaller. + +Typical Range: `0.8` - `0.995` + +### Curiosity Reward Signal + +The `curiosity` reward signal enables the Intrinsic Curiosity Module. This is an implementation +of the approach described in "Curiosity-driven Exploration by Self-supervised Prediction" +by Pathak, et al. It trains two networks: +* an inverse model, which takes the current and next obersvation of the agent, encodes them, and +uses the encoding to predict the action that was taken between the observations +* a forward model, which takes the encoded current obseravation and action, and predicts the +next encoded observation. + +The loss of the forward model (the difference between the predicted and actual encoded observations) is used as the intrinsic reward, so the more surprised the model is, the larger the reward will be. + +For more information, see +* https://arxiv.org/abs/1705.05363 +* https://pathak22.github.io/noreward-rl/ +* https://blogs.unity3d.com/2018/06/26/solving-sparse-reward-tasks-with-curiosity/ + +#### Strength + +In this case, `strength` corresponds to the magnitude of the curiosity reward generated +by the intrinsic curiosity module. This should be scaled in order to ensure it is large enough +to not be overwhelmed by extrinsic reward signals in the environment. +Likewise it should not be too large to overwhelm the extrinsic reward signal. + +Typical Range: `0.001` - `0.1` + +#### Gamma + +`gamma` corresponds to the discount factor for future rewards. + +Typical Range: `0.8` - `0.995` + +#### (Optional) Encoding Size + +`encoding_size` corresponds to the size of the encoding used by the intrinsic curiosity model. +This value should be small enough to encourage the ICM to compress the original +observation, but also not too small to prevent it from learning to differentiate between +demonstrated and actual behavior. + +Default Value: `64` + +Typical Range: `64` - `256` + +#### (Optional) Learning Rate + +`learning_rate` is the learning rate used to update the intrinsic curiosity module. +This should typically be decreased if training is unstable, and the curiosity loss is unstable. + +Default Value: `3e-4` + +Typical Range: `1e-5` - `1e-3` + +#### (Optional) Num Epochs + +`num_epoch` The number of passes to make through the experience buffer when performing gradient +descent optimization for the ICM. This typically should be set to the same as used for PPO. + +Default Value: `3` + +Typical Range: `3` - `10` + +### GAIL Reward Signal + +GAIL, or [Generative Adversarial Imitation Learning](https://arxiv.org/abs/1606.03476), is an +imitation learning algorithm that uses an adversarial approach, in a similar vein to GANs +(Generative Adversarial Networks). In this framework, a second neural network, the +discriminator, is taught to distinguish whether an observation/action is from a demonstration or +produced by the agent. This discriminator can the examine a new observation/action and provide it a +reward based on how close it believes this new observation/action is to the provided demonstrations. + +At each training step, the agent tries to learn how to maximize this reward. Then, the +discriminator is trained to better distinguish between demonstrations and agent state/actions. +In this way, while the agent gets better and better at mimicing the demonstrations, the +discriminator keeps getting stricter and stricter and the agent must try harder to "fool" it. + +This approach, when compared to [Behavioral Cloning](Training-Behavioral-Cloning.md), requires +far fewer demonstrations to be provided. After all, we are still learning a policy that happens +to be similar to the demonstrations, not directly copying the behavior of the demonstrations. It +is especially effective when combined with an Extrinsic signal. However, the GAIL reward signal can +also be used independently to purely learn from demonstrations. + +Using GAIL requires recorded demonstrations from your Unity environment. See the +[imitation learning guide](Training-Imitation-Learning.md) to learn more about recording demonstrations. + +#### Strength + +`strength` is the factor by which to multiply the raw reward. Note that when using GAIL +with an Extrinsic Signal, this value should be set lower if your demonstrations are +suboptimal (e.g. from a human), so that a trained agent will focus on receiving extrinsic +rewards instead of exactly copying the demonstrations. Keep the strength below about 0.1 in those cases. + +Typical Range: `0.01` - `1.0` + +#### Gamma + +`gamma` corresponds to the discount factor for future rewards. + +Typical Range: `0.8` - `0.9` + +#### Demo Path + +`demo_path` is the path to your `.demo` file or directory of `.demo` files. See the [imitation learning guide] +(Training-Imitation-Learning.md). + +#### (Optional) Encoding Size + +`encoding_size` corresponds to the size of the hidden layer used by the discriminator. +This value should be small enough to encourage the discriminator to compress the original +observation, but also not too small to prevent it from learning to differentiate between +demonstrated and actual behavior. Dramatically increasing this size will also negatively affect +training times. + +Default Value: `64` + +Typical Range: `64` - `256` + +#### (Optional) Learning Rate + +`learning_rate` is the learning rate used to update the discriminator. +This should typically be decreased if training is unstable, and the GAIL loss is unstable. + +Default Value: `3e-4` + +Typical Range: `1e-5` - `1e-3` + +#### (Optional) Use Actions + +`use_actions` determines whether the discriminator should discriminate based on both +observations and actions, or just observations. Set to `True` if you want the agent to +mimic the actions from the demonstrations, and `False` if you'd rather have the agent +visit the same states as in the demonstrations but with possibly different actions. +Setting to `False` is more likely to be stable, especially with imperfect demonstrations, +but may learn slower. + +Default Value: `false` + +#### (Optional) Variational Discriminator Bottleneck + +`use_vail` enables a [variational bottleneck](https://arxiv.org/abs/1810.00821) within the +GAIL discriminator. This forces the discriminator to learn a more general representation +and reduces its tendency to be "too good" at discriminating, making learning more stable. +However, it does increase training time. Enable this if you notice your imitation learning is +unstable, or unable to learn the task at hand. + +Default Value: `false` + +#### (Optional) Samples Per Update + +`samples_per_update` is the maximum number of samples to use during each discriminator update. You may +want to lower this if your buffer size is very large to avoid overfitting the discriminator on current data. +If set to 0, we will use the minimum of buffer size and the number of demonstration samples. + +Default Value: `0` + +Typical Range: Approximately equal to [`buffer_size`](Training-PPO.md) + +#### (Optional) Num Epochs + +`num_epoch` The number of passes to make through the experience buffer when performing gradient +descent optimization for the discriminator. To avoid overfitting, this typically should be set to +the same as or less than used for PPO. + +Default Value: `3` + +Typical Range: `1` - `10` \ No newline at end of file diff --git a/docs/Training-Behavioral-Cloning.md b/docs/Training-Behavioral-Cloning.md new file mode 100644 index 0000000000..427c8db515 --- /dev/null +++ b/docs/Training-Behavioral-Cloning.md @@ -0,0 +1,92 @@ +# Training with Behavioral Cloning + +There are a variety of possible imitation learning algorithms which can +be used, the simplest one of them is Behavioral Cloning. It works by collecting +demonstrations from a teacher, and then simply uses them to directly learn a +policy, in the same way the supervised learning for image classification +or other traditional Machine Learning tasks work. + +## Offline Training + +With offline behavioral cloning, we can use demonstrations (`.demo` files) +generated using the `Demonstration Recorder` as the dataset used to train a behavior. + +1. Choose an agent you would like to learn to imitate some set of demonstrations. +2. Record a set of demonstration using the `Demonstration Recorder` (see [here](Training-Imitation-Learning.md)). + For illustrative purposes we will refer to this file as `AgentRecording.demo`. +3. Build the scene, assigning the agent a Learning Brain, and set the Brain to + Control in the Broadcast Hub. For more information on Brains, see + [here](Learning-Environment-Design-Brains.md). +4. Open the `config/offline_bc_config.yaml` file. +5. Modify the `demo_path` parameter in the file to reference the path to the + demonstration file recorded in step 2. In our case this is: + `./UnitySDK/Assets/Demonstrations/AgentRecording.demo` +6. Launch `mlagent-learn`, providing `./config/offline_bc_config.yaml` + as the config parameter, and include the `--run-id` and `--train` as usual. + Provide your environment as the `--env` parameter if it has been compiled + as standalone, or omit to train in the editor. +7. (Optional) Observe training performance using TensorBoard. + +This will use the demonstration file to train a neural network driven agent +to directly imitate the actions provided in the demonstration. The environment +will launch and be used for evaluating the agent's performance during training. + +## Online Training + +It is also possible to provide demonstrations in realtime during training, +without pre-recording a demonstration file. The steps to do this are as follows: + +1. First create two Brains, one which will be the "Teacher," and the other which + will be the "Student." We will assume that the names of the Brain + Assets are "Teacher" and "Student" respectively. +2. The "Teacher" Brain must be a **Player Brain**. You must properly + configure the inputs to map to the corresponding actions. +3. The "Student" Brain must be a **Learning Brain**. +4. The Brain Parameters of both the "Teacher" and "Student" Brains must be + compatible with the agent. +5. Drag both the "Teacher" and "Student" Brain into the Academy's `Broadcast Hub` + and check the `Control` checkbox on the "Student" Brain. +6. Link the Brains to the desired Agents (one Agent as the teacher and at least + one Agent as a student). +7. In `config/online_bc_config.yaml`, add an entry for the "Student" Brain. Set + the `trainer` parameter of this entry to `online_bc`, and the + `brain_to_imitate` parameter to the name of the teacher Brain: "Teacher". + Additionally, set `batches_per_epoch`, which controls how much training to do + each moment. Increase the `max_steps` option if you'd like to keep training + the Agents for a longer period of time. +8. Launch the training process with `mlagents-learn config/online_bc_config.yaml + --train --slow`, and press the :arrow_forward: button in Unity when the + message _"Start training by pressing the Play button in the Unity Editor"_ is + displayed on the screen +9. From the Unity window, control the Agent with the Teacher Brain by providing + "teacher demonstrations" of the behavior you would like to see. +10. Watch as the Agent(s) with the student Brain attached begin to behave + similarly to the demonstrations. +11. Once the Student Agents are exhibiting the desired behavior, end the training + process with `CTL+C` from the command line. +12. Move the resulting `*.nn` file into the `TFModels` subdirectory of the + Assets folder (or a subdirectory within Assets of your choosing) , and use + with `Learning` Brain. + +**BC Teacher Helper** + +We provide a convenience utility, `BC Teacher Helper` component that you can add +to the Teacher Agent. + +

+ BC Teacher Helper +

+ +This utility enables you to use keyboard shortcuts to do the following: + +1. To start and stop recording experiences. This is useful in case you'd like to + interact with the game _but not have the agents learn from these + interactions_. The default command to toggle this is to press `R` on the + keyboard. + +2. Reset the training buffer. This enables you to instruct the agents to forget + their buffer of recent experiences. This is useful if you'd like to get them + to quickly learn a new behavior. The default command to reset the buffer is + to press `C` on the keyboard. diff --git a/docs/Training-Generalized-Reinforcement-Learning-Agents.md b/docs/Training-Generalized-Reinforcement-Learning-Agents.md new file mode 100644 index 0000000000..29210781ce --- /dev/null +++ b/docs/Training-Generalized-Reinforcement-Learning-Agents.md @@ -0,0 +1,171 @@ +# Training Generalized Reinforcement Learning Agents + +One of the challenges of training and testing agents on the same +environment is that the agents tend to overfit. The result is that the +agents are unable to generalize to any tweaks or variations in the enviornment. +This is analgous to a model being trained and tested on an identical dataset +in supervised learning. This becomes problematic in cases where environments +are randomly instantiated with varying objects or properties. + +To make agents robust and generalizable to different environments, the agent +should be trained over multiple variations of the enviornment. Using this approach +for training, the agent will be better suited to adapt (with higher performance) +to future unseen variations of the enviornment + +_Example of variations of the 3D Ball environment._ + +Ball scale of 0.5 | Ball scale of 4 +:-------------------------:|:-------------------------: +![](images/3dball_small.png) | ![](images/3dball_big.png) + +## Introducing Generalization Using Reset Parameters + +To enable variations in the environments, we implemented `Reset Parameters`. We +also included different sampling methods and the ability to create new kinds of +sampling methods for each `Reset Parameter`. In the 3D ball environment example displayed +in the figure above, the reset parameters are `gravity`, `ball_mass` and `ball_scale`. + + +## How to Enable Generalization Using Reset Parameters + +We first need to provide a way to modify the environment by supplying a set of `Reset Parameters` +and vary them over time. This provision can be done either deterministically or randomly. + +This is done by assigning each `Reset Parameter` a `sampler-type`(such as a uniform sampler), +which determines how to sample a `Reset +Parameter`. If a `sampler-type` isn't provided for a +`Reset Parameter`, the parameter maintains the default value throughout the +training procedure, remaining unchanged. The samplers for all the `Reset Parameters` +are handled by a **Sampler Manager**, which also handles the generation of new +values for the reset parameters when needed. + +To setup the Sampler Manager, we create a YAML file that specifies how we wish to +generate new samples for each `Reset Parameters`. In this file, we specify the samplers and the +`resampling-interval` (the number of simulation steps after which reset parameters are +resampled). Below is an example of a sampler file for the 3D ball environment. + +```yaml +resampling-interval: 5000 + +mass: + sampler-type: "uniform" + min_value: 0.5 + max_value: 10 + +gravity: + sampler-type: "multirange_uniform" + intervals: [[7, 10], [15, 20]] + +scale: + sampler-type: "uniform" + min_value: 0.75 + max_value: 3 + +``` + +Below is the explanation of the fields in the above example. + +* `resampling-interval` - Specifies the number of steps for the agent to +train under a particular environment configuration before resetting the +environment with a new sample of `Reset Parameters`. + +* `Reset Parameter` - Name of the `Reset Parameter` like `mass`, `gravity` and `scale`. This should match the name +specified in the academy of the intended environment for which the agent is +being trained. If a parameter specified in the file doesn't exist in the +environment, then this parameter will be ignored. Within each `Reset Parameter` + + * `sampler-type` - Specify the sampler type to use for the `Reset Parameter`. + This is a string that should exist in the `Sampler Factory` (explained + below). + + * `sampler-type-sub-arguments` - Specify the sub-arguments depending on the `sampler-type`. + In the example above, this would correspond to the `intervals` + under the `sampler-type` `"multirange_uniform"` for the `Reset Parameter` called gravity`. + The key name should match the name of the corresponding argument in the sampler definition. + (See below) + +The Sampler Manager allocates a sampler type for each `Reset Parameter` by using the *Sampler Factory*, +which maintains a dictionary mapping of string keys to sampler objects. The available sampler types +to be used for each `Reset Parameter` is available in the Sampler Factory. + +### Included Sampler Types + +Below is a list of included `sampler-type` as part of the toolkit. + +* `uniform` - Uniform sampler + * Uniformly samples a single float value between defined endpoints. + The sub-arguments for this sampler to specify the interval + endpoints are as below. The sampling is done in the range of + [`min_value`, `max_value`). + + * **sub-arguments** - `min_value`, `max_value` + +* `gaussian` - Gaussian sampler + * Samples a single float value from the distribution characterized by + the mean and standard deviation. The sub-arguments to specify the + gaussian distribution to use are as below. + + * **sub-arguments** - `mean`, `st_dev` + +* `multirange_uniform` - Multirange uniform sampler + * Uniformly samples a single float value between the specified intervals. + Samples by first performing a weight pick of an interval from the list + of intervals (weighted based on interval width) and samples uniformly + from the selected interval (half-closed interval, same as the uniform + sampler). This sampler can take an arbitrary number of intervals in a + list in the following format: + [[`interval_1_min`, `interval_1_max`], [`interval_2_min`, `interval_2_max`], ...] + + * **sub-arguments** - `intervals` + +The implementation of the samplers can be found at `ml-agents-envs/mlagents/envs/sampler_class.py`. + +### Defining a New Sampler Type + +If you want to define your own sampler type, you must first inherit the *Sampler* +base class (included in the `sampler_class` file) and preserve the interface. +Once the class for the required method is specified, it must be registered in the Sampler Factory. + +This can be done by subscribing to the *register_sampler* method of the SamplerFactory. The command +is as follows: + +`SamplerFactory.register_sampler(*custom_sampler_string_key*, *custom_sampler_object*)` + +Once the Sampler Factory reflects the new register, the new sampler type can be used for sample any +`Reset Parameter`. For example, lets say a new sampler type was implemented as below and we register +the `CustomSampler` class with the string `custom-sampler` in the Sampler Factory. + +```python +class CustomSampler(Sampler): + + def __init__(self, argA, argB, argC): + self.possible_vals = [argA, argB, argC] + + def sample_all(self): + return np.random.choice(self.possible_vals) +``` + +Now we need to specify the new sampler type in the sampler YAML file. For example, we use this new +sampler type for the `Reset Parameter` *mass*. + +```yaml +mass: + sampler-type: "custom-sampler" + argB: 1 + argA: 2 + argC: 3 +``` + +### Training with Generalization Using Reset Parameters + +After the sampler YAML file is defined, we proceed by launching `mlagents-learn` and specify +our configured sampler file with the `--sampler` flag. For example, if we wanted to train the +3D ball agent with generalization using `Reset Parameters` with `config/3dball_generalize.yaml` +sampling setup, we would run + +```sh +mlagents-learn config/trainer_config.yaml --sampler=config/3dball_generalize.yaml +--run-id=3D-Ball-generalization --train +``` + +We can observe progress and metrics via Tensorboard. diff --git a/docs/Training-Imitation-Learning.md b/docs/Training-Imitation-Learning.md index 027564cbb6..679568a339 100644 --- a/docs/Training-Imitation-Learning.md +++ b/docs/Training-Imitation-Learning.md @@ -1,4 +1,4 @@ -# Imitation Learning +# Training with Imitation Learning It is often more intuitive to simply demonstrate the behavior we want an agent to perform, rather than attempting to have it learn via trial-and-error methods. @@ -10,131 +10,81 @@ from the game and actions from a game controller to guide the medic's behavior. Imitation Learning uses pairs of observations and actions from from a demonstration to learn a policy. [Video Link](https://youtu.be/kpb8ZkMBFYs). -## Recording Demonstrations - -It is possible to record demonstrations of agent behavior from the Unity Editor, -and save them as assets. These demonstrations contain information on the -observations, actions, and rewards for a given agent during the recording session. -They can be managed from the Editor, as well as used for training with Offline -Behavioral Cloning (see below). - -In order to record demonstrations from an agent, add the `Demonstration Recorder` -component to a GameObject in the scene which contains an `Agent` component. -Once added, it is possible to name the demonstration that will be recorded -from the agent. - -

- BC Teacher Helper -

- -When `Record` is checked, a demonstration will be created whenever the scene -is played from the Editor. Depending on the complexity of the task, anywhere -from a few minutes or a few hours of demonstration data may be necessary to -be useful for imitation learning. When you have recorded enough data, end -the Editor play session, and a `.demo` file will be created in the -`Assets/Demonstrations` folder. This file contains the demonstrations. -Clicking on the file will provide metadata about the demonstration in the -inspector. +Imitation learning can also be used to help reinforcement learning. Especially in +environments with sparse (i.e., infrequent or rare) rewards, the agent may never see +the reward and thus not learn from it. Curiosity (which is available in the toolkit) +helps the agent explore, but in some cases +it is easier to show the agent how to achieve the reward. In these cases, +imitation learning combined with reinforcement learning can dramatically +reduce the time the agent takes to solve the environment. +For instance, on the [Pyramids environment](Learning-Environment-Examples.md#pyramids), +using 6 episodes of demonstrations can reduce training steps by more than 4 times. +See PreTraining + GAIL + Curiosity + RL below.

- BC Teacher Helper + Using Demonstrations with Reinforcement Learning

- - -## Training with Behavioral Cloning - -There are a variety of possible imitation learning algorithms which can -be used, the simplest one of them is Behavioral Cloning. It works by collecting -demonstrations from a teacher, and then simply uses them to directly learn a -policy, in the same way the supervised learning for image classification -or other traditional Machine Learning tasks work. - -### Offline Training +The ML-Agents toolkit provides several ways to learn from demonstrations. -With offline behavioral cloning, we can use demonstrations (`.demo` files) -generated using the `Demonstration Recorder` as the dataset used to train a behavior. +* To train using GAIL (Generative Adversarial Imitaiton Learning) you can add the + [GAIL reward signal](Reward-Signals.md#the-gail-reward-signal). GAIL can be + used with or without environment rewards, and works well when there are a limited + number of demonstrations. +* To help bootstrap reinforcement learning, you can enable + [pretraining](Training-PPO.md#optional-pretraining-using-demonstrations) + on the PPO trainer, in addition to using a small GAIL reward signal. +* To train an agent to exactly mimic demonstrations, you can use the + [Behavioral Cloning](Training-Behavioral-Cloning.md) trainer. Behavioral Cloning can be + used offline and online (in-editor), and learns very quickly. However, it usually is ineffective + on more complex environments without a large number of demonstrations. -1. Choose an agent you would like to learn to imitate some set of demonstrations. -2. Record a set of demonstration using the `Demonstration Recorder` (see above). - For illustrative purposes we will refer to this file as `AgentRecording.demo`. -3. Build the scene, assigning the agent a Learning Brain, and set the Brain to - Control in the Broadcast Hub. For more information on Brains, see - [here](Learning-Environment-Design-Brains.md). -4. Open the `config/offline_bc_config.yaml` file. -5. Modify the `demo_path` parameter in the file to reference the path to the - demonstration file recorded in step 2. In our case this is: - `./UnitySDK/Assets/Demonstrations/AgentRecording.demo` -6. Launch `mlagent-learn`, providing `./config/offline_bc_config.yaml` - as the config parameter, and include the `--run-id` and `--train` as usual. - Provide your environment as the `--env` parameter if it has been compiled - as standalone, or omit to train in the editor. -7. (Optional) Observe training performance using TensorBoard. +### How to Choose -This will use the demonstration file to train a neural network driven agent -to directly imitate the actions provided in the demonstration. The environment -will launch and be used for evaluating the agent's performance during training. +If you want to help your agents learn (especially with environments that have sparse rewards) +using pre-recorded demonstrations, you can generally enable both GAIL and Pretraining. +An example of this is provided for the Pyramids example environment under + `PyramidsLearning` in `config/gail_config.yaml`. -### Online Training +If you want to train purely from demonstrations, GAIL is generally the preferred approach, especially +if you have few (<10) episodes of demonstrations. An example of this is provided for the Crawler example +environment under `CrawlerStaticLearning` in `config/gail_config.yaml`. -It is also possible to provide demonstrations in realtime during training, -without pre-recording a demonstration file. The steps to do this are as follows: +If you have plenty of demonstrations and/or a very simple environment, Behavioral Cloning +(online and offline) can be effective and quick. However, it cannot be combined with RL. -1. First create two Brains, one which will be the "Teacher," and the other which - will be the "Student." We will assume that the names of the Brain - Assets are "Teacher" and "Student" respectively. -2. The "Teacher" Brain must be a **Player Brain**. You must properly - configure the inputs to map to the corresponding actions. -3. The "Student" Brain must be a **Learning Brain**. -4. The Brain Parameters of both the "Teacher" and "Student" Brains must be - compatible with the agent. -5. Drag both the "Teacher" and "Student" Brain into the Academy's `Broadcast Hub` - and check the `Control` checkbox on the "Student" Brain. -6. Link the Brains to the desired Agents (one Agent as the teacher and at least - one Agent as a student). -7. In `config/online_bc_config.yaml`, add an entry for the "Student" Brain. Set - the `trainer` parameter of this entry to `online_bc`, and the - `brain_to_imitate` parameter to the name of the teacher Brain: "Teacher". - Additionally, set `batches_per_epoch`, which controls how much training to do - each moment. Increase the `max_steps` option if you'd like to keep training - the Agents for a longer period of time. -8. Launch the training process with `mlagents-learn config/online_bc_config.yaml - --train --slow`, and press the :arrow_forward: button in Unity when the - message _"Start training by pressing the Play button in the Unity Editor"_ is - displayed on the screen -9. From the Unity window, control the Agent with the Teacher Brain by providing - "teacher demonstrations" of the behavior you would like to see. -10. Watch as the Agent(s) with the student Brain attached begin to behave - similarly to the demonstrations. -11. Once the Student Agents are exhibiting the desired behavior, end the training - process with `CTL+C` from the command line. -12. Move the resulting `*.nn` file into the `TFModels` subdirectory of the - Assets folder (or a subdirectory within Assets of your choosing) , and use - with `Learning` Brain. +## Recording Demonstrations -**BC Teacher Helper** +It is possible to record demonstrations of agent behavior from the Unity Editor, +and save them as assets. These demonstrations contain information on the +observations, actions, and rewards for a given agent during the recording session. +They can be managed from the Editor, as well as used for training with Offline +Behavioral Cloning and GAIL. -We provide a convenience utility, `BC Teacher Helper` component that you can add -to the Teacher Agent. +In order to record demonstrations from an agent, add the `Demonstration Recorder` +component to a GameObject in the scene which contains an `Agent` component. +Once added, it is possible to name the demonstration that will be recorded +from the agent.

- BC Teacher Helper

-This utility enables you to use keyboard shortcuts to do the following: - -1. To start and stop recording experiences. This is useful in case you'd like to - interact with the game _but not have the agents learn from these - interactions_. The default command to toggle this is to press `R` on the - keyboard. +When `Record` is checked, a demonstration will be created whenever the scene +is played from the Editor. Depending on the complexity of the task, anywhere +from a few minutes or a few hours of demonstration data may be necessary to +be useful for imitation learning. When you have recorded enough data, end +the Editor play session, and a `.demo` file will be created in the +`Assets/Demonstrations` folder. This file contains the demonstrations. +Clicking on the file will provide metadata about the demonstration in the +inspector. -2. Reset the training buffer. This enables you to instruct the agents to forget - their buffer of recent experiences. This is useful if you'd like to get them - to quickly learn a new behavior. The default command to reset the buffer is - to press `C` on the keyboard. +

+ BC Teacher Helper +

diff --git a/docs/Training-ML-Agents.md b/docs/Training-ML-Agents.md index da597e584b..5a9a749119 100644 --- a/docs/Training-ML-Agents.md +++ b/docs/Training-ML-Agents.md @@ -91,65 +91,65 @@ While this example used the default training hyperparameters, you can edit the [training_config.yaml file](#training-config-file) with a text editor to set different values. -### Command line training options +### Command Line Training Options In addition to passing the path of the Unity executable containing your training environment, you can set the following command line options when invoking `mlagents-learn`: -* `--env=` - Specify an executable environment to train. -* `--curriculum=` – Specify a curriculum JSON file for defining the +* `--env=`: Specify an executable environment to train. +* `--curriculum=`: Specify a curriculum JSON file for defining the lessons for curriculum training. See [Curriculum Training](Training-Curriculum-Learning.md) for more information. -* `--keep-checkpoints=` – Specify the maximum number of model checkpoints to +* `--sampler=`: Specify a sampler YAML file for defining the + sampler for generalization training. See [Generalization + Training](Training-Generalized-Reinforcement-Learning-Agents.md) for more information. +* `--keep-checkpoints=`: Specify the maximum number of model checkpoints to keep. Checkpoints are saved after the number of steps specified by the `save-freq` option. Once the maximum number of checkpoints has been reached, the oldest checkpoint is deleted when saving a new checkpoint. Defaults to 5. -* `--lesson=` – Specify which lesson to start with when performing curriculum +* `--lesson=`: Specify which lesson to start with when performing curriculum training. Defaults to 0. -* `--load` – If set, the training code loads an already trained model to +* `--load`: If set, the training code loads an already trained model to initialize the neural network before training. The learning code looks for the model in `models//` (which is also where it saves models at the end of training). When not set (the default), the neural network weights are randomly initialized and an existing model is not loaded. -* `--num-runs=` - Sets the number of concurrent training sessions to perform. +* `--num-runs=`: Sets the number of concurrent training sessions to perform. Default is set to 1. Set to higher values when benchmarking performance and multiple training sessions is desired. Training sessions are independent, and do not improve learning performance. -* `--run-id=` – Specifies an identifier for each training run. This +* `--run-id=`: Specifies an identifier for each training run. This identifier is used to name the subdirectories in which the trained model and summary statistics are saved as well as the saved model itself. The default id is "ppo". If you use TensorBoard to view the training statistics, always set a unique run-id for each training run. (The statistics for all runs with the same id are combined as if they were produced by a the same session.) -* `--save-freq=` Specifies how often (in steps) to save the model during +* `--save-freq=`: Specifies how often (in steps) to save the model during training. Defaults to 50000. -* `--seed=` – Specifies a number to use as a seed for the random number +* `--seed=`: Specifies a number to use as a seed for the random number generator used by the training code. -* `--slow` – Specify this option to run the Unity environment at normal, game +* `--slow`: Specify this option to run the Unity environment at normal, game speed. The `--slow` mode uses the **Time Scale** and **Target Frame Rate** specified in the Academy's **Inference Configuration**. By default, training runs using the speeds specified in your Academy's **Training Configuration**. See [Academy Properties](Learning-Environment-Design-Academy.md#academy-properties). -* `--train` – Specifies whether to train model or only run in inference mode. +* `--train`: Specifies whether to train model or only run in inference mode. When training, **always** use the `--train` option. -* `--num-envs=` - Specifies the number of concurrent Unity environment instances to collect +* `--num-envs=`: Specifies the number of concurrent Unity environment instances to collect experiences from when training. Defaults to 1. -* `--base-port` - Specifies the starting port. Each concurrent Unity environment instance will get assigned a port sequentially, starting from the `base-port`. Each instance will use the port `(base_port + worker_id)`, where the `worker_id` is sequential IDs given to each instance from 0 to `num_envs - 1`. Default is 5005. -* `--docker-target-name=
` – The Docker Volume on which to store curriculum, +* `--base-port`: Specifies the starting port. Each concurrent Unity environment instance will get assigned a port sequentially, starting from the `base-port`. Each instance will use the port `(base_port + worker_id)`, where the `worker_id` is sequential IDs given to each instance from 0 to `num_envs - 1`. Default is 5005. +* `--docker-target-name=
`: The Docker Volume on which to store curriculum, executable and model files. See [Using Docker](Using-Docker.md). -* `--no-graphics` - Specify this option to run the Unity executable in +* `--no-graphics`: Specify this option to run the Unity executable in `-batchmode` and doesn't initialize the graphics driver. Use this only if your training doesn't involve visual observations (reading from Pixels). See [here](https://docs.unity3d.com/Manual/CommandLineArguments.html) for more details. -* `--debug` - Specify this option to run ML-Agents in debug mode and log Trainer - Metrics to a CSV stored in the `summaries` directory. The metrics stored are: - brain name, time to update policy, time since start of training, time for last experience collection, number of experiences used for training, mean return. This - option is not available currently for Imitation Learning. +* `--debug`: Specify this option to enable debug-level logging for some parts of the code. -### Training config file +### Training Config File The training config files `config/trainer_config.yaml`, `config/online_bc_config.yaml` and `config/offline_bc_config.yaml` specifies the @@ -170,10 +170,7 @@ environments are included in the provided config file. | brain\_to\_imitate | For online imitation learning, the name of the GameObject containing the Brain component to imitate. | (online)BC | | demo_path | For offline imitation learning, the file path of the recorded demonstration file | (offline)BC | | buffer_size | The number of experiences to collect before updating the policy model. | PPO | -| curiosity\_enc\_size | The size of the encoding to use in the forward and inverse models in the Curiosity module. | PPO | -| curiosity_strength | Magnitude of intrinsic reward generated by Intrinsic Curiosity Module. | PPO | | epsilon | Influences how rapidly the policy can evolve during training. | PPO | -| gamma | The reward discount rate for the Generalized Advantage Estimator (GAE). | PPO | | hidden_units | The number of units in the hidden layers of the neural network. | PPO, BC | | lambd | The regularization parameter. | PPO | | learning_rate | The initial learning rate for gradient descent. | PPO, BC | @@ -182,13 +179,15 @@ environments are included in the provided config file. | normalize | Whether to automatically normalize observations. | PPO | | num_epoch | The number of passes to make through the experience buffer when performing gradient descent optimization. | PPO | | num_layers | The number of hidden layers in the neural network. | PPO, BC | +| pretraining | Use demonstrations to bootstrap the policy neural network. See [Pretraining Using Demonstrations](Training-PPO.md#optional-pretraining-using-demonstrations). | PPO | +| reward_signals | The reward signals used to train the policy. Enable Curiosity and GAIL here. See [Reward Signals](Reward-Signals.md) for configuration options. | PPO | | sequence_length | Defines how long the sequences of experiences must be while training. Only used for training with a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, BC | | summary_freq | How often, in steps, to save training statistics. This determines the number of data points shown by TensorBoard. | PPO, BC | | time_horizon | How many steps of experience to collect per-agent before adding it to the experience buffer. | PPO, (online)BC | -| trainer | The type of training to perform: "ppo" or "imitation". | PPO, BC | -| use_curiosity | Train using an additional intrinsic reward signal generated from Intrinsic Curiosity Module. | PPO | +| trainer | The type of training to perform: "ppo", "offline_bc" or "online_bc". | PPO, BC | | use_recurrent | Train using a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, BC | + \*PPO = Proximal Policy Optimization, BC = Behavioral Cloning (Imitation) For specific advice on setting hyperparameters based on the type of training you @@ -198,9 +197,25 @@ are conducting, see: * [Using Recurrent Neural Networks](Feature-Memory.md) * [Training with Curriculum Learning](Training-Curriculum-Learning.md) * [Training with Imitation Learning](Training-Imitation-Learning.md) +* [Training Generalized Reinforcement Learning Agents](Training-Generalized-Reinforcement-Learning-Agents.md) You can also compare the [example environments](Learning-Environment-Examples.md) to the corresponding sections of the `config/trainer_config.yaml` file for each example to see how the hyperparameters and other configuration variables have been changed from the defaults. + +### Debugging and Profiling +If you enable the `--debug` flag in the command line, the trainer metrics are logged to a CSV file +stored in the `summaries` directory. The metrics stored are: + * brain name + * time to update policy + * time since start of training + * time for last experience collection + * number of experiences used for training + * mean return + +This option is not available currently for Behavioral Cloning. + +Additionally, we have included basic [Profiling in Python](Profiling-Python.md) as part of the toolkit. +This information is also saved in the `summaries` directory. diff --git a/docs/Training-PPO.md b/docs/Training-PPO.md index 2735207974..a2bd53844b 100644 --- a/docs/Training-PPO.md +++ b/docs/Training-PPO.md @@ -7,6 +7,10 @@ observations to the best action an agent can take in a given state. The ML-Agents PPO algorithm is implemented in TensorFlow and runs in a separate Python process (communicating with the running Unity application over a socket). +To train an agent, you will need to provide the agent one or more reward signals which +the agent should attempt to maximize. See [Reward Signals](Reward-Signals.md) +for the available reward signals and the corresponding hyperparameters. + See [Training ML-Agents](Training-ML-Agents.md) for instructions on running the training program, `learn.py`. @@ -18,11 +22,10 @@ If you are using curriculum training to pace the difficulty of the learning task presented to an agent, see [Training with Curriculum Learning](Training-Curriculum-Learning.md). -For information about imitation learning, which uses a different training -algorithm, see +For information about imitation learning from demonstrations, see [Training with Imitation Learning](Training-Imitation-Learning.md). -## Best Practices when training with PPO +## Best Practices Training with PPO Successfully training a Reinforcement Learning model often involves tuning the training hyperparameters. This guide contains some best practices for tuning the @@ -31,15 +34,19 @@ of performance you would like. ## Hyperparameters -### Gamma +### Reward Signals -`gamma` corresponds to the discount factor for future rewards. This can be -thought of as how far into the future the agent should care about possible -rewards. In situations when the agent should be acting in the present in order -to prepare for rewards in the distant future, this value should be large. In -cases when rewards are more immediate, it can be smaller. +In reinforcement learning, the goal is to learn a Policy that maximizes reward. +At a base level, the reward is given by the environment. However, we could imagine +rewarding the agent for various different behaviors. For instance, we could reward +the agent for exploring new states, rather than just when an explicit reward is given. +Furthermore, we could mix reward signals to help the learning process. -Typical Range: `0.8` - `0.995` +Using `reward_signals` allows you to define [reward signals.](Reward-Signals.md) +The ML-Agents toolkit provides three reward signals by default, the Extrinsic (environment) +reward signal, the Curiosity reward signal, which can be used to encourage exploration in +sparse extrinsic reward environments, and the GAIL reward signal. Please see [Reward Signals](Reward-Signals.md) +for additional details. ### Lambda @@ -160,6 +167,19 @@ variables, this should be larger. Typical Range: `32` - `512` +### (Optional) Visual Encoder Type + +`vis_encode_type` corresponds to the encoder type for encoding visual observations. +Valid options include: +* `simple` (default): a simple encoder which consists of two convolutional layers +* `nature_cnn`: CNN implementation proposed by Mnih et al.(https://www.nature.com/articles/nature14236), +consisting of three convolutional layers +* `resnet`: IMPALA Resnet implementation (https://arxiv.org/abs/1802.01561), +consisting of three stacked layers, each with two risidual blocks, making a +much larger network than the other two. + +Options: `simple`, `nature_cnn`, `resnet` + ## (Optional) Recurrent Neural Network Hyperparameters The below hyperparameters are only used when `use_recurrent` is set to true. @@ -184,29 +204,72 @@ the agent will need to remember in order to successfully complete the task. Typical Range: `64` - `512` -## (Optional) Intrinsic Curiosity Module Hyperparameters +## (Optional) Pretraining Using Demonstrations + +In some cases, you might want to bootstrap the agent's policy using behavior recorded +from a player. This can help guide the agent towards the reward. Pretraining adds +training operations that mimic a demonstration rather than attempting to maximize reward. +It is essentially equivalent to running [behavioral cloning](Training-Behavioral-Cloning.md) +in-line with PPO. + +To use pretraining, add a `pretraining` section to the trainer_config. For instance: + +``` + pretraining: + demo_path: ./demos/ExpertPyramid.demo + strength: 0.5 + steps: 10000 +``` + +Below are the avaliable hyperparameters for pretraining. + +### Strength + +`strength` corresponds to the learning rate of the imitation relative to the learning +rate of PPO, and roughly corresponds to how strongly we allow the behavioral cloning +to influence the policy. + +Typical Range: `0.1` - `0.5` + +### Demo Path -The below hyperparameters are only used when `use_curiosity` is set to true. +`demo_path` is the path to your `.demo` file or directory of `.demo` files. +See the [imitation learning guide](Training-Imitation-Learning.md) for more on `.demo` files. -### Curiosity Encoding Size +### Steps -`curiosity_enc_size` corresponds to the size of the hidden layer used to encode -the observations within the intrinsic curiosity module. This value should be -small enough to encourage the curiosity module to compress the original -observation, but also not too small to prevent it from learning the dynamics of -the environment. +During pretraining, it is often desirable to stop using demonstrations after the agent has +"seen" rewards, and allow it to optimize past the available demonstrations and/or generalize +outside of the provided demonstrations. `steps` corresponds to the training steps over which +pretraining is active. The learning rate of the pretrainer will anneal over the steps. Set +the steps to 0 for constant imitation over the entire training run. + +### (Optional) Batch Size + +`batch_size` is the number of demonstration experiences used for one iteration of a gradient +descent update. If not specified, it will default to the `batch_size` defined for PPO. + +Typical Range (Continuous): `512` - `5120` + +Typical Range (Discrete): `32` - `512` + +### (Optional) Number of Epochs + +`num_epoch` is the number of passes through the experience buffer during +gradient descent. If not specified, it will default to the number of epochs set for PPO. + +Typical Range: `3` - `10` -Typical Range: `64` - `256` +### (Optional) Samples Per Update -### Curiosity Strength +`samples_per_update` is the maximum number of samples +to use during each imitation update. You may want to lower this if your demonstration +dataset is very large to avoid overfitting the policy on demonstrations. Set to 0 +to train over all of the demonstrations at each update step. -`curiosity_strength` corresponds to the magnitude of the intrinsic reward -generated by the intrinsic curiosity module. This should be scaled in order to -ensure it is large enough to not be overwhelmed by extrinsic reward signals in -the environment. Likewise it should not be too large to overwhelm the extrinsic -reward signal. +Default Value: `0` (all) -Typical Range: `0.1` - `0.001` +Typical Range: Approximately equal to PPO's `buffer_size` ## Training Statistics diff --git a/docs/Training-on-Amazon-Web-Service.md b/docs/Training-on-Amazon-Web-Service.md index 26e61d303b..b89e8a1e65 100644 --- a/docs/Training-on-Amazon-Web-Service.md +++ b/docs/Training-on-Amazon-Web-Service.md @@ -14,7 +14,7 @@ headless mode, you need to enable X Server. After launching your EC2 instance using the ami and ssh into it, run the following commands to enable it: -```console +```sh # Start the X Server, press Enter to come to the command line $ sudo /usr/bin/X :0 & @@ -88,7 +88,7 @@ linux executables which use visual observations. #### Install and setup Xorg: - ```console + ```sh # Install Xorg $ sudo apt-get update $ sudo apt-get install -y xserver-xorg mesa-utils @@ -107,7 +107,7 @@ linux executables which use visual observations. #### Update and setup Nvidia driver: - ```console + ```sh # Download and install the latest Nvidia driver for ubuntu # Please refer to http://download.nvidia.com/XFree86/Linux-#x86_64/latest.txt $ wget http://download.nvidia.com/XFree86/Linux-x86_64/390.87/NVIDIA-Linux-x86_64-390.87.run @@ -122,13 +122,13 @@ linux executables which use visual observations. #### Restart the EC2 instance: - ```console + ```sh sudo reboot now ``` #### Make sure there are no Xorg processes running: - ```console + ```sh # Kill any possible running Xorg processes # Note that you might have to run this command multiple times depending on # how Xorg is configured. @@ -175,7 +175,7 @@ linux executables which use visual observations. #### Ensure the Xorg is correctly configured: - ```console + ```sh # For more information on glxgears, see ftp://www.x.org/pub/X11R6.8.1/doc/glxgears.1.html. $ glxgears # If Xorg is configured correctly, you should see the following message @@ -201,12 +201,12 @@ Headless Mode, you have to setup the X Server to enable training.) 6. Upload the executable to your EC2 instance within `ml-agents` folder. 7. Change the permissions of the executable. - ```console + ```sh chmod +x .x86_64 ``` 8. (Without Headless Mode) Start X Server and use it for display: - ```console + ```sh # Start the X Server, press Enter to come back to the command line $ sudo /usr/bin/X :0 & @@ -240,7 +240,7 @@ Headless Mode, you have to setup the X Server to enable training.) If you've built your Linux executable, but forget to copy over the corresponding _Data folder, you will see error message like the following: -```console +```sh Set current directory to /home/ubuntu/ml-agents/ml-agents Found path: /home/ubuntu/ml-agents/ml-agents/3dball_linux.x86_64 no boot config - using default values @@ -275,13 +275,13 @@ It would be also really helpful to check your /home/ubuntu/.config/unity3d/ \ +docker run -it --name \ --mount type=bind,source="$(pwd)"/unity-volume,target=/unity-volume \ -p 5005:5005 \ + -p 6006:6006 \ :latest \ --docker-target-name=unity-volume \ \ @@ -118,13 +119,14 @@ Notes on argument values: To train with a `3DBall` environment executable, the command would be: ```sh -docker run --name 3DBallContainer.first.trial \ +docker run -it --name 3DBallContainer.first.trial \ --mount type=bind,source="$(pwd)"/unity-volume,target=/unity-volume \ -p 5005:5005 \ + -p 6006:6006 \ balance.ball.v0.1:latest 3DBall \ --docker-target-name=unity-volume \ trainer_config.yaml \ - --env=3DBall + --env=3DBall \ --train \ --run-id=3dball_first_trial ``` @@ -134,6 +136,21 @@ For more detail on Docker mounts, check out **NOTE** If you are training using docker for environments that use visual observations, you may need to increase the default memory that Docker allocates for the container. For example, see [here](https://docs.docker.com/docker-for-mac/#advanced) for instructions for Docker for Mac. +### Running Tensorboard + +You can run Tensorboard to monitor your training instance on http://localhost:6006: + +```sh +docker exec -it tensorboard --logdir=/unity-volume/summaries --host=0.0.0.0 +``` + +With our previous 3DBall example, this command would look like this: +```sh +docker exec -it 3DBallContainer.first.trial tensorboard --logdir=/unity-volume/summaries --host=0.0.0.0 +``` + +For more details on Tensorboard, check out the documentation about [Using Tensorboard](Using-Tensorboard.md). + ### Stopping Container and Saving State If you are satisfied with the training progress, you can stop the Docker diff --git a/docs/images/3dball_big.png b/docs/images/3dball_big.png new file mode 100644 index 0000000000..9fdfa0241e Binary files /dev/null and b/docs/images/3dball_big.png differ diff --git a/docs/images/3dball_small.png b/docs/images/3dball_small.png new file mode 100644 index 0000000000..449d7d32b5 Binary files /dev/null and b/docs/images/3dball_small.png differ diff --git a/docs/images/mlagents-ImitationAndRL.png b/docs/images/mlagents-ImitationAndRL.png new file mode 100644 index 0000000000..ffa61d1b11 Binary files /dev/null and b/docs/images/mlagents-ImitationAndRL.png differ diff --git a/docs/localized/KR/README.md b/docs/localized/KR/README.md new file mode 100644 index 0000000000..5a3709847f --- /dev/null +++ b/docs/localized/KR/README.md @@ -0,0 +1,86 @@ + + + + +# Unity ML-Agents Toolkit (Beta) v0.9 +[![docs badge](https://img.shields.io/badge/docs-reference-blue.svg)](docs/Readme.md) +[![license badge](https://img.shields.io/badge/license-Apache--2.0-green.svg)](LICENSE) + +**Unity Machine Learning Agents Toolkit** (ML-Agents) 은 지능형 에이전트를 학습시키기 위한 +환경을 제공하여 게임 또는 시뮬레이션을 만들 수 있게 해주는 오픈소스 유니티 플러그인 입니다. 사용하기 쉬운 +파이썬 API를 통해 강화학습, 모방학습, 신경진화 또는 다른 기계학습 방법론을 사용하여 에이전트들을 학습시킬 수 있습니다. +우리는 또한 게임 개발자와 개발에 대해 취미를 가지신 분들이 2D, 3D 그리고 VR/AR 게임들에 사용할 지능형 에이전트를 +쉽게 훈련시킬 수 있도록하는 최신 알고리즘 구현체를 ([텐서플로우]([https://www.tensorflow.org/](https://www.tensorflow.org/)) 기반)을 제공합니다. 학습된 에이전트들은 +NPC의 행동 제어(다중 에이전트, 적대적 에이전트 등), 게임 빌드 테스트 자동화, 그리고 출시 전 게임 설계 검증 등을 포함한 다양한 목적을 위해 사용될 수 있습니다. +ML-Agents toolkit은 유니티의 풍부한 환경에서 인공지능 에이전트 개발을 위한 중심 플랫폼을 제공함으로써 더욱 광범위한 연구와 게임 개발이 진행되도록 하며 이에 따라 게임 개발자들과 AI 연구원들 모두에게 도움을 줍니다. + +## 특징 + +* 파이썬을 통한 유니티 환경 제어 +* 10가지 이상의 유니티 환경 샘플 +* 여러 환경 구성 및 학습 시나리오 제공 +* 심층 강화 학습을 사용하여 기억력이 향상된 에이전트 학습 +* 쉽게 정의 가능한 커리큘럼 학습 시나리오 +* 지도 학습을 위한 에이전트 행동 브로드캐스팅 +* 모방 학습 지원 기본 제공 +* 온 디맨드 의사 결정을 통한 유연한 에이전트 제어 +* 환경 속 네트워크 출력의 시각화 +* [도커(Docker)]([https://www.docker.com/](https://www.docker.com/))를 통한 설정 단순화 +* [gym]([https://gym.openai.com/](https://gym.openai.com/))과 같은 학습 환경 +* 유니티 인터페이스 엔진 활용 +* 유니티 환경 인스턴스를 동시에 사용하는 학습 + +## 문서화 + +* 설치와 사용법 외에 더 많은 정보는 [설명서 홈](docs/Readme.md)을 참고해주십시오. +* 만약 유니티 AI 플랫폼에 관한 토론에 관심있는 연구원이라면 유니티와 ML-Agents Toolkit에 관한 [논문](https://arxiv.org/abs/1809.02627)을 참고해 주십시오. 또한 이 논문을 인용하는 것에 관한 사항은 아래의 인용 부분을 참조하십시오. +* 만약 이전 버전의 ML-Agents toolkit을 사용하고 있다면 [이전 버전 마이그레이션 가이드](docs/Migrating.md)를 확인해주십시오. + +## 추가 리소스 + +블로그에 ML-Agents와 관련된 시리즈의 게시물을 게시하였습니다(영어). + +* 강화 학습 개념 개요 ([multi-armed bandit](https://blogs.unity3d.com/kr/2017/06/26/unity-ai-themed-blog-entries/) 과 [Q-learning](https://blogs.unity3d.com/kr/2017/08/22/unity-ai-reinforcement-learning-with-q-learning/)) +* [실제 게임에서 Machine Learning 에이전트 사용하기: 초보자 가이드](https://blogs.unity3d.com/kr/2017/12/11/using-machine-learning-agents-in-a-real-game-a-beginners-guide/) +* [첫번째 ML-Agents 챌린지](https://connect.unity.com/challenges/ml-agents-1)의 수상자 관련 [포스트](https://blogs.unity3d.com/kr/2018/02/28/introducing-the-winners-of-the-first-ml-agents-challenge/) +* 안전한 도시 설계를 위한 유니티 사용 방법 개요 관련 [포스트](https://blogs.unity3d.com/kr/2018/01/23/designing-safer-cities-through-simulations/) + +유니티에서 제공하는 문서 뿐만 아니라 관련된 기사들이 있습니다: + +* [유니티 AI - 유니티의 3D 인공지능](https://www.youtube.com/watch?v=bqsfkGbBU6k) +* [머신러닝을 배우는 게임 개발자](https://mikecann.co.uk/machine-learning/a-game-developer-learns-machine-learning-intent/) +* [인텔 아키텍쳐 전용 Unity Technologies ML-Agents 둘러보기](https://software.intel.com/en-us/articles/explore-unity-technologies-ml-agents-exclusively-on-intel-architecture) + +## 커뮤니티 그리고 피드백 + +ML-Agents toolkit은 오픈소스 프로젝트이며 컨트리뷰션을 환영합니다. 만약 컨트리뷰션을 원하시는 경우 +[컨트리뷰션 가이드라인](CONTRIBUTING.md)과 [행동 규칙](CODE_OF_CONDUCT.md)을 검토해주십시오. + +만약 ML-Agents toolkit을 사용하며 문제가 생긴다면, 가능한 많은 세부 사항을 포함하여 [이슈 제출](https://github.com/Unity-Technologies/ml-agents/issues)을 해주십시오. + +여러분의 의견은 저희에게 매우 중요합니다. Unity ML-Agents Toolkit에 관련된 여러분의 의견을 통해서 저희는 계속해서 +발전하고 성장할 수 있습니다. 단 몇 분만 사용하여 [저희에게 알려주세요](https://github.com/Unity-Technologies/ml-agents/issues/1454). + + +다른 의견과 피드백은 ML-Agents 팀과 직접 연락부탁드립니다. (ml-agents@unity3d.com) + + +## 라이센스 + +[Apache License 2.0](LICENSE) + +## 인용 + +만약 Unity 또는 the ML-Agents Toolkit을 사용하여 연구를 수행할 경우 다음 논문을 참고 자료로 인용하여 주시길 바랍니다: + +Juliani, A., Berges, V., Vckay, E., Gao, Y., Henry, H., Mattar, M., Lange, D. (2018). Unity: A General Platform for Intelligent Agents. *arXiv preprint arXiv:1809.02627.* https://github.com/Unity-Technologies/ml-agents. + + + +## 한글 번역 + +유니티 ML-Agents 관련 문서의 한글 번역은 [장현준(Hyeonjun Jang)][https://github.com/JangHyeonJun], [민규식 (Kyushik Min)]([https://github.com/Kyushik](https://github.com/Kyushik))에 의해 진행되었습니다. 내용상 오류나 오탈자가 있는 경우 각 문서의 번역을 진행한 사람의 이메일을 통해 연락주시면 감사드리겠습니다. + +장현준: totok682@naver.com + +민규식: kyushikmin@gmail.com \ No newline at end of file diff --git a/docs/localized/KR/docs/Installation-Windows.md b/docs/localized/KR/docs/Installation-Windows.md new file mode 100644 index 0000000000..fde671f589 --- /dev/null +++ b/docs/localized/KR/docs/Installation-Windows.md @@ -0,0 +1,304 @@ +# Windows ڸ ML-Agents Toolkit ġ + +ML-Agents toolkit Windows 10 մϴ. ٸ Windows ε ML-Agents toolkit + ʾҽϴ. , ML-Agents toolkit Windows VM(Bootcamp Ǵ ó +ȯ ) ʾҽϴ . + +ML-Agents toolkit ϱ , Ʒ Ȱ ó Python 䱸Ǵ Python Ű ġؾ մϴ. + ̵ GPU н(ڸ ) ٷϴ. +, ML-Agents toolkit GPU н ʿ Ǵ Ư ׿ ʿ ֽϴ. + +## ܰ 1: Anaconda Python ġ + +Windows Anaconda [ٿε](https://www.anaconda.com/download/#windows)ϰ ġϽʽÿ. +Anaconda ν, ٸ Python и ȯ濡 ֽϴ. +Python 2 ̻ ʱ Python 3.5 Ǵ 3.6 ʿմϴ. ̵忡 츮 +Python 3.6 Anaconda 5.1 Դϴ. +([64-bit](https://repo.continuum.io/archive/Anaconda3-5.1.0-Windows-x86_64.exe) +Ǵ [32-bit](https://repo.continuum.io/archive/Anaconda3-5.1.0-Windows-x86.exe) +ũ). + +

+ Anaconda Install +

+ +Ʈ _advanced installation options_ ϴ õ Ȳ ɼ Ͻʽÿ. + +

+ Anaconda Install +

+ +ġ Ŀ ݵ __Anaconda Navigator__ Ϸؾ մϴ. +Windows Ž â, _anaconda navigator_ ŸϿ Anaconda Navigator ֽϴ. + +ȯ Ǿ ʴٸ `conda` ɾ Ÿ +"conda is not recognized as internal or external command" Դϴ. +̸ ذϱ Ȯ ȯ ʿմϴ. + +Ž â `ȯ ` Ÿ Ͽ ( Ű ų Ʒ ư ֽϴ). + __ý ȯ __ ɼ ҷɴϴ. + +

+ edit env variables +

+ + ɼǿ __ȯ __ ư Ŭϰ. Ʒ __ý __ "Path" Ŭϰ __ __ ŬϿ path ߰Ͻʽÿ. + +```console +%UserProfile%\Anaconda3\Scripts +%UserProfile%\Anaconda3\Scripts\conda.exe +%UserProfile%\Anaconda3 +%UserProfile%\Anaconda3\python.exe +``` + +## ܰ 2: ο Conda ȯ Ȱȭ + +ML-Agents toolkit Բ ο [Conda ȯ](https://conda.io/docs/) Դϴ. + ۾ ġ Ű ȯ濡 ѵȴٴ ǹմϴ. ̴ ٸ ȯ̳ ٸ ̽ ġ + ġ ʽϴ. ML-Agents ׻ Conda ȯ Ȱȭ Ѿ մϴ. + +ο Conda ȯ , ο Anaconda Ʈ(Ž â _Anaconda Prompt_ Ŭ) +ɾ Ÿ Ͻʽÿ: + +```sh +conda create -n ml-agents python=3.6 +``` + + Ű ġϱ ޼ `y` Ÿϰ ͸ ʽÿ _(ͳ Ǿִ ȮϽʽÿ)_. + 䱸Ǵ Ű ݵ ġؾ մϴ. ο Conda ȯ濡 Python 3.6 Ǹ ml-agents ȣ˴ϴ. + +

+ Anaconda Install +

+ +ռ ȯ ̿ϱ ݵ Ȱȭ ؾմϴ. _(Ŀ ɾ ȯ ֽϴ)_. + Anaconda Ʈ ɾ Ÿ Ͻʽÿ: + +```sh +activate ml-agents +``` + +Ȱȭ Ŀ `(ml-agents)` ڰ տ Ÿ ֽϴ. + +, `tensorflow` ġմϴ. ̽ Ű ġϱ ϴ `pip` Ű ý۸ Ͽ ġ ֽϴ. +ֽ TensorFlow ۵ Ƿ, ġ 1.7.1 Ȯؾ մϴ. Anaconda Ʈ â + ɾ Ÿ Ͻʽÿ._(ͳ Ǿ ִ ȮϿ ֽʽÿ)_: + +```sh +pip install tensorflow==1.7.1 +``` + +## ܰ 3: ʼ ̽ Ű ġ + +ML-Agents toolkit ̽ Ű Դϴ. `pip` Ͽ ̽ Ӽ ġϽʽÿ. + +ML-Agents Toolkit Ұ ǻͿ Ǿ ʾҴٸ Ͻʽÿ. Git ([ٿε](https://git-scm.com/download/win))ϰ +Ų ɾ Anaconda Ʈâ ԷϿ ֽϴ. _( Ʈ â ִٸ `activate ml-agents` ŸϿ +ml-agents Conda ȯ Ȱȭ Ǿִ ȮϽʽÿ)_: + +```sh +git clone https://github.com/Unity-Technologies/ml-agents.git +``` + + Git ϰ ʴٸ [ũ](https://github.com/Unity-Technologies/ml-agents/archive/master.zip) ٿε ֽϴ. + +`UnitySDK` 丮 Ʈ ߰ Ƽ ּ ԵǾ ֽϴ. ϴµ Ǵ [ ȯ](Learning-Environment-Examples.md) ֽϴ. + +`ml-agents` 丮 Ƽ ȯ ԰ ϴ ȭн Ʈ̳ ̽ Ű ԵǾ ֽϴ. + +`ml-agents-envs` 丮 `ml-agents` Ű ӵǴ Ƽ ̽ ̽ API ԵǾ ֽϴ. + +`gym-unity` 丮 OpenAI Gym ̽ Ű ԵǾ ֽϴ. + +`mlagents-learn` Ʈ̳ ȯ 丮 ȿ ʿϹǷ, ٿε 丮 ġ Ͻʽÿ. +ͳ Ǿ Ȯϰ Anaconda Ʈ ɾ Ÿ Ͻʽÿt: + +```console +pip install mlagents +``` + +ML-Agents toolkit ʿ ̽ Ű ġ Ϸ Դϴ. + +Windows pip Ͽ Ư ̽ Ű ġ Ű ij д ֽϴ. + ذ ֽϴ: + +```console +pip install mlagents --no-cache-dir +``` + +`--no-cache-dir` pip ij Ȱȭ Ѵٴ Դϴ. + + +### ġ + + `ml-agents` Ǵ `ml-agents-envs` ϰ ʹٸ, PyPi ƴ ҷ Ű ġؾ մϴ. +̸ , `ml-agents` `ml-agents-envs` ġؾ մϴ. + + `C:\Downloads` ġ ֽϴ. ϰų ٿε +Anaconda Ʈ ml-agents 丮 ml-agents 丮 Ͻʽÿ: + +```console +cd C:\Downloads\ml-agents +``` + + 丮 Ͻʽÿ: + +```console +cd ml-agents-envs +pip install -e . +cd .. +cd ml-agents +pip install -e . +``` + +`-e` ÷׸ Ͽ pip ϸ ̽ ְ `mlagents-learn` ݿ˴ϴ. +`mlagents` Ű `mlagents_envs` ̰, ٸ ġϸ PyPi `mlagents_envs` ġ ֱ + Ű ġϴ ߿մϴ. + +## (ɼ) Step 4: ML-Agents Toolkit GPU н + +ML-Agents toolkit GPU ʿ н ߿ PPO ˰ ӵ ũ մϴ( Ŀ GPU ֽϴ). + ̵ GPU н ϰ ڸ ̵ Դϴ. GPU CUDA ȣȯǴ Ȯؾ մϴ. +[](https://developer.nvidia.com/cuda-gpus) Nvidia Ȯ ֽʽÿ. + + ML-Agents toolkit CUDA 9.0 cuDNN 7.0.5 ˴ϴ. + +### Nvidia CUDA toolkit ġ + +Nvidia ī̺꿡 CUDA Ŷ(toolkit) 9.0 [ٿε](https://developer.nvidia.com/cuda-toolkit-archive)ϰ ġϽʽÿ. +ML-Agents toolkit Ű CUDA Ŷ GPU ̺귯, +-ȭ , C/C++(־ Ʃ 2017) Ϸ, Ÿ ̺귯 մϴ. + ̵忡 [9.0.176](https://developer.nvidia.com/compute/cuda/9.0/Prod/network_installers/cuda_9.0.176_win10_network-exe)) մϴ. + +ġϱ , __ Ƽ Ǵ ־ Ʃ ߴ__ ȮϿ ֽʽÿ. + +ν緯 ϰ Express ɼ Ͻʽÿ. CUDA Ŷ ġ 丮 ֽʽÿ. ̵忡, +`C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0` ο ġմϴ. + +### Nvidia cuDNN ̺귯 ġ + +Nvidia cuDNN ̺귯 [ٿε](https://developer.nvidia.com/cudnn)ϰ ġϽʽÿ. +cuDNN Ű ⺻ Ǵ GPU ̺귯. ٿε Nvidia Developer Program ؾ Դϴ(). + +

+ cuDNN membership required +

+ +ϰ cuDNN [ٿε ](https://developer.nvidia.com/cudnn) ưʽÿ. +ª 翡 ؾ ֽϴ. When you get to the list +cuDNN Ʈ __ܰ 1 ġ CUDA Ŷ ´ ٿεϰ ִ ȮϽʽÿ.__ ̵忡, +CUDA Ŷ 9.0 7.0.5 մϴ +([ٿε ũ](https://developer.nvidia.com/compute/machine-learning/cudnn/secure/v7.0.5/prod/9.0_20171129/cudnn-9.0-windows10-x64-v7)). + +cuDNN ٿε Ŀ, CUDA Ŷ 丮ȿ ( )ؾ մϴ. +cuDNN zip ȿ `bin`, `include`, ׸ `lib` ֽϴ. + +

+ cuDNN zip files +

+ + CUDA Ŷ 丮ȿ Ͻʽÿ. +CUDA Ŷ 丮 `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0` ġ ֽϴ. + +

+ cuda toolkit directory +

+ +### ȯ + +1 ȯ 2 ߰ؾ մϴ. + +ȯ ϱ , Ž â `ȯ ` Ÿ Ͽ ( Ű ų Ʒ ư ֽϴ). + __ý ȯ __ ɼ ҷɴϴ. + +

+ edit env variables +

+ + ɼǿ __ȯ __ ư Ŭϰ ý __ __ ŬϽʽÿ _( ƴ Ʒ __ý __ ȮϽʽÿ). + +

+ new system variable +

+ +__ ̸__ `CUDA_HOME` ϰ CUDA Ŷ 丮 θ Է ֽʽÿ. + ̵忡 丮 δ `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0` Դϴ. Ȯ ư ֽʽÿ. + +

+ system variable names and values +

+ +2 __ȯ __ â ׸ Ʒ ι° ڽ __ý __ȿ, +`Path` ã Ŭϰ ____ư ʽÿ. Ʈ 2 丮 ߰ Դϴ. 丮 ϴ: + +```console +C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\lib\x64 +C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\extras\CUPTI\libx64 +``` + + 丮 ġ ġ 丮 ȮϽʽÿ. _ҹڿ Ͻʽÿ_. + +

+ Path variables +

+ +### TensorFlow GPU ġ + +, `pip` Ͽ 1.7.1. `tensorflow-gpu` ġϽʽÿ . ml-agents Conda ȯ Ȱȭ Ų Anaconda Ʈ +CPU TensorFlow ϰ GPU TensorFlow ġϱ ɾ Ÿ Ͻʽÿ _(ͳ Ǿ ִ ȮϽʽÿ)_: + +```sh +pip uninstall tensorflow +pip install tensorflow-gpu==1.7.1 +``` + +, ġǾ ְ, Tensorflow GPU νϰ ִ ׽Ʈؾմϴ. + Anaconda Ʈ Python ȣϿ ϴ: + +```sh +python +``` + +׸ ɾ Ÿ Ͻʽÿ: + +```python +import tensorflow as tf + +sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) +``` + + Դϴ: + +```console +Found device 0 with properties ... +``` + +## Acknowledgments + + ̵带 ʾ ۼ +[Jason Weimann](https://unity3d.college/2017/10/25/machine-learning-in-unity3d-setting-up-the-environment-tensorflow-for-agentml-on-windows-10/) + +[Nitish S. Mutha](http://blog.nitishmutha.com/tensorflow/2017/01/22/TensorFlow-with-gpu-for-windows.html) + 帳ϴ. + +## ѱ + +ش ѱ [ (Hyeonjun Jang)]([https://github.com/janghyeonjun](https://github.com/janghyeonjun)) Ǿϴ. Żڰ ִ totok682@naver.com ֽø 帮ڽϴ. diff --git a/docs/localized/KR/docs/Installation.md b/docs/localized/KR/docs/Installation.md new file mode 100644 index 0000000000..44b50d3f36 --- /dev/null +++ b/docs/localized/KR/docs/Installation.md @@ -0,0 +1,104 @@ +# 설치 + +ML-Agents를 설치하고 사용하기 위해 유니티를 설치해야 하고 이 Repository(저장소)를 +Clone(복제)하고 추가종속성을 가지는 Python(파이썬)을 설치해야합니다. 아래 Subsection(하위섹션)에서는 Docker(도커) 설정 외에도 +각 단계를 개괄적으로 설명합니다. + +## **Unity 2017.4** 또는 이후의 버전을 설치하십시오. + +[다운로드](https://store.unity.com/kr/download)하고 설치하십시오. 만약 저희의 도커 설정(차후에 소개할)을 사용하고 싶다면, +유니티를 설치할 때, Linux Build Support를 설정하십시오. + +

+ Linux Build Support +

+ +## Windows 사용자 +Windows에서 환경을 설정하기 위해, [세부 사항](Installation-Windows.md)에 설정 방법에 대해 작성하였습니다. +Mac과 Linux는 다음 가이드를 확인해주십시오. + +## Mac 또는 Unix 사용자 + +### ML-Agents Toolkit 저장소 복제 + +유니티 설치 후에 ML-Agents Toolkit 깃허브 저장소를 설치하고 싶을 것입니다. + +```sh +git clone https://github.com/Unity-Technologies/ml-agents.git +``` + +`UnitySDK` 하위 디렉토리에는 프로젝트에 추가할 유니티 애셋이 포함되어 있습니다. +또한 시작하는데 도움이 되는 많은 [예제 환경](Learning-Environment-Examples.md)들이 있습니다. + +`ml-agents` 하위 디렉토리에는 유니티 환경과 함게 사용하는 심층 강화학습 트레이너 파이썬 패키지가 포함되어 있습니다. + +`ml-agents-envs` 하위 디렉토리에는 `ml-agents` 패키지에 종속되는 유니티의 인터페이스를 위한 파이썬 API가 포함되어 있습니다. + +`gym-unity` 하위 디렉토리에는 OpenAI Gym의 인터페이스를 위한 패키지가 포함되어 있습니다. + +### 파이썬과 mlagents 패키지 설치 + +ML-Agents toolkit을 사용하기 위해 [setup.py file](../ml-agents/setup.py)에 나열된 종속성과 함께 파이썬 3.6이 필요합니다. +주요 종속성의 일부는 다음을 포함합니다: + +- [TensorFlow](Background-TensorFlow.md) (Requires a CPU w/ AVX support) +- [Jupyter](Background-Jupyter.md) + +Python 3.6이 만약 설치되어 있지 않다면, [다운로드](https://www.python.org/downloads/)하고 설치하십시오. + +만약 당신의 파이썬 환경이 `pip3`을 포함하지 않는다면, 다음 +[지시사항](https://packaging.python.org/guides/installing-using-linux-tools/#installing-pip-setuptools-wheel-with-linux-package-managers) +을 따라서 설치하십시오. + +종속성과 `mlagents` 파이썬 패키지를 설치하기 위해 다음 명령어를 실행하십시오: + +```sh +pip3 install mlagents +``` + +이 명령어를 통해 PyPi로 부터(복제된 저장소가 아닌) `ml-agents`가 설치될 것입니다. +만약 성공적으로 설치를 완료 했다면, `mlagents-learn --help` 명령어를 실행할 수 있을 것입니다. +명령어를 실행하면 유니티 로고와 `mlagents-learn`에서 사용할 수 있는 명령어 라인 매개변수들을 볼 수 있습니다. + +**주의:** + +- 현재 Python 3.7 또는 Python 3.5을 지원하지 않습니다. +- 만약 Anaconda를 사용하고 TensorFlow에 문제가 있다면, 다음 + [링크](https://www.tensorflow.org/install/pip)에서 Anaconda 환경에서 어떻게 TensorFlow를 설치하는지 확인하십시오. +### 개발을 위한 설치방법 + +만약 `ml-agents` 또는 `ml-agents-envs`를 수정하고 싶다면, PyPi가 아닌 복제된 저장소로 부터 패키지를 설치해야 합니다. +이를 위해, `ml-agents`와 `ml-agents-envs`를 각각 설치해야 합니다. 저장소의 루트 디렉토리에서 다음 명령어를 실행하십시오: + +```sh +cd ml-agents-envs +pip3 install -e ./ +cd .. +cd ml-agents +pip3 install -e ./ +``` + +`-e` 플래그를 사용하여 pip를 실행 하면 파이썬 파일을 직접 변경할 수 있고 `mlagents-learn`를 실행할 때 반영됩니다. +`mlagents` 패키지가 `mlagents_envs`에 의존적이고, 다른 순서로 설치하면 PyPi로 부터 `mlagents_envs`를 +설치할 수 있기 때문에 이 순서대로 패키지를 설치하는 것은 중요합니다. + +## 도커 기반 설치 + +만약 ML-Agents를 위해 도커를 사용하고 싶다면, [이 가이드](Using-Docker.md)를 따라하십시오. + +## 다음 단계 + +[기초 가이드](Basic-Guide.md) 페이지에는 유니티 내에서 ML-Agents toolkit의 설정 및 학습된 모델 실행, +환경 구축, 학습 방법에 대한 여러 짧은 튜토리얼을 포함하고 있습니다. + +## 도움말 + +ML-Agents와 관련된 문제가 발생하면 저희의 [FAQ](FAQ.md)와 [제약 사항](Limitations.md) 페이지를 참고해 주십시오. +만약 문제에 대한 아무것도 찾을 수 없다면 OS, Pythons 버전 및 정확한 오류 메세지와 함께 [이슈 제출](https://github.com/Unity-Technologies/ml-agents/issues)을 해주십시오. + + +## 한글 번역 + +해당 문서의 한글 번역은 [장현준 (Hyeonjun Jang)]([https://github.com/janghyeonjun](https://github.com/janghyeonjun))에 의해 진행되었습니다. 내용상 오류나 오탈자가 있는 경우 totok682@naver.com 으로 연락주시면 감사드리겠습니다. \ No newline at end of file diff --git a/docs/localized/KR/docs/Training-Imitation-Learning.md b/docs/localized/KR/docs/Training-Imitation-Learning.md new file mode 100644 index 0000000000..dc4a24b9d1 --- /dev/null +++ b/docs/localized/KR/docs/Training-Imitation-Learning.md @@ -0,0 +1,85 @@ +# 모방학습을 통한 에이전트 학습 + +에이전트가 시행착오를 통해 스스로 학습하는 것보다 단순히 에이전트가 수행하기를 원하는 행동을 우리가 알려주는 것이 더 직관적일 수 있습니다. 위생병 NPC를 학습하기 위한 [실행 예시](ML-Agents-Overview.md#running-example-training-npc-behaviors) 문서 내용에 대해 생각해보겠습니다. 보상 함수를 이용하여 위생병의 행동을 간접적으로 학습하는 것이 아니라 게임에서 얻어진 관측 (observation)과 게임 컨트롤러를 통해 얻어진 행동들 (actions)의 실제 데이터를 통해 위생병의 행동을 결정하도록 학습합니다. 모방학습 (Imitation Learning)은 실제 플레이를 통해 얻어진 관측과 행동 데이터 쌍을 이용하여 에이전트의 정책을 학습합니다. [비디오 링크](https://youtu.be/kpb8ZkMBFYs). + +## 시범 (Demonstration) 데이터 기록 + +유니티 에디터를 이용하여 에이전트의 플레이를 기록하고 에셋으로 저장하는 것이 가능합니다. 이런 플레이 데이터에는 기록을 진행하는 동안의 관측, 행동 그리고 보상 정보가 포함됩니다. 이것들은 데이터를 통해 관리가 가능하며 Behavioral Cloning과 같은 오프라인 학습에 사용될 수 있습니다. (아래 내용 참고) + +에이전트의 플레이 데이터를 기록하기 위해서는 씬(Scene)에서 `Agent` 컴포넌트를 포함하고 있는 GameObject에 `Demonstration Recorder` 컴포넌트를 추가해주어야 합니다. 일단 추가되고나면 에이전트로부터 플레이 데이터를 기록할 수 있게 됩니다. + +

+ BC Teacher Helper +

+ +`Record`가 체크되는 경우 씬이 실행되면 데이터가 생성됩니다. 환경의 난이도에 따라 모방학습에 사용하기 위해 몇분에서 몇시간 정도 플레이 데이터를 수집해야합니다. 충분한 데이터가 기록되었으면 유니티 상에서 게임의 실행을 정지합니다. 그렇게 하면 `.demo` 파일이 `Assets/Demonstations` 폴더 내부에 생성됩니다. 이 파일에는 에이전트의 플레이 데이터가 저장되어 있습니다. 이 파일을 클릭하면 인스펙터 상에 데모 파일에 대한 정보를 아래와 같이 알려줍니다. + +

+ BC Teacher Helper +

+ + +## Behavioral Cloning을 통한 학습 + +모방학습을 위한 다양한 알고리즘이 존재하며 모방학습 알고리즘 중 가장 간단한 알고리즘이 Behavioral Cloning 입니다. 이 알고리즘은 마치 이미지 분류를 위한 지도학습 (Supervised Learning)이나 기타 고전적인 머신러닝 기법들처럼 전문가의 플레이로부터 수집된 데이터를 직접적으로 모방하도록 정책 (Policy)을 학습합니다. + + +### 오프라인 학습 + +오프라인 Behavioral Cloning에서 우리는 에이전트의 행동을 학습하기 위해 `Demonstration Recorder`를 통해 생성된 `demo` 파일을 데이터 셋으로 이용합니다. + +1. 전문가의 플레이 데이터를 모방하도록 학습하는 에이전트 선택 +2. `Demonstration Recorder`를 이용하여 전문가의 플레이를 기록합니다. (위의 내용 참고) + 앞으로 설명을 위해 이 기록된 파일의 이름을 `AgentRecording.demo`라고 하겠습니다. +3. 씬을 빌드하고 에이전트에게 러닝 브레인 (Learning Brain)을 할당합니다. 그리고 아카데미의 Broadcast Hub에서 이 브레인의 Control을 체크해줍니다. 브레인에 대한 정보가 필요하시면 다음의 [문서](Learning-Environment-Design-Brains.md)를 참고해주세요. +4. `config/offline_bc_config.yaml` 파일을 열어줍니다. +5. `demo_path` 파라미터를 스텝 2에서 기록한 데모 파일의 경로로 수정해줍니다. 이번 예시의 경우 설정된 경로는 다음과 같습니다: `./UnitySDK/Assets/Demonstrations/AgentRecording.demo` +6. `./config/offline_bc_config.yaml` 을 설정 파라미터로 하는 mlagent-learn을 실행하며 `--run-id` 와 `--train` 을 입력합니다. 빌드된 환경이 standalone으로 컴파일되었거나 에디터에서 train이 생략된 경우 `--env` 파라미터에 빌드된 환경의 경로를 기입해주세요. + ​ +7. (선택적) 텐서 보드를 활용하여 학습 성능을 확인해보세요!. + +위 방법은 데모 파일을 이용하여 에이전트가 직접적으로 전문가의 행동을 따라하도록 인공신경망을 학습하는 기법입니다. 환경은 학습이 진행되는 동안 에이전트의 성능을 평가하기 위해 실행되며 사용될 것입니다. + +### 온라인 학습 + +미리 생성된 데모 파일 없이 학습이 진행되는 동안 실시간으로 전문가의 플레이 데이터를 제공하며 에이전트를 학습하는 것도 간으합니다. 이 방법은 다음의 단계를 따라 진행됩니다: +without pre-recording a demonstration file. The steps to do this are as follows: + +1. 먼저 두개의 브레인들을 생성합니다. 하나는 "선생님"이 될 것이고 하나는 "학생"이 될 것입니다. 이번 예시에서는 두개의 브레인 에셋의 이름을 각각 "Teacher"와 "Student"로 설정할 것입니다. +2. "Teacher" 브레인은 반드시 **플레이어 브레인 (Player Brain)**이어야 합니다. +3. "Student" 브레인은 반드시 **러닝 브레인 (Learning Brain)**이어야 합니다. +4. "Teacher" 브레인과 "Student" 브레인의 파라미터는 에이전트에서 설정한대로 동일하게 설정되어야 합니다. +5. "Teacher" 브레인과 "Student" 브레인을 아카데미의 `Broadcast Hub`에 추가하고 "Student" 브레인의 `Control` 체크박스에 체크를 해줍니다. +6. 브레인들을 원하는 에이전트들에게 연결해줍니다. (하나의 에이전트는 선생님으로 설정되어야 하며 적어도 하나의 에이전트는 학생으로 설정되어야 합니다). +7. `config/online_bc_config.yaml` 파일에서, "Student" 브레인에 대한 항목을 추가해야합니다. `trainer` 파라미터를 `online_bc`로 설정하고 `brain_to_imitate` 파라미터를 선생님 에이전트의 브레인 이름인 "Teacher"로 설정합니다. 추가적으로 각 순간마다 얼마나 많은 학습을 진행할지 결정하는 `batches_per_epoch`를 설정합니다. 에이전트를 더 오랜 기간동안 학습하고 싶은 경우 `max_steps` 값을 증가시켜주세요. +8. `mlagents-learn config/online_bc_config.yaml + ​--train —slow`를 통해 학습과정을 실행하고 화면에 _"Start training by pressing the Play button in the Unity Editor"_ 라는 메세지가 출력되면 유니티의 :arrow_forward: 버튼을 눌러주세요 +9. 유니티 윈도우 상에서 선생님 브레인을 가진 에이전트를 제어하면서 원하는대로 플레이 데이터를 생성합니다. +10. 학생 브레인을 가진 에이전트(들)을 살펴보면 선생님 브레인을 가진 에이전트의 플레이와 유사하게 행동하기 시작합니다. +11. 학생 에이전트들이 원하는대로 행동하게 되면 커멘드 라인에서 `CTL+C`를 눌러서 학습을 중단하십시오. +12. 생성된 `*.nn` 파일을 Assets 폴더의 하위 폴더인 `TFModels` 폴더로 이동시키고 이 파일을 `러닝` 브레인에 사용하세요. + +**BC Teacher Helper** + +더 편리한 사용을 위해서, `BC Teacher Helper` 컴포넌트를 선생님 에이전트에 사용할 수 있습니다. + +

+ BC Teacher Helper +

+ +이것을 사용하면 다음과 같은 키보드 단축키를 사용할 수 있습니다: + +1. 기록을 시작하거나 중단할 수 있습니다. 이것은 에이전트를 통해 게임을 플레이하되 에이전트가 학습은 되지 않도록 사용할 때 유용합니다. 이것에 대한 기본적인 실행은 키보드의 `R` 버튼을 누르면 됩니다. +2. 트레이닝 버퍼를 리셋합니다. 이 명령을 통해 에이전트가 최근의 경험에 대한 버퍼를 비우도록 설정합니다. 이것은 에이전트가 빠르게 새로운 행동을 배우게 하고싶을때 사용하면 유용합니다. 버퍼를 리셋하기 위한 기본 명령은 키보드의 `C` 버튼을 누르면 됩니다. + + + +## 한글 번역 + +해당 문서의 한글 번역은 [민규식 (Kyushik Min)]([https://github.com/Kyushik](https://github.com/Kyushik))에 의해 진행되었습니다. 내용상 오류나 오탈자가 있는 경우 kyushikmin@gmail.com 으로 연락주시면 감사드리겠습니다. \ No newline at end of file diff --git a/docs/localized/KR/docs/Training-PPO.md b/docs/localized/KR/docs/Training-PPO.md new file mode 100644 index 0000000000..d61af58b84 --- /dev/null +++ b/docs/localized/KR/docs/Training-PPO.md @@ -0,0 +1,151 @@ +# Proximal Policy Optimization를 이용한 학습 + +ML-Agents는 [Proximal Policy Optimization (PPO)](https://blog.openai.com/openai-baselines-ppo/) 라는 강화학습 기법을 사용합니다. +PPO는 에이전트의 관측 (Observation)을 통해 에이전트가 주어진 상태에서 최선의 행동을 선택할 수 있도록 하는 이상적인 함수를 인공신경망을 이용하여 근사하는 기법입니다. ML-agents의 PPO 알고리즘은 텐서플로우로 구현되었으며 별도의 파이썬 프로세스 (소켓 통신을 통해 실행중인 유니티 프로그램과 통신)에서 실행됩니다. + +에이전트를 학습하기 위해서 사용자는 에이전트가 최대화하도록 시도하는 보상 시그널을 하나 혹은 그 이상 설정해야합니다. 사용 가능한 보상 시그널들과 관련된 하이퍼파라미터에 대해서는 [보상 시그널](Training-RewardSignals.md) 문서를 참고해주십시오. + +`learn.py`를 이용하여 학습 프로그램을 실행하는 방법은 [ML-Agents 학습](Training-ML-Agents.md) 문서를 참고해주십시오. + +만약 에이전트에게 기억력을 부여하기 위해 순환 신경망 (Recurrent Neural Network, RNN)을 사용하는 경우, 순환 신경망에 대한 구체적인 학습 방법을 설명하는 [순환 신경망 사용하기](Feature-Memory.md) 문서를 참고해주십시오. + + +만약 에이전트에게 제시된 문제의 난이도를 점차적으로 증가시키며 학습하는 커리큘럼 학습 (Curriculum Learning)을 사용하는 경우 [커리큘럼 학습을 통한 에이전트 학습](Training-Curriculum-Learning.md) 문서를 참고해주십니오. + +모방 학습 (Imitation Learning)에 대한 정보를 얻고 싶으시다면 [모방 학습을 통한 에이전트 학습](Training-Imitation-Learning.md) 문서를 참고해주십시오. + + + +## PPO 학습을 위한 하이퍼파라미터 + +강화학습 모델을 성공적으로 학습하기 위해서는 학습과 관련된 하이퍼파라미터 튜닝이 필요합니다. 이 가이드는 기본적인 파라미터들을 이용하여 학습했을 때 사용자가 원하는 성능을 만족하지 못한 경우 파라미터 튜닝을 수행하는 방법에 대해 설명합니다. + +## 하이퍼파라미터 + +### Reward Signals + +강화학습에서 목표는 보상을 최대로 하는 정책 (Policy)을 학습하는 것입니다. 기본적으로 보상은 환경으로부터 주어집니다. 그러나 우리는 다양한 다른 행동을 통해 에이전트에게 보상을 주는 것을 생각해볼 수 있습니다. 예를 들어 에이전트가 새로운 상태를 탐험했을 때 에이전트에게 보상을 줄 수 있습니다. 이런 보상 시그널을 추가하여 학습 과정에 도움을 줄 수도 있습니다. + +`reward_signals`는 [보상 시그널](Training-RewardSignals.md)을 정의합니다. ML-Agents는 기본적으로 두개의 보상 시그널을 제공합니다. 하나는 외부 (환경) 보상이며 다른 하나는 호기심 (Curiosity) 보상입니다. 이 호기심 보상은 외부 보상이 희소성을 가지는 환경 (Sparse Extrinsic Reward Environment)에서 더 다양한 탐험을 수행할 수 있도록 도와줍니다. + +### Lambda + +`lambd` 는 `람다(lambda)` 파라미터를 의미하며 일반화된 이득 추정 (Generalized Advantage Estimate, [GAE]((https://arxiv.org/abs/1506.02438))) 계산에 사용됩니다. 이는 업데이트된 가치를 예측할 때 현재 예측된 가치에 얼마나 의존할지 결정하는 값입니다. 이 값이 낮으면 현재 예측된 가치에 더 의존하는 것을 의미하며 (높은 편향 (bias) 발생 가능), 값이 높으면 환경을 통해 얻은 실제 보상에 더 의존하는 것을 의미합니다 (높은 분산 (variance) 발생 가능). 즉 이 파라미터를 어떻게 선택하냐에 따라 두 특성간에 트레이드오프 (trade-off)가 존재합니다. 또한 이 파라미터를 적절하게 선택하면 더 안정적인 학습이 가능합니다. + +일반적인 범위: `0.9` - `0.95` + +### Buffer Size + +`buffer_size` 는 모델 학습을 시작하기 전 얼마나 많은 경험들(관측, 행동, 보상 등)을 저장할지 결정합니다. **이 값은 `batch_size`의 배수로 설정되어야 합니다.** 일반적으로 큰 `buffer_size`는 더 안정적인 학습을 가능하게 합니다. + +일반적인 범위: `2048` - `409600` + +### Batch Size + +`batch_size` 는 한번의 경사하강(Gradient Descent) 업데이트를 수행할 때 사용할 경험들의 수를 의미합니다. **이 값은 항상 `buffer_size`의 약수로 설정되어야 합니다.** 만약 연속적인 행동 공간 (Continuous Action Space) 환경을 사용하는 경우 이 값은 크게 설정되어야 합니다 (1000의 단위). 만약 이산적인 행동 공간 (Discrete Action Space) 환경을 사용하는 경우 이 값은 더 작게 설정되어야 합니다. (10의 단위). + +일반적인 범위 (연속적인 행동): `512` - `5120` + +일반적인 범위 (이산적인 행동): `32` - `512` + +### Number of Epochs + +`num_epoch` 는 경사 하강 (Gradient Descent) 학습 동안 경험 버퍼 (Experience Buffer) 데이터에 대해 학습을 몇번 수행할 지 결정합니다. `batch_size`가 클수록 이 값도 커져야합니다. 이 값을 줄이면 더 안정적인 업데이트가 보장되지만 학습 속도가 느려집니다. + +일반적인 범위: `3` - `10` + +### Learning Rate + +`learning_rate` 는 경사 하강 (Gradient Descent) 학습의 정도를 결정합니다. 학습이 불안정하고 에이전트가 얻는 보상이 증가하지 않는 경우 일반적으로 학습률을 감소시킵니다. + +일반적인 범위: `1e-5` - `1e-3` + +### Time Horizon + +`time_horizon` 은 경험 버퍼 (Experience Buffer)에 저장하기 전 에이전트당 수집할 경험의 스텝 수를 의미합니다. 에피소드가 끝나기 전에 이 한도에 도달하면 가치 평가를 통해 에이전트의 현재 상태로부터 기대되는 전체 보상을 예측합니다. 따라서 이 값의 설정에 따라 덜 편향되지만 분산이 커질수도 있고 (긴 time horizon), 더 편향 (bias)되지만 분산 (variance)이 작아질 수도 있습니다 (짧은 time horizon). 한 에피소드 동안 보상이 빈번하게 발생하는 경우나 에피소드가 엄청나게 긴 경우에는 time horizon 값은 작게 설정하는 것이 이상적입니다. 이 값은 에이전트가 취하는 일련의 행동 내에서 중요한 행동을 모두 포착할 수 있을 만큼 큰 값을 가져야 합니다. + +일반적인 범위: `32` - `2048` + +### Max Steps + +`max_steps` 은 학습 과정 동안 얼마나 많은 시뮬레이션 스텝 (프레임 스킵을 곱한만큼) 을 실행할지 결정합니다. 이 값은 복잡한 문제일수록 크게 설정해야합니다. + +일반적인 범위: `5e5` - `1e7` + +### Beta + +`beta` 는 엔트로피 정규화 (Entropy Regulazation)의 정도를 결정하며 이를 통해 정책을 더 랜덤하게 만들 수 있습니다. 이 값을 통해 에이전트는 학습 동안 액션 공간을 적절하게 탐험할 수 있습니다. 이 값을 증가시키면 에이전트가 더 많이 랜덤 행동을 취하게 됩니다. 엔트로피 (텐서보드를 통해 측정 가능)는 보상이 증가함에 따라 서서히 크기를 감소시켜야합니다. 만약 엔트로피가 너무 빠르게 떨어지면 `beta`를 증가시켜야합니다. 만약 엔트로피가 너무 느리게 떨어지면 `beta`를 감소시켜야 합니다. + +일반적인 범위: 1e-4 - 1e-2 + +### Epsilon + +`epsilon` 은 경사 하강 업데이트 동안 사용하는 이전 정책과 새로운 정책 사이의 비율을 일정 범위의 크기로 제한하는 값입니다. 이 값이 작게 설정되면 더 안정적인 학습이 가능하지만 학습이 느리게 진행될 것입니다. + +일반적인 범위: `0.1` - `0.3` + +### Normalize + +`normalize`는 벡터 관측 (Vector Observation) 입력을 정규화 (Normalization)할지 결정합니다. 이 정규화는 벡터 관측의 이동 평균 및 분산을 기반으로 수행합니다. 정규화는 복잡하고 연속적인 제어 문제에서 도움이 될 수 있지만 단순하고 이산적인 제어 문제에서는 정규화를 사용하는 것이 좋지 않을 수 있습니다. + +### Number of Layers + +`num_layers` 는 관측 입력 후 혹은 시각적 관측 (Visual Observation)의 CNN 인코딩 이후 몇개의 은닉층 (Hidden Layer)을 사용할지 결정합니다. 간단한 문제에서는 적은 수의 층을 사용하여 빠르고 효율적으로 학습해야합니다. 복잡한 제어 문제에서는 많은 층을 사용할 필요가 있습니다. + +일반적인 범위: `1` - `3` + +### Hidden Units + +`hidden_units` 은 인공신경망의 각 완전연결층 (Fully Connected Layer)에 몇개의 유닛을 사용할지 결정합니다. 최적의 행동이 관측 입력의 간단한 조합으로 결정되는 단순한 문제에 대해서는 이 값을 작게 설정합니다. 최적의 행동이 관측 입력의 복잡한 관계에 의해 결정되는 어려운 문제에 대해서는 이 값을 크게 설정합니다. + +일반적인 범위: `32` - `512` + +## (선택적) 순환신경망의 하이퍼파라미터 + +아래의 하이퍼파라미터들은 `use_recurrent` 이 참(True)으로 결정된 경우에만 사용합니다. + +### Sequence Length + +`sequence_length` 는 학습 동안 네트워크를 통과하는 연속적인 경험들의 길이를 의미합니다. 에이전트가 긴 시간에 대해 기억해야하는 정보가 있다면 이 값을 충분히 길게 설정해야합니다. 예를 들어 에이전트가 물체의 속도를 기억해야하는 경우 이 값은 작게 설정해도 괜찮습니다. 만약 에이전트가 에피소드 초반에 한번 주어진 정보를 계속 기억해야한다면 이 값을 크게 설정해야 합니다. + +일반적인 범위: `4` - `128` + +### Memory Size + +`memory_size` 는 순환신경망의 은닉 상태(hidden state)를 저장하는데 사용되는 배열의 크기를 의미합니다. 이 값은 반드시 4의 배수로 설정되어야 하며 에이전트가 임무를 성공적으로 완수하기 위해서 기억해야하는 정보의 양에 따라 크기를 조절해야합니다. + +일반적인 범위: `64` - `512` + +## Training Statistics + +학습의 상태를 확인하려면 텐서보드 (TensorBoard)를 사용해야합니다. 텐서보드를 실행하고 사용하는 것에 대한 정보를 알고싶으신 경우 이 [문서](./Getting-Started-with-Balance-Ball.md#observing-training-progress)를 참고해주십시오. + +### Cumulative Reward + +보상은 일반적으로 지속적으로 증가하는 경향을 가져야합니다. 작은 기복이 발생할수는 있습니다. 문제의 복잡도에 따라 수백만 스텝의 학습이 진행되어도 보상이 증가하지 않을수도 있습니다. + +### Entropy + +이 값은 브레인이 결정이 얼마나 무작위인지 나타냅니다. 이 값은 학습이 진행되는 동안 지속적으로 감소해야합니다. 만약 이 값이 너무 빠르게 감소하거나 아예 감소하지 않는 경우 `beta`의 크기를 조절해야합니다. (이산적인 행동 공간을 사용하는 경우) + +### Learning Rate + +이 값은 시간이 지남에 따라 선형적으로 감소합니다. + +### Policy Loss + +이 값들은 학습이 진행되는 동안 진동합니다. 일반적으로 이 값들은 1보다 작아야합니다. + +### Value Estimate + +이 값들은 누적 보상이 증가함에 따라 커져야합니다. 이 값들은 주어진 시점에서 에이전트가 스스로 받을 것이라 예측하는 미래의 보상이 얼마나 될것인지를 나타냅니다. + +### Value Loss + +이 값들은 보상이 증가하면 증가하고 보상이 안정되면 감소합니다. + + + +## 한글 번역 + +해당 문서의 한글 번역은 [민규식 (Kyushik Min)]([https://github.com/Kyushik](https://github.com/Kyushik))에 의해 진행되었습니다. 내용상 오류나 오탈자가 있는 경우 kyushikmin@gmail.com 으로 연락주시면 감사드리겠습니다. \ No newline at end of file diff --git a/docs/localized/KR/docs/Using-Docker.md b/docs/localized/KR/docs/Using-Docker.md new file mode 100644 index 0000000000..947479296d --- /dev/null +++ b/docs/localized/KR/docs/Using-Docker.md @@ -0,0 +1,123 @@ +# ML-Agents 용 도커 사용법 + +도커를 사용해 추론과 학습을 하고자하는 Windows와 Mac 사용자를 위한 솔루션을 제공합니다. +이것은 Python과 TensorFlow 설치를 피하고자 하는 분에게 매력적인 옵션이 될 것입니다. 현재 설정은 TensorFlow와 Unity가 _CPU를 통해서만_ +계산하도록 합니다. 따라서 도커 시뮬레이션은 GPU를 사용하지 않고 시각적 렌더링을 위해 [`Xvfb`](https://en.wikipedia.org/wiki/Xvfb)를 사용합니다. +`Xvfb`는 `ML-Agents`(또는 다른 응용 프로그램)가 가상으로 렌더링을 할 수 있게하는 유틸리티 입니다. 즉, `ML-Agents`를 실행하는 기계가 GPU를 가지고 있거나 +디스플레이를 가지고 있다고 가정하지 않습니다. 이것은 카메라 기반의 시각적 관찰 요소가 포함된 환경은 더욱 느려질 수도 있음을 의미합니다. + +## 요구사항 + +- 유니티 _Linux Build Support_ 컴포넌트 +- [도커](https://www.docker.com) + +## 설치 + +- 유니티 인스톨러를 [다운로드](https://unity3d.com/kr/get-unity/download)하고 _Linux Build Support_ 컴포넌트를 추가하십시오. + +- 도커가 설치되어 있지 않다면 [다운로드](https://www.docker.com/community-edition#/download)하고 설치 하십시오. + +- 호스트 머신과 분리된 환경에서 도커를 실행하기 때문에, 호스트 머신안에 마운트된 디렉토리는 트레이너 환경 설정 파일, + 유니티 실행 파일, 커리큘럼 파일과 TensorFlow 그래프와 같은 데이터를 공유하기위해 사용됩니다. + 이를 위해, 편의상 비어있는 `unity-volume` 디렉토리를 저장소의 루트에 만들었으나, 다른 디렉토리의 사용은 자유롭게 할 수 있습니다. + 이 가이드의 나머지 부분에서는 `unity-volume` 디렉토리가 사용된다고 가정하고 진행됩니다. + +## 사용법 + +ML-Agents 용 도커 사용에는 세 단계가 포함됩니다.: 특정 플래그를 사용하여 유니티 환경 빌드, 도커 컨테이너 빌드 +마지막으로, 컨테이너 실행. 만약 ML-Agents 용 유니티 환경 빌드에 익숙하지 않다면, [3D 밸런스 볼 예제와 함께 시작하기](Getting-Started-with-Balance-Ball.md) 가이드를 먼저 읽으십시오. + +### 환경 빌드 (옵션) + +_학습을 위해 에디터 사용을 원한다면 이 단계를 건너뛸 수 있습니다._ + +도커는 일반적으로 호스트 머신과 (리눅스) 커널을 공유하는 컨테이너를 실행하기 때문에, +유니티 환경은 리눅스 플랫폼이 구축되어야 합니다. 유니티 환경을 빌드할 때, 빌드 세팅 창(Build Settings window)에서 +다음 옵션을 선택해 주십시오: + +- 타겟 플랫폼을 `리눅스`로 설정 (Set the _Target Platform_ to `Linux`) +- _아키텍처_를 `x86_64'로 설정 (Set the _Architecture_ to `x86_64`) +- 환경에서 시각적인 관찰을 필요로 하지않는다면, `headless` 옵션을 선택할 수 있습니다 (아래 사진 참조). + +`빌드` (Build)를 클릭하고, 환경 이름을 선택하고 (예시: `3DBall`) 출력 디레토리를 `unity-volume`으로 설정하십시오. +빌드 후에, 파일 `<환경 이름>.x86_64` 와 하위디렉토리 `<환경 이름>_Data/` 가 `unity-volume` 에 생성 되어있는지 확인하십시오. + +![도커를 위한 빌드 설정](images/docker_build_settings.png) + +### 도커 컨테이너 빌드 + +첫 번째, 도커 머신이 시스템에서 작동하는지 확인하십시오. 저장소의 최상단에서 다음 명령어를 호출하여 +도커 컨테이너를 빌드하십시오: + +```sh +docker build -t . +``` + +``을 도커 이미지 이름으로 바꾸십시오, 예시: `balance.ball.v0.1`. + +### 도커 컨테이너 실행 + +저장소의 최상단에서 다음 명령어를 호출하여 도커 컨테이너를 실행하십시오: + +```sh +docker run --name \ + --mount type=bind,source="$(pwd)"/unity-volume,target=/unity-volume \ + -p 5005:5005 \ + :latest \ + --docker-target-name=unity-volume \ + \ + --env= \ + --train \ + --run-id= +``` + +인수(argument) 값 정보: + +- `` 은 컨테이너를 구분하기위해 사용됩니다 (컨테이너를 인터럽트하거나 종료시킬 때). +이것은 선택사항이며 설정하지 않았을 경우 도커는 랜덤한 이름을 생성합니다. _도커 이미지를 실행할 때마다 +고유한 이름을 가져야함에 유의하십시오._ +- `` 컨테이너를 빌드할 때 사용할 image name을 참조합니다. +- `` __(옵션)__: 리눅스 실행파일과 함께 학습을 할 경우, 인수 값이 실행파일의 이름이 된다. +에디터에서 학습을 할 경우, `` 인수를 전달하지 말고 유니티에서 _"Start training by pressing + the Play button in the Unity Editor"_ 메세지가 화면에 표시될 때 :arrow_forward: 버튼을 누르십시오. +- `source`: 유니티 실행파일을 저장할 호스트 운영체제의 경로를 참조합니다. +- `target`: 도커가`source` 경로에 이 이름을 가진 디스크로 마운트하도록 합니다. +- `docker-target-name`: ML-Agents 파이썬 패키지에게 유니티 실행파일을 읽고 그래프를 저장할 수 있는 디스크의 이름을 알려준다. +**그러므로 `target`과 동일한 값을 가져야 합니다.** +- `trainer-config-file`, `train`, `run-id`: ML-Agents 인자들은 `mlagents-learn`로 전달됩니다. 트레이너 설정 파일의 이름 `trainer-config-file`, +알고리즘을 학습하는 `train`, 그리고 각 실험에 고유한 식별자를 태깅하는데 사용되는 `run-id`. +컨테이너가 파일에 접근할 수 있도록 trainer-config 파일을 `unity-volume` 안에 둘 것을 권장합니다. + +`3DBall` 환경 실행파일을 학습하기 위해 다음 명령어가 사용됩니다: + +```sh +docker run --name 3DBallContainer.first.trial \ + --mount type=bind,source="$(pwd)"/unity-volume,target=/unity-volume \ + -p 5005:5005 \ + balance.ball.v0.1:latest 3DBall \ + --docker-target-name=unity-volume \ + trainer_config.yaml \ + --env=3DBall + --train \ + --run-id=3dball_first_trial +``` + +도커 마운트에 대한 세부 사항은 도커의 [이 문서](https://docs.docker.com/storage/bind-mounts/)를 참고해 주십시오. + +**참고** 도커를 사용해 시각적인 관찰을 포함한 환경을 학습할 경우, 콘테이너를 위해 할당한 도커의 디폴트 메모리를 늘려야할 것입니다. +예를 들어, [여기](https://docs.docker.com/docker-for-mac/#advanced) Mac 사용자를 위한 도커 지시사항을 봐주십시오. + +### 컨테이너 중지 및 상태 저장 + +학습 진행 상황에 만족했을 경우, 상태를 저장하는 동안 `Ctrl+C` or `⌘+C` (Mac) 키를 사용하거나 다음 명령어를 통해 도커 컨테이너를 중지할 수 있습니다: + +```sh +docker kill --signal=SIGINT +``` + +`` 은 `docker run` 명령어에 지정된 컨테이너 이름입니다. 지정하지 않으면 무작위로 생성되며`docker container ls`를 통해 확인할 수 있습니다. + + +## 한글 번역 + +해당 문서의 한글 번역은 [장현준 (Hyeonjun Jang)]([https://github.com/janghyeonjun](https://github.com/janghyeonjun))에 의해 진행되었습니다. 내용상 오류나 오탈자가 있는 경우 totok682@naver.com 으로 연락주시면 감사드리겠습니다. diff --git a/docs/localized/KR/docs/images/3dball_learning_brain.png b/docs/localized/KR/docs/images/3dball_learning_brain.png new file mode 100644 index 0000000000..1f4a4440ed Binary files /dev/null and b/docs/localized/KR/docs/images/3dball_learning_brain.png differ diff --git a/docs/localized/KR/docs/images/3dballhard.png b/docs/localized/KR/docs/images/3dballhard.png new file mode 100644 index 0000000000..a452167157 Binary files /dev/null and b/docs/localized/KR/docs/images/3dballhard.png differ diff --git a/docs/localized/KR/docs/images/academy.png b/docs/localized/KR/docs/images/academy.png new file mode 100644 index 0000000000..62f3e5f8e5 Binary files /dev/null and b/docs/localized/KR/docs/images/academy.png differ diff --git a/docs/localized/KR/docs/images/agent.png b/docs/localized/KR/docs/images/agent.png new file mode 100644 index 0000000000..1918afe54d Binary files /dev/null and b/docs/localized/KR/docs/images/agent.png differ diff --git a/docs/localized/KR/docs/images/anaconda_default.PNG b/docs/localized/KR/docs/images/anaconda_default.PNG new file mode 100644 index 0000000000..9d65d81b69 Binary files /dev/null and b/docs/localized/KR/docs/images/anaconda_default.PNG differ diff --git a/docs/localized/KR/docs/images/anaconda_install.PNG b/docs/localized/KR/docs/images/anaconda_install.PNG new file mode 100644 index 0000000000..237c59cae7 Binary files /dev/null and b/docs/localized/KR/docs/images/anaconda_install.PNG differ diff --git a/docs/localized/KR/docs/images/balance.png b/docs/localized/KR/docs/images/balance.png new file mode 100644 index 0000000000..ee9049f6af Binary files /dev/null and b/docs/localized/KR/docs/images/balance.png differ diff --git a/docs/localized/KR/docs/images/banana.png b/docs/localized/KR/docs/images/banana.png new file mode 100644 index 0000000000..c485f22126 Binary files /dev/null and b/docs/localized/KR/docs/images/banana.png differ diff --git a/docs/localized/KR/docs/images/bananaimitation.png b/docs/localized/KR/docs/images/bananaimitation.png new file mode 100644 index 0000000000..7fcf2fed87 Binary files /dev/null and b/docs/localized/KR/docs/images/bananaimitation.png differ diff --git a/docs/localized/KR/docs/images/banner.png b/docs/localized/KR/docs/images/banner.png new file mode 100644 index 0000000000..9068615db9 Binary files /dev/null and b/docs/localized/KR/docs/images/banner.png differ diff --git a/docs/localized/KR/docs/images/basic.png b/docs/localized/KR/docs/images/basic.png new file mode 100644 index 0000000000..0824ac6091 Binary files /dev/null and b/docs/localized/KR/docs/images/basic.png differ diff --git a/docs/localized/KR/docs/images/bc_teacher_helper.png b/docs/localized/KR/docs/images/bc_teacher_helper.png new file mode 100644 index 0000000000..0f188f36ef Binary files /dev/null and b/docs/localized/KR/docs/images/bc_teacher_helper.png differ diff --git a/docs/localized/KR/docs/images/bouncer.png b/docs/localized/KR/docs/images/bouncer.png new file mode 100644 index 0000000000..b36764bfa3 Binary files /dev/null and b/docs/localized/KR/docs/images/bouncer.png differ diff --git a/docs/localized/KR/docs/images/brain.png b/docs/localized/KR/docs/images/brain.png new file mode 100644 index 0000000000..b7e45cdfb7 Binary files /dev/null and b/docs/localized/KR/docs/images/brain.png differ diff --git a/docs/localized/KR/docs/images/broadcast.png b/docs/localized/KR/docs/images/broadcast.png new file mode 100644 index 0000000000..5428110aef Binary files /dev/null and b/docs/localized/KR/docs/images/broadcast.png differ diff --git a/docs/localized/KR/docs/images/conda_new.PNG b/docs/localized/KR/docs/images/conda_new.PNG new file mode 100644 index 0000000000..96d6cc8bf4 Binary files /dev/null and b/docs/localized/KR/docs/images/conda_new.PNG differ diff --git a/docs/localized/KR/docs/images/crawler.png b/docs/localized/KR/docs/images/crawler.png new file mode 100644 index 0000000000..3b5c46050a Binary files /dev/null and b/docs/localized/KR/docs/images/crawler.png differ diff --git a/docs/localized/KR/docs/images/cuDNN_membership_required.png b/docs/localized/KR/docs/images/cuDNN_membership_required.png new file mode 100644 index 0000000000..6a7ffc6cd2 Binary files /dev/null and b/docs/localized/KR/docs/images/cuDNN_membership_required.png differ diff --git a/docs/localized/KR/docs/images/cuda_toolkit_directory.PNG b/docs/localized/KR/docs/images/cuda_toolkit_directory.PNG new file mode 100644 index 0000000000..304ec7fc57 Binary files /dev/null and b/docs/localized/KR/docs/images/cuda_toolkit_directory.PNG differ diff --git a/docs/localized/KR/docs/images/cudnn_zip_files.PNG b/docs/localized/KR/docs/images/cudnn_zip_files.PNG new file mode 100644 index 0000000000..9170f34f94 Binary files /dev/null and b/docs/localized/KR/docs/images/cudnn_zip_files.PNG differ diff --git a/docs/localized/KR/docs/images/curriculum.png b/docs/localized/KR/docs/images/curriculum.png new file mode 100644 index 0000000000..e62c256b81 Binary files /dev/null and b/docs/localized/KR/docs/images/curriculum.png differ diff --git a/docs/localized/KR/docs/images/curriculum_progress.png b/docs/localized/KR/docs/images/curriculum_progress.png new file mode 100644 index 0000000000..adbced8261 Binary files /dev/null and b/docs/localized/KR/docs/images/curriculum_progress.png differ diff --git a/docs/localized/KR/docs/images/demo_component.png b/docs/localized/KR/docs/images/demo_component.png new file mode 100644 index 0000000000..6cc78380bb Binary files /dev/null and b/docs/localized/KR/docs/images/demo_component.png differ diff --git a/docs/localized/KR/docs/images/demo_inspector.png b/docs/localized/KR/docs/images/demo_inspector.png new file mode 100644 index 0000000000..9cb7a60980 Binary files /dev/null and b/docs/localized/KR/docs/images/demo_inspector.png differ diff --git a/docs/localized/KR/docs/images/docker_build_settings.png b/docs/localized/KR/docs/images/docker_build_settings.png new file mode 100644 index 0000000000..e0325a34b6 Binary files /dev/null and b/docs/localized/KR/docs/images/docker_build_settings.png differ diff --git a/docs/localized/KR/docs/images/edit_env_var.png b/docs/localized/KR/docs/images/edit_env_var.png new file mode 100644 index 0000000000..2fd622c431 Binary files /dev/null and b/docs/localized/KR/docs/images/edit_env_var.png differ diff --git a/docs/localized/KR/docs/images/edit_env_var_kr.png b/docs/localized/KR/docs/images/edit_env_var_kr.png new file mode 100644 index 0000000000..c94f359e6c Binary files /dev/null and b/docs/localized/KR/docs/images/edit_env_var_kr.png differ diff --git a/docs/localized/KR/docs/images/gridworld.png b/docs/localized/KR/docs/images/gridworld.png new file mode 100644 index 0000000000..f31d6e01a1 Binary files /dev/null and b/docs/localized/KR/docs/images/gridworld.png differ diff --git a/docs/localized/KR/docs/images/hallway.png b/docs/localized/KR/docs/images/hallway.png new file mode 100644 index 0000000000..776c73f70e Binary files /dev/null and b/docs/localized/KR/docs/images/hallway.png differ diff --git a/docs/localized/KR/docs/images/image-banner.png b/docs/localized/KR/docs/images/image-banner.png new file mode 100644 index 0000000000..1c705bb00e Binary files /dev/null and b/docs/localized/KR/docs/images/image-banner.png differ diff --git a/docs/localized/KR/docs/images/internal_brain.png b/docs/localized/KR/docs/images/internal_brain.png new file mode 100644 index 0000000000..1a434e03fc Binary files /dev/null and b/docs/localized/KR/docs/images/internal_brain.png differ diff --git a/docs/localized/KR/docs/images/learning_environment.png b/docs/localized/KR/docs/images/learning_environment.png new file mode 100644 index 0000000000..df590a0b9f Binary files /dev/null and b/docs/localized/KR/docs/images/learning_environment.png differ diff --git a/docs/localized/KR/docs/images/learning_environment_basic.png b/docs/localized/KR/docs/images/learning_environment_basic.png new file mode 100644 index 0000000000..14431e8c50 Binary files /dev/null and b/docs/localized/KR/docs/images/learning_environment_basic.png differ diff --git a/docs/localized/KR/docs/images/learning_environment_example.png b/docs/localized/KR/docs/images/learning_environment_example.png new file mode 100644 index 0000000000..7ffaa030b6 Binary files /dev/null and b/docs/localized/KR/docs/images/learning_environment_example.png differ diff --git a/docs/localized/KR/docs/images/math.png b/docs/localized/KR/docs/images/math.png new file mode 100644 index 0000000000..6cd1b696dd Binary files /dev/null and b/docs/localized/KR/docs/images/math.png differ diff --git a/docs/localized/KR/docs/images/ml-agents-LSTM.png b/docs/localized/KR/docs/images/ml-agents-LSTM.png new file mode 100644 index 0000000000..313a8edc1f Binary files /dev/null and b/docs/localized/KR/docs/images/ml-agents-LSTM.png differ diff --git a/docs/localized/KR/docs/images/mlagents-3DBallHierarchy.png b/docs/localized/KR/docs/images/mlagents-3DBallHierarchy.png new file mode 100644 index 0000000000..a853848a48 Binary files /dev/null and b/docs/localized/KR/docs/images/mlagents-3DBallHierarchy.png differ diff --git a/docs/localized/KR/docs/images/mlagents-BuildWindow.png b/docs/localized/KR/docs/images/mlagents-BuildWindow.png new file mode 100644 index 0000000000..4eae2512d6 Binary files /dev/null and b/docs/localized/KR/docs/images/mlagents-BuildWindow.png differ diff --git a/docs/localized/KR/docs/images/mlagents-NewProject.png b/docs/localized/KR/docs/images/mlagents-NewProject.png new file mode 100644 index 0000000000..81f5d994ef Binary files /dev/null and b/docs/localized/KR/docs/images/mlagents-NewProject.png differ diff --git a/docs/localized/KR/docs/images/mlagents-NewTutAcademy.png b/docs/localized/KR/docs/images/mlagents-NewTutAcademy.png new file mode 100644 index 0000000000..d3bf3289a8 Binary files /dev/null and b/docs/localized/KR/docs/images/mlagents-NewTutAcademy.png differ diff --git a/docs/localized/KR/docs/images/mlagents-NewTutAssignBrain.png b/docs/localized/KR/docs/images/mlagents-NewTutAssignBrain.png new file mode 100644 index 0000000000..b657046c88 Binary files /dev/null and b/docs/localized/KR/docs/images/mlagents-NewTutAssignBrain.png differ diff --git a/docs/localized/KR/docs/images/mlagents-NewTutBlock.png b/docs/localized/KR/docs/images/mlagents-NewTutBlock.png new file mode 100644 index 0000000000..8ef983a869 Binary files /dev/null and b/docs/localized/KR/docs/images/mlagents-NewTutBlock.png differ diff --git a/docs/localized/KR/docs/images/mlagents-NewTutBrain.png b/docs/localized/KR/docs/images/mlagents-NewTutBrain.png new file mode 100644 index 0000000000..23a5093d81 Binary files /dev/null and b/docs/localized/KR/docs/images/mlagents-NewTutBrain.png differ diff --git a/docs/localized/KR/docs/images/mlagents-NewTutFloor.png b/docs/localized/KR/docs/images/mlagents-NewTutFloor.png new file mode 100644 index 0000000000..7c070c40ee Binary files /dev/null and b/docs/localized/KR/docs/images/mlagents-NewTutFloor.png differ diff --git a/docs/localized/KR/docs/images/mlagents-NewTutHierarchy.png b/docs/localized/KR/docs/images/mlagents-NewTutHierarchy.png new file mode 100644 index 0000000000..d1c4e350c5 Binary files /dev/null and b/docs/localized/KR/docs/images/mlagents-NewTutHierarchy.png differ diff --git a/docs/localized/KR/docs/images/mlagents-NewTutSphere.png b/docs/localized/KR/docs/images/mlagents-NewTutSphere.png new file mode 100644 index 0000000000..55d6e3cb47 Binary files /dev/null and b/docs/localized/KR/docs/images/mlagents-NewTutSphere.png differ diff --git a/docs/localized/KR/docs/images/mlagents-NewTutSplash.png b/docs/localized/KR/docs/images/mlagents-NewTutSplash.png new file mode 100644 index 0000000000..0e6efc2181 Binary files /dev/null and b/docs/localized/KR/docs/images/mlagents-NewTutSplash.png differ diff --git a/docs/localized/KR/docs/images/mlagents-Open3DBall.png b/docs/localized/KR/docs/images/mlagents-Open3DBall.png new file mode 100644 index 0000000000..840ad6b64f Binary files /dev/null and b/docs/localized/KR/docs/images/mlagents-Open3DBall.png differ diff --git a/docs/localized/KR/docs/images/mlagents-RollerAgentStats.png b/docs/localized/KR/docs/images/mlagents-RollerAgentStats.png new file mode 100644 index 0000000000..f1cde7cda2 Binary files /dev/null and b/docs/localized/KR/docs/images/mlagents-RollerAgentStats.png differ diff --git a/docs/localized/KR/docs/images/mlagents-SetBrainToTrain.png b/docs/localized/KR/docs/images/mlagents-SetBrainToTrain.png new file mode 100644 index 0000000000..9fa8347e3d Binary files /dev/null and b/docs/localized/KR/docs/images/mlagents-SetBrainToTrain.png differ diff --git a/docs/localized/KR/docs/images/mlagents-SetExternalBrain.png b/docs/localized/KR/docs/images/mlagents-SetExternalBrain.png new file mode 100644 index 0000000000..7637fa50c0 Binary files /dev/null and b/docs/localized/KR/docs/images/mlagents-SetExternalBrain.png differ diff --git a/docs/localized/KR/docs/images/mlagents-TensorBoard.png b/docs/localized/KR/docs/images/mlagents-TensorBoard.png new file mode 100644 index 0000000000..a4e3fde36f Binary files /dev/null and b/docs/localized/KR/docs/images/mlagents-TensorBoard.png differ diff --git a/docs/localized/KR/docs/images/monitor.png b/docs/localized/KR/docs/images/monitor.png new file mode 100644 index 0000000000..da342630c6 Binary files /dev/null and b/docs/localized/KR/docs/images/monitor.png differ diff --git a/docs/localized/KR/docs/images/new_system_variable.PNG b/docs/localized/KR/docs/images/new_system_variable.PNG new file mode 100644 index 0000000000..b27365977a Binary files /dev/null and b/docs/localized/KR/docs/images/new_system_variable.PNG differ diff --git a/docs/localized/KR/docs/images/new_system_variable_kr.PNG b/docs/localized/KR/docs/images/new_system_variable_kr.PNG new file mode 100644 index 0000000000..14c879a333 Binary files /dev/null and b/docs/localized/KR/docs/images/new_system_variable_kr.PNG differ diff --git a/docs/localized/KR/docs/images/normalization.png b/docs/localized/KR/docs/images/normalization.png new file mode 100644 index 0000000000..768c436971 Binary files /dev/null and b/docs/localized/KR/docs/images/normalization.png differ diff --git a/docs/localized/KR/docs/images/path_variables.PNG b/docs/localized/KR/docs/images/path_variables.PNG new file mode 100644 index 0000000000..35745c56a5 Binary files /dev/null and b/docs/localized/KR/docs/images/path_variables.PNG differ diff --git a/docs/localized/KR/docs/images/path_variables_kr.PNG b/docs/localized/KR/docs/images/path_variables_kr.PNG new file mode 100644 index 0000000000..b7193984cc Binary files /dev/null and b/docs/localized/KR/docs/images/path_variables_kr.PNG differ diff --git a/docs/localized/KR/docs/images/platform_prefab.png b/docs/localized/KR/docs/images/platform_prefab.png new file mode 100644 index 0000000000..9eed9e3c1d Binary files /dev/null and b/docs/localized/KR/docs/images/platform_prefab.png differ diff --git a/docs/localized/KR/docs/images/player_brain.png b/docs/localized/KR/docs/images/player_brain.png new file mode 100644 index 0000000000..043f0d9c1b Binary files /dev/null and b/docs/localized/KR/docs/images/player_brain.png differ diff --git a/docs/localized/KR/docs/images/push.png b/docs/localized/KR/docs/images/push.png new file mode 100644 index 0000000000..661e5721be Binary files /dev/null and b/docs/localized/KR/docs/images/push.png differ diff --git a/docs/localized/KR/docs/images/pyramids.png b/docs/localized/KR/docs/images/pyramids.png new file mode 100644 index 0000000000..9d26a7d8cc Binary files /dev/null and b/docs/localized/KR/docs/images/pyramids.png differ diff --git a/docs/localized/KR/docs/images/reacher.png b/docs/localized/KR/docs/images/reacher.png new file mode 100644 index 0000000000..2311f6bc65 Binary files /dev/null and b/docs/localized/KR/docs/images/reacher.png differ diff --git a/docs/localized/KR/docs/images/rl_cycle.png b/docs/localized/KR/docs/images/rl_cycle.png new file mode 100644 index 0000000000..2283360dd7 Binary files /dev/null and b/docs/localized/KR/docs/images/rl_cycle.png differ diff --git a/docs/localized/KR/docs/images/running-a-pretrained-model.gif b/docs/localized/KR/docs/images/running-a-pretrained-model.gif new file mode 100644 index 0000000000..8d9e5929c8 Binary files /dev/null and b/docs/localized/KR/docs/images/running-a-pretrained-model.gif differ diff --git a/docs/localized/KR/docs/images/scene-hierarchy.png b/docs/localized/KR/docs/images/scene-hierarchy.png new file mode 100644 index 0000000000..b045adeff1 Binary files /dev/null and b/docs/localized/KR/docs/images/scene-hierarchy.png differ diff --git a/docs/localized/KR/docs/images/soccer.png b/docs/localized/KR/docs/images/soccer.png new file mode 100644 index 0000000000..26d59de6d3 Binary files /dev/null and b/docs/localized/KR/docs/images/soccer.png differ diff --git a/docs/localized/KR/docs/images/splitbar.png b/docs/localized/KR/docs/images/splitbar.png new file mode 100644 index 0000000000..8f3a2c2b8b Binary files /dev/null and b/docs/localized/KR/docs/images/splitbar.png differ diff --git a/docs/localized/KR/docs/images/system_variable_name_value.PNG b/docs/localized/KR/docs/images/system_variable_name_value.PNG new file mode 100644 index 0000000000..ae3a47d623 Binary files /dev/null and b/docs/localized/KR/docs/images/system_variable_name_value.PNG differ diff --git a/docs/localized/KR/docs/images/system_variable_name_value_kr.PNG b/docs/localized/KR/docs/images/system_variable_name_value_kr.PNG new file mode 100644 index 0000000000..447cab78c4 Binary files /dev/null and b/docs/localized/KR/docs/images/system_variable_name_value_kr.PNG differ diff --git a/docs/localized/KR/docs/images/tennis.png b/docs/localized/KR/docs/images/tennis.png new file mode 100644 index 0000000000..817f8a8ef9 Binary files /dev/null and b/docs/localized/KR/docs/images/tennis.png differ diff --git a/docs/localized/KR/docs/images/unity-logo-rgb.png b/docs/localized/KR/docs/images/unity-logo-rgb.png new file mode 100644 index 0000000000..4a37a405cc Binary files /dev/null and b/docs/localized/KR/docs/images/unity-logo-rgb.png differ diff --git a/docs/localized/KR/docs/images/unity-wide.png b/docs/localized/KR/docs/images/unity-wide.png new file mode 100644 index 0000000000..1668b46745 Binary files /dev/null and b/docs/localized/KR/docs/images/unity-wide.png differ diff --git a/docs/localized/KR/docs/images/unity_linux_build_support.png b/docs/localized/KR/docs/images/unity_linux_build_support.png new file mode 100644 index 0000000000..c253efcae6 Binary files /dev/null and b/docs/localized/KR/docs/images/unity_linux_build_support.png differ diff --git a/docs/localized/KR/docs/images/visual-observation-combination.png b/docs/localized/KR/docs/images/visual-observation-combination.png new file mode 100644 index 0000000000..a40b37752c Binary files /dev/null and b/docs/localized/KR/docs/images/visual-observation-combination.png differ diff --git a/docs/localized/KR/docs/images/visual-observation-debug.png b/docs/localized/KR/docs/images/visual-observation-debug.png new file mode 100644 index 0000000000..32449c963b Binary files /dev/null and b/docs/localized/KR/docs/images/visual-observation-debug.png differ diff --git a/docs/localized/KR/docs/images/visual-observation-rawimage.png b/docs/localized/KR/docs/images/visual-observation-rawimage.png new file mode 100644 index 0000000000..03142985aa Binary files /dev/null and b/docs/localized/KR/docs/images/visual-observation-rawimage.png differ diff --git a/docs/localized/KR/docs/images/visual-observation-rendertexture.png b/docs/localized/KR/docs/images/visual-observation-rendertexture.png new file mode 100644 index 0000000000..d2f8c7f662 Binary files /dev/null and b/docs/localized/KR/docs/images/visual-observation-rendertexture.png differ diff --git a/docs/localized/KR/docs/images/visual-observation.png b/docs/localized/KR/docs/images/visual-observation.png new file mode 100644 index 0000000000..bfc3144049 Binary files /dev/null and b/docs/localized/KR/docs/images/visual-observation.png differ diff --git a/docs/localized/KR/docs/images/walker.png b/docs/localized/KR/docs/images/walker.png new file mode 100644 index 0000000000..af901fa943 Binary files /dev/null and b/docs/localized/KR/docs/images/walker.png differ diff --git a/docs/localized/KR/docs/images/wall.png b/docs/localized/KR/docs/images/wall.png new file mode 100644 index 0000000000..41430ca279 Binary files /dev/null and b/docs/localized/KR/docs/images/wall.png differ diff --git a/docs/localized/zh-CN/README.md b/docs/localized/zh-CN/README.md index 0d38124091..20ef6a4cdd 100755 --- a/docs/localized/zh-CN/README.md +++ b/docs/localized/zh-CN/README.md @@ -1,6 +1,6 @@ -# Unity ML-Agents 工具包(Beta) +# Unity ML-Agents 工具包(Beta) v0.3.1 **注意:** 本文档为v0.3版本文档的部分翻译版,目前并不会随着英文版文档更新而更新。若要查看更新更全的英文版文档,请查看[这里](https://github.com/Unity-Technologies/ml-agents)。 diff --git a/gym-unity/gym_unity/envs/unity_env.py b/gym-unity/gym_unity/envs/unity_env.py index 86079798a8..1cece2bb12 100755 --- a/gym-unity/gym_unity/envs/unity_env.py +++ b/gym-unity/gym_unity/envs/unity_env.py @@ -28,13 +28,13 @@ class UnityEnv(gym.Env): def __init__( self, environment_filename: str, - worker_id=0, - use_visual=False, - uint8_visual=False, - multiagent=False, - flatten_branched=False, - no_graphics=False, - allow_multiple_visual_obs=False, + worker_id: int = 0, + use_visual: bool = False, + uint8_visual: bool = False, + multiagent: bool = False, + flatten_branched: bool = False, + no_graphics: bool = False, + allow_multiple_visual_obs: bool = False, ): """ Environment initialization @@ -43,7 +43,8 @@ def __init__( :param use_visual: Whether to use visual observation or vector observation. :param uint8_visual: Return visual observations as uint8 (0-255) matrices instead of float (0.0-1.0). :param multiagent: Whether to run in multi-agent mode (lists of obs, reward, done). - :param flatten_branched: If True, turn branched discrete action spaces into a Discrete space rather than MultiDiscrete. + :param flatten_branched: If True, turn branched discrete action spaces into a Discrete space rather than + MultiDiscrete. :param no_graphics: Whether to run the Unity simulator in no-graphics mode :param allow_multiple_visual_obs: If True, return a list of visual observations instead of only one. """ @@ -218,16 +219,14 @@ def step(self, action): def _single_step(self, info): if self.use_visual: visual_obs = info.visual_observations - if isinstance(visual_obs, list): - visual_obs = np.array(visual_obs) if self._allow_multiple_visual_obs: visual_obs_list = [] for obs in visual_obs: - visual_obs_list.append(self._preprocess_single(obs[0, :, :, :])) + visual_obs_list.append(self._preprocess_single(obs[0])) self.visual_obs = visual_obs_list else: - self.visual_obs = self._preprocess_single(visual_obs[0][0, :, :, :]) + self.visual_obs = self._preprocess_single(visual_obs[0][0]) default_observation = self.visual_obs else: diff --git a/gym-unity/setup.py b/gym-unity/setup.py index 92a6bbdc4d..f5ccf499c3 100755 --- a/gym-unity/setup.py +++ b/gym-unity/setup.py @@ -4,12 +4,12 @@ setup( name="gym_unity", - version="0.4.2", + version="0.4.3", description="Unity Machine Learning Agents Gym Interface", license="Apache License 2.0", author="Unity Technologies", author_email="ML-Agents@unity3d.com", url="https://github.com/Unity-Technologies/ml-agents", packages=find_packages(), - install_requires=["gym", "mlagents_envs==0.8.2"], + install_requires=["gym", "mlagents_envs==0.9.0"], ) diff --git a/ml-agents-envs/mlagents/envs/__init__.py b/ml-agents-envs/mlagents/envs/__init__.py index 9cac8d1f0b..566aa3c957 100644 --- a/ml-agents-envs/mlagents/envs/__init__.py +++ b/ml-agents-envs/mlagents/envs/__init__.py @@ -1,3 +1,5 @@ -from .brain import * +from .brain import AllBrainInfo, BrainInfo, BrainParameters +from .action_info import ActionInfo, ActionInfoOutputs +from .policy import Policy from .environment import * from .exception import * diff --git a/ml-agents/mlagents/trainers/action_info.py b/ml-agents-envs/mlagents/envs/action_info.py similarity index 65% rename from ml-agents/mlagents/trainers/action_info.py rename to ml-agents-envs/mlagents/envs/action_info.py index a7c36c53bb..f6bd4561fc 100644 --- a/ml-agents/mlagents/trainers/action_info.py +++ b/ml-agents-envs/mlagents/envs/action_info.py @@ -1,9 +1,11 @@ from typing import NamedTuple, Any, Dict, Optional +ActionInfoOutputs = Optional[Dict[str, Any]] + class ActionInfo(NamedTuple): action: Any memory: Any text: Any value: Any - outputs: Optional[Dict[str, Any]] + outputs: ActionInfoOutputs diff --git a/ml-agents-envs/mlagents/envs/base_unity_environment.py b/ml-agents-envs/mlagents/envs/base_unity_environment.py index ff6caf5ffc..d24a460a8b 100644 --- a/ml-agents-envs/mlagents/envs/base_unity_environment.py +++ b/ml-agents-envs/mlagents/envs/base_unity_environment.py @@ -12,7 +12,9 @@ def step( pass @abstractmethod - def reset(self, config=None, train_mode=True) -> AllBrainInfo: + def reset( + self, config=None, train_mode=True, custom_reset_parameters=None + ) -> AllBrainInfo: pass @property @@ -27,7 +29,7 @@ def external_brains(self) -> Dict[str, BrainParameters]: @property @abstractmethod - def reset_parameters(self) -> Dict[str, str]: + def reset_parameters(self) -> Dict[str, float]: pass @abstractmethod diff --git a/ml-agents-envs/mlagents/envs/brain.py b/ml-agents-envs/mlagents/envs/brain.py index 816dd13ac9..4fdd90a38a 100644 --- a/ml-agents-envs/mlagents/envs/brain.py +++ b/ml-agents-envs/mlagents/envs/brain.py @@ -84,7 +84,7 @@ def merge_memories(m1, m2, agents1, agents2): return np.append(m1, m2, axis=0) @staticmethod - def process_pixels(image_bytes, gray_scale): + def process_pixels(image_bytes: bytes, gray_scale: bool) -> np.ndarray: """ Converts byte array observation image into numpy array, re-sizes it, and optionally converts it to grey scale @@ -92,8 +92,8 @@ def process_pixels(image_bytes, gray_scale): :param image_bytes: input byte array corresponding to image :return: processed numpy array of observation from environment """ - s = bytearray(image_bytes) - image = Image.open(io.BytesIO(s)) + image_bytearray = bytearray(image_bytes) + image = Image.open(io.BytesIO(image_bytearray)) s = np.array(image) / 255.0 if gray_scale: s = np.mean(s, axis=2) @@ -101,11 +101,11 @@ def process_pixels(image_bytes, gray_scale): return s @staticmethod - def from_agent_proto(agent_info_list, brain_params): + def from_agent_proto(worker_id: int, agent_info_list, brain_params): """ Converts list of agent infos to BrainInfo. """ - vis_obs = [] + vis_obs: List[np.ndarray] = [] for i in range(brain_params.number_visual_observations): obs = [ BrainInfo.process_pixels( @@ -157,13 +157,14 @@ def from_agent_proto(agent_info_list, brain_params): vector_obs = np.nan_to_num( np.array([x.stacked_vector_observation for x in agent_info_list]) ) + agents = [f"${worker_id}-{x.id}" for x in agent_info_list] brain_info = BrainInfo( visual_observation=vis_obs, vector_observation=vector_obs, text_observations=[x.text_observation for x in agent_info_list], memory=memory, reward=[x.reward if not np.isnan(x.reward) else 0 for x in agent_info_list], - agents=[x.id for x in agent_info_list], + agents=agents, local_done=[x.done for x in agent_info_list], vector_action=np.array([x.stored_vector_actions for x in agent_info_list]), text_action=[list(x.stored_text_actions) for x in agent_info_list], @@ -174,17 +175,19 @@ def from_agent_proto(agent_info_list, brain_params): return brain_info -def safe_concat_lists(l1: Optional[List], l2: Optional[List]): - if l1 is None and l2 is None: - return None - if l1 is None and l2 is not None: - return l2.copy() - if l1 is not None and l2 is None: - return l1.copy() +def safe_concat_lists(l1: Optional[List], l2: Optional[List]) -> Optional[List]: + if l1 is None: + if l2 is None: + return None + else: + return l2.copy() else: - copy = l1.copy() - copy.extend(l2) - return copy + if l2 is None: + return l1.copy() + else: + copy = l1.copy() + copy.extend(l2) + return copy def safe_concat_np_ndarray(a1: Optional[np.ndarray], a2: Optional[np.ndarray]): diff --git a/ml-agents-envs/mlagents/envs/communicator.py b/ml-agents-envs/mlagents/envs/communicator.py index 220c5487d2..393403c260 100644 --- a/ml-agents-envs/mlagents/envs/communicator.py +++ b/ml-agents-envs/mlagents/envs/communicator.py @@ -1,4 +1,5 @@ import logging +from typing import Optional from .communicator_objects import UnityOutput, UnityInput @@ -21,7 +22,7 @@ def initialize(self, inputs: UnityInput) -> UnityOutput: :return: UnityOutput: The initialization output sent by Unity """ - def exchange(self, inputs: UnityInput) -> UnityOutput: + def exchange(self, inputs: UnityInput) -> Optional[UnityOutput]: """ Used to send an input and receive an output from the Environment :param inputs: The UnityInput that needs to be sent the Environment diff --git a/ml-agents-envs/mlagents/envs/communicator_objects/agent_action_proto_pb2.pyi b/ml-agents-envs/mlagents/envs/communicator_objects/agent_action_proto_pb2.pyi new file mode 100644 index 0000000000..4df09226eb --- /dev/null +++ b/ml-agents-envs/mlagents/envs/communicator_objects/agent_action_proto_pb2.pyi @@ -0,0 +1,51 @@ +# @generated by generate_proto_mypy_stubs.py. Do not edit! +import sys +from google.protobuf.internal.containers import ( + RepeatedScalarFieldContainer as google___protobuf___internal___containers___RepeatedScalarFieldContainer, +) + +from google.protobuf.message import ( + Message as google___protobuf___message___Message, +) + +from mlagents.envs.communicator_objects.custom_action_pb2 import ( + CustomAction as mlagents___envs___communicator_objects___custom_action_pb2___CustomAction, +) + +from typing import ( + Iterable as typing___Iterable, + Optional as typing___Optional, + Text as typing___Text, +) + +from typing_extensions import ( + Literal as typing_extensions___Literal, +) + + +class AgentActionProto(google___protobuf___message___Message): + vector_actions = ... # type: google___protobuf___internal___containers___RepeatedScalarFieldContainer[float] + text_actions = ... # type: typing___Text + memories = ... # type: google___protobuf___internal___containers___RepeatedScalarFieldContainer[float] + value = ... # type: float + + @property + def custom_action(self) -> mlagents___envs___communicator_objects___custom_action_pb2___CustomAction: ... + + def __init__(self, + vector_actions : typing___Optional[typing___Iterable[float]] = None, + text_actions : typing___Optional[typing___Text] = None, + memories : typing___Optional[typing___Iterable[float]] = None, + value : typing___Optional[float] = None, + custom_action : typing___Optional[mlagents___envs___communicator_objects___custom_action_pb2___CustomAction] = None, + ) -> None: ... + @classmethod + def FromString(cls, s: bytes) -> AgentActionProto: ... + def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + if sys.version_info >= (3,): + def HasField(self, field_name: typing_extensions___Literal[u"custom_action"]) -> bool: ... + def ClearField(self, field_name: typing_extensions___Literal[u"custom_action",u"memories",u"text_actions",u"value",u"vector_actions"]) -> None: ... + else: + def HasField(self, field_name: typing_extensions___Literal[u"custom_action",b"custom_action"]) -> bool: ... + def ClearField(self, field_name: typing_extensions___Literal[b"custom_action",b"memories",b"text_actions",b"value",b"vector_actions"]) -> None: ... diff --git a/ml-agents-envs/mlagents/envs/communicator_objects/agent_info_proto_pb2.pyi b/ml-agents-envs/mlagents/envs/communicator_objects/agent_info_proto_pb2.pyi new file mode 100644 index 0000000000..595aa70ef5 --- /dev/null +++ b/ml-agents-envs/mlagents/envs/communicator_objects/agent_info_proto_pb2.pyi @@ -0,0 +1,65 @@ +# @generated by generate_proto_mypy_stubs.py. Do not edit! +import sys +from google.protobuf.internal.containers import ( + RepeatedScalarFieldContainer as google___protobuf___internal___containers___RepeatedScalarFieldContainer, +) + +from google.protobuf.message import ( + Message as google___protobuf___message___Message, +) + +from mlagents.envs.communicator_objects.custom_observation_pb2 import ( + CustomObservation as mlagents___envs___communicator_objects___custom_observation_pb2___CustomObservation, +) + +from typing import ( + Iterable as typing___Iterable, + Optional as typing___Optional, + Text as typing___Text, +) + +from typing_extensions import ( + Literal as typing_extensions___Literal, +) + + +class AgentInfoProto(google___protobuf___message___Message): + stacked_vector_observation = ... # type: google___protobuf___internal___containers___RepeatedScalarFieldContainer[float] + visual_observations = ... # type: google___protobuf___internal___containers___RepeatedScalarFieldContainer[bytes] + text_observation = ... # type: typing___Text + stored_vector_actions = ... # type: google___protobuf___internal___containers___RepeatedScalarFieldContainer[float] + stored_text_actions = ... # type: typing___Text + memories = ... # type: google___protobuf___internal___containers___RepeatedScalarFieldContainer[float] + reward = ... # type: float + done = ... # type: bool + max_step_reached = ... # type: bool + id = ... # type: int + action_mask = ... # type: google___protobuf___internal___containers___RepeatedScalarFieldContainer[bool] + + @property + def custom_observation(self) -> mlagents___envs___communicator_objects___custom_observation_pb2___CustomObservation: ... + + def __init__(self, + stacked_vector_observation : typing___Optional[typing___Iterable[float]] = None, + visual_observations : typing___Optional[typing___Iterable[bytes]] = None, + text_observation : typing___Optional[typing___Text] = None, + stored_vector_actions : typing___Optional[typing___Iterable[float]] = None, + stored_text_actions : typing___Optional[typing___Text] = None, + memories : typing___Optional[typing___Iterable[float]] = None, + reward : typing___Optional[float] = None, + done : typing___Optional[bool] = None, + max_step_reached : typing___Optional[bool] = None, + id : typing___Optional[int] = None, + action_mask : typing___Optional[typing___Iterable[bool]] = None, + custom_observation : typing___Optional[mlagents___envs___communicator_objects___custom_observation_pb2___CustomObservation] = None, + ) -> None: ... + @classmethod + def FromString(cls, s: bytes) -> AgentInfoProto: ... + def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + if sys.version_info >= (3,): + def HasField(self, field_name: typing_extensions___Literal[u"custom_observation"]) -> bool: ... + def ClearField(self, field_name: typing_extensions___Literal[u"action_mask",u"custom_observation",u"done",u"id",u"max_step_reached",u"memories",u"reward",u"stacked_vector_observation",u"stored_text_actions",u"stored_vector_actions",u"text_observation",u"visual_observations"]) -> None: ... + else: + def HasField(self, field_name: typing_extensions___Literal[u"custom_observation",b"custom_observation"]) -> bool: ... + def ClearField(self, field_name: typing_extensions___Literal[b"action_mask",b"custom_observation",b"done",b"id",b"max_step_reached",b"memories",b"reward",b"stacked_vector_observation",b"stored_text_actions",b"stored_vector_actions",b"text_observation",b"visual_observations"]) -> None: ... diff --git a/ml-agents-envs/mlagents/envs/communicator_objects/brain_parameters_proto_pb2.pyi b/ml-agents-envs/mlagents/envs/communicator_objects/brain_parameters_proto_pb2.pyi new file mode 100644 index 0000000000..ce3d0da775 --- /dev/null +++ b/ml-agents-envs/mlagents/envs/communicator_objects/brain_parameters_proto_pb2.pyi @@ -0,0 +1,60 @@ +# @generated by generate_proto_mypy_stubs.py. Do not edit! +import sys +from google.protobuf.internal.containers import ( + RepeatedCompositeFieldContainer as google___protobuf___internal___containers___RepeatedCompositeFieldContainer, + RepeatedScalarFieldContainer as google___protobuf___internal___containers___RepeatedScalarFieldContainer, +) + +from google.protobuf.message import ( + Message as google___protobuf___message___Message, +) + +from mlagents.envs.communicator_objects.resolution_proto_pb2 import ( + ResolutionProto as mlagents___envs___communicator_objects___resolution_proto_pb2___ResolutionProto, +) + +from mlagents.envs.communicator_objects.space_type_proto_pb2 import ( + SpaceTypeProto as mlagents___envs___communicator_objects___space_type_proto_pb2___SpaceTypeProto, +) + +from typing import ( + Iterable as typing___Iterable, + Optional as typing___Optional, + Text as typing___Text, +) + +from typing_extensions import ( + Literal as typing_extensions___Literal, +) + + +class BrainParametersProto(google___protobuf___message___Message): + vector_observation_size = ... # type: int + num_stacked_vector_observations = ... # type: int + vector_action_size = ... # type: google___protobuf___internal___containers___RepeatedScalarFieldContainer[int] + vector_action_descriptions = ... # type: google___protobuf___internal___containers___RepeatedScalarFieldContainer[typing___Text] + vector_action_space_type = ... # type: mlagents___envs___communicator_objects___space_type_proto_pb2___SpaceTypeProto + brain_name = ... # type: typing___Text + is_training = ... # type: bool + + @property + def camera_resolutions(self) -> google___protobuf___internal___containers___RepeatedCompositeFieldContainer[mlagents___envs___communicator_objects___resolution_proto_pb2___ResolutionProto]: ... + + def __init__(self, + vector_observation_size : typing___Optional[int] = None, + num_stacked_vector_observations : typing___Optional[int] = None, + vector_action_size : typing___Optional[typing___Iterable[int]] = None, + camera_resolutions : typing___Optional[typing___Iterable[mlagents___envs___communicator_objects___resolution_proto_pb2___ResolutionProto]] = None, + vector_action_descriptions : typing___Optional[typing___Iterable[typing___Text]] = None, + vector_action_space_type : typing___Optional[mlagents___envs___communicator_objects___space_type_proto_pb2___SpaceTypeProto] = None, + brain_name : typing___Optional[typing___Text] = None, + is_training : typing___Optional[bool] = None, + ) -> None: ... + @classmethod + def FromString(cls, s: bytes) -> BrainParametersProto: ... + def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + if sys.version_info >= (3,): + def ClearField(self, field_name: typing_extensions___Literal[u"brain_name",u"camera_resolutions",u"is_training",u"num_stacked_vector_observations",u"vector_action_descriptions",u"vector_action_size",u"vector_action_space_type",u"vector_observation_size"]) -> None: ... + else: + def ClearField(self, field_name: typing_extensions___Literal[b"brain_name",b"camera_resolutions",b"is_training",b"num_stacked_vector_observations",b"vector_action_descriptions",b"vector_action_size",b"vector_action_space_type",b"vector_observation_size"]) -> None: ... diff --git a/ml-agents-envs/mlagents/envs/communicator_objects/command_proto_pb2.pyi b/ml-agents-envs/mlagents/envs/communicator_objects/command_proto_pb2.pyi new file mode 100644 index 0000000000..7672e0ac55 --- /dev/null +++ b/ml-agents-envs/mlagents/envs/communicator_objects/command_proto_pb2.pyi @@ -0,0 +1,32 @@ +# @generated by generate_proto_mypy_stubs.py. Do not edit! +import sys +from google.protobuf.descriptor import ( + EnumDescriptor as google___protobuf___descriptor___EnumDescriptor, +) + +from google.protobuf.message import ( + Message as google___protobuf___message___Message, +) + +from typing import ( + List as typing___List, + Tuple as typing___Tuple, + cast as typing___cast, +) + + +class CommandProto(int): + DESCRIPTOR: google___protobuf___descriptor___EnumDescriptor = ... + @classmethod + def Name(cls, number: int) -> str: ... + @classmethod + def Value(cls, name: str) -> CommandProto: ... + @classmethod + def keys(cls) -> typing___List[str]: ... + @classmethod + def values(cls) -> typing___List[CommandProto]: ... + @classmethod + def items(cls) -> typing___List[typing___Tuple[str, CommandProto]]: ... +STEP = typing___cast(CommandProto, 0) +RESET = typing___cast(CommandProto, 1) +QUIT = typing___cast(CommandProto, 2) diff --git a/ml-agents-envs/mlagents/envs/communicator_objects/custom_action_pb2.pyi b/ml-agents-envs/mlagents/envs/communicator_objects/custom_action_pb2.pyi new file mode 100644 index 0000000000..0c328c343b --- /dev/null +++ b/ml-agents-envs/mlagents/envs/communicator_objects/custom_action_pb2.pyi @@ -0,0 +1,15 @@ +# @generated by generate_proto_mypy_stubs.py. Do not edit! +import sys +from google.protobuf.message import ( + Message as google___protobuf___message___Message, +) + + +class CustomAction(google___protobuf___message___Message): + + def __init__(self, + ) -> None: ... + @classmethod + def FromString(cls, s: bytes) -> CustomAction: ... + def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ... diff --git a/ml-agents-envs/mlagents/envs/communicator_objects/custom_observation_pb2.pyi b/ml-agents-envs/mlagents/envs/communicator_objects/custom_observation_pb2.pyi new file mode 100644 index 0000000000..ccab14e3f2 --- /dev/null +++ b/ml-agents-envs/mlagents/envs/communicator_objects/custom_observation_pb2.pyi @@ -0,0 +1,15 @@ +# @generated by generate_proto_mypy_stubs.py. Do not edit! +import sys +from google.protobuf.message import ( + Message as google___protobuf___message___Message, +) + + +class CustomObservation(google___protobuf___message___Message): + + def __init__(self, + ) -> None: ... + @classmethod + def FromString(cls, s: bytes) -> CustomObservation: ... + def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ... diff --git a/ml-agents-envs/mlagents/envs/communicator_objects/custom_reset_parameters_pb2.pyi b/ml-agents-envs/mlagents/envs/communicator_objects/custom_reset_parameters_pb2.pyi new file mode 100644 index 0000000000..26a1da540d --- /dev/null +++ b/ml-agents-envs/mlagents/envs/communicator_objects/custom_reset_parameters_pb2.pyi @@ -0,0 +1,15 @@ +# @generated by generate_proto_mypy_stubs.py. Do not edit! +import sys +from google.protobuf.message import ( + Message as google___protobuf___message___Message, +) + + +class CustomResetParameters(google___protobuf___message___Message): + + def __init__(self, + ) -> None: ... + @classmethod + def FromString(cls, s: bytes) -> CustomResetParameters: ... + def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ... diff --git a/ml-agents-envs/mlagents/envs/communicator_objects/demonstration_meta_proto_pb2.pyi b/ml-agents-envs/mlagents/envs/communicator_objects/demonstration_meta_proto_pb2.pyi new file mode 100644 index 0000000000..c905e9927d --- /dev/null +++ b/ml-agents-envs/mlagents/envs/communicator_objects/demonstration_meta_proto_pb2.pyi @@ -0,0 +1,38 @@ +# @generated by generate_proto_mypy_stubs.py. Do not edit! +import sys +from google.protobuf.message import ( + Message as google___protobuf___message___Message, +) + +from typing import ( + Optional as typing___Optional, + Text as typing___Text, +) + +from typing_extensions import ( + Literal as typing_extensions___Literal, +) + + +class DemonstrationMetaProto(google___protobuf___message___Message): + api_version = ... # type: int + demonstration_name = ... # type: typing___Text + number_steps = ... # type: int + number_episodes = ... # type: int + mean_reward = ... # type: float + + def __init__(self, + api_version : typing___Optional[int] = None, + demonstration_name : typing___Optional[typing___Text] = None, + number_steps : typing___Optional[int] = None, + number_episodes : typing___Optional[int] = None, + mean_reward : typing___Optional[float] = None, + ) -> None: ... + @classmethod + def FromString(cls, s: bytes) -> DemonstrationMetaProto: ... + def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + if sys.version_info >= (3,): + def ClearField(self, field_name: typing_extensions___Literal[u"api_version",u"demonstration_name",u"mean_reward",u"number_episodes",u"number_steps"]) -> None: ... + else: + def ClearField(self, field_name: typing_extensions___Literal[b"api_version",b"demonstration_name",b"mean_reward",b"number_episodes",b"number_steps"]) -> None: ... diff --git a/ml-agents-envs/mlagents/envs/communicator_objects/engine_configuration_proto_pb2.pyi b/ml-agents-envs/mlagents/envs/communicator_objects/engine_configuration_proto_pb2.pyi new file mode 100644 index 0000000000..5d4220fbdc --- /dev/null +++ b/ml-agents-envs/mlagents/envs/communicator_objects/engine_configuration_proto_pb2.pyi @@ -0,0 +1,39 @@ +# @generated by generate_proto_mypy_stubs.py. Do not edit! +import sys +from google.protobuf.message import ( + Message as google___protobuf___message___Message, +) + +from typing import ( + Optional as typing___Optional, +) + +from typing_extensions import ( + Literal as typing_extensions___Literal, +) + + +class EngineConfigurationProto(google___protobuf___message___Message): + width = ... # type: int + height = ... # type: int + quality_level = ... # type: int + time_scale = ... # type: float + target_frame_rate = ... # type: int + show_monitor = ... # type: bool + + def __init__(self, + width : typing___Optional[int] = None, + height : typing___Optional[int] = None, + quality_level : typing___Optional[int] = None, + time_scale : typing___Optional[float] = None, + target_frame_rate : typing___Optional[int] = None, + show_monitor : typing___Optional[bool] = None, + ) -> None: ... + @classmethod + def FromString(cls, s: bytes) -> EngineConfigurationProto: ... + def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + if sys.version_info >= (3,): + def ClearField(self, field_name: typing_extensions___Literal[u"height",u"quality_level",u"show_monitor",u"target_frame_rate",u"time_scale",u"width"]) -> None: ... + else: + def ClearField(self, field_name: typing_extensions___Literal[b"height",b"quality_level",b"show_monitor",b"target_frame_rate",b"time_scale",b"width"]) -> None: ... diff --git a/ml-agents-envs/mlagents/envs/communicator_objects/environment_parameters_proto_pb2.pyi b/ml-agents-envs/mlagents/envs/communicator_objects/environment_parameters_proto_pb2.pyi new file mode 100644 index 0000000000..daba639760 --- /dev/null +++ b/ml-agents-envs/mlagents/envs/communicator_objects/environment_parameters_proto_pb2.pyi @@ -0,0 +1,61 @@ +# @generated by generate_proto_mypy_stubs.py. Do not edit! +import sys +from google.protobuf.message import ( + Message as google___protobuf___message___Message, +) + +from mlagents.envs.communicator_objects.custom_reset_parameters_pb2 import ( + CustomResetParameters as mlagents___envs___communicator_objects___custom_reset_parameters_pb2___CustomResetParameters, +) + +from typing import ( + Mapping as typing___Mapping, + MutableMapping as typing___MutableMapping, + Optional as typing___Optional, + Text as typing___Text, +) + +from typing_extensions import ( + Literal as typing_extensions___Literal, +) + + +class EnvironmentParametersProto(google___protobuf___message___Message): + class FloatParametersEntry(google___protobuf___message___Message): + key = ... # type: typing___Text + value = ... # type: float + + def __init__(self, + key : typing___Optional[typing___Text] = None, + value : typing___Optional[float] = None, + ) -> None: ... + @classmethod + def FromString(cls, s: bytes) -> EnvironmentParametersProto.FloatParametersEntry: ... + def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + if sys.version_info >= (3,): + def ClearField(self, field_name: typing_extensions___Literal[u"key",u"value"]) -> None: ... + else: + def ClearField(self, field_name: typing_extensions___Literal[b"key",b"value"]) -> None: ... + + + @property + def float_parameters(self) -> typing___MutableMapping[typing___Text, float]: ... + + @property + def custom_reset_parameters(self) -> mlagents___envs___communicator_objects___custom_reset_parameters_pb2___CustomResetParameters: ... + + def __init__(self, + float_parameters : typing___Optional[typing___Mapping[typing___Text, float]] = None, + custom_reset_parameters : typing___Optional[mlagents___envs___communicator_objects___custom_reset_parameters_pb2___CustomResetParameters] = None, + ) -> None: ... + @classmethod + def FromString(cls, s: bytes) -> EnvironmentParametersProto: ... + def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + if sys.version_info >= (3,): + def HasField(self, field_name: typing_extensions___Literal[u"custom_reset_parameters"]) -> bool: ... + def ClearField(self, field_name: typing_extensions___Literal[u"custom_reset_parameters",u"float_parameters"]) -> None: ... + else: + def HasField(self, field_name: typing_extensions___Literal[u"custom_reset_parameters",b"custom_reset_parameters"]) -> bool: ... + def ClearField(self, field_name: typing_extensions___Literal[b"custom_reset_parameters",b"float_parameters"]) -> None: ... diff --git a/ml-agents-envs/mlagents/envs/communicator_objects/header_pb2.pyi b/ml-agents-envs/mlagents/envs/communicator_objects/header_pb2.pyi new file mode 100644 index 0000000000..3852336713 --- /dev/null +++ b/ml-agents-envs/mlagents/envs/communicator_objects/header_pb2.pyi @@ -0,0 +1,32 @@ +# @generated by generate_proto_mypy_stubs.py. Do not edit! +import sys +from google.protobuf.message import ( + Message as google___protobuf___message___Message, +) + +from typing import ( + Optional as typing___Optional, + Text as typing___Text, +) + +from typing_extensions import ( + Literal as typing_extensions___Literal, +) + + +class Header(google___protobuf___message___Message): + status = ... # type: int + message = ... # type: typing___Text + + def __init__(self, + status : typing___Optional[int] = None, + message : typing___Optional[typing___Text] = None, + ) -> None: ... + @classmethod + def FromString(cls, s: bytes) -> Header: ... + def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + if sys.version_info >= (3,): + def ClearField(self, field_name: typing_extensions___Literal[u"message",u"status"]) -> None: ... + else: + def ClearField(self, field_name: typing_extensions___Literal[b"message",b"status"]) -> None: ... diff --git a/ml-agents-envs/mlagents/envs/communicator_objects/resolution_proto_pb2.pyi b/ml-agents-envs/mlagents/envs/communicator_objects/resolution_proto_pb2.pyi new file mode 100644 index 0000000000..a9068d4833 --- /dev/null +++ b/ml-agents-envs/mlagents/envs/communicator_objects/resolution_proto_pb2.pyi @@ -0,0 +1,33 @@ +# @generated by generate_proto_mypy_stubs.py. Do not edit! +import sys +from google.protobuf.message import ( + Message as google___protobuf___message___Message, +) + +from typing import ( + Optional as typing___Optional, +) + +from typing_extensions import ( + Literal as typing_extensions___Literal, +) + + +class ResolutionProto(google___protobuf___message___Message): + width = ... # type: int + height = ... # type: int + gray_scale = ... # type: bool + + def __init__(self, + width : typing___Optional[int] = None, + height : typing___Optional[int] = None, + gray_scale : typing___Optional[bool] = None, + ) -> None: ... + @classmethod + def FromString(cls, s: bytes) -> ResolutionProto: ... + def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + if sys.version_info >= (3,): + def ClearField(self, field_name: typing_extensions___Literal[u"gray_scale",u"height",u"width"]) -> None: ... + else: + def ClearField(self, field_name: typing_extensions___Literal[b"gray_scale",b"height",b"width"]) -> None: ... diff --git a/ml-agents-envs/mlagents/envs/communicator_objects/space_type_proto_pb2.pyi b/ml-agents-envs/mlagents/envs/communicator_objects/space_type_proto_pb2.pyi new file mode 100644 index 0000000000..8dae72559b --- /dev/null +++ b/ml-agents-envs/mlagents/envs/communicator_objects/space_type_proto_pb2.pyi @@ -0,0 +1,31 @@ +# @generated by generate_proto_mypy_stubs.py. Do not edit! +import sys +from google.protobuf.descriptor import ( + EnumDescriptor as google___protobuf___descriptor___EnumDescriptor, +) + +from google.protobuf.message import ( + Message as google___protobuf___message___Message, +) + +from typing import ( + List as typing___List, + Tuple as typing___Tuple, + cast as typing___cast, +) + + +class SpaceTypeProto(int): + DESCRIPTOR: google___protobuf___descriptor___EnumDescriptor = ... + @classmethod + def Name(cls, number: int) -> str: ... + @classmethod + def Value(cls, name: str) -> SpaceTypeProto: ... + @classmethod + def keys(cls) -> typing___List[str]: ... + @classmethod + def values(cls) -> typing___List[SpaceTypeProto]: ... + @classmethod + def items(cls) -> typing___List[typing___Tuple[str, SpaceTypeProto]]: ... +discrete = typing___cast(SpaceTypeProto, 0) +continuous = typing___cast(SpaceTypeProto, 1) diff --git a/ml-agents-envs/mlagents/envs/communicator_objects/unity_input_pb2.pyi b/ml-agents-envs/mlagents/envs/communicator_objects/unity_input_pb2.pyi new file mode 100644 index 0000000000..372ebce4cc --- /dev/null +++ b/ml-agents-envs/mlagents/envs/communicator_objects/unity_input_pb2.pyi @@ -0,0 +1,45 @@ +# @generated by generate_proto_mypy_stubs.py. Do not edit! +import sys +from google.protobuf.message import ( + Message as google___protobuf___message___Message, +) + +from mlagents.envs.communicator_objects.unity_rl_initialization_input_pb2 import ( + UnityRLInitializationInput as mlagents___envs___communicator_objects___unity_rl_initialization_input_pb2___UnityRLInitializationInput, +) + +from mlagents.envs.communicator_objects.unity_rl_input_pb2 import ( + UnityRLInput as mlagents___envs___communicator_objects___unity_rl_input_pb2___UnityRLInput, +) + +from typing import ( + Optional as typing___Optional, +) + +from typing_extensions import ( + Literal as typing_extensions___Literal, +) + + +class UnityInput(google___protobuf___message___Message): + + @property + def rl_input(self) -> mlagents___envs___communicator_objects___unity_rl_input_pb2___UnityRLInput: ... + + @property + def rl_initialization_input(self) -> mlagents___envs___communicator_objects___unity_rl_initialization_input_pb2___UnityRLInitializationInput: ... + + def __init__(self, + rl_input : typing___Optional[mlagents___envs___communicator_objects___unity_rl_input_pb2___UnityRLInput] = None, + rl_initialization_input : typing___Optional[mlagents___envs___communicator_objects___unity_rl_initialization_input_pb2___UnityRLInitializationInput] = None, + ) -> None: ... + @classmethod + def FromString(cls, s: bytes) -> UnityInput: ... + def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + if sys.version_info >= (3,): + def HasField(self, field_name: typing_extensions___Literal[u"rl_initialization_input",u"rl_input"]) -> bool: ... + def ClearField(self, field_name: typing_extensions___Literal[u"rl_initialization_input",u"rl_input"]) -> None: ... + else: + def HasField(self, field_name: typing_extensions___Literal[u"rl_initialization_input",b"rl_initialization_input",u"rl_input",b"rl_input"]) -> bool: ... + def ClearField(self, field_name: typing_extensions___Literal[b"rl_initialization_input",b"rl_input"]) -> None: ... diff --git a/ml-agents-envs/mlagents/envs/communicator_objects/unity_message_pb2.pyi b/ml-agents-envs/mlagents/envs/communicator_objects/unity_message_pb2.pyi new file mode 100644 index 0000000000..04d221534b --- /dev/null +++ b/ml-agents-envs/mlagents/envs/communicator_objects/unity_message_pb2.pyi @@ -0,0 +1,53 @@ +# @generated by generate_proto_mypy_stubs.py. Do not edit! +import sys +from google.protobuf.message import ( + Message as google___protobuf___message___Message, +) + +from mlagents.envs.communicator_objects.header_pb2 import ( + Header as mlagents___envs___communicator_objects___header_pb2___Header, +) + +from mlagents.envs.communicator_objects.unity_input_pb2 import ( + UnityInput as mlagents___envs___communicator_objects___unity_input_pb2___UnityInput, +) + +from mlagents.envs.communicator_objects.unity_output_pb2 import ( + UnityOutput as mlagents___envs___communicator_objects___unity_output_pb2___UnityOutput, +) + +from typing import ( + Optional as typing___Optional, +) + +from typing_extensions import ( + Literal as typing_extensions___Literal, +) + + +class UnityMessage(google___protobuf___message___Message): + + @property + def header(self) -> mlagents___envs___communicator_objects___header_pb2___Header: ... + + @property + def unity_output(self) -> mlagents___envs___communicator_objects___unity_output_pb2___UnityOutput: ... + + @property + def unity_input(self) -> mlagents___envs___communicator_objects___unity_input_pb2___UnityInput: ... + + def __init__(self, + header : typing___Optional[mlagents___envs___communicator_objects___header_pb2___Header] = None, + unity_output : typing___Optional[mlagents___envs___communicator_objects___unity_output_pb2___UnityOutput] = None, + unity_input : typing___Optional[mlagents___envs___communicator_objects___unity_input_pb2___UnityInput] = None, + ) -> None: ... + @classmethod + def FromString(cls, s: bytes) -> UnityMessage: ... + def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + if sys.version_info >= (3,): + def HasField(self, field_name: typing_extensions___Literal[u"header",u"unity_input",u"unity_output"]) -> bool: ... + def ClearField(self, field_name: typing_extensions___Literal[u"header",u"unity_input",u"unity_output"]) -> None: ... + else: + def HasField(self, field_name: typing_extensions___Literal[u"header",b"header",u"unity_input",b"unity_input",u"unity_output",b"unity_output"]) -> bool: ... + def ClearField(self, field_name: typing_extensions___Literal[b"header",b"unity_input",b"unity_output"]) -> None: ... diff --git a/ml-agents-envs/mlagents/envs/communicator_objects/unity_output_pb2.pyi b/ml-agents-envs/mlagents/envs/communicator_objects/unity_output_pb2.pyi new file mode 100644 index 0000000000..b7a66ff436 --- /dev/null +++ b/ml-agents-envs/mlagents/envs/communicator_objects/unity_output_pb2.pyi @@ -0,0 +1,45 @@ +# @generated by generate_proto_mypy_stubs.py. Do not edit! +import sys +from google.protobuf.message import ( + Message as google___protobuf___message___Message, +) + +from mlagents.envs.communicator_objects.unity_rl_initialization_output_pb2 import ( + UnityRLInitializationOutput as mlagents___envs___communicator_objects___unity_rl_initialization_output_pb2___UnityRLInitializationOutput, +) + +from mlagents.envs.communicator_objects.unity_rl_output_pb2 import ( + UnityRLOutput as mlagents___envs___communicator_objects___unity_rl_output_pb2___UnityRLOutput, +) + +from typing import ( + Optional as typing___Optional, +) + +from typing_extensions import ( + Literal as typing_extensions___Literal, +) + + +class UnityOutput(google___protobuf___message___Message): + + @property + def rl_output(self) -> mlagents___envs___communicator_objects___unity_rl_output_pb2___UnityRLOutput: ... + + @property + def rl_initialization_output(self) -> mlagents___envs___communicator_objects___unity_rl_initialization_output_pb2___UnityRLInitializationOutput: ... + + def __init__(self, + rl_output : typing___Optional[mlagents___envs___communicator_objects___unity_rl_output_pb2___UnityRLOutput] = None, + rl_initialization_output : typing___Optional[mlagents___envs___communicator_objects___unity_rl_initialization_output_pb2___UnityRLInitializationOutput] = None, + ) -> None: ... + @classmethod + def FromString(cls, s: bytes) -> UnityOutput: ... + def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + if sys.version_info >= (3,): + def HasField(self, field_name: typing_extensions___Literal[u"rl_initialization_output",u"rl_output"]) -> bool: ... + def ClearField(self, field_name: typing_extensions___Literal[u"rl_initialization_output",u"rl_output"]) -> None: ... + else: + def HasField(self, field_name: typing_extensions___Literal[u"rl_initialization_output",b"rl_initialization_output",u"rl_output",b"rl_output"]) -> bool: ... + def ClearField(self, field_name: typing_extensions___Literal[b"rl_initialization_output",b"rl_output"]) -> None: ... diff --git a/ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_initialization_input_pb2.pyi b/ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_initialization_input_pb2.pyi new file mode 100644 index 0000000000..d85a1f54eb --- /dev/null +++ b/ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_initialization_input_pb2.pyi @@ -0,0 +1,29 @@ +# @generated by generate_proto_mypy_stubs.py. Do not edit! +import sys +from google.protobuf.message import ( + Message as google___protobuf___message___Message, +) + +from typing import ( + Optional as typing___Optional, +) + +from typing_extensions import ( + Literal as typing_extensions___Literal, +) + + +class UnityRLInitializationInput(google___protobuf___message___Message): + seed = ... # type: int + + def __init__(self, + seed : typing___Optional[int] = None, + ) -> None: ... + @classmethod + def FromString(cls, s: bytes) -> UnityRLInitializationInput: ... + def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + if sys.version_info >= (3,): + def ClearField(self, field_name: typing_extensions___Literal[u"seed"]) -> None: ... + else: + def ClearField(self, field_name: typing_extensions___Literal[b"seed"]) -> None: ... diff --git a/ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_initialization_output_pb2.pyi b/ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_initialization_output_pb2.pyi new file mode 100644 index 0000000000..6cccfeb9bb --- /dev/null +++ b/ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_initialization_output_pb2.pyi @@ -0,0 +1,57 @@ +# @generated by generate_proto_mypy_stubs.py. Do not edit! +import sys +from google.protobuf.internal.containers import ( + RepeatedCompositeFieldContainer as google___protobuf___internal___containers___RepeatedCompositeFieldContainer, +) + +from google.protobuf.message import ( + Message as google___protobuf___message___Message, +) + +from mlagents.envs.communicator_objects.brain_parameters_proto_pb2 import ( + BrainParametersProto as mlagents___envs___communicator_objects___brain_parameters_proto_pb2___BrainParametersProto, +) + +from mlagents.envs.communicator_objects.environment_parameters_proto_pb2 import ( + EnvironmentParametersProto as mlagents___envs___communicator_objects___environment_parameters_proto_pb2___EnvironmentParametersProto, +) + +from typing import ( + Iterable as typing___Iterable, + Optional as typing___Optional, + Text as typing___Text, +) + +from typing_extensions import ( + Literal as typing_extensions___Literal, +) + + +class UnityRLInitializationOutput(google___protobuf___message___Message): + name = ... # type: typing___Text + version = ... # type: typing___Text + log_path = ... # type: typing___Text + + @property + def brain_parameters(self) -> google___protobuf___internal___containers___RepeatedCompositeFieldContainer[mlagents___envs___communicator_objects___brain_parameters_proto_pb2___BrainParametersProto]: ... + + @property + def environment_parameters(self) -> mlagents___envs___communicator_objects___environment_parameters_proto_pb2___EnvironmentParametersProto: ... + + def __init__(self, + name : typing___Optional[typing___Text] = None, + version : typing___Optional[typing___Text] = None, + log_path : typing___Optional[typing___Text] = None, + brain_parameters : typing___Optional[typing___Iterable[mlagents___envs___communicator_objects___brain_parameters_proto_pb2___BrainParametersProto]] = None, + environment_parameters : typing___Optional[mlagents___envs___communicator_objects___environment_parameters_proto_pb2___EnvironmentParametersProto] = None, + ) -> None: ... + @classmethod + def FromString(cls, s: bytes) -> UnityRLInitializationOutput: ... + def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + if sys.version_info >= (3,): + def HasField(self, field_name: typing_extensions___Literal[u"environment_parameters"]) -> bool: ... + def ClearField(self, field_name: typing_extensions___Literal[u"brain_parameters",u"environment_parameters",u"log_path",u"name",u"version"]) -> None: ... + else: + def HasField(self, field_name: typing_extensions___Literal[u"environment_parameters",b"environment_parameters"]) -> bool: ... + def ClearField(self, field_name: typing_extensions___Literal[b"brain_parameters",b"environment_parameters",b"log_path",b"name",b"version"]) -> None: ... diff --git a/ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_input_pb2.pyi b/ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_input_pb2.pyi new file mode 100644 index 0000000000..1724000b4a --- /dev/null +++ b/ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_input_pb2.pyi @@ -0,0 +1,99 @@ +# @generated by generate_proto_mypy_stubs.py. Do not edit! +import sys +from google.protobuf.internal.containers import ( + RepeatedCompositeFieldContainer as google___protobuf___internal___containers___RepeatedCompositeFieldContainer, +) + +from google.protobuf.message import ( + Message as google___protobuf___message___Message, +) + +from mlagents.envs.communicator_objects.agent_action_proto_pb2 import ( + AgentActionProto as mlagents___envs___communicator_objects___agent_action_proto_pb2___AgentActionProto, +) + +from mlagents.envs.communicator_objects.command_proto_pb2 import ( + CommandProto as mlagents___envs___communicator_objects___command_proto_pb2___CommandProto, +) + +from mlagents.envs.communicator_objects.environment_parameters_proto_pb2 import ( + EnvironmentParametersProto as mlagents___envs___communicator_objects___environment_parameters_proto_pb2___EnvironmentParametersProto, +) + +from typing import ( + Iterable as typing___Iterable, + Mapping as typing___Mapping, + MutableMapping as typing___MutableMapping, + Optional as typing___Optional, + Text as typing___Text, +) + +from typing_extensions import ( + Literal as typing_extensions___Literal, +) + + +class UnityRLInput(google___protobuf___message___Message): + class ListAgentActionProto(google___protobuf___message___Message): + + @property + def value(self) -> google___protobuf___internal___containers___RepeatedCompositeFieldContainer[mlagents___envs___communicator_objects___agent_action_proto_pb2___AgentActionProto]: ... + + def __init__(self, + value : typing___Optional[typing___Iterable[mlagents___envs___communicator_objects___agent_action_proto_pb2___AgentActionProto]] = None, + ) -> None: ... + @classmethod + def FromString(cls, s: bytes) -> UnityRLInput.ListAgentActionProto: ... + def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + if sys.version_info >= (3,): + def ClearField(self, field_name: typing_extensions___Literal[u"value"]) -> None: ... + else: + def ClearField(self, field_name: typing_extensions___Literal[b"value"]) -> None: ... + + class AgentActionsEntry(google___protobuf___message___Message): + key = ... # type: typing___Text + + @property + def value(self) -> UnityRLInput.ListAgentActionProto: ... + + def __init__(self, + key : typing___Optional[typing___Text] = None, + value : typing___Optional[UnityRLInput.ListAgentActionProto] = None, + ) -> None: ... + @classmethod + def FromString(cls, s: bytes) -> UnityRLInput.AgentActionsEntry: ... + def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + if sys.version_info >= (3,): + def HasField(self, field_name: typing_extensions___Literal[u"value"]) -> bool: ... + def ClearField(self, field_name: typing_extensions___Literal[u"key",u"value"]) -> None: ... + else: + def HasField(self, field_name: typing_extensions___Literal[u"value",b"value"]) -> bool: ... + def ClearField(self, field_name: typing_extensions___Literal[b"key",b"value"]) -> None: ... + + is_training = ... # type: bool + command = ... # type: mlagents___envs___communicator_objects___command_proto_pb2___CommandProto + + @property + def agent_actions(self) -> typing___MutableMapping[typing___Text, UnityRLInput.ListAgentActionProto]: ... + + @property + def environment_parameters(self) -> mlagents___envs___communicator_objects___environment_parameters_proto_pb2___EnvironmentParametersProto: ... + + def __init__(self, + agent_actions : typing___Optional[typing___Mapping[typing___Text, UnityRLInput.ListAgentActionProto]] = None, + environment_parameters : typing___Optional[mlagents___envs___communicator_objects___environment_parameters_proto_pb2___EnvironmentParametersProto] = None, + is_training : typing___Optional[bool] = None, + command : typing___Optional[mlagents___envs___communicator_objects___command_proto_pb2___CommandProto] = None, + ) -> None: ... + @classmethod + def FromString(cls, s: bytes) -> UnityRLInput: ... + def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + if sys.version_info >= (3,): + def HasField(self, field_name: typing_extensions___Literal[u"environment_parameters"]) -> bool: ... + def ClearField(self, field_name: typing_extensions___Literal[u"agent_actions",u"command",u"environment_parameters",u"is_training"]) -> None: ... + else: + def HasField(self, field_name: typing_extensions___Literal[u"environment_parameters",b"environment_parameters"]) -> bool: ... + def ClearField(self, field_name: typing_extensions___Literal[b"agent_actions",b"command",b"environment_parameters",b"is_training"]) -> None: ... diff --git a/ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_output_pb2.pyi b/ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_output_pb2.pyi new file mode 100644 index 0000000000..a75e130d8e --- /dev/null +++ b/ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_output_pb2.pyi @@ -0,0 +1,83 @@ +# @generated by generate_proto_mypy_stubs.py. Do not edit! +import sys +from google.protobuf.internal.containers import ( + RepeatedCompositeFieldContainer as google___protobuf___internal___containers___RepeatedCompositeFieldContainer, +) + +from google.protobuf.message import ( + Message as google___protobuf___message___Message, +) + +from mlagents.envs.communicator_objects.agent_info_proto_pb2 import ( + AgentInfoProto as mlagents___envs___communicator_objects___agent_info_proto_pb2___AgentInfoProto, +) + +from typing import ( + Iterable as typing___Iterable, + Mapping as typing___Mapping, + MutableMapping as typing___MutableMapping, + Optional as typing___Optional, + Text as typing___Text, +) + +from typing_extensions import ( + Literal as typing_extensions___Literal, +) + + +class UnityRLOutput(google___protobuf___message___Message): + class ListAgentInfoProto(google___protobuf___message___Message): + + @property + def value(self) -> google___protobuf___internal___containers___RepeatedCompositeFieldContainer[mlagents___envs___communicator_objects___agent_info_proto_pb2___AgentInfoProto]: ... + + def __init__(self, + value : typing___Optional[typing___Iterable[mlagents___envs___communicator_objects___agent_info_proto_pb2___AgentInfoProto]] = None, + ) -> None: ... + @classmethod + def FromString(cls, s: bytes) -> UnityRLOutput.ListAgentInfoProto: ... + def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + if sys.version_info >= (3,): + def ClearField(self, field_name: typing_extensions___Literal[u"value"]) -> None: ... + else: + def ClearField(self, field_name: typing_extensions___Literal[b"value"]) -> None: ... + + class AgentInfosEntry(google___protobuf___message___Message): + key = ... # type: typing___Text + + @property + def value(self) -> UnityRLOutput.ListAgentInfoProto: ... + + def __init__(self, + key : typing___Optional[typing___Text] = None, + value : typing___Optional[UnityRLOutput.ListAgentInfoProto] = None, + ) -> None: ... + @classmethod + def FromString(cls, s: bytes) -> UnityRLOutput.AgentInfosEntry: ... + def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + if sys.version_info >= (3,): + def HasField(self, field_name: typing_extensions___Literal[u"value"]) -> bool: ... + def ClearField(self, field_name: typing_extensions___Literal[u"key",u"value"]) -> None: ... + else: + def HasField(self, field_name: typing_extensions___Literal[u"value",b"value"]) -> bool: ... + def ClearField(self, field_name: typing_extensions___Literal[b"key",b"value"]) -> None: ... + + global_done = ... # type: bool + + @property + def agentInfos(self) -> typing___MutableMapping[typing___Text, UnityRLOutput.ListAgentInfoProto]: ... + + def __init__(self, + global_done : typing___Optional[bool] = None, + agentInfos : typing___Optional[typing___Mapping[typing___Text, UnityRLOutput.ListAgentInfoProto]] = None, + ) -> None: ... + @classmethod + def FromString(cls, s: bytes) -> UnityRLOutput: ... + def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ... + if sys.version_info >= (3,): + def ClearField(self, field_name: typing_extensions___Literal[u"agentInfos",u"global_done"]) -> None: ... + else: + def ClearField(self, field_name: typing_extensions___Literal[b"agentInfos",b"global_done"]) -> None: ... diff --git a/ml-agents-envs/mlagents/envs/communicator_objects/unity_to_external_pb2.py b/ml-agents-envs/mlagents/envs/communicator_objects/unity_to_external_pb2.py index a42ef09178..d1a8c7f60a 100644 --- a/ml-agents-envs/mlagents/envs/communicator_objects/unity_to_external_pb2.py +++ b/ml-agents-envs/mlagents/envs/communicator_objects/unity_to_external_pb2.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: mlagents/envs/communicator_objects/unity_to_external.proto diff --git a/ml-agents-envs/mlagents/envs/communicator_objects/unity_to_external_pb2.pyi b/ml-agents-envs/mlagents/envs/communicator_objects/unity_to_external_pb2.pyi new file mode 100644 index 0000000000..8bba4ff746 --- /dev/null +++ b/ml-agents-envs/mlagents/envs/communicator_objects/unity_to_external_pb2.pyi @@ -0,0 +1,6 @@ +# @generated by generate_proto_mypy_stubs.py. Do not edit! +import sys +from google.protobuf.message import ( + Message as google___protobuf___message___Message, +) + diff --git a/ml-agents-envs/mlagents/envs/env_manager.py b/ml-agents-envs/mlagents/envs/env_manager.py new file mode 100644 index 0000000000..afe917b18d --- /dev/null +++ b/ml-agents-envs/mlagents/envs/env_manager.py @@ -0,0 +1,39 @@ +from abc import ABC, abstractmethod +from typing import List, Dict, NamedTuple, Optional +from mlagents.envs import AllBrainInfo, BrainParameters, Policy, ActionInfo + + +class StepInfo(NamedTuple): + previous_all_brain_info: Optional[AllBrainInfo] + current_all_brain_info: AllBrainInfo + brain_name_to_action_info: Optional[Dict[str, ActionInfo]] + + +class EnvManager(ABC): + def __init__(self): + self.policies: Dict[str, Policy] = {} + + def set_policy(self, brain_name: str, policy: Policy) -> None: + self.policies[brain_name] = policy + + @abstractmethod + def step(self) -> List[StepInfo]: + pass + + @abstractmethod + def reset(self, config=None, train_mode=True) -> List[StepInfo]: + pass + + @property + @abstractmethod + def external_brains(self) -> Dict[str, BrainParameters]: + pass + + @property + @abstractmethod + def reset_parameters(self) -> Dict[str, float]: + pass + + @abstractmethod + def close(self): + pass diff --git a/ml-agents-envs/mlagents/envs/environment.py b/ml-agents-envs/mlagents/envs/environment.py index ccf1a5f49c..d642d792b6 100644 --- a/ml-agents-envs/mlagents/envs/environment.py +++ b/ml-agents-envs/mlagents/envs/environment.py @@ -7,9 +7,11 @@ from typing import * from mlagents.envs.base_unity_environment import BaseUnityEnvironment +from mlagents.envs.timers import timed, hierarchical_timer from .brain import AllBrainInfo, BrainInfo, BrainParameters from .exception import ( UnityEnvironmentException, + UnityCommunicationException, UnityActionException, UnityTimeOutException, ) @@ -48,6 +50,7 @@ def __init__( docker_training: bool = False, no_graphics: bool = False, timeout_wait: int = 30, + args: list = [], ): """ Starts a new unity environment and establishes a connection with the environment. @@ -61,12 +64,13 @@ def __init__( :bool no_graphics: Whether to run the Unity simulator in no-graphics mode :int timeout_wait: Time (in seconds) to wait for connection from environment. :bool train_mode: Whether to run in training mode, speeding up the simulation, by default. + :list args: Addition Unity command line arguments """ atexit.register(self._close) self.port = base_port + worker_id self._buffer_size = 12000 - self._version_ = "API-8" + self._version_ = "API-9" self._loaded = ( False ) # If true, this means the environment was successfully loaded @@ -74,6 +78,7 @@ def __init__( None ) # The process that is started. If None, no process was started self.communicator = self.get_communicator(worker_id, base_port, timeout_wait) + self.worker_id = worker_id # If the environment name is None, a new environment will not be launched # and the communicator will directly try to connect to an existing unity environment. @@ -84,7 +89,7 @@ def __init__( "the worker-id must be 0 in order to connect with the Editor." ) if file_name is not None: - self.executable_launcher(file_name, docker_training, no_graphics) + self.executable_launcher(file_name, docker_training, no_graphics, args) else: logger.info( "Start training by pressing the Play button in the Unity Editor." @@ -106,13 +111,13 @@ def __init__( "{1}.\nPlease go to https://github.com/Unity-Technologies/ml-agents to download the latest version " "of ML-Agents.".format(self._version_, self._unity_version) ) - self._n_agents = {} - self._global_done = None + self._n_agents: Dict[str, int] = {} + self._global_done: Optional[bool] = None self._academy_name = aca_params.name self._log_path = aca_params.log_path - self._brains = {} - self._brain_names = [] - self._external_brain_names = [] + self._brains: Dict[str, BrainParameters] = {} + self._brain_names: List[str] = [] + self._external_brain_names: List[str] = [] for brain_param in aca_params.brain_parameters: self._brain_names += [brain_param.brain_name] self._brains[brain_param.brain_name] = BrainParameters.from_proto( @@ -179,7 +184,7 @@ def external_brains(self): def reset_parameters(self): return self._resetParameters - def executable_launcher(self, file_name, docker_training, no_graphics): + def executable_launcher(self, file_name, docker_training, no_graphics, args): cwd = os.getcwd() file_name = ( file_name.strip() @@ -249,10 +254,11 @@ def executable_launcher(self, file_name, docker_training, no_graphics): "--port", str(self.port), ] + + args ) else: self.proc1 = subprocess.Popen( - [launch_string, "--port", str(self.port)] + [launch_string, "--port", str(self.port)] + args ) else: """ @@ -338,7 +344,7 @@ def reset( self._generate_reset_input(train_mode, config, custom_reset_parameters) ) if outputs is None: - raise KeyboardInterrupt + raise UnityCommunicationException("Communicator has stopped.") rl_output = outputs.rl_output s = self._get_state(rl_output) self._global_done = s[1] @@ -348,6 +354,7 @@ def reset( else: raise UnityEnvironmentException("No Unity environment is loaded.") + @timed def step( self, vector_action=None, @@ -373,7 +380,18 @@ def step( custom_action = {} if custom_action is None else custom_action # Check that environment is loaded, and episode is currently running. - if self._loaded and not self._global_done and self._global_done is not None: + if not self._loaded: + raise UnityEnvironmentException("No Unity environment is loaded.") + elif self._global_done: + raise UnityActionException( + "The episode is completed. Reset the environment with 'reset()'" + ) + elif self.global_done is None: + raise UnityActionException( + "You cannot conduct step without first calling reset. " + "Reset the environment with 'reset()'" + ) + else: if isinstance(vector_action, self.SINGLE_BRAIN_ACTION_TYPES): if self._num_external_brains == 1: vector_action = {self._external_brain_names[0]: vector_action} @@ -547,30 +565,19 @@ def step( ) ) - outputs = self.communicator.exchange( - self._generate_step_input( - vector_action, memory, text_action, value, custom_action - ) + step_input = self._generate_step_input( + vector_action, memory, text_action, value, custom_action ) + with hierarchical_timer("communicator.exchange"): + outputs = self.communicator.exchange(step_input) if outputs is None: - raise KeyboardInterrupt + raise UnityCommunicationException("Communicator has stopped.") rl_output = outputs.rl_output state = self._get_state(rl_output) self._global_done = state[1] for _b in self._external_brain_names: self._n_agents[_b] = len(state[0][_b].agents) return state[0] - elif not self._loaded: - raise UnityEnvironmentException("No Unity environment is loaded.") - elif self._global_done: - raise UnityActionException( - "The episode is completed. Reset the environment with 'reset()'" - ) - elif self.global_done is None: - raise UnityActionException( - "You cannot conduct step without first calling reset. " - "Reset the environment with 'reset()'" - ) def close(self): """ @@ -607,7 +614,7 @@ def _flatten(cls, arr) -> List[float]: arr = [float(x) for x in arr] return arr - def _get_state(self, output: UnityRLOutput) -> (AllBrainInfo, bool): + def _get_state(self, output: UnityRLOutput) -> Tuple[AllBrainInfo, bool]: """ Collects experience information from all external brains in environment at current step. :return: a dictionary of BrainInfo objects. @@ -617,13 +624,14 @@ def _get_state(self, output: UnityRLOutput) -> (AllBrainInfo, bool): for brain_name in output.agentInfos: agent_info_list = output.agentInfos[brain_name].value _data[brain_name] = BrainInfo.from_agent_proto( - agent_info_list, self.brains[brain_name] + self.worker_id, agent_info_list, self.brains[brain_name] ) return _data, global_done + @timed def _generate_step_input( self, vector_action, memory, text_action, value, custom_action - ) -> UnityRLInput: + ) -> UnityInput: rl_in = UnityRLInput() for b in vector_action: n_agents = self._n_agents[b] @@ -647,7 +655,7 @@ def _generate_step_input( def _generate_reset_input( self, training, config, custom_reset_parameters - ) -> UnityRLInput: + ) -> UnityInput: rl_in = UnityRLInput() rl_in.is_training = training rl_in.environment_parameters.CopyFrom(EnvironmentParametersProto()) @@ -668,7 +676,7 @@ def send_academy_parameters( return self.communicator.initialize(inputs).rl_initialization_output @staticmethod - def wrap_unity_input(rl_input: UnityRLInput) -> UnityOutput: + def wrap_unity_input(rl_input: UnityRLInput) -> UnityInput: result = UnityInput() result.rl_input.CopyFrom(rl_input) return result diff --git a/ml-agents-envs/mlagents/envs/exception.py b/ml-agents-envs/mlagents/envs/exception.py index d250bf2843..f1c0bed80c 100644 --- a/ml-agents-envs/mlagents/envs/exception.py +++ b/ml-agents-envs/mlagents/envs/exception.py @@ -19,6 +19,14 @@ class UnityEnvironmentException(UnityException): pass +class UnityCommunicationException(UnityException): + """ + Related to errors with the communicator. + """ + + pass + + class UnityActionException(UnityException): """ Related to errors with sending actions. @@ -27,6 +35,14 @@ class UnityActionException(UnityException): pass +class SamplerException(UnityException): + """ + Related to errors with the sampler actions. + """ + + pass + + class UnityTimeOutException(UnityException): """ Related to errors with communication timeouts. @@ -38,15 +54,15 @@ def __init__(self, message, log_file_path=None): with open(log_file_path, "r") as f: printing = False unity_error = "\n" - for l in f: - l = l.strip() - if (l == "Exception") or (l == "Error"): + for line in f: + line = line.strip() + if (line == "Exception") or (line == "Error"): printing = True unity_error += "----------------------\n" - if l == "": + if line == "": printing = False if printing: - unity_error += l + "\n" + unity_error += line + "\n" logger.info(unity_error) logger.error( "An error might have occured in the environment. " @@ -54,7 +70,7 @@ def __init__(self, message, log_file_path=None): log_file_path ) ) - except: + except Exception: logger.error( "An error might have occured in the environment. " "No UnitySDK.log file could be found." diff --git a/ml-agents-envs/mlagents/envs/mock_communicator.py b/ml-agents-envs/mlagents/envs/mock_communicator.py index c5ab43ebb1..da0cdf7b53 100755 --- a/ml-agents-envs/mlagents/envs/mock_communicator.py +++ b/ml-agents-envs/mlagents/envs/mock_communicator.py @@ -54,7 +54,7 @@ def initialize(self, inputs: UnityInput) -> UnityOutput: is_training=True, ) rl_init = UnityRLInitializationOutput( - name="RealFakeAcademy", version="API-8", log_path="", brain_parameters=[bp] + name="RealFakeAcademy", version="API-9", log_path="", brain_parameters=[bp] ) return UnityOutput(rl_initialization_output=rl_init) @@ -91,7 +91,7 @@ def exchange(self, inputs: UnityInput) -> UnityOutput: try: fake_brain = inputs.rl_input.agent_actions["RealFakeBrain"] global_done = fake_brain.value[0].vector_actions[0] == -1 - except: + except Exception: pass result = UnityRLOutput(global_done=global_done, agentInfos=dict_agent_info) return UnityOutput(rl_output=result) diff --git a/ml-agents-envs/mlagents/envs/policy.py b/ml-agents-envs/mlagents/envs/policy.py new file mode 100644 index 0000000000..89bb884e94 --- /dev/null +++ b/ml-agents-envs/mlagents/envs/policy.py @@ -0,0 +1,10 @@ +from abc import ABC, abstractmethod + +from mlagents.envs import BrainInfo +from mlagents.envs import ActionInfo + + +class Policy(ABC): + @abstractmethod + def get_action(self, brain_info: BrainInfo) -> ActionInfo: + pass diff --git a/ml-agents-envs/mlagents/envs/rpc_communicator.py b/ml-agents-envs/mlagents/envs/rpc_communicator.py index 5f75ceddc4..27cb3f6731 100644 --- a/ml-agents-envs/mlagents/envs/rpc_communicator.py +++ b/ml-agents-envs/mlagents/envs/rpc_communicator.py @@ -1,5 +1,6 @@ import logging import grpc +from typing import Optional import socket from multiprocessing import Pipe @@ -62,7 +63,7 @@ def create_server(self): self.server.add_insecure_port("[::]:" + str(self.port)) self.server.start() self.is_open = True - except: + except Exception: raise UnityWorkerInUseException(self.worker_id) def check_port(self, port): @@ -94,7 +95,7 @@ def initialize(self, inputs: UnityInput) -> UnityOutput: self.unity_to_external.parent_conn.recv() return aca_param - def exchange(self, inputs: UnityInput) -> UnityOutput: + def exchange(self, inputs: UnityInput) -> Optional[UnityOutput]: message = UnityMessage() message.header.status = 200 message.unity_input.CopyFrom(inputs) diff --git a/ml-agents-envs/mlagents/envs/sampler_class.py b/ml-agents-envs/mlagents/envs/sampler_class.py new file mode 100644 index 0000000000..cfff08253d --- /dev/null +++ b/ml-agents-envs/mlagents/envs/sampler_class.py @@ -0,0 +1,134 @@ +import numpy as np +from typing import * +from functools import * +from collections import OrderedDict +from abc import ABC, abstractmethod + +from .exception import SamplerException + + +class Sampler(ABC): + @abstractmethod + def sample_parameter(self) -> float: + pass + + +class UniformSampler(Sampler): + """ + Uniformly draws a single sample in the range [min_value, max_value). + """ + + def __init__( + self, min_value: Union[int, float], max_value: Union[int, float], **kwargs + ) -> None: + self.min_value = min_value + self.max_value = max_value + + def sample_parameter(self) -> float: + return np.random.uniform(self.min_value, self.max_value) + + +class MultiRangeUniformSampler(Sampler): + """ + Draws a single sample uniformly from the intervals provided. The sampler + first picks an interval based on a weighted selection, with the weights + assigned to an interval based on its range. After picking the range, + it proceeds to pick a value uniformly in that range. + """ + + def __init__(self, intervals: List[List[Union[int, float]]], **kwargs) -> None: + self.intervals = intervals + # Measure the length of the intervals + interval_lengths = [abs(x[1] - x[0]) for x in self.intervals] + cum_interval_length = sum(interval_lengths) + # Assign weights to an interval proportionate to the interval size + self.interval_weights = [x / cum_interval_length for x in interval_lengths] + + def sample_parameter(self) -> float: + cur_min, cur_max = self.intervals[ + np.random.choice(len(self.intervals), p=self.interval_weights) + ] + return np.random.uniform(cur_min, cur_max) + + +class GaussianSampler(Sampler): + """ + Draw a single sample value from a normal (gaussian) distribution. + This sampler is characterized by the mean and the standard deviation. + """ + + def __init__( + self, mean: Union[float, int], st_dev: Union[float, int], **kwargs + ) -> None: + self.mean = mean + self.st_dev = st_dev + + def sample_parameter(self) -> float: + return np.random.normal(self.mean, self.st_dev) + + +class SamplerFactory: + """ + Maintain a directory of all samplers available. + Add new samplers using the register_sampler method. + """ + + NAME_TO_CLASS = { + "uniform": UniformSampler, + "gaussian": GaussianSampler, + "multirange_uniform": MultiRangeUniformSampler, + } + + @staticmethod + def register_sampler(name: str, sampler_cls: Type[Sampler]) -> None: + SamplerFactory.NAME_TO_CLASS[name] = sampler_cls + + @staticmethod + def init_sampler_class(name: str, params: Dict[str, Any]): + if name not in SamplerFactory.NAME_TO_CLASS: + raise SamplerException( + name + " sampler is not registered in the SamplerFactory." + " Use the register_sample method to register the string" + " associated to your sampler in the SamplerFactory." + ) + sampler_cls = SamplerFactory.NAME_TO_CLASS[name] + try: + return sampler_cls(**params) + except TypeError: + raise SamplerException( + "The sampler class associated to the " + name + " key in the factory " + "was not provided the required arguments. Please ensure that the sampler " + "config file consists of the appropriate keys for this sampler class." + ) + + +class SamplerManager: + def __init__(self, reset_param_dict: Dict[str, Any]) -> None: + self.reset_param_dict = reset_param_dict if reset_param_dict else {} + assert isinstance(self.reset_param_dict, dict) + self.samplers: Dict[str, Sampler] = {} + for param_name, cur_param_dict in self.reset_param_dict.items(): + if "sampler-type" not in cur_param_dict: + raise SamplerException( + "'sampler_type' argument hasn't been supplied for the {0} parameter".format( + param_name + ) + ) + sampler_name = cur_param_dict.pop("sampler-type") + param_sampler = SamplerFactory.init_sampler_class( + sampler_name, cur_param_dict + ) + + self.samplers[param_name] = param_sampler + + def is_empty(self) -> bool: + """ + Check for if sampler_manager is empty. + """ + return not bool(self.samplers) + + def sample_all(self) -> Dict[str, float]: + res = {} + for param_name, param_sampler in list(self.samplers.items()): + res[param_name] = param_sampler.sample_parameter() + return res diff --git a/ml-agents-envs/mlagents/envs/simple_env_manager.py b/ml-agents-envs/mlagents/envs/simple_env_manager.py new file mode 100644 index 0000000000..0a4495539e --- /dev/null +++ b/ml-agents-envs/mlagents/envs/simple_env_manager.py @@ -0,0 +1,81 @@ +from typing import Any, Dict, List + +from mlagents.envs.base_unity_environment import BaseUnityEnvironment +from mlagents.envs.env_manager import EnvManager, StepInfo +from mlagents.envs.timers import timed +from mlagents.envs import ActionInfo, BrainParameters + + +class SimpleEnvManager(EnvManager): + """ + Simple implementation of the EnvManager interface that only handles one BaseUnityEnvironment at a time. + This is generally only useful for testing; see SubprocessEnvManager for a production-quality implementation. + """ + + def __init__(self, env: BaseUnityEnvironment): + super().__init__() + self.env = env + self.previous_step: StepInfo = StepInfo(None, {}, None) + self.previous_all_action_info: Dict[str, ActionInfo] = {} + + def step(self) -> List[StepInfo]: + + all_action_info = self._take_step(self.previous_step) + self.previous_all_action_info = all_action_info + + if self.env.global_done: + all_brain_info = self.env.reset() + else: + actions = {} + memories = {} + texts = {} + values = {} + for brain_name, action_info in all_action_info.items(): + actions[brain_name] = action_info.action + memories[brain_name] = action_info.memory + texts[brain_name] = action_info.text + values[brain_name] = action_info.value + all_brain_info = self.env.step(actions, memories, texts, values) + step_brain_info = all_brain_info + + step_info = StepInfo( + self.previous_step.current_all_brain_info, + step_brain_info, + self.previous_all_action_info, + ) + self.previous_step = step_info + return [step_info] + + def reset( + self, + config: Dict[str, float] = None, + train_mode: bool = True, + custom_reset_parameters: Any = None, + ) -> List[StepInfo]: # type: ignore + all_brain_info = self.env.reset( + config=config, + train_mode=train_mode, + custom_reset_parameters=custom_reset_parameters, + ) + self.previous_step = StepInfo(None, all_brain_info, None) + return [self.previous_step] + + @property + def external_brains(self) -> Dict[str, BrainParameters]: + return self.env.external_brains + + @property + def reset_parameters(self) -> Dict[str, float]: + return self.env.reset_parameters + + def close(self): + self.env.close() + + @timed + def _take_step(self, last_step: StepInfo) -> Dict[str, ActionInfo]: + all_action_info: Dict[str, ActionInfo] = {} + for brain_name, brain_info in last_step.current_all_brain_info.items(): + all_action_info[brain_name] = self.policies[brain_name].get_action( + brain_info + ) + return all_action_info diff --git a/ml-agents-envs/mlagents/envs/socket_communicator.py b/ml-agents-envs/mlagents/envs/socket_communicator.py index 3b06fa0d72..c9668064aa 100644 --- a/ml-agents-envs/mlagents/envs/socket_communicator.py +++ b/ml-agents-envs/mlagents/envs/socket_communicator.py @@ -1,6 +1,7 @@ import logging import socket import struct +from typing import Optional from .communicator import Communicator from .communicator_objects import UnityMessage, UnityOutput, UnityInput @@ -31,7 +32,7 @@ def initialize(self, inputs: UnityInput) -> UnityOutput: self._socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self._socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) self._socket.bind(("localhost", self.port)) - except: + except Exception: raise UnityTimeOutException( "Couldn't start socket communication because worker number {} is still in use. " "You may need to manually close a previously opened environment " @@ -42,7 +43,7 @@ def initialize(self, inputs: UnityInput) -> UnityOutput: self._socket.listen(1) self._conn, _ = self._socket.accept() self._conn.settimeout(30) - except: + except Exception: raise UnityTimeOutException( "The Unity environment took too long to respond. Make sure that :\n" "\t The environment does not need user interaction to launch\n" @@ -65,14 +66,14 @@ def _communicator_receive(self): s = s[4:] while len(s) != message_length: s += self._conn.recv(self._buffer_size) - except socket.timeout as e: + except socket.timeout: raise UnityTimeOutException("The environment took too long to respond.") return s def _communicator_send(self, message): self._conn.send(struct.pack("I", len(message)) + message) - def exchange(self, inputs: UnityInput) -> UnityOutput: + def exchange(self, inputs: UnityInput) -> Optional[UnityOutput]: message = UnityMessage() message.header.status = 200 message.unity_input.CopyFrom(inputs) diff --git a/ml-agents-envs/mlagents/envs/subprocess_env_manager.py b/ml-agents-envs/mlagents/envs/subprocess_env_manager.py new file mode 100644 index 0000000000..babb20382c --- /dev/null +++ b/ml-agents-envs/mlagents/envs/subprocess_env_manager.py @@ -0,0 +1,256 @@ +from typing import * +import cloudpickle + +from mlagents.envs import UnityEnvironment +from mlagents.envs.exception import UnityCommunicationException +from multiprocessing import Process, Pipe, Queue +from multiprocessing.connection import Connection +from queue import Empty as EmptyQueueException +from mlagents.envs.base_unity_environment import BaseUnityEnvironment +from mlagents.envs.env_manager import EnvManager, StepInfo +from mlagents.envs.timers import ( + TimerNode, + timed, + hierarchical_timer, + reset_timers, + get_timer_root, +) +from mlagents.envs import AllBrainInfo, BrainParameters, ActionInfo + + +class EnvironmentCommand(NamedTuple): + name: str + payload: Any = None + + +class EnvironmentResponse(NamedTuple): + name: str + worker_id: int + payload: Any + + +class StepResponse(NamedTuple): + all_brain_info: AllBrainInfo + timer_root: Optional[TimerNode] + + +class UnityEnvWorker: + def __init__(self, process: Process, worker_id: int, conn: Connection): + self.process = process + self.worker_id = worker_id + self.conn = conn + self.previous_step: StepInfo = StepInfo(None, {}, None) + self.previous_all_action_info: Dict[str, ActionInfo] = {} + self.waiting = False + + def send(self, name: str, payload=None): + try: + cmd = EnvironmentCommand(name, payload) + self.conn.send(cmd) + except (BrokenPipeError, EOFError): + raise UnityCommunicationException("UnityEnvironment worker: send failed.") + + def recv(self) -> EnvironmentResponse: + try: + response: EnvironmentResponse = self.conn.recv() + return response + except (BrokenPipeError, EOFError): + raise UnityCommunicationException("UnityEnvironment worker: recv failed.") + + def close(self): + try: + self.conn.send(EnvironmentCommand("close")) + except (BrokenPipeError, EOFError): + pass + self.process.join() + + +def worker( + parent_conn: Connection, step_queue: Queue, pickled_env_factory: str, worker_id: int +): + env_factory: Callable[[int], UnityEnvironment] = cloudpickle.loads( + pickled_env_factory + ) + env = env_factory(worker_id) + + def _send_response(cmd_name, payload): + parent_conn.send(EnvironmentResponse(cmd_name, worker_id, payload)) + + try: + while True: + cmd: EnvironmentCommand = parent_conn.recv() + if cmd.name == "step": + all_action_info = cmd.payload + # When an environment is "global_done" it means automatic agent reset won't occur, so we need + # to perform an academy reset. + if env.global_done: + all_brain_info = env.reset() + else: + actions = {} + memories = {} + texts = {} + values = {} + for brain_name, action_info in all_action_info.items(): + actions[brain_name] = action_info.action + memories[brain_name] = action_info.memory + texts[brain_name] = action_info.text + values[brain_name] = action_info.value + all_brain_info = env.step(actions, memories, texts, values) + # The timers in this process are independent from all the processes and the "main" process + # So after we send back the root timer, we can safely clear them. + # Note that we could randomly return timers a fraction of the time if we wanted to reduce + # the data transferred. + step_response = StepResponse(all_brain_info, get_timer_root()) + step_queue.put(EnvironmentResponse("step", worker_id, step_response)) + reset_timers() + elif cmd.name == "external_brains": + _send_response("external_brains", env.external_brains) + elif cmd.name == "reset_parameters": + _send_response("reset_parameters", env.reset_parameters) + elif cmd.name == "reset": + all_brain_info = env.reset( + cmd.payload[0], cmd.payload[1], cmd.payload[2] + ) + _send_response("reset", all_brain_info) + elif cmd.name == "global_done": + _send_response("global_done", env.global_done) + elif cmd.name == "close": + break + except (KeyboardInterrupt, UnityCommunicationException): + print("UnityEnvironment worker: environment stopping.") + step_queue.put(EnvironmentResponse("env_close", worker_id, None)) + finally: + step_queue.close() + env.close() + + +class SubprocessEnvManager(EnvManager): + def __init__( + self, env_factory: Callable[[int], BaseUnityEnvironment], n_env: int = 1 + ): + super().__init__() + self.env_workers: List[UnityEnvWorker] = [] + self.step_queue: Queue = Queue() + for worker_idx in range(n_env): + self.env_workers.append( + self.create_worker(worker_idx, self.step_queue, env_factory) + ) + + @staticmethod + def create_worker( + worker_id: int, + step_queue: Queue, + env_factory: Callable[[int], BaseUnityEnvironment], + ) -> UnityEnvWorker: + parent_conn, child_conn = Pipe() + + # Need to use cloudpickle for the env factory function since function objects aren't picklable + # on Windows as of Python 3.6. + pickled_env_factory = cloudpickle.dumps(env_factory) + child_process = Process( + target=worker, args=(child_conn, step_queue, pickled_env_factory, worker_id) + ) + child_process.start() + return UnityEnvWorker(child_process, worker_id, parent_conn) + + def _queue_steps(self) -> None: + for env_worker in self.env_workers: + if not env_worker.waiting: + env_action_info = self._take_step(env_worker.previous_step) + env_worker.previous_all_action_info = env_action_info + env_worker.send("step", env_action_info) + env_worker.waiting = True + + def step(self) -> List[StepInfo]: + # Queue steps for any workers which aren't in the "waiting" state. + self._queue_steps() + + worker_steps: List[EnvironmentResponse] = [] + step_workers: Set[int] = set() + # Poll the step queue for completed steps from environment workers until we retrieve + # 1 or more, which we will then return as StepInfos + while len(worker_steps) < 1: + try: + while True: + step = self.step_queue.get_nowait() + if step.name == "env_close": + raise UnityCommunicationException( + "At least one of the environments has closed." + ) + self.env_workers[step.worker_id].waiting = False + if step.worker_id not in step_workers: + worker_steps.append(step) + step_workers.add(step.worker_id) + except EmptyQueueException: + pass + + step_infos = self._postprocess_steps(worker_steps) + return step_infos + + def reset( + self, config=None, train_mode=True, custom_reset_parameters=None + ) -> List[StepInfo]: + while any([ew.waiting for ew in self.env_workers]): + if not self.step_queue.empty(): + step = self.step_queue.get_nowait() + self.env_workers[step.worker_id].waiting = False + # First enqueue reset commands for all workers so that they reset in parallel + for ew in self.env_workers: + ew.send("reset", (config, train_mode, custom_reset_parameters)) + # Next (synchronously) collect the reset observations from each worker in sequence + for ew in self.env_workers: + ew.previous_step = StepInfo(None, ew.recv().payload, None) + return list(map(lambda ew: ew.previous_step, self.env_workers)) + + @property + def external_brains(self) -> Dict[str, BrainParameters]: + self.env_workers[0].send("external_brains") + return self.env_workers[0].recv().payload + + @property + def reset_parameters(self) -> Dict[str, float]: + self.env_workers[0].send("reset_parameters") + return self.env_workers[0].recv().payload + + def close(self) -> None: + self.step_queue.close() + self.step_queue.join_thread() + for env_worker in self.env_workers: + env_worker.close() + + def _postprocess_steps( + self, env_steps: List[EnvironmentResponse] + ) -> List[StepInfo]: + step_infos = [] + timer_nodes = [] + for step in env_steps: + payload: StepResponse = step.payload + env_worker = self.env_workers[step.worker_id] + new_step = StepInfo( + env_worker.previous_step.current_all_brain_info, + payload.all_brain_info, + env_worker.previous_all_action_info, + ) + step_infos.append(new_step) + env_worker.previous_step = new_step + + if payload.timer_root: + timer_nodes.append(payload.timer_root) + + if timer_nodes: + with hierarchical_timer("workers") as main_timer_node: + for worker_timer_node in timer_nodes: + main_timer_node.merge( + worker_timer_node, root_name="worker_root", is_parallel=True + ) + + return step_infos + + @timed + def _take_step(self, last_step: StepInfo) -> Dict[str, ActionInfo]: + all_action_info: Dict[str, ActionInfo] = {} + for brain_name, brain_info in last_step.current_all_brain_info.items(): + all_action_info[brain_name] = self.policies[brain_name].get_action( + brain_info + ) + return all_action_info diff --git a/ml-agents-envs/mlagents/envs/subprocess_environment.py b/ml-agents-envs/mlagents/envs/subprocess_environment.py deleted file mode 100644 index 48b493fa66..0000000000 --- a/ml-agents-envs/mlagents/envs/subprocess_environment.py +++ /dev/null @@ -1,224 +0,0 @@ -from typing import * -import copy -import numpy as np -import cloudpickle - -from mlagents.envs import UnityEnvironment -from multiprocessing import Process, Pipe -from multiprocessing.connection import Connection -from mlagents.envs.base_unity_environment import BaseUnityEnvironment -from mlagents.envs import AllBrainInfo, UnityEnvironmentException - - -class EnvironmentCommand(NamedTuple): - name: str - payload: Any = None - - -class EnvironmentResponse(NamedTuple): - name: str - worker_id: int - payload: Any - - -class UnityEnvWorker(NamedTuple): - process: Process - worker_id: int - conn: Connection - - def send(self, name: str, payload=None): - try: - cmd = EnvironmentCommand(name, payload) - self.conn.send(cmd) - except (BrokenPipeError, EOFError): - raise KeyboardInterrupt - - def recv(self) -> EnvironmentResponse: - try: - response: EnvironmentResponse = self.conn.recv() - return response - except (BrokenPipeError, EOFError): - raise KeyboardInterrupt - - def close(self): - try: - self.conn.send(EnvironmentCommand("close")) - except (BrokenPipeError, EOFError): - pass - self.process.join() - - -def worker(parent_conn: Connection, pickled_env_factory: str, worker_id: int): - env_factory: Callable[[int], UnityEnvironment] = cloudpickle.loads( - pickled_env_factory - ) - env = env_factory(worker_id) - - def _send_response(cmd_name, payload): - parent_conn.send(EnvironmentResponse(cmd_name, worker_id, payload)) - - try: - while True: - cmd: EnvironmentCommand = parent_conn.recv() - if cmd.name == "step": - vector_action, memory, text_action, value = cmd.payload - if env.global_done: - all_brain_info = env.reset() - else: - all_brain_info = env.step(vector_action, memory, text_action, value) - _send_response("step", all_brain_info) - elif cmd.name == "external_brains": - _send_response("external_brains", env.external_brains) - elif cmd.name == "reset_parameters": - _send_response("reset_parameters", env.reset_parameters) - elif cmd.name == "reset": - all_brain_info = env.reset(cmd.payload[0], cmd.payload[1]) - _send_response("reset", all_brain_info) - elif cmd.name == "global_done": - _send_response("global_done", env.global_done) - elif cmd.name == "close": - break - except KeyboardInterrupt: - print("UnityEnvironment worker: keyboard interrupt") - finally: - env.close() - - -class SubprocessUnityEnvironment(BaseUnityEnvironment): - def __init__( - self, env_factory: Callable[[int], BaseUnityEnvironment], n_env: int = 1 - ): - self.envs = [] - self.env_agent_counts = {} - self.waiting = False - for worker_id in range(n_env): - self.envs.append(self.create_worker(worker_id, env_factory)) - - @staticmethod - def create_worker( - worker_id: int, env_factory: Callable[[int], BaseUnityEnvironment] - ) -> UnityEnvWorker: - parent_conn, child_conn = Pipe() - - # Need to use cloudpickle for the env factory function since function objects aren't picklable - # on Windows as of Python 3.6. - pickled_env_factory = cloudpickle.dumps(env_factory) - child_process = Process( - target=worker, args=(child_conn, pickled_env_factory, worker_id) - ) - child_process.start() - return UnityEnvWorker(child_process, worker_id, parent_conn) - - def step_async( - self, vector_action, memory=None, text_action=None, value=None - ) -> None: - if self.waiting: - raise UnityEnvironmentException( - "Tried to take an environment step bore previous step has completed." - ) - - agent_counts_cum = {} - for brain_name in self.env_agent_counts.keys(): - agent_counts_cum[brain_name] = np.cumsum(self.env_agent_counts[brain_name]) - - # Split the actions provided by the previous set of agent counts, and send the step - # commands to the workers. - for worker_id, env in enumerate(self.envs): - env_actions = {} - env_memory = {} - env_text_action = {} - env_value = {} - for brain_name in self.env_agent_counts.keys(): - start_ind = 0 - if worker_id > 0: - start_ind = agent_counts_cum[brain_name][worker_id - 1] - end_ind = agent_counts_cum[brain_name][worker_id] - if vector_action.get(brain_name) is not None: - env_actions[brain_name] = vector_action[brain_name][ - start_ind:end_ind - ] - if memory and memory.get(brain_name) is not None: - env_memory[brain_name] = memory[brain_name][start_ind:end_ind] - if text_action and text_action.get(brain_name) is not None: - env_text_action[brain_name] = text_action[brain_name][ - start_ind:end_ind - ] - if value and value.get(brain_name) is not None: - env_value[brain_name] = value[brain_name][start_ind:end_ind] - - env.send("step", (env_actions, env_memory, env_text_action, env_value)) - self.waiting = True - - def step_await(self) -> AllBrainInfo: - if not self.waiting: - raise UnityEnvironmentException( - "Tried to await an environment step, but no async step was taken." - ) - - steps = [self.envs[i].recv() for i in range(len(self.envs))] - self._get_agent_counts(map(lambda s: s.payload, steps)) - combined_brain_info = self._merge_step_info(steps) - self.waiting = False - return combined_brain_info - - def step( - self, vector_action=None, memory=None, text_action=None, value=None - ) -> AllBrainInfo: - self.step_async(vector_action, memory, text_action, value) - return self.step_await() - - def reset(self, config=None, train_mode=True) -> AllBrainInfo: - self._broadcast_message("reset", (config, train_mode)) - reset_results = [self.envs[i].recv() for i in range(len(self.envs))] - self._get_agent_counts(map(lambda r: r.payload, reset_results)) - - return self._merge_step_info(reset_results) - - @property - def global_done(self): - self._broadcast_message("global_done") - dones: List[EnvironmentResponse] = [ - self.envs[i].recv().payload for i in range(len(self.envs)) - ] - return all(dones) - - @property - def external_brains(self): - self.envs[0].send("external_brains") - return self.envs[0].recv().payload - - @property - def reset_parameters(self): - self.envs[0].send("reset_parameters") - return self.envs[0].recv().payload - - def close(self): - for env in self.envs: - env.close() - - def _get_agent_counts(self, step_list: Iterable[AllBrainInfo]): - for i, step in enumerate(step_list): - for brain_name, brain_info in step.items(): - if brain_name not in self.env_agent_counts.keys(): - self.env_agent_counts[brain_name] = [0] * len(self.envs) - self.env_agent_counts[brain_name][i] = len(brain_info.agents) - - @staticmethod - def _merge_step_info(env_steps: List[EnvironmentResponse]) -> AllBrainInfo: - accumulated_brain_info: AllBrainInfo = None - for env_step in env_steps: - all_brain_info: AllBrainInfo = env_step.payload - for brain_name, brain_info in all_brain_info.items(): - for i in range(len(brain_info.agents)): - brain_info.agents[i] = ( - str(env_step.worker_id) + "-" + str(brain_info.agents[i]) - ) - if accumulated_brain_info: - accumulated_brain_info[brain_name].merge(brain_info) - if not accumulated_brain_info: - accumulated_brain_info = copy.deepcopy(all_brain_info) - return accumulated_brain_info - - def _broadcast_message(self, name: str, payload=None): - for env in self.envs: - env.send(name, payload) diff --git a/ml-agents-envs/mlagents/envs/tests/test_sampler_class.py b/ml-agents-envs/mlagents/envs/tests/test_sampler_class.py new file mode 100644 index 0000000000..cf9bdc40f9 --- /dev/null +++ b/ml-agents-envs/mlagents/envs/tests/test_sampler_class.py @@ -0,0 +1,97 @@ +from math import isclose +import pytest + +from mlagents.envs.sampler_class import SamplerManager +from mlagents.envs.sampler_class import ( + UniformSampler, + MultiRangeUniformSampler, + GaussianSampler, +) +from mlagents.envs.exception import UnityException + + +def sampler_config_1(): + return { + "mass": {"sampler-type": "uniform", "min_value": 5, "max_value": 10}, + "gravity": { + "sampler-type": "multirange_uniform", + "intervals": [[8, 11], [15, 20]], + }, + } + + +def check_value_in_intervals(val, intervals): + check_in_bounds = [a <= val <= b for a, b in intervals] + return any(check_in_bounds) + + +def test_sampler_config_1(): + config = sampler_config_1() + sampler = SamplerManager(config) + + assert sampler.is_empty() is False + assert isinstance(sampler.samplers["mass"], UniformSampler) + assert isinstance(sampler.samplers["gravity"], MultiRangeUniformSampler) + + cur_sample = sampler.sample_all() + + # Check uniform sampler for mass + assert sampler.samplers["mass"].min_value == config["mass"]["min_value"] + assert sampler.samplers["mass"].max_value == config["mass"]["max_value"] + assert config["mass"]["min_value"] <= cur_sample["mass"] + assert config["mass"]["max_value"] >= cur_sample["mass"] + + # Check multirange_uniform sampler for gravity + assert sampler.samplers["gravity"].intervals == config["gravity"]["intervals"] + assert check_value_in_intervals( + cur_sample["gravity"], sampler.samplers["gravity"].intervals + ) + + +def sampler_config_2(): + return {"angle": {"sampler-type": "gaussian", "mean": 0, "st_dev": 1}} + + +def test_sampler_config_2(): + config = sampler_config_2() + sampler = SamplerManager(config) + assert sampler.is_empty() is False + assert isinstance(sampler.samplers["angle"], GaussianSampler) + + # Check angle gaussian sampler + assert sampler.samplers["angle"].mean == config["angle"]["mean"] + assert sampler.samplers["angle"].st_dev == config["angle"]["st_dev"] + + +def test_empty_samplers(): + empty_sampler = SamplerManager({}) + assert empty_sampler.is_empty() + empty_cur_sample = empty_sampler.sample_all() + assert empty_cur_sample == {} + + none_sampler = SamplerManager(None) + assert none_sampler.is_empty() + none_cur_sample = none_sampler.sample_all() + assert none_cur_sample == {} + + +def incorrect_uniform_sampler(): + # Do not specify required arguments to uniform sampler + return {"mass": {"sampler-type": "uniform", "min-value": 10}} + + +def incorrect_sampler_config(): + # Do not specify 'sampler-type' key + return {"mass": {"min-value": 2, "max-value": 30}} + + +def test_incorrect_uniform_sampler(): + config = incorrect_uniform_sampler() + with pytest.raises(UnityException): + SamplerManager(config) + + +def test_incorrect_sampler(): + config = incorrect_sampler_config() + with pytest.raises(UnityException): + SamplerManager(config) diff --git a/ml-agents-envs/mlagents/envs/tests/test_subprocess_env_manager.py b/ml-agents-envs/mlagents/envs/tests/test_subprocess_env_manager.py new file mode 100644 index 0000000000..3852af6cb2 --- /dev/null +++ b/ml-agents-envs/mlagents/envs/tests/test_subprocess_env_manager.py @@ -0,0 +1,138 @@ +import unittest.mock as mock +from unittest.mock import Mock, MagicMock +import unittest +import cloudpickle +from queue import Empty as EmptyQueue + +from mlagents.envs.subprocess_env_manager import ( + SubprocessEnvManager, + EnvironmentResponse, + EnvironmentCommand, + worker, + StepResponse, +) +from mlagents.envs.base_unity_environment import BaseUnityEnvironment + + +def mock_env_factory(worker_id: int): + return mock.create_autospec(spec=BaseUnityEnvironment) + + +class MockEnvWorker: + def __init__(self, worker_id, resp=None): + self.worker_id = worker_id + self.process = None + self.conn = None + self.send = Mock() + self.recv = Mock(return_value=resp) + self.waiting = False + + +class SubprocessEnvManagerTest(unittest.TestCase): + def test_environments_are_created(self): + SubprocessEnvManager.create_worker = MagicMock() + env = SubprocessEnvManager(mock_env_factory, 2) + # Creates two processes + env.create_worker.assert_has_calls( + [ + mock.call(0, env.step_queue, mock_env_factory), + mock.call(1, env.step_queue, mock_env_factory), + ] + ) + self.assertEqual(len(env.env_workers), 2) + + def test_worker_step_resets_on_global_done(self): + env_mock = Mock() + env_mock.reset = Mock(return_value="reset_data") + env_mock.global_done = True + + def mock_global_done_env_factory(worker_id: int): + return env_mock + + mock_parent_connection = Mock() + mock_step_queue = Mock() + step_command = EnvironmentCommand("step", (None, None, None, None)) + close_command = EnvironmentCommand("close") + mock_parent_connection.recv.side_effect = [step_command, close_command] + mock_parent_connection.send = Mock() + + worker( + mock_parent_connection, + mock_step_queue, + cloudpickle.dumps(mock_global_done_env_factory), + 0, + ) + + # recv called twice to get step and close command + self.assertEqual(mock_parent_connection.recv.call_count, 2) + + expected_step_response = StepResponse( + all_brain_info="reset_data", timer_root=mock.ANY + ) + + # worker returns the data from the reset + mock_step_queue.put.assert_called_with( + EnvironmentResponse("step", 0, expected_step_response) + ) + + def test_reset_passes_reset_params(self): + SubprocessEnvManager.create_worker = lambda em, worker_id, step_queue, env_factory: MockEnvWorker( + worker_id, EnvironmentResponse("reset", worker_id, worker_id) + ) + manager = SubprocessEnvManager(mock_env_factory, 1) + params = {"test": "params"} + manager.reset(params, False) + manager.env_workers[0].send.assert_called_with("reset", (params, False, None)) + + def test_reset_collects_results_from_all_envs(self): + SubprocessEnvManager.create_worker = lambda em, worker_id, step_queue, env_factory: MockEnvWorker( + worker_id, EnvironmentResponse("reset", worker_id, worker_id) + ) + manager = SubprocessEnvManager(mock_env_factory, 4) + + params = {"test": "params"} + res = manager.reset(params) + for i, env in enumerate(manager.env_workers): + env.send.assert_called_with("reset", (params, True, None)) + env.recv.assert_called() + # Check that the "last steps" are set to the value returned for each step + self.assertEqual( + manager.env_workers[i].previous_step.current_all_brain_info, i + ) + assert res == list(map(lambda ew: ew.previous_step, manager.env_workers)) + + def test_step_takes_steps_for_all_non_waiting_envs(self): + SubprocessEnvManager.create_worker = lambda em, worker_id, step_queue, env_factory: MockEnvWorker( + worker_id, EnvironmentResponse("step", worker_id, worker_id) + ) + manager = SubprocessEnvManager(mock_env_factory, 3) + manager.step_queue = Mock() + manager.step_queue.get_nowait.side_effect = [ + EnvironmentResponse("step", 0, StepResponse(0, None)), + EnvironmentResponse("step", 1, StepResponse(1, None)), + EmptyQueue(), + ] + step_mock = Mock() + last_steps = [Mock(), Mock(), Mock()] + manager.env_workers[0].previous_step = last_steps[0] + manager.env_workers[1].previous_step = last_steps[1] + manager.env_workers[2].previous_step = last_steps[2] + manager.env_workers[2].waiting = True + manager._take_step = Mock(return_value=step_mock) + res = manager.step() + for i, env in enumerate(manager.env_workers): + if i < 2: + env.send.assert_called_with("step", step_mock) + manager.step_queue.get_nowait.assert_called() + # Check that the "last steps" are set to the value returned for each step + self.assertEqual( + manager.env_workers[i].previous_step.current_all_brain_info, i + ) + self.assertEqual( + manager.env_workers[i].previous_step.previous_all_brain_info, + last_steps[i].current_all_brain_info, + ) + assert res == [ + manager.env_workers[0].previous_step, + manager.env_workers[1].previous_step, + ] diff --git a/ml-agents-envs/mlagents/envs/tests/test_subprocess_unity_environment.py b/ml-agents-envs/mlagents/envs/tests/test_subprocess_unity_environment.py deleted file mode 100644 index 831eaef5d5..0000000000 --- a/ml-agents-envs/mlagents/envs/tests/test_subprocess_unity_environment.py +++ /dev/null @@ -1,123 +0,0 @@ -import unittest.mock as mock -from unittest.mock import Mock, MagicMock -import unittest - -from mlagents.envs.subprocess_environment import * -from mlagents.envs import UnityEnvironmentException, BrainInfo - - -def mock_env_factory(worker_id: int): - return mock.create_autospec(spec=BaseUnityEnvironment) - - -class MockEnvWorker: - def __init__(self, worker_id): - self.worker_id = worker_id - self.process = None - self.conn = None - self.send = MagicMock() - self.recv = MagicMock() - - -class SubprocessEnvironmentTest(unittest.TestCase): - def test_environments_are_created(self): - SubprocessUnityEnvironment.create_worker = MagicMock() - env = SubprocessUnityEnvironment(mock_env_factory, 2) - # Creates two processes - self.assertEqual( - env.create_worker.call_args_list, - [mock.call(0, mock_env_factory), mock.call(1, mock_env_factory)], - ) - self.assertEqual(len(env.envs), 2) - - def test_step_async_fails_when_waiting(self): - env = SubprocessUnityEnvironment(mock_env_factory, 0) - env.waiting = True - with self.assertRaises(UnityEnvironmentException): - env.step_async(vector_action=[]) - - @staticmethod - def test_step_async_splits_input_by_agent_count(): - env = SubprocessUnityEnvironment(mock_env_factory, 0) - env.env_agent_counts = {"MockBrain": [1, 3, 5]} - env.envs = [MockEnvWorker(0), MockEnvWorker(1), MockEnvWorker(2)] - env_0_actions = [[1.0, 2.0]] - env_1_actions = [[3.0, 4.0]] * 3 - env_2_actions = [[5.0, 6.0]] * 5 - vector_action = {"MockBrain": env_0_actions + env_1_actions + env_2_actions} - env.step_async(vector_action=vector_action) - env.envs[0].send.assert_called_with( - "step", ({"MockBrain": env_0_actions}, {}, {}, {}) - ) - env.envs[1].send.assert_called_with( - "step", ({"MockBrain": env_1_actions}, {}, {}, {}) - ) - env.envs[2].send.assert_called_with( - "step", ({"MockBrain": env_2_actions}, {}, {}, {}) - ) - - def test_step_async_sets_waiting(self): - env = SubprocessUnityEnvironment(mock_env_factory, 0) - env.step_async(vector_action=[]) - self.assertTrue(env.waiting) - - def test_step_await_fails_if_not_waiting(self): - env = SubprocessUnityEnvironment(mock_env_factory, 0) - with self.assertRaises(UnityEnvironmentException): - env.step_await() - - def test_step_await_combines_brain_info(self): - all_brain_info_env0 = { - "MockBrain": BrainInfo( - [], [[1.0, 2.0], [1.0, 2.0]], [], agents=[1, 2], memory=np.zeros((0, 0)) - ) - } - all_brain_info_env1 = { - "MockBrain": BrainInfo( - [], [[3.0, 4.0]], [], agents=[3], memory=np.zeros((0, 0)) - ) - } - env_worker_0 = MockEnvWorker(0) - env_worker_0.recv.return_value = EnvironmentResponse( - "step", 0, all_brain_info_env0 - ) - env_worker_1 = MockEnvWorker(1) - env_worker_1.recv.return_value = EnvironmentResponse( - "step", 1, all_brain_info_env1 - ) - env = SubprocessUnityEnvironment(mock_env_factory, 0) - env.envs = [env_worker_0, env_worker_1] - env.waiting = True - combined_braininfo = env.step_await()["MockBrain"] - self.assertEqual( - combined_braininfo.vector_observations.tolist(), - [[1.0, 2.0], [1.0, 2.0], [3.0, 4.0]], - ) - self.assertEqual(combined_braininfo.agents, ["0-1", "0-2", "1-3"]) - - def test_step_resets_on_global_done(self): - env_mock = Mock() - env_mock.reset = Mock(return_value="reset_data") - env_mock.global_done = True - - def mock_global_done_env_factory(worker_id: int): - return env_mock - - mock_parent_connection = Mock() - step_command = EnvironmentCommand("step", (None, None, None, None)) - close_command = EnvironmentCommand("close") - mock_parent_connection.recv = Mock() - mock_parent_connection.recv.side_effect = [step_command, close_command] - mock_parent_connection.send = Mock() - - worker( - mock_parent_connection, cloudpickle.dumps(mock_global_done_env_factory), 0 - ) - - # recv called twice to get step and close command - self.assertEqual(mock_parent_connection.recv.call_count, 2) - - # worker returns the data from the reset - mock_parent_connection.send.assert_called_with( - EnvironmentResponse("step", 0, "reset_data") - ) diff --git a/ml-agents-envs/mlagents/envs/tests/test_timers.py b/ml-agents-envs/mlagents/envs/tests/test_timers.py new file mode 100644 index 0000000000..95ea64a224 --- /dev/null +++ b/ml-agents-envs/mlagents/envs/tests/test_timers.py @@ -0,0 +1,96 @@ +from unittest import mock + +from mlagents.envs import timers + + +@timers.timed +def decorated_func(x: int = 0, y: float = 1.0) -> str: + return f"{x} + {y} = {x + y}" + + +def test_timers() -> None: + with mock.patch( + "mlagents.envs.timers._global_timer_stack", new_callable=timers.TimerStack + ) as test_timer: + # First, run some simple code + with timers.hierarchical_timer("top_level"): + for i in range(3): + with timers.hierarchical_timer("multiple"): + decorated_func() + + raised = False + try: + with timers.hierarchical_timer("raises"): + raise RuntimeError("timeout!") + except RuntimeError: + raised = True + + with timers.hierarchical_timer("post_raise"): + assert raised + pass + + # We expect the hierarchy to look like + # (root) + # top_level + # multiple + # decorated_func + # raises + # post_raise + root = test_timer.root + assert root.children.keys() == {"top_level"} + + top_level = root.children["top_level"] + assert top_level.children.keys() == {"multiple", "raises", "post_raise"} + + # make sure the scope was closed properly when the exception was raised + raises = top_level.children["raises"] + assert raises.count == 1 + + multiple = top_level.children["multiple"] + assert multiple.count == 3 + + timer_tree = test_timer.get_timing_tree() + + expected_tree = { + "name": "root", + "total": mock.ANY, + "count": 1, + "self": mock.ANY, + "children": [ + { + "name": "top_level", + "total": mock.ANY, + "count": 1, + "self": mock.ANY, + "children": [ + { + "name": "multiple", + "total": mock.ANY, + "count": 3, + "self": mock.ANY, + "children": [ + { + "name": "decorated_func", + "total": mock.ANY, + "count": 3, + "self": mock.ANY, + } + ], + }, + { + "name": "raises", + "total": mock.ANY, + "count": 1, + "self": mock.ANY, + }, + { + "name": "post_raise", + "total": mock.ANY, + "count": 1, + "self": mock.ANY, + }, + ], + } + ], + } + assert timer_tree == expected_tree diff --git a/ml-agents-envs/mlagents/envs/timers.py b/ml-agents-envs/mlagents/envs/timers.py new file mode 100644 index 0000000000..d2eb43f50e --- /dev/null +++ b/ml-agents-envs/mlagents/envs/timers.py @@ -0,0 +1,236 @@ +# # Unity ML-Agents Toolkit +from time import perf_counter + +from contextlib import contextmanager +from typing import Any, Callable, Dict, Generator, TypeVar + +""" +Lightweight, hierarchical timers for profiling sections of code. + +Example: + +@timed +def foo(t): + time.sleep(t) + +def main(): + for i in range(3): + foo(i + 1) + with hierarchical_timer("context"): + foo(1) + + print(get_timer_tree()) + +This would produce a timer tree like + (root) + "foo" + "context" + "foo" + +The total time and counts are tracked for each block of code; in this example "foo" and "context.foo" are considered +distinct blocks, and are tracked separately. + +The decorator and contextmanager are equivalent; the context manager may be more useful if you want more control +over the timer name, or are splitting up multiple sections of a large function. +""" + + +class TimerNode: + """ + Represents the time spent in a block of code. + """ + + __slots__ = ["children", "total", "count", "is_parallel"] + + def __init__(self): + # Note that since dictionary keys are the node names, we don't explicitly store the name on the TimerNode. + self.children: Dict[str, TimerNode] = {} + self.total: float = 0.0 + self.count: int = 0 + self.is_parallel = False + + def get_child(self, name: str) -> "TimerNode": + """ + Get the child node corresponding to the name (and create if it doesn't already exist). + """ + child = self.children.get(name) + if child is None: + child = TimerNode() + self.children[name] = child + return child + + def add_time(self, elapsed: float) -> None: + """ + Accumulate the time spent in the node (and increment the count). + """ + self.total += elapsed + self.count += 1 + + def merge(self, other: "TimerNode", root_name: str = None, is_parallel=True): + """ + Add the other node to this node, then do the same recursively on its children. + :param other: The other node to merge + :param root_name: Optional name of the root node being merged. + :param is_parallel: Whether or not the code block was executed in parallel. + :return: + """ + if root_name: + node = self.get_child(root_name) + else: + node = self + + node.total += other.total + node.count += other.count + node.is_parallel |= is_parallel + for other_child_name, other_child_node in other.children.items(): + child = node.get_child(other_child_name) + child.merge(other_child_node, is_parallel=is_parallel) + + +class TimerStack: + """ + Tracks all the time spent. Users shouldn't use this directly, they should use the contextmanager below to make + sure that pushes and pops are already matched. + """ + + __slots__ = ["root", "stack", "start_time"] + + def __init__(self): + self.root = TimerNode() + self.stack = [self.root] + self.start_time = perf_counter() + + def reset(self): + self.root = TimerNode() + self.stack = [self.root] + self.start_time = perf_counter() + + def push(self, name: str) -> TimerNode: + """ + Called when entering a new block of code that is timed (e.g. with a contextmanager). + """ + current_node: TimerNode = self.stack[-1] + next_node = current_node.get_child(name) + self.stack.append(next_node) + return next_node + + def pop(self) -> None: + """ + Called when exiting a new block of code that is timed (e.g. with a contextmanager). + """ + self.stack.pop() + + def get_root(self) -> TimerNode: + """ + Update the total time and count of the root name, and return it. + """ + root = self.root + root.total = perf_counter() - self.start_time + root.count = 1 + return root + + def get_timing_tree(self, node: TimerNode = None) -> Dict[str, Any]: + """ + Recursively build a tree of timings, suitable for output/archiving. + """ + res: Dict[str, Any] = {} + if node is None: + # Special case the root - total is time since it was created, and count is 1 + node = self.get_root() + res["name"] = "root" + + res["total"] = node.total + res["count"] = node.count + + if node.is_parallel: + # Note when the block ran in parallel, so that it's less confusing that a timer is less that its children. + res["is_parallel"] = True + + child_total = 0.0 + child_list = [] + for child_name, child_node in node.children.items(): + child_res: Dict[str, Any] = { + "name": child_name, + **self.get_timing_tree(child_node), + } + child_list.append(child_res) + child_total += child_res["total"] + + # "self" time is total time minus all time spent on children + res["self"] = max(0.0, node.total - child_total) + if child_list: + res["children"] = child_list + + return res + + +# Global instance of a TimerStack. This is generally all that we need for profiling, but you can potentially +# create multiple instances and pass them to the contextmanager +_global_timer_stack = TimerStack() + + +@contextmanager +def hierarchical_timer(name: str, timer_stack: TimerStack = None) -> Generator: + """ + Creates a scoped timer around a block of code. This time spent will automatically be incremented when + the context manager exits. + """ + timer_stack = timer_stack or _global_timer_stack + timer_node = timer_stack.push(name) + start_time = perf_counter() + + try: + # The wrapped code block will run here. + yield timer_node + finally: + # This will trigger either when the context manager exits, or an exception is raised. + # We'll accumulate the time, and the exception (if any) gets raised automatically. + elapsed = perf_counter() - start_time + timer_node.add_time(elapsed) + timer_stack.pop() + + +# This is used to ensure the signature of the decorated function is preserved +# See also https://github.com/python/mypy/issues/3157 +FuncT = TypeVar("FuncT", bound=Callable[..., Any]) + + +def timed(func: FuncT) -> FuncT: + """ + Decorator for timing a function or method. The name of the timer will be the qualified name of the function. + Usage: + @timed + def my_func(x, y): + return x + y + Note that because this doesn't take arguments, the global timer stack is always used. + """ + + def wrapped(*args, **kwargs): + with hierarchical_timer(func.__qualname__): + return func(*args, **kwargs) + + return wrapped # type: ignore + + +def get_timer_tree(timer_stack: TimerStack = None) -> Dict[str, Any]: + """ + Return the tree of timings from the TimerStack as a dictionary (or the global stack if none is provided) + """ + timer_stack = timer_stack or _global_timer_stack + return timer_stack.get_timing_tree() + + +def get_timer_root(timer_stack: TimerStack = None) -> TimerNode: + """ + Get the root TimerNode of the timer_stack (or the global TimerStack if not specified) + """ + timer_stack = timer_stack or _global_timer_stack + return timer_stack.get_root() + + +def reset_timers(timer_stack: TimerStack = None) -> None: + """ + Reset the timer_stack (or the global TimerStack if not specified) + """ + timer_stack = timer_stack or _global_timer_stack + timer_stack.reset() diff --git a/ml-agents-envs/setup.py b/ml-agents-envs/setup.py index 66b7357f8b..6edacf1721 100644 --- a/ml-agents-envs/setup.py +++ b/ml-agents-envs/setup.py @@ -5,7 +5,7 @@ setup( name="mlagents_envs", - version="0.8.2", + version="0.9.0", description="Unity Machine Learning Agents Interface", url="https://github.com/Unity-Technologies/ml-agents", author="Unity Technologies", @@ -24,7 +24,7 @@ "pytest>=3.2.2,<4.0.0", "protobuf>=3.6,<3.7", "grpcio>=1.11.0,<1.12.0", - "cloudpickle==0.8.1", + "cloudpickle", ], python_requires=">=3.5,<3.8", ) diff --git a/ml-agents/mlagents/trainers/__init__.py b/ml-agents/mlagents/trainers/__init__.py index cf18c6e1e4..dd957220f2 100644 --- a/ml-agents/mlagents/trainers/__init__.py +++ b/ml-agents/mlagents/trainers/__init__.py @@ -1,11 +1,10 @@ -from .action_info import * from .buffer import * from .curriculum import * from .meta_curriculum import * from .models import * from .trainer_metrics import * from .trainer import * -from .policy import * +from .tf_policy import * from .trainer_controller import * from .bc.models import * from .bc.offline_trainer import * diff --git a/ml-agents/mlagents/trainers/barracuda.py b/ml-agents/mlagents/trainers/barracuda.py index afa2a8aaf1..dba755186d 100644 --- a/ml-agents/mlagents/trainers/barracuda.py +++ b/ml-agents/mlagents/trainers/barracuda.py @@ -9,6 +9,7 @@ BARRACUDA_VERSION = 16 + # Definition of Barracuda model class Model: def __init__(self): @@ -119,7 +120,7 @@ def topologicalSortUtil(self, v, visited, stack): # Recur for all the vertices adjacent to this vertex for i in self.graph[v]: - if visited[i] == False: + if not visited[i]: self.topologicalSortUtil(i, visited, stack) # Push current vertex to stack which stores result @@ -135,7 +136,7 @@ def topologicalSort(self): # Call the recursive helper function to store Topological # Sort starting from all vertices one by one for i in range(self.V): - if visited[i] == False: + if not visited[i]: self.topologicalSortUtil(i, visited, stack) # print(stack) @@ -523,7 +524,7 @@ def write_int64(self, d): def write_shape(self, s): self.write_int32(len(s)) for el in s: - self.write_int32(el if el != None else -1) + self.write_int32(el if el is not None else -1) def close(self): self.f.close() @@ -560,7 +561,7 @@ def write(model, filename): w.write_int32(len(model.layers)) for l in model.layers: - assert not l.name in l.inputs + assert l.name not in l.inputs w.write_str(l.name) w.write_int32(l.type) diff --git a/ml-agents/mlagents/trainers/bc/online_trainer.py b/ml-agents/mlagents/trainers/bc/online_trainer.py index 935a46f830..44aa6ad2c4 100644 --- a/ml-agents/mlagents/trainers/bc/online_trainer.py +++ b/ml-agents/mlagents/trainers/bc/online_trainer.py @@ -6,6 +6,7 @@ import numpy as np from mlagents.envs import AllBrainInfo +from mlagents.trainers import ActionInfoOutputs from mlagents.trainers.bc.trainer import BCTrainer logger = logging.getLogger("mlagents.trainers") @@ -62,8 +63,11 @@ def __str__(self): ) def add_experiences( - self, curr_info: AllBrainInfo, next_info: AllBrainInfo, take_action_outputs - ): + self, + curr_info: AllBrainInfo, + next_info: AllBrainInfo, + take_action_outputs: ActionInfoOutputs, + ) -> None: """ Adds experiences to each agent's experience history. :param curr_info: Current AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo). @@ -124,7 +128,9 @@ def add_experiences( curr_info, next_info, take_action_outputs ) - def process_experiences(self, current_info: AllBrainInfo, next_info: AllBrainInfo): + def process_experiences( + self, current_info: AllBrainInfo, next_info: AllBrainInfo + ) -> None: """ Checks agent histories for processing condition, and processes them as necessary. Processing involves calculating value and advantage targets for model updating step. diff --git a/ml-agents/mlagents/trainers/bc/policy.py b/ml-agents/mlagents/trainers/bc/policy.py index 6944a794f4..10faf7fab9 100644 --- a/ml-agents/mlagents/trainers/bc/policy.py +++ b/ml-agents/mlagents/trainers/bc/policy.py @@ -2,12 +2,12 @@ import numpy as np from mlagents.trainers.bc.models import BehavioralCloningModel -from mlagents.trainers.policy import Policy +from mlagents.trainers.tf_policy import TFPolicy logger = logging.getLogger("mlagents.trainers") -class BCPolicy(Policy): +class BCPolicy(TFPolicy): def __init__(self, seed, brain, trainer_parameters, load): """ :param seed: Random seed. @@ -57,7 +57,7 @@ def evaluate(self, brain_info): self.model.sequence_length: 1, } - feed_dict = self._fill_eval_dict(feed_dict, brain_info) + feed_dict = self.fill_eval_dict(feed_dict, brain_info) if self.use_recurrent: if brain_info.memories.shape[1] == 0: brain_info.memories = self.make_empty_memory(len(brain_info.agents)) diff --git a/ml-agents/mlagents/trainers/bc/trainer.py b/ml-agents/mlagents/trainers/bc/trainer.py index f57ce47aaa..9b2c1553cc 100644 --- a/ml-agents/mlagents/trainers/bc/trainer.py +++ b/ml-agents/mlagents/trainers/bc/trainer.py @@ -8,6 +8,7 @@ import tensorflow as tf from mlagents.envs import AllBrainInfo +from mlagents.trainers import ActionInfoOutputs from mlagents.trainers.bc.policy import BCPolicy from mlagents.trainers.buffer import Buffer from mlagents.trainers.trainer import Trainer @@ -66,27 +67,19 @@ def get_step(self): """ return self.policy.get_current_step() - @property - def get_last_reward(self): - """ - Returns the last reward the trainer has had - :return: the new last reward - """ - if len(self.stats["Environment/Cumulative Reward"]) > 0: - return np.mean(self.stats["Environment/Cumulative Reward"]) - else: - return 0 - - def increment_step_and_update_last_reward(self): + def increment_step(self): """ - Increment the step count of the trainer and Updates the last reward + Increment the step count of the trainer """ self.policy.increment_step() return def add_experiences( - self, curr_info: AllBrainInfo, next_info: AllBrainInfo, take_action_outputs - ): + self, + curr_info: AllBrainInfo, + next_info: AllBrainInfo, + take_action_outputs: ActionInfoOutputs, + ) -> None: """ Adds experiences to each agent's experience history. :param curr_info: Current AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo). @@ -114,7 +107,9 @@ def add_experiences( self.episode_steps[agent_id] = 0 self.episode_steps[agent_id] += 1 - def process_experiences(self, current_info: AllBrainInfo, next_info: AllBrainInfo): + def process_experiences( + self, current_info: AllBrainInfo, next_info: AllBrainInfo + ) -> None: """ Checks agent histories for processing condition, and processes them as necessary. Processing involves calculating value and advantage targets for model updating step. @@ -131,6 +126,7 @@ def process_experiences(self, current_info: AllBrainInfo, next_info: AllBrainInf self.stats["Environment/Episode Length"].append( self.episode_steps.get(agent_id, 0) ) + self.reward_buffer.appendleft(self.cumulative_rewards.get(agent_id, 0)) self.cumulative_rewards[agent_id] = 0 self.episode_steps[agent_id] = 0 diff --git a/ml-agents/mlagents/trainers/buffer.py b/ml-agents/mlagents/trainers/buffer.py index 9096fb33d1..e873d0f593 100644 --- a/ml-agents/mlagents/trainers/buffer.py +++ b/ml-agents/mlagents/trainers/buffer.py @@ -38,9 +38,9 @@ def __str__(self): def append(self, element, padding_value=0): """ - Adds an element to this list. Also lets you change the padding + Adds an element to this list. Also lets you change the padding type, so that it can be set on append (e.g. action_masks should - be padded with 1.) + be padded with 1.) :param element: The element to append to the list. :param padding_value: The value used to pad when get_batch is called. """ @@ -178,13 +178,13 @@ def check_length(self, key_list): """ if len(key_list) < 2: return True - l = None + length = None for key in key_list: if key not in self.keys(): return False - if (l is not None) and (l != len(self[key])): + if (length is not None) and (length != len(self[key])): return False - l = len(self[key]) + length = len(self[key]) return True def shuffle(self, key_list=None): diff --git a/ml-agents/mlagents/trainers/components/__init__.py b/ml-agents/mlagents/trainers/components/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/ml-agents/mlagents/trainers/components/bc/__init__.py b/ml-agents/mlagents/trainers/components/bc/__init__.py new file mode 100644 index 0000000000..159875b09f --- /dev/null +++ b/ml-agents/mlagents/trainers/components/bc/__init__.py @@ -0,0 +1 @@ +from .module import BCModule diff --git a/ml-agents/mlagents/trainers/components/bc/model.py b/ml-agents/mlagents/trainers/components/bc/model.py new file mode 100644 index 0000000000..230613013d --- /dev/null +++ b/ml-agents/mlagents/trainers/components/bc/model.py @@ -0,0 +1,80 @@ +import tensorflow as tf +import numpy as np +from mlagents.trainers.models import LearningModel + + +class BCModel(object): + def __init__( + self, + policy_model: LearningModel, + learning_rate: float = 3e-4, + anneal_steps: int = 0, + ): + """ + Tensorflow operations to perform Behavioral Cloning on a Policy model + :param policy_model: The policy of the learning algorithm + :param lr: The initial learning Rate for behavioral cloning + :param anneal_steps: Number of steps over which to anneal BC training + """ + self.policy_model = policy_model + self.expert_visual_in = self.policy_model.visual_in + self.obs_in_expert = self.policy_model.vector_in + self.make_inputs() + self.create_loss(learning_rate, anneal_steps) + + def make_inputs(self) -> None: + """ + Creates the input layers for the discriminator + """ + self.done_expert = tf.placeholder(shape=[None, 1], dtype=tf.float32) + self.done_policy = tf.placeholder(shape=[None, 1], dtype=tf.float32) + + if self.policy_model.brain.vector_action_space_type == "continuous": + action_length = self.policy_model.act_size[0] + self.action_in_expert = tf.placeholder( + shape=[None, action_length], dtype=tf.float32 + ) + self.expert_action = tf.identity(self.action_in_expert) + else: + action_length = len(self.policy_model.act_size) + self.action_in_expert = tf.placeholder( + shape=[None, action_length], dtype=tf.int32 + ) + self.expert_action = tf.concat( + [ + tf.one_hot(self.action_in_expert[:, i], act_size) + for i, act_size in enumerate(self.policy_model.act_size) + ], + axis=1, + ) + + def create_loss(self, learning_rate: float, anneal_steps: int) -> None: + """ + Creates the loss and update nodes for the BC module + :param learning_rate: The learning rate for the optimizer + :param anneal_steps: Number of steps over which to anneal the learning_rate + """ + selected_action = self.policy_model.output + if self.policy_model.brain.vector_action_space_type == "continuous": + self.loss = tf.reduce_mean( + tf.squared_difference(selected_action, self.expert_action) + ) + else: + log_probs = self.policy_model.all_log_probs + self.loss = tf.reduce_mean( + -tf.log(tf.nn.softmax(log_probs) + 1e-7) * self.expert_action + ) + + if anneal_steps > 0: + self.annealed_learning_rate = tf.train.polynomial_decay( + learning_rate, + self.policy_model.global_step, + anneal_steps, + 0.0, + power=1.0, + ) + else: + self.annealed_learning_rate = learning_rate + + optimizer = tf.train.AdamOptimizer(learning_rate=self.annealed_learning_rate) + self.update_batch = optimizer.minimize(self.loss) diff --git a/ml-agents/mlagents/trainers/components/bc/module.py b/ml-agents/mlagents/trainers/components/bc/module.py new file mode 100644 index 0000000000..f0fb18b57e --- /dev/null +++ b/ml-agents/mlagents/trainers/components/bc/module.py @@ -0,0 +1,173 @@ +from typing import Dict, Any +import numpy as np + +from mlagents.trainers.tf_policy import TFPolicy +from .model import BCModel +from mlagents.trainers.demo_loader import demo_to_buffer +from mlagents.trainers.trainer import UnityTrainerException + + +class BCModule: + def __init__( + self, + policy: TFPolicy, + policy_learning_rate: float, + default_batch_size: int, + default_num_epoch: int, + strength: float, + demo_path: str, + steps: int, + batch_size: int = None, + num_epoch: int = None, + samples_per_update: int = 0, + ): + """ + A BC trainer that can be used inline with RL, especially for pretraining. + :param policy: The policy of the learning model + :param policy_learning_rate: The initial Learning Rate of the policy. Used to set an appropriate learning rate + for the pretrainer. + :param default_batch_size: The default batch size to use if batch_size isn't provided. + :param default_num_epoch: The default num_epoch to use if num_epoch isn't provided. + :param strength: The proportion of learning rate used to update through BC. + :param steps: The number of steps to anneal BC training over. 0 for continuous training. + :param demo_path: The path to the demonstration file. + :param batch_size: The batch size to use during BC training. + :param num_epoch: Number of epochs to train for during each update. + :param samples_per_update: Maximum number of samples to train on during each pretraining update. + """ + self.policy = policy + self.current_lr = policy_learning_rate * strength + self.model = BCModel(policy.model, self.current_lr, steps) + _, self.demonstration_buffer = demo_to_buffer(demo_path, policy.sequence_length) + + self.batch_size = batch_size if batch_size else default_batch_size + self.num_epoch = num_epoch if num_epoch else default_num_epoch + self.n_sequences = max( + min( + self.batch_size, len(self.demonstration_buffer.update_buffer["actions"]) + ) + // policy.sequence_length, + 1, + ) + + self.has_updated = False + self.use_recurrent = self.policy.use_recurrent + self.samples_per_update = samples_per_update + self.out_dict = { + "loss": self.model.loss, + "update": self.model.update_batch, + "learning_rate": self.model.annealed_learning_rate, + } + + @staticmethod + def check_config(config_dict: Dict[str, Any]) -> None: + """ + Check the pretraining config for the required keys. + :param config_dict: Pretraining section of trainer_config + """ + param_keys = ["strength", "demo_path", "steps"] + for k in param_keys: + if k not in config_dict: + raise UnityTrainerException( + "The required pre-training hyper-parameter {0} was not defined. Please check your \ + trainer YAML file.".format( + k + ) + ) + + def update(self) -> Dict[str, Any]: + """ + Updates model using buffer. + :param max_batches: The maximum number of batches to use per update. + :return: The loss of the update. + """ + # Don't continue training if the learning rate has reached 0, to reduce training time. + if self.current_lr <= 0: + return {"Losses/Pretraining Loss": 0} + + batch_losses = [] + possible_demo_batches = ( + len(self.demonstration_buffer.update_buffer["actions"]) // self.n_sequences + ) + possible_batches = possible_demo_batches + + max_batches = self.samples_per_update // self.n_sequences + + n_epoch = self.num_epoch + for _ in range(n_epoch): + self.demonstration_buffer.update_buffer.shuffle() + if max_batches == 0: + num_batches = possible_batches + else: + num_batches = min(possible_batches, max_batches) + for i in range(num_batches): + demo_update_buffer = self.demonstration_buffer.update_buffer + start = i * self.n_sequences + end = (i + 1) * self.n_sequences + mini_batch_demo = demo_update_buffer.make_mini_batch(start, end) + run_out = self._update_batch(mini_batch_demo, self.n_sequences) + loss = run_out["loss"] + self.current_lr = run_out["learning_rate"] + batch_losses.append(loss) + self.has_updated = True + update_stats = {"Losses/Pretraining Loss": np.mean(batch_losses)} + return update_stats + + def _update_batch( + self, mini_batch_demo: Dict[str, Any], n_sequences: int + ) -> Dict[str, Any]: + """ + Helper function for update_batch. + """ + feed_dict = { + self.policy.model.batch_size: n_sequences, + self.policy.model.sequence_length: self.policy.sequence_length, + } + if self.policy.model.brain.vector_action_space_type == "continuous": + feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"].reshape( + [-1, self.policy.model.brain.vector_action_space_size[0]] + ) + feed_dict[self.policy.model.epsilon] = np.random.normal( + size=(1, self.policy.model.act_size[0]) + ) + else: + feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"].reshape( + [-1, len(self.policy.model.brain.vector_action_space_size)] + ) + feed_dict[self.policy.model.action_masks] = np.ones( + ( + self.n_sequences, + sum(self.policy.model.brain.vector_action_space_size), + ) + ) + if self.policy.model.brain.vector_observation_space_size > 0: + apparent_obs_size = ( + self.policy.model.brain.vector_observation_space_size + * self.policy.model.brain.num_stacked_vector_observations + ) + feed_dict[self.policy.model.vector_in] = mini_batch_demo[ + "vector_obs" + ].reshape([-1, apparent_obs_size]) + for i, _ in enumerate(self.policy.model.visual_in): + visual_obs = mini_batch_demo["visual_obs%d" % i] + if self.policy.sequence_length > 1 and self.policy.use_recurrent: + (_batch, _seq, _w, _h, _c) = visual_obs.shape + feed_dict[self.policy.model.visual_in[i]] = visual_obs.reshape( + [-1, _w, _h, _c] + ) + else: + feed_dict[self.policy.model.visual_in[i]] = visual_obs + if self.use_recurrent: + feed_dict[self.policy.model.memory_in] = np.zeros( + [self.n_sequences, self.policy.m_size] + ) + if not self.policy.model.brain.vector_action_space_type == "continuous": + feed_dict[self.policy.model.prev_action] = mini_batch_demo[ + "prev_action" + ].reshape([-1, len(self.policy.model.act_size)]) + + network_out = self.policy.sess.run( + list(self.out_dict.values()), feed_dict=feed_dict + ) + run_out = dict(zip(list(self.out_dict.keys()), network_out)) + return run_out diff --git a/ml-agents/mlagents/trainers/components/reward_signals/__init__.py b/ml-agents/mlagents/trainers/components/reward_signals/__init__.py new file mode 100644 index 0000000000..ce6aee0671 --- /dev/null +++ b/ml-agents/mlagents/trainers/components/reward_signals/__init__.py @@ -0,0 +1 @@ +from .reward_signal import * diff --git a/ml-agents/mlagents/trainers/components/reward_signals/curiosity/__init__.py b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/__init__.py new file mode 100644 index 0000000000..4d3549a137 --- /dev/null +++ b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/__init__.py @@ -0,0 +1 @@ +from .signal import CuriosityRewardSignal diff --git a/ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py new file mode 100644 index 0000000000..fd60b6306f --- /dev/null +++ b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py @@ -0,0 +1,185 @@ +from typing import List, Tuple +import tensorflow as tf +from mlagents.trainers.models import LearningModel + + +class CuriosityModel(object): + def __init__( + self, + policy_model: LearningModel, + encoding_size: int = 128, + learning_rate: float = 3e-4, + ): + """ + Creates the curiosity model for the Curiosity reward Generator + :param policy_model: The model being used by the learning policy + :param encoding_size: The size of the encoding for the Curiosity module + :param learning_rate: The learning rate for the curiosity module + """ + self.encoding_size = encoding_size + self.policy_model = policy_model + self.next_visual_in: List[tf.Tensor] = [] + encoded_state, encoded_next_state = self.create_curiosity_encoders() + self.create_inverse_model(encoded_state, encoded_next_state) + self.create_forward_model(encoded_state, encoded_next_state) + self.create_loss(learning_rate) + + def create_curiosity_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]: + """ + Creates state encoders for current and future observations. + Used for implementation of Curiosity-driven Exploration by Self-supervised Prediction + See https://arxiv.org/abs/1705.05363 for more details. + :return: current and future state encoder tensors. + """ + encoded_state_list = [] + encoded_next_state_list = [] + + if self.policy_model.vis_obs_size > 0: + self.next_visual_in = [] + visual_encoders = [] + next_visual_encoders = [] + for i in range(self.policy_model.vis_obs_size): + # Create input ops for next (t+1) visual observations. + next_visual_input = LearningModel.create_visual_input( + self.policy_model.brain.camera_resolutions[i], + name="next_visual_observation_" + str(i), + ) + self.next_visual_in.append(next_visual_input) + + # Create the encoder ops for current and next visual input. + # Note that these encoders are siamese. + encoded_visual = self.policy_model.create_visual_observation_encoder( + self.policy_model.visual_in[i], + self.encoding_size, + LearningModel.swish, + 1, + "stream_{}_visual_obs_encoder".format(i), + False, + ) + + encoded_next_visual = self.policy_model.create_visual_observation_encoder( + self.next_visual_in[i], + self.encoding_size, + LearningModel.swish, + 1, + "stream_{}_visual_obs_encoder".format(i), + True, + ) + visual_encoders.append(encoded_visual) + next_visual_encoders.append(encoded_next_visual) + + hidden_visual = tf.concat(visual_encoders, axis=1) + hidden_next_visual = tf.concat(next_visual_encoders, axis=1) + encoded_state_list.append(hidden_visual) + encoded_next_state_list.append(hidden_next_visual) + + if self.policy_model.vec_obs_size > 0: + # Create the encoder ops for current and next vector input. + # Note that these encoders are siamese. + # Create input op for next (t+1) vector observation. + self.next_vector_in = tf.placeholder( + shape=[None, self.policy_model.vec_obs_size], + dtype=tf.float32, + name="next_vector_observation", + ) + + encoded_vector_obs = self.policy_model.create_vector_observation_encoder( + self.policy_model.vector_in, + self.encoding_size, + LearningModel.swish, + 2, + "vector_obs_encoder", + False, + ) + encoded_next_vector_obs = self.policy_model.create_vector_observation_encoder( + self.next_vector_in, + self.encoding_size, + LearningModel.swish, + 2, + "vector_obs_encoder", + True, + ) + encoded_state_list.append(encoded_vector_obs) + encoded_next_state_list.append(encoded_next_vector_obs) + + encoded_state = tf.concat(encoded_state_list, axis=1) + encoded_next_state = tf.concat(encoded_next_state_list, axis=1) + return encoded_state, encoded_next_state + + def create_inverse_model( + self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor + ) -> None: + """ + Creates inverse model TensorFlow ops for Curiosity module. + Predicts action taken given current and future encoded states. + :param encoded_state: Tensor corresponding to encoded current state. + :param encoded_next_state: Tensor corresponding to encoded next state. + """ + combined_input = tf.concat([encoded_state, encoded_next_state], axis=1) + hidden = tf.layers.dense(combined_input, 256, activation=LearningModel.swish) + if self.policy_model.brain.vector_action_space_type == "continuous": + pred_action = tf.layers.dense( + hidden, self.policy_model.act_size[0], activation=None + ) + squared_difference = tf.reduce_sum( + tf.squared_difference(pred_action, self.policy_model.selected_actions), + axis=1, + ) + self.inverse_loss = tf.reduce_mean( + tf.dynamic_partition(squared_difference, self.policy_model.mask, 2)[1] + ) + else: + pred_action = tf.concat( + [ + tf.layers.dense( + hidden, self.policy_model.act_size[i], activation=tf.nn.softmax + ) + for i in range(len(self.policy_model.act_size)) + ], + axis=1, + ) + cross_entropy = tf.reduce_sum( + -tf.log(pred_action + 1e-10) * self.policy_model.selected_actions, + axis=1, + ) + self.inverse_loss = tf.reduce_mean( + tf.dynamic_partition(cross_entropy, self.policy_model.mask, 2)[1] + ) + + def create_forward_model( + self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor + ) -> None: + """ + Creates forward model TensorFlow ops for Curiosity module. + Predicts encoded future state based on encoded current state and given action. + :param encoded_state: Tensor corresponding to encoded current state. + :param encoded_next_state: Tensor corresponding to encoded next state. + """ + combined_input = tf.concat( + [encoded_state, self.policy_model.selected_actions], axis=1 + ) + hidden = tf.layers.dense(combined_input, 256, activation=LearningModel.swish) + pred_next_state = tf.layers.dense( + hidden, + self.encoding_size + * ( + self.policy_model.vis_obs_size + int(self.policy_model.vec_obs_size > 0) + ), + activation=None, + ) + squared_difference = 0.5 * tf.reduce_sum( + tf.squared_difference(pred_next_state, encoded_next_state), axis=1 + ) + self.intrinsic_reward = squared_difference + self.forward_loss = tf.reduce_mean( + tf.dynamic_partition(squared_difference, self.policy_model.mask, 2)[1] + ) + + def create_loss(self, learning_rate: float) -> None: + """ + Creates the loss node of the model as well as the update_batch optimizer to update the model. + :param learning_rate: The learning rate for the optimizer. + """ + self.loss = 10 * (0.2 * self.forward_loss + 0.8 * self.inverse_loss) + optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) + self.update_batch = optimizer.minimize(self.loss) diff --git a/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py new file mode 100644 index 0000000000..f384c6a35c --- /dev/null +++ b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py @@ -0,0 +1,193 @@ +from typing import Any, Dict, List +import numpy as np +from mlagents.envs.brain import BrainInfo + +from mlagents.trainers.buffer import Buffer +from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult +from mlagents.trainers.components.reward_signals.curiosity.model import CuriosityModel +from mlagents.trainers.tf_policy import TFPolicy + + +class CuriosityRewardSignal(RewardSignal): + def __init__( + self, + policy: TFPolicy, + strength: float, + gamma: float, + encoding_size: int = 128, + learning_rate: float = 3e-4, + num_epoch: int = 3, + ): + """ + Creates the Curiosity reward generator + :param policy: The Learning Policy + :param strength: The scaling parameter for the reward. The scaled reward will be the unscaled + reward multiplied by the strength parameter + :param gamma: The time discounting factor used for this reward. + :param encoding_size: The size of the hidden encoding layer for the ICM + :param learning_rate: The learning rate for the ICM. + :param num_epoch: The number of epochs to train over the training buffer for the ICM. + """ + super().__init__(policy, strength, gamma) + self.model = CuriosityModel( + policy.model, encoding_size=encoding_size, learning_rate=learning_rate + ) + self.num_epoch = num_epoch + self.use_terminal_states = False + self.update_dict = { + "forward_loss": self.model.forward_loss, + "inverse_loss": self.model.inverse_loss, + "update": self.model.update_batch, + } + self.has_updated = False + + def evaluate( + self, current_info: BrainInfo, next_info: BrainInfo + ) -> RewardSignalResult: + """ + Evaluates the reward for the agents present in current_info given the next_info + :param current_info: The current BrainInfo. + :param next_info: The BrainInfo from the next timestep. + :return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator + """ + if len(current_info.agents) == 0: + return [] + + feed_dict = { + self.policy.model.batch_size: len(next_info.vector_observations), + self.policy.model.sequence_length: 1, + } + feed_dict = self.policy.fill_eval_dict(feed_dict, brain_info=current_info) + if self.policy.use_continuous_act: + feed_dict[ + self.policy.model.selected_actions + ] = next_info.previous_vector_actions + else: + feed_dict[ + self.policy.model.action_holder + ] = next_info.previous_vector_actions + for i in range(self.policy.model.vis_obs_size): + feed_dict[self.model.next_visual_in[i]] = next_info.visual_observations[i] + if self.policy.use_vec_obs: + feed_dict[self.model.next_vector_in] = next_info.vector_observations + if self.policy.use_recurrent: + if current_info.memories.shape[1] == 0: + current_info.memories = self.policy.make_empty_memory( + len(current_info.agents) + ) + feed_dict[self.policy.model.memory_in] = current_info.memories + unscaled_reward = self.policy.sess.run( + self.model.intrinsic_reward, feed_dict=feed_dict + ) + scaled_reward = np.clip( + unscaled_reward * float(self.has_updated) * self.strength, 0, 1 + ) + return RewardSignalResult(scaled_reward, unscaled_reward) + + @classmethod + def check_config( + cls, config_dict: Dict[str, Any], param_keys: List[str] = None + ) -> None: + """ + Checks the config and throw an exception if a hyperparameter is missing. Curiosity requires strength, + gamma, and encoding size at minimum. + """ + param_keys = ["strength", "gamma", "encoding_size"] + super().check_config(config_dict, param_keys) + + def update(self, update_buffer: Buffer, num_sequences: int) -> Dict[str, float]: + """ + Updates Curiosity model using training buffer. Divides training buffer into mini batches and performs + gradient descent. + :param update_buffer: Update buffer from which to pull data from. + :param num_sequences: Number of sequences in the update buffer. + :return: Dict of stats that should be reported to Tensorboard. + """ + forward_total: List[float] = [] + inverse_total: List[float] = [] + for _ in range(self.num_epoch): + update_buffer.shuffle() + buffer = update_buffer + for l in range(len(update_buffer["actions"]) // num_sequences): + start = l * num_sequences + end = (l + 1) * num_sequences + run_out_curio = self._update_batch( + buffer.make_mini_batch(start, end), num_sequences + ) + inverse_total.append(run_out_curio["inverse_loss"]) + forward_total.append(run_out_curio["forward_loss"]) + + update_stats = { + "Losses/Curiosity Forward Loss": np.mean(forward_total), + "Losses/Curiosity Inverse Loss": np.mean(inverse_total), + } + return update_stats + + def _update_batch( + self, mini_batch: Dict[str, np.ndarray], num_sequences: int + ) -> Dict[str, float]: + """ + Updates model using buffer. + :param num_sequences: Number of trajectories in batch. + :param mini_batch: Experience batch. + :return: Output from update process. + """ + feed_dict = { + self.policy.model.batch_size: num_sequences, + self.policy.model.sequence_length: self.policy.sequence_length, + self.policy.model.mask_input: mini_batch["masks"].flatten(), + self.policy.model.advantage: mini_batch["advantages"].reshape([-1, 1]), + self.policy.model.all_old_log_probs: mini_batch["action_probs"].reshape( + [-1, sum(self.policy.model.act_size)] + ), + } + if self.policy.use_continuous_act: + feed_dict[self.policy.model.output_pre] = mini_batch["actions_pre"].reshape( + [-1, self.policy.model.act_size[0]] + ) + feed_dict[self.policy.model.epsilon] = mini_batch[ + "random_normal_epsilon" + ].reshape([-1, self.policy.model.act_size[0]]) + else: + feed_dict[self.policy.model.action_holder] = mini_batch["actions"].reshape( + [-1, len(self.policy.model.act_size)] + ) + if self.policy.use_recurrent: + feed_dict[self.policy.model.prev_action] = mini_batch[ + "prev_action" + ].reshape([-1, len(self.policy.model.act_size)]) + feed_dict[self.policy.model.action_masks] = mini_batch[ + "action_mask" + ].reshape([-1, sum(self.policy.brain.vector_action_space_size)]) + if self.policy.use_vec_obs: + feed_dict[self.policy.model.vector_in] = mini_batch["vector_obs"].reshape( + [-1, self.policy.vec_obs_size] + ) + feed_dict[self.model.next_vector_in] = mini_batch["next_vector_in"].reshape( + [-1, self.policy.vec_obs_size] + ) + if self.policy.model.vis_obs_size > 0: + for i, _ in enumerate(self.policy.model.visual_in): + _obs = mini_batch["visual_obs%d" % i] + if self.policy.sequence_length > 1 and self.policy.use_recurrent: + (_batch, _seq, _w, _h, _c) = _obs.shape + feed_dict[self.policy.model.visual_in[i]] = _obs.reshape( + [-1, _w, _h, _c] + ) + else: + feed_dict[self.policy.model.visual_in[i]] = _obs + for i, _ in enumerate(self.policy.model.visual_in): + _obs = mini_batch["next_visual_obs%d" % i] + if self.policy.sequence_length > 1 and self.policy.use_recurrent: + (_batch, _seq, _w, _h, _c) = _obs.shape + feed_dict[self.model.next_visual_in[i]] = _obs.reshape( + [-1, _w, _h, _c] + ) + else: + feed_dict[self.model.next_visual_in[i]] = _obs + if self.policy.use_recurrent: + mem_in = mini_batch["memory"][:, 0, :] + feed_dict[self.policy.model.memory_in] = mem_in + self.has_updated = True + run_out = self.policy._execute_model(feed_dict, self.update_dict) + return run_out diff --git a/ml-agents/mlagents/trainers/components/reward_signals/extrinsic/__init__.py b/ml-agents/mlagents/trainers/components/reward_signals/extrinsic/__init__.py new file mode 100644 index 0000000000..9f8dc9d33b --- /dev/null +++ b/ml-agents/mlagents/trainers/components/reward_signals/extrinsic/__init__.py @@ -0,0 +1 @@ +from .signal import ExtrinsicRewardSignal diff --git a/ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py b/ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py new file mode 100644 index 0000000000..b229cfaf00 --- /dev/null +++ b/ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py @@ -0,0 +1,49 @@ +from typing import Any, Dict, List +import numpy as np +from mlagents.envs.brain import BrainInfo + +from mlagents.trainers.buffer import Buffer +from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult +from mlagents.trainers.tf_policy import TFPolicy + + +class ExtrinsicRewardSignal(RewardSignal): + def __init__(self, policy: TFPolicy, strength: float, gamma: float): + """ + The extrinsic reward generator. Returns the reward received by the environment + :param policy: The Policy object (e.g. PPOPolicy) that this Reward Signal will apply to. + :param strength: The strength of the reward. The reward's raw value will be multiplied by this value. + :param gamma: The time discounting factor used for this reward. + :return: An ExtrinsicRewardSignal object. + """ + super().__init__(policy, strength, gamma) + + @classmethod + def check_config( + cls, config_dict: Dict[str, Any], param_keys: List[str] = None + ) -> None: + """ + Checks the config and throw an exception if a hyperparameter is missing. Extrinsic requires strength and gamma + at minimum. + """ + param_keys = ["strength", "gamma"] + super().check_config(config_dict, param_keys) + + def evaluate( + self, current_info: BrainInfo, next_info: BrainInfo + ) -> RewardSignalResult: + """ + Evaluates the reward for the agents present in current_info given the next_info + :param current_info: The current BrainInfo. + :param next_info: The BrainInfo from the next timestep. + :return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator + """ + unscaled_reward = np.array(next_info.rewards) + scaled_reward = self.strength * unscaled_reward + return RewardSignalResult(scaled_reward, unscaled_reward) + + def update(self, update_buffer: Buffer, num_sequences: int) -> Dict[str, float]: + """ + This method does nothing, as there is nothing to update. + """ + return {} diff --git a/ml-agents/mlagents/trainers/components/reward_signals/gail/__init__.py b/ml-agents/mlagents/trainers/components/reward_signals/gail/__init__.py new file mode 100644 index 0000000000..77c38345ea --- /dev/null +++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/__init__.py @@ -0,0 +1 @@ +from .signal import GAILRewardSignal diff --git a/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py b/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py new file mode 100644 index 0000000000..e26d969da3 --- /dev/null +++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py @@ -0,0 +1,300 @@ +from typing import Tuple, List + +import tensorflow as tf +from mlagents.trainers.models import LearningModel + +EPSILON = 1e-7 + + +class GAILModel(object): + def __init__( + self, + policy_model: LearningModel, + h_size: int = 128, + learning_rate: float = 3e-4, + encoding_size: int = 64, + use_actions: bool = False, + use_vail: bool = False, + gradient_penalty_weight: float = 10.0, + ): + """ + The initializer for the GAIL reward generator. + https://arxiv.org/abs/1606.03476 + :param policy_model: The policy of the learning algorithm + :param h_size: Size of the hidden layer for the discriminator + :param learning_rate: The learning Rate for the discriminator + :param encoding_size: The encoding size for the encoder + :param use_actions: Whether or not to use actions to discriminate + :param use_vail: Whether or not to use a variational bottleneck for the + discriminator. See https://arxiv.org/abs/1810.00821. + """ + self.h_size = h_size + self.z_size = 128 + self.alpha = 0.0005 + self.mutual_information = 0.5 + self.policy_model = policy_model + self.encoding_size = encoding_size + self.gradient_penalty_weight = gradient_penalty_weight + self.use_vail = use_vail + self.use_actions = use_actions # True # Not using actions + self.make_beta() + self.make_inputs() + self.create_network() + self.create_loss(learning_rate) + + def make_beta(self) -> None: + """ + Creates the beta parameter and its updater for GAIL + """ + self.beta = tf.get_variable( + "gail_beta", + [], + trainable=False, + dtype=tf.float32, + initializer=tf.ones_initializer(), + ) + self.kl_div_input = tf.placeholder(shape=[], dtype=tf.float32) + new_beta = tf.maximum( + self.beta + self.alpha * (self.kl_div_input - self.mutual_information), + EPSILON, + ) + self.update_beta = tf.assign(self.beta, new_beta) + + def make_inputs(self) -> None: + """ + Creates the input layers for the discriminator + """ + self.done_expert = tf.placeholder(shape=[None, 1], dtype=tf.float32) + self.done_policy = tf.placeholder(shape=[None, 1], dtype=tf.float32) + + if self.policy_model.brain.vector_action_space_type == "continuous": + action_length = self.policy_model.act_size[0] + self.action_in_expert = tf.placeholder( + shape=[None, action_length], dtype=tf.float32 + ) + self.expert_action = tf.identity(self.action_in_expert) + else: + action_length = len(self.policy_model.act_size) + self.action_in_expert = tf.placeholder( + shape=[None, action_length], dtype=tf.int32 + ) + self.expert_action = tf.concat( + [ + tf.one_hot(self.action_in_expert[:, i], act_size) + for i, act_size in enumerate(self.policy_model.act_size) + ], + axis=1, + ) + + encoded_policy_list = [] + encoded_expert_list = [] + + if self.policy_model.vec_obs_size > 0: + self.obs_in_expert = tf.placeholder( + shape=[None, self.policy_model.vec_obs_size], dtype=tf.float32 + ) + if self.policy_model.normalize: + encoded_expert_list.append( + self.policy_model.normalize_vector_obs(self.obs_in_expert) + ) + encoded_policy_list.append( + self.policy_model.normalize_vector_obs(self.policy_model.vector_in) + ) + else: + encoded_expert_list.append(self.obs_in_expert) + encoded_policy_list.append(self.policy_model.vector_in) + + if self.policy_model.vis_obs_size > 0: + self.expert_visual_in: List[tf.Tensor] = [] + visual_policy_encoders = [] + visual_expert_encoders = [] + for i in range(self.policy_model.vis_obs_size): + # Create input ops for next (t+1) visual observations. + visual_input = self.policy_model.create_visual_input( + self.policy_model.brain.camera_resolutions[i], + name="visual_observation_" + str(i), + ) + self.expert_visual_in.append(visual_input) + + encoded_policy_visual = self.policy_model.create_visual_observation_encoder( + self.policy_model.visual_in[i], + self.encoding_size, + LearningModel.swish, + 1, + "stream_{}_visual_obs_encoder".format(i), + False, + ) + + encoded_expert_visual = self.policy_model.create_visual_observation_encoder( + self.expert_visual_in[i], + self.encoding_size, + LearningModel.swish, + 1, + "stream_{}_visual_obs_encoder".format(i), + True, + ) + visual_policy_encoders.append(encoded_policy_visual) + visual_expert_encoders.append(encoded_expert_visual) + hidden_policy_visual = tf.concat(visual_policy_encoders, axis=1) + hidden_expert_visual = tf.concat(visual_expert_encoders, axis=1) + encoded_policy_list.append(hidden_policy_visual) + encoded_expert_list.append(hidden_expert_visual) + + self.encoded_expert = tf.concat(encoded_expert_list, axis=1) + self.encoded_policy = tf.concat(encoded_policy_list, axis=1) + + def create_encoder( + self, state_in: tf.Tensor, action_in: tf.Tensor, done_in: tf.Tensor, reuse: bool + ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]: + """ + Creates the encoder for the discriminator + :param state_in: The encoded observation input + :param action_in: The action input + :param done_in: The done flags input + :param reuse: If true, the weights will be shared with the previous encoder created + """ + with tf.variable_scope("GAIL_model"): + if self.use_actions: + concat_input = tf.concat([state_in, action_in, done_in], axis=1) + else: + concat_input = state_in + + hidden_1 = tf.layers.dense( + concat_input, + self.h_size, + activation=LearningModel.swish, + name="d_hidden_1", + reuse=reuse, + ) + + hidden_2 = tf.layers.dense( + hidden_1, + self.h_size, + activation=LearningModel.swish, + name="d_hidden_2", + reuse=reuse, + ) + + z_mean = None + if self.use_vail: + # Latent representation + z_mean = tf.layers.dense( + hidden_2, + self.z_size, + reuse=reuse, + name="z_mean", + kernel_initializer=LearningModel.scaled_init(0.01), + ) + + self.noise = tf.random_normal(tf.shape(z_mean), dtype=tf.float32) + + # Sampled latent code + self.z = z_mean + self.z_sigma * self.noise * self.use_noise + estimate_input = self.z + else: + estimate_input = hidden_2 + + estimate = tf.layers.dense( + estimate_input, + 1, + activation=tf.nn.sigmoid, + name="d_estimate", + reuse=reuse, + ) + return estimate, z_mean, concat_input + + def create_network(self) -> None: + """ + Helper for creating the intrinsic reward nodes + """ + if self.use_vail: + self.z_sigma = tf.get_variable( + "sigma_vail", + self.z_size, + dtype=tf.float32, + initializer=tf.ones_initializer(), + ) + self.z_sigma_sq = self.z_sigma * self.z_sigma + self.z_log_sigma_sq = tf.log(self.z_sigma_sq + EPSILON) + self.use_noise = tf.placeholder( + shape=[1], dtype=tf.float32, name="NoiseLevel" + ) + self.expert_estimate, self.z_mean_expert, _ = self.create_encoder( + self.encoded_expert, self.expert_action, self.done_expert, reuse=False + ) + self.policy_estimate, self.z_mean_policy, _ = self.create_encoder( + self.encoded_policy, + self.policy_model.selected_actions, + self.done_policy, + reuse=True, + ) + self.discriminator_score = tf.reshape( + self.policy_estimate, [-1], name="GAIL_reward" + ) + self.intrinsic_reward = -tf.log(1.0 - self.discriminator_score + EPSILON) + + def create_gradient_magnitude(self) -> tf.Tensor: + """ + Gradient penalty from https://arxiv.org/pdf/1704.00028. Adds stability esp. + for off-policy. Compute gradients w.r.t randomly interpolated input. + """ + expert = [self.encoded_expert, self.expert_action, self.done_expert] + policy = [ + self.encoded_policy, + self.policy_model.selected_actions, + self.done_policy, + ] + interp = [] + for _expert_in, _policy_in in zip(expert, policy): + alpha = tf.random_uniform(tf.shape(_expert_in)) + interp.append(alpha * _expert_in + (1 - alpha) * _policy_in) + + grad_estimate, _, grad_input = self.create_encoder( + interp[0], interp[1], interp[2], reuse=True + ) + + grad = tf.gradients(grad_estimate, [grad_input])[0] + + # Norm's gradient could be NaN at 0. Use our own safe_norm + safe_norm = tf.sqrt(tf.reduce_sum(grad ** 2, axis=-1) + EPSILON) + gradient_mag = tf.reduce_mean(tf.pow(safe_norm - 1, 2)) + + return gradient_mag + + def create_loss(self, learning_rate: float) -> None: + """ + Creates the loss and update nodes for the GAIL reward generator + :param learning_rate: The learning rate for the optimizer + """ + self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate) + self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate) + + self.discriminator_loss = -tf.reduce_mean( + tf.log(self.expert_estimate + EPSILON) + + tf.log(1.0 - self.policy_estimate + EPSILON) + ) + + if self.use_vail: + # KL divergence loss (encourage latent representation to be normal) + self.kl_loss = tf.reduce_mean( + -tf.reduce_sum( + 1 + + self.z_log_sigma_sq + - 0.5 * tf.square(self.z_mean_expert) + - 0.5 * tf.square(self.z_mean_policy) + - tf.exp(self.z_log_sigma_sq), + 1, + ) + ) + self.loss = ( + self.beta * (self.kl_loss - self.mutual_information) + + self.discriminator_loss + ) + else: + self.loss = self.discriminator_loss + + if self.gradient_penalty_weight > 0.0: + self.loss += self.gradient_penalty_weight * self.create_gradient_magnitude() + + optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) + self.update_batch = optimizer.minimize(self.loss) diff --git a/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py b/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py new file mode 100644 index 0000000000..a7c923ac4d --- /dev/null +++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py @@ -0,0 +1,272 @@ +from typing import Any, Dict, List +import logging +import numpy as np +import tensorflow as tf + +from mlagents.envs.brain import BrainInfo +from mlagents.trainers.buffer import Buffer +from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult +from mlagents.trainers.tf_policy import TFPolicy +from .model import GAILModel +from mlagents.trainers.demo_loader import demo_to_buffer + +LOGGER = logging.getLogger("mlagents.trainers") + + +class GAILRewardSignal(RewardSignal): + def __init__( + self, + policy: TFPolicy, + strength: float, + gamma: float, + demo_path: str, + num_epoch: int = 3, + encoding_size: int = 64, + learning_rate: float = 3e-4, + samples_per_update: int = 0, + use_actions: bool = False, + use_vail: bool = False, + ): + """ + The GAIL Reward signal generator. https://arxiv.org/abs/1606.03476 + :param policy: The policy of the learning model + :param strength: The scaling parameter for the reward. The scaled reward will be the unscaled + reward multiplied by the strength parameter + :param gamma: The time discounting factor used for this reward. + :param demo_path: The path to the demonstration file + :param num_epoch: The number of epochs to train over the training buffer for the discriminator. + :param encoding_size: The size of the the hidden layers of the discriminator + :param learning_rate: The Learning Rate used during GAIL updates. + :param samples_per_update: The maximum number of samples to update during GAIL updates. + :param use_actions: Whether or not to use the actions for the discriminator. + :param use_vail: Whether or not to use a variational bottleneck for the discriminator. + See https://arxiv.org/abs/1810.00821. + """ + super().__init__(policy, strength, gamma) + self.num_epoch = num_epoch + self.samples_per_update = samples_per_update + self.use_terminal_states = False + + self.model = GAILModel( + policy.model, 128, learning_rate, encoding_size, use_actions, use_vail + ) + _, self.demonstration_buffer = demo_to_buffer(demo_path, policy.sequence_length) + self.has_updated = False + + def evaluate( + self, current_info: BrainInfo, next_info: BrainInfo + ) -> RewardSignalResult: + if len(current_info.agents) == 0: + return [] + + feed_dict: Dict[tf.Tensor, Any] = { + self.policy.model.batch_size: len(next_info.vector_observations), + self.policy.model.sequence_length: 1, + } + if self.model.use_vail: + feed_dict[self.model.use_noise] = [0] + + feed_dict = self.policy.fill_eval_dict(feed_dict, brain_info=current_info) + feed_dict[self.model.done_policy] = np.reshape(next_info.local_done, [-1, 1]) + if self.policy.use_continuous_act: + feed_dict[ + self.policy.model.selected_actions + ] = next_info.previous_vector_actions + else: + feed_dict[ + self.policy.model.action_holder + ] = next_info.previous_vector_actions + if self.policy.use_recurrent: + if current_info.memories.shape[1] == 0: + current_info.memories = self.policy.make_empty_memory( + len(current_info.agents) + ) + feed_dict[self.policy.model.memory_in] = current_info.memories + unscaled_reward = self.policy.sess.run( + self.model.intrinsic_reward, feed_dict=feed_dict + ) + scaled_reward = unscaled_reward * float(self.has_updated) * self.strength + return RewardSignalResult(scaled_reward, unscaled_reward) + + @classmethod + def check_config( + cls, config_dict: Dict[str, Any], param_keys: List[str] = None + ) -> None: + """ + Checks the config and throw an exception if a hyperparameter is missing. GAIL requires strength and gamma + at minimum. + """ + param_keys = ["strength", "gamma", "demo_path"] + super().check_config(config_dict, param_keys) + + def update(self, update_buffer: Buffer, n_sequences: int) -> Dict[str, float]: + """ + Updates model using buffer. + :param update_buffer: The policy buffer containing the trajectories for the current policy. + :param n_sequences: The number of sequences from demo and policy used in each mini batch. + :return: The loss of the update. + """ + batch_losses = [] + # Divide by 2 since we have two buffers, so we have roughly the same batch size + n_sequences = max(n_sequences // 2, 1) + possible_demo_batches = ( + len(self.demonstration_buffer.update_buffer["actions"]) // n_sequences + ) + possible_policy_batches = len(update_buffer["actions"]) // n_sequences + possible_batches = min(possible_policy_batches, possible_demo_batches) + + max_batches = self.samples_per_update // n_sequences + + kl_loss = [] + policy_estimate = [] + expert_estimate = [] + z_log_sigma_sq = [] + z_mean_expert = [] + z_mean_policy = [] + + n_epoch = self.num_epoch + for _epoch in range(n_epoch): + self.demonstration_buffer.update_buffer.shuffle() + update_buffer.shuffle() + if max_batches == 0: + num_batches = possible_batches + else: + num_batches = min(possible_batches, max_batches) + for i in range(num_batches): + demo_update_buffer = self.demonstration_buffer.update_buffer + policy_update_buffer = update_buffer + start = i * n_sequences + end = (i + 1) * n_sequences + mini_batch_demo = demo_update_buffer.make_mini_batch(start, end) + mini_batch_policy = policy_update_buffer.make_mini_batch(start, end) + run_out = self._update_batch(mini_batch_demo, mini_batch_policy) + loss = run_out["gail_loss"] + + policy_estimate.append(run_out["policy_estimate"]) + expert_estimate.append(run_out["expert_estimate"]) + if self.model.use_vail: + kl_loss.append(run_out["kl_loss"]) + z_log_sigma_sq.append(run_out["z_log_sigma_sq"]) + z_mean_policy.append(run_out["z_mean_policy"]) + z_mean_expert.append(run_out["z_mean_expert"]) + + batch_losses.append(loss) + self.has_updated = True + + print_list = ["n_epoch", "beta", "policy_estimate", "expert_estimate"] + print_vals = [ + n_epoch, + self.policy.sess.run(self.model.beta), + np.mean(policy_estimate), + np.mean(expert_estimate), + ] + if self.model.use_vail: + print_list += [ + "kl_loss", + "z_mean_expert", + "z_mean_policy", + "z_log_sigma_sq", + ] + print_vals += [ + np.mean(kl_loss), + np.mean(z_mean_expert), + np.mean(z_mean_policy), + np.mean(z_log_sigma_sq), + ] + LOGGER.debug( + "GAIL Debug:\n\t\t" + + "\n\t\t".join( + "{0}: {1}".format(_name, _val) + for _name, _val in zip(print_list, print_vals) + ) + ) + update_stats = {"Losses/GAIL Loss": np.mean(batch_losses)} + return update_stats + + def _update_batch( + self, + mini_batch_demo: Dict[str, np.ndarray], + mini_batch_policy: Dict[str, np.ndarray], + ) -> Dict[str, float]: + """ + Helper method for update. + :param mini_batch_demo: A mini batch of expert trajectories + :param mini_batch_policy: A mini batch of trajectories sampled from the current policy + :return: Output from update process. + """ + feed_dict: Dict[tf.Tensor, Any] = { + self.model.done_expert: mini_batch_demo["done"].reshape([-1, 1]), + self.model.done_policy: mini_batch_policy["done"].reshape([-1, 1]), + } + + if self.model.use_vail: + feed_dict[self.model.use_noise] = [1] + + if self.policy.use_continuous_act: + feed_dict[self.policy.model.selected_actions] = mini_batch_policy[ + "actions" + ].reshape([-1, self.policy.model.act_size[0]]) + feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"].reshape( + [-1, self.policy.model.act_size[0]] + ) + else: + feed_dict[self.policy.model.action_holder] = mini_batch_policy[ + "actions" + ].reshape([-1, len(self.policy.model.act_size)]) + feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"].reshape( + [-1, len(self.policy.model.act_size)] + ) + + if self.policy.use_vis_obs > 0: + for i in range(len(self.policy.model.visual_in)): + policy_obs = mini_batch_policy["visual_obs%d" % i] + if self.policy.sequence_length > 1 and self.policy.use_recurrent: + (_batch, _seq, _w, _h, _c) = policy_obs.shape + feed_dict[self.policy.model.visual_in[i]] = policy_obs.reshape( + [-1, _w, _h, _c] + ) + else: + feed_dict[self.policy.model.visual_in[i]] = policy_obs + + demo_obs = mini_batch_demo["visual_obs%d" % i] + if self.policy.sequence_length > 1 and self.policy.use_recurrent: + (_batch, _seq, _w, _h, _c) = demo_obs.shape + feed_dict[self.model.expert_visual_in[i]] = demo_obs.reshape( + [-1, _w, _h, _c] + ) + else: + feed_dict[self.model.expert_visual_in[i]] = demo_obs + if self.policy.use_vec_obs: + feed_dict[self.policy.model.vector_in] = mini_batch_policy[ + "vector_obs" + ].reshape([-1, self.policy.vec_obs_size]) + feed_dict[self.model.obs_in_expert] = mini_batch_demo["vector_obs"].reshape( + [-1, self.policy.vec_obs_size] + ) + + out_dict = { + "gail_loss": self.model.loss, + "update_batch": self.model.update_batch, + "policy_estimate": self.model.policy_estimate, + "expert_estimate": self.model.expert_estimate, + } + if self.model.use_vail: + out_dict["kl_loss"] = self.model.kl_loss + out_dict["z_log_sigma_sq"] = self.model.z_log_sigma_sq + out_dict["z_mean_expert"] = self.model.z_mean_expert + out_dict["z_mean_policy"] = self.model.z_mean_policy + + run_out = self.policy.sess.run(out_dict, feed_dict=feed_dict) + if self.model.use_vail: + self.update_beta(run_out["kl_loss"]) + return run_out + + def update_beta(self, kl_div: float) -> None: + """ + Updates the Beta parameter with the latest kl_divergence value. + The larger Beta, the stronger the importance of the kl divergence in the loss function. + :param kl_div: The KL divergence + """ + self.policy.sess.run( + self.model.update_beta, feed_dict={self.model.kl_div_input: kl_div} + ) diff --git a/ml-agents/mlagents/trainers/components/reward_signals/reward_signal.py b/ml-agents/mlagents/trainers/components/reward_signals/reward_signal.py new file mode 100644 index 0000000000..7ca2f46a56 --- /dev/null +++ b/ml-agents/mlagents/trainers/components/reward_signals/reward_signal.py @@ -0,0 +1,79 @@ +import logging +from typing import Any, Dict, List +from collections import namedtuple +import numpy as np +import abc + +import tensorflow as tf + +from mlagents.envs.brain import BrainInfo +from mlagents.trainers.trainer import UnityTrainerException +from mlagents.trainers.tf_policy import TFPolicy +from mlagents.trainers.buffer import Buffer + +logger = logging.getLogger("mlagents.trainers") + +RewardSignalResult = namedtuple( + "RewardSignalResult", ["scaled_reward", "unscaled_reward"] +) + + +class RewardSignal(abc.ABC): + def __init__(self, policy: TFPolicy, strength: float, gamma: float): + """ + Initializes a reward signal. At minimum, you must pass in the policy it is being applied to, + the reward strength, and the gamma (discount factor.) + :param policy: The Policy object (e.g. PPOPolicy) that this Reward Signal will apply to. + :param strength: The strength of the reward. The reward's raw value will be multiplied by this value. + :param gamma: The time discounting factor used for this reward. + :return: A RewardSignal object. + """ + class_name = self.__class__.__name__ + short_name = class_name.replace("RewardSignal", "") + self.stat_name = f"Policy/{short_name} Reward" + self.value_name = f"Policy/{short_name} Value Estimate" + # Terminate discounted reward computation at Done. Can disable to mitigate positive bias in rewards with + # no natural end, e.g. GAIL or Curiosity + self.use_terminal_states = True + self.gamma = gamma + self.policy = policy + self.strength = strength + + def evaluate( + self, current_info: BrainInfo, next_info: BrainInfo + ) -> RewardSignalResult: + """ + Evaluates the reward for the agents present in current_info given the next_info + :param current_info: The current BrainInfo. + :param next_info: The BrainInfo from the next timestep. + :return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator + """ + return RewardSignalResult( + self.strength * np.zeros(len(current_info.agents)), + np.zeros(len(current_info.agents)), + ) + + def update(self, update_buffer: Buffer, num_sequences: int) -> Dict[str, float]: + """ + If the reward signal has an internal model (e.g. GAIL or Curiosity), update that model. + :param update_buffer: An AgentBuffer that contains the live data from which to update. + :param n_sequences: The number of sequences in the training buffer. + :return: A dict of {"Stat Name": stat} to be added to Tensorboard + """ + return {} + + @classmethod + def check_config( + cls, config_dict: Dict[str, Any], param_keys: List[str] = None + ) -> None: + """ + Check the config dict, and throw an error if there are missing hyperparameters. + """ + param_keys = param_keys or [] + for k in param_keys: + if k not in config_dict: + raise UnityTrainerException( + "The hyper-parameter {0} could not be found for {1}.".format( + k, cls.__name__ + ) + ) diff --git a/ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py b/ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py new file mode 100644 index 0000000000..0866147e21 --- /dev/null +++ b/ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py @@ -0,0 +1,45 @@ +import logging +from typing import Any, Dict, Type + +from mlagents.trainers.trainer import UnityTrainerException +from mlagents.trainers.components.reward_signals.reward_signal import RewardSignal +from mlagents.trainers.components.reward_signals.extrinsic.signal import ( + ExtrinsicRewardSignal, +) +from mlagents.trainers.components.reward_signals.gail.signal import GAILRewardSignal +from mlagents.trainers.components.reward_signals.curiosity.signal import ( + CuriosityRewardSignal, +) +from mlagents.trainers.tf_policy import TFPolicy + +logger = logging.getLogger("mlagents.trainers") + + +NAME_TO_CLASS: Dict[str, Type[RewardSignal]] = { + "extrinsic": ExtrinsicRewardSignal, + "curiosity": CuriosityRewardSignal, + "gail": GAILRewardSignal, +} + + +def create_reward_signal( + policy: TFPolicy, name: str, config_entry: Dict[str, Any] +) -> RewardSignal: + """ + Creates a reward signal class based on the name and config entry provided as a dict. + :param policy: The policy class which the reward will be applied to. + :param name: The name of the reward signal + :param config_entry: The config entries for that reward signal + :return: The reward signal class instantiated + """ + rcls = NAME_TO_CLASS.get(name) + if not rcls: + raise UnityTrainerException("Unknown reward signal type {0}".format(name)) + rcls.check_config(config_entry) + try: + class_inst = rcls(policy, **config_entry) + except TypeError: + raise UnityTrainerException( + "Unknown parameters given for reward signal {0}".format(name) + ) + return class_inst diff --git a/ml-agents/mlagents/trainers/demo_loader.py b/ml-agents/mlagents/trainers/demo_loader.py index ccf96d8f8c..2043ea4d85 100644 --- a/ml-agents/mlagents/trainers/demo_loader.py +++ b/ml-agents/mlagents/trainers/demo_loader.py @@ -1,15 +1,23 @@ import pathlib import logging import os +from typing import List, Tuple from mlagents.trainers.buffer import Buffer from mlagents.envs.brain import BrainParameters, BrainInfo -from mlagents.envs.communicator_objects import * -from google.protobuf.internal.decoder import _DecodeVarint32 +from mlagents.envs.communicator_objects import ( + AgentInfoProto, + BrainParametersProto, + DemonstrationMetaProto, +) +from google.protobuf.internal.decoder import _DecodeVarint32 # type: ignore + logger = logging.getLogger("mlagents.trainers") -def make_demo_buffer(brain_infos, brain_params, sequence_length): +def make_demo_buffer( + brain_infos: List[BrainInfo], brain_params: BrainParameters, sequence_length: int +) -> Buffer: # Create and populate buffer using experiences demo_buffer = Buffer() for idx, experience in enumerate(brain_infos): @@ -29,6 +37,9 @@ def make_demo_buffer(brain_infos, brain_params, sequence_length): current_brain_info.vector_observations[0] ) demo_buffer[0]["actions"].append(next_brain_info.previous_vector_actions[0]) + demo_buffer[0]["prev_action"].append( + current_brain_info.previous_vector_actions[0] + ) if next_brain_info.local_done[0]: demo_buffer.append_update_buffer( 0, batch_size=None, training_length=sequence_length @@ -40,7 +51,9 @@ def make_demo_buffer(brain_infos, brain_params, sequence_length): return demo_buffer -def demo_to_buffer(file_path, sequence_length): +def demo_to_buffer( + file_path: str, sequence_length: int +) -> Tuple[BrainParameters, Buffer]: """ Loads demonstration file and uses it to fill training buffer. :param file_path: Location of demonstration file (.demo). @@ -52,7 +65,7 @@ def demo_to_buffer(file_path, sequence_length): return brain_params, demo_buffer -def load_demonstration(file_path): +def load_demonstration(file_path: str) -> Tuple[BrainParameters, List[BrainInfo], int]: """ Loads and parses a demonstration file. :param file_path: Location of demonstration file (.demo). @@ -61,42 +74,52 @@ def load_demonstration(file_path): # First 32 bytes of file dedicated to meta-data. INITIAL_POS = 33 - - if not os.path.isfile(file_path): + file_paths = [] + if os.path.isdir(file_path): + all_files = os.listdir(file_path) + for _file in all_files: + if _file.endswith(".demo"): + file_paths.append(os.path.join(file_path, _file)) + if not all_files: + raise ValueError("There are no '.demo' files in the provided directory.") + elif os.path.isfile(file_path): + file_paths.append(file_path) + file_extension = pathlib.Path(file_path).suffix + if file_extension != ".demo": + raise ValueError( + "The file is not a '.demo' file. Please provide a file with the " + "correct extension." + ) + else: raise FileNotFoundError( - "The demonstration file {} does not exist.".format(file_path) - ) - file_extension = pathlib.Path(file_path).suffix - if file_extension != ".demo": - raise ValueError( - "The file is not a '.demo' file. Please provide a file with the " - "correct extension." + "The demonstration file or directory {} does not exist.".format(file_path) ) brain_params = None brain_infos = [] - data = open(file_path, "rb").read() - next_pos, pos, obs_decoded = 0, 0, 0 total_expected = 0 - while pos < len(data): - next_pos, pos = _DecodeVarint32(data, pos) - if obs_decoded == 0: - meta_data_proto = DemonstrationMetaProto() - meta_data_proto.ParseFromString(data[pos : pos + next_pos]) - total_expected = meta_data_proto.number_steps - pos = INITIAL_POS - if obs_decoded == 1: - brain_param_proto = BrainParametersProto() - brain_param_proto.ParseFromString(data[pos : pos + next_pos]) - brain_params = BrainParameters.from_proto(brain_param_proto) - pos += next_pos - if obs_decoded > 1: - agent_info = AgentInfoProto() - agent_info.ParseFromString(data[pos : pos + next_pos]) - brain_info = BrainInfo.from_agent_proto([agent_info], brain_params) - brain_infos.append(brain_info) - if len(brain_infos) == total_expected: - break - pos += next_pos - obs_decoded += 1 + for _file_path in file_paths: + data = open(_file_path, "rb").read() + next_pos, pos, obs_decoded = 0, 0, 0 + while pos < len(data): + next_pos, pos = _DecodeVarint32(data, pos) + if obs_decoded == 0: + meta_data_proto = DemonstrationMetaProto() + meta_data_proto.ParseFromString(data[pos : pos + next_pos]) + total_expected += meta_data_proto.number_steps + pos = INITIAL_POS + if obs_decoded == 1: + brain_param_proto = BrainParametersProto() + brain_param_proto.ParseFromString(data[pos : pos + next_pos]) + brain_params = BrainParameters.from_proto(brain_param_proto) + pos += next_pos + if obs_decoded > 1: + agent_info = AgentInfoProto() + agent_info.ParseFromString(data[pos : pos + next_pos]) + brain_info = BrainInfo.from_agent_proto(0, [agent_info], brain_params) + brain_infos.append(brain_info) + if len(brain_infos) == total_expected: + break + pos += next_pos + obs_decoded += 1 return brain_params, brain_infos, total_expected diff --git a/ml-agents/mlagents/trainers/exception.py b/ml-agents/mlagents/trainers/exception.py index e780e925db..d9b9921081 100644 --- a/ml-agents/mlagents/trainers/exception.py +++ b/ml-agents/mlagents/trainers/exception.py @@ -23,3 +23,5 @@ class MetaCurriculumError(TrainerError): """ Any error related to the configuration of a metacurriculum. """ + + pass diff --git a/ml-agents/mlagents/trainers/learn.py b/ml-agents/mlagents/trainers/learn.py index 6a4f23e13d..c1ca879107 100644 --- a/ml-agents/mlagents/trainers/learn.py +++ b/ml-agents/mlagents/trainers/learn.py @@ -9,19 +9,22 @@ import numpy as np import yaml from docopt import docopt -from typing import Optional, Callable +from typing import Any, Callable, Dict, Optional from mlagents.trainers.trainer_controller import TrainerController from mlagents.trainers.exception import TrainerError from mlagents.trainers import MetaCurriculumError, MetaCurriculum from mlagents.envs import UnityEnvironment -from mlagents.envs.exception import UnityEnvironmentException +from mlagents.envs.sampler_class import SamplerManager +from mlagents.envs.exception import UnityEnvironmentException, SamplerException from mlagents.envs.base_unity_environment import BaseUnityEnvironment -from mlagents.envs.subprocess_environment import SubprocessUnityEnvironment +from mlagents.envs.subprocess_env_manager import SubprocessEnvManager -def run_training(sub_id: int, run_seed: int, run_options, process_queue): +def run_training( + sub_id: int, run_seed: int, run_options: Dict[str, Any], process_queue: Queue +) -> None: """ Launches training session. :param process_queue: Queue used to send signal back to main. @@ -52,6 +55,10 @@ def run_training(sub_id: int, run_seed: int, run_options, process_queue): fast_simulation = not bool(run_options["--slow"]) no_graphics = run_options["--no-graphics"] trainer_config_path = run_options[""] + sampler_file_path = ( + run_options["--sampler"] if run_options["--sampler"] != "None" else None + ) + # Recognize and use docker volume if one is passed as an argument if not docker_target_name: model_path = "./models/{run_id}-{sub_id}".format(run_id=run_id, sub_id=sub_id) @@ -81,8 +88,11 @@ def run_training(sub_id: int, run_seed: int, run_options, process_queue): run_seed, base_port + (sub_id * num_envs), ) - env = SubprocessUnityEnvironment(env_factory, num_envs) + env = SubprocessEnvManager(env_factory, num_envs) maybe_meta_curriculum = try_create_meta_curriculum(curriculum_folder, env) + sampler_manager, resampling_interval = create_sampler_manager( + sampler_file_path, env.reset_parameters + ) # Create controller and begin training. tc = TrainerController( @@ -95,9 +105,10 @@ def run_training(sub_id: int, run_seed: int, run_options, process_queue): train_model, keep_checkpoints, lesson, - env.external_brains, run_seed, fast_simulation, + sampler_manager, + resampling_interval, ) # Signal that environment has been launched. @@ -107,8 +118,30 @@ def run_training(sub_id: int, run_seed: int, run_options, process_queue): tc.start_learning(env, trainer_config) +def create_sampler_manager(sampler_file_path, env_reset_params): + sampler_config = None + resample_interval = None + if sampler_file_path is not None: + sampler_config = load_config(sampler_file_path) + if ("resampling-interval") in sampler_config: + # Filter arguments that do not exist in the environment + resample_interval = sampler_config.pop("resampling-interval") + if (resample_interval <= 0) or (not isinstance(resample_interval, int)): + raise SamplerException( + "Specified resampling-interval is not valid. Please provide" + " a positive integer value for resampling-interval" + ) + else: + raise SamplerException( + "Resampling interval was not specified in the sampler file." + " Please specify it with the 'resampling-interval' key in the sampler config file." + ) + sampler_manager = SamplerManager(sampler_config) + return sampler_manager, resample_interval + + def try_create_meta_curriculum( - curriculum_folder: Optional[str], env: BaseUnityEnvironment + curriculum_folder: Optional[str], env: SubprocessEnvManager ) -> Optional[MetaCurriculum]: if curriculum_folder is None: return None @@ -151,7 +184,7 @@ def prepare_for_docker_run(docker_target_name, env_path): return env_path -def load_config(trainer_config_path): +def load_config(trainer_config_path: str) -> Dict[str, Any]: try: with open(trainer_config_path) as data_file: trainer_config = yaml.safe_load(data_file) @@ -233,7 +266,7 @@ def main(): """ ) - except: + except Exception: print("\n\n\tUnity Technologies\n") _USAGE = """ @@ -242,22 +275,23 @@ def main(): mlagents-learn --help Options: - --env= Name of the Unity executable [default: None]. - --curriculum= Curriculum json directory for environment [default: None]. - --keep-checkpoints= How many model checkpoints to keep [default: 5]. - --lesson= Start learning from this lesson [default: 0]. - --load Whether to load the model or randomly initialize [default: False]. - --run-id= The directory name for model and summary statistics [default: ppo]. - --num-runs= Number of concurrent training sessions [default: 1]. - --save-freq= Frequency at which to save model [default: 50000]. - --seed= Random seed used for training [default: -1]. - --slow Whether to run the game at training speed [default: False]. - --train Whether to train model, or only run inference [default: False]. - --base-port= Base port for environment communication [default: 5005]. - --num-envs= Number of parallel environments to use for training [default: 1] - --docker-target-name=
Docker volume to store training-specific files [default: None]. - --no-graphics Whether to run the environment in no-graphics mode [default: False]. - --debug Whether to run ML-Agents in debug mode with detailed logging [default: False]. + --env= Name of the Unity executable [default: None]. + --curriculum= Curriculum json directory for environment [default: None]. + --sampler= Reset parameter yaml file for environment [default: None]. + --keep-checkpoints= How many model checkpoints to keep [default: 5]. + --lesson= Start learning from this lesson [default: 0]. + --load Whether to load the model or randomly initialize [default: False]. + --run-id= The directory name for model and summary statistics [default: ppo]. + --num-runs= Number of concurrent training sessions [default: 1]. + --save-freq= Frequency at which to save model [default: 50000]. + --seed= Random seed used for training [default: -1]. + --slow Whether to run the game at training speed [default: False]. + --train Whether to train model, or only run inference [default: False]. + --base-port= Base port for environment communication [default: 5005]. + --num-envs= Number of parallel environments to use for training [default: 1] + --docker-target-name=
Docker volume to store training-specific files [default: None]. + --no-graphics Whether to run the environment in no-graphics mode [default: False]. + --debug Whether to run ML-Agents in debug mode with detailed logging [default: False]. """ options = docopt(_USAGE) diff --git a/ml-agents/mlagents/trainers/models.py b/ml-agents/mlagents/trainers/models.py index 2d14768dbd..880eaa315e 100644 --- a/ml-agents/mlagents/trainers/models.py +++ b/ml-agents/mlagents/trainers/models.py @@ -1,4 +1,6 @@ import logging +from enum import Enum +from typing import Any, Callable, Dict import numpy as np import tensorflow as tf @@ -6,15 +8,27 @@ logger = logging.getLogger("mlagents.trainers") +ActivationFunction = Callable[[tf.Tensor], tf.Tensor] + + +class EncoderType(Enum): + SIMPLE = "simple" + NATURE_CNN = "nature_cnn" + RESNET = "resnet" + class LearningModel(object): _version_number_ = 2 - def __init__(self, m_size, normalize, use_recurrent, brain, seed): + def __init__( + self, m_size, normalize, use_recurrent, brain, seed, stream_names=None + ): tf.set_random_seed(seed) self.brain = brain self.vector_in = None - self.global_step, self.increment_step = self.create_global_steps() + self.global_step, self.increment_step, self.steps_to_increment = ( + self.create_global_steps() + ) self.visual_in = [] self.batch_size = tf.placeholder(shape=None, dtype=tf.int32, name="batch_size") self.sequence_length = tf.placeholder( @@ -22,6 +36,7 @@ def __init__(self, m_size, normalize, use_recurrent, brain, seed): ) self.mask_input = tf.placeholder(shape=[None], dtype=tf.float32, name="masks") self.mask = tf.cast(self.mask_input, tf.int32) + self.stream_names = stream_names or [] self.use_recurrent = use_recurrent if self.use_recurrent: self.m_size = m_size @@ -67,16 +82,23 @@ def create_global_steps(): global_step = tf.Variable( 0, name="global_step", trainable=False, dtype=tf.int32 ) - increment_step = tf.assign(global_step, tf.add(global_step, 1)) - return global_step, increment_step + steps_to_increment = tf.placeholder( + shape=[], dtype=tf.int32, name="steps_to_increment" + ) + increment_step = tf.assign(global_step, tf.add(global_step, steps_to_increment)) + return global_step, increment_step, steps_to_increment + + @staticmethod + def scaled_init(scale): + return c_layers.variance_scaling_initializer(scale) @staticmethod - def swish(input_activation): + def swish(input_activation: tf.Tensor) -> tf.Tensor: """Swish activation function. For more info: https://arxiv.org/abs/1710.05941""" return tf.multiply(input_activation, tf.nn.sigmoid(input_activation)) @staticmethod - def create_visual_input(camera_parameters, name): + def create_visual_input(camera_parameters: Dict[str, Any], name: str) -> tf.Tensor: """ Creates image input op. :param camera_parameters: Parameters for visual observation from BrainInfo. @@ -108,53 +130,72 @@ def create_vector_input(self, name="vector_observation"): shape=[None, self.vec_obs_size], dtype=tf.float32, name=name ) if self.normalize: - self.running_mean = tf.get_variable( - "running_mean", - [self.vec_obs_size], - trainable=False, - dtype=tf.float32, - initializer=tf.zeros_initializer(), - ) - self.running_variance = tf.get_variable( - "running_variance", - [self.vec_obs_size], - trainable=False, - dtype=tf.float32, - initializer=tf.ones_initializer(), - ) - self.update_mean, self.update_variance = self.create_normalizer_update( - self.vector_in - ) - - self.normalized_state = tf.clip_by_value( - (self.vector_in - self.running_mean) - / tf.sqrt( - self.running_variance / (tf.cast(self.global_step, tf.float32) + 1) - ), - -5, - 5, - name="normalized_state", - ) - return self.normalized_state + self.create_normalizer(self.vector_in) + return self.normalize_vector_obs(self.vector_in) else: return self.vector_in + def normalize_vector_obs(self, vector_obs): + normalized_state = tf.clip_by_value( + (vector_obs - self.running_mean) + / tf.sqrt( + self.running_variance + / (tf.cast(self.normalization_steps, tf.float32) + 1) + ), + -5, + 5, + name="normalized_state", + ) + return normalized_state + + def create_normalizer(self, vector_obs): + self.normalization_steps = tf.get_variable( + "normalization_steps", + [], + trainable=False, + dtype=tf.int32, + initializer=tf.ones_initializer(), + ) + self.running_mean = tf.get_variable( + "running_mean", + [self.vec_obs_size], + trainable=False, + dtype=tf.float32, + initializer=tf.zeros_initializer(), + ) + self.running_variance = tf.get_variable( + "running_variance", + [self.vec_obs_size], + trainable=False, + dtype=tf.float32, + initializer=tf.ones_initializer(), + ) + self.update_normalization = self.create_normalizer_update(vector_obs) + def create_normalizer_update(self, vector_input): mean_current_observation = tf.reduce_mean(vector_input, axis=0) new_mean = self.running_mean + ( mean_current_observation - self.running_mean - ) / tf.cast(tf.add(self.global_step, 1), tf.float32) + ) / tf.cast(tf.add(self.normalization_steps, 1), tf.float32) new_variance = self.running_variance + (mean_current_observation - new_mean) * ( mean_current_observation - self.running_mean ) update_mean = tf.assign(self.running_mean, new_mean) update_variance = tf.assign(self.running_variance, new_variance) - return update_mean, update_variance + update_norm_step = tf.assign( + self.normalization_steps, self.normalization_steps + 1 + ) + return tf.group([update_mean, update_variance, update_norm_step]) @staticmethod def create_vector_observation_encoder( - observation_input, h_size, activation, num_layers, scope, reuse - ): + observation_input: tf.Tensor, + h_size: int, + activation: ActivationFunction, + num_layers: int, + scope: str, + reuse: bool, + ) -> tf.Tensor: """ Builds a set of hidden state encoders. :param reuse: Whether to re-use the weights within the same scope. @@ -179,16 +220,22 @@ def create_vector_observation_encoder( return hidden def create_visual_observation_encoder( - self, image_input, h_size, activation, num_layers, scope, reuse - ): + self, + image_input: tf.Tensor, + h_size: int, + activation: ActivationFunction, + num_layers: int, + scope: str, + reuse: bool, + ) -> tf.Tensor: """ - Builds a set of visual (CNN) encoders. - :param reuse: Whether to re-use the weights within the same scope. - :param scope: The scope of the graph within which to create the ops. + Builds a set of resnet visual encoders. :param image_input: The placeholder for the image input to use. :param h_size: Hidden layer size. :param activation: What type of activation function to use for layers. :param num_layers: number of hidden layers to create. + :param scope: The scope of the graph within which to create the ops. + :param reuse: Whether to re-use the weights within the same scope. :return: List of hidden layer tensors. """ with tf.variable_scope(scope): @@ -218,6 +265,129 @@ def create_visual_observation_encoder( ) return hidden_flat + def create_nature_cnn_visual_observation_encoder( + self, + image_input: tf.Tensor, + h_size: int, + activation: ActivationFunction, + num_layers: int, + scope: str, + reuse: bool, + ) -> tf.Tensor: + """ + Builds a set of resnet visual encoders. + :param image_input: The placeholder for the image input to use. + :param h_size: Hidden layer size. + :param activation: What type of activation function to use for layers. + :param num_layers: number of hidden layers to create. + :param scope: The scope of the graph within which to create the ops. + :param reuse: Whether to re-use the weights within the same scope. + :return: List of hidden layer tensors. + """ + with tf.variable_scope(scope): + conv1 = tf.layers.conv2d( + image_input, + 32, + kernel_size=[8, 8], + strides=[4, 4], + activation=tf.nn.elu, + reuse=reuse, + name="conv_1", + ) + conv2 = tf.layers.conv2d( + conv1, + 64, + kernel_size=[4, 4], + strides=[2, 2], + activation=tf.nn.elu, + reuse=reuse, + name="conv_2", + ) + conv3 = tf.layers.conv2d( + conv2, + 64, + kernel_size=[3, 3], + strides=[1, 1], + activation=tf.nn.elu, + reuse=reuse, + name="conv_3", + ) + hidden = c_layers.flatten(conv3) + + with tf.variable_scope(scope + "/" + "flat_encoding"): + hidden_flat = self.create_vector_observation_encoder( + hidden, h_size, activation, num_layers, scope, reuse + ) + return hidden_flat + + def create_resnet_visual_observation_encoder( + self, + image_input: tf.Tensor, + h_size: int, + activation: ActivationFunction, + num_layers: int, + scope: str, + reuse: bool, + ) -> tf.Tensor: + """ + Builds a set of resnet visual encoders. + :param image_input: The placeholder for the image input to use. + :param h_size: Hidden layer size. + :param activation: What type of activation function to use for layers. + :param num_layers: number of hidden layers to create. + :param scope: The scope of the graph within which to create the ops. + :param reuse: Whether to re-use the weights within the same scope. + :return: List of hidden layer tensors. + """ + n_channels = [16, 32, 32] # channel for each stack + n_blocks = 2 # number of residual blocks + with tf.variable_scope(scope): + hidden = image_input + for i, ch in enumerate(n_channels): + hidden = tf.layers.conv2d( + hidden, + ch, + kernel_size=[3, 3], + strides=[1, 1], + reuse=reuse, + name="layer%dconv_1" % i, + ) + hidden = tf.layers.max_pooling2d( + hidden, pool_size=[3, 3], strides=[2, 2], padding="same" + ) + # create residual blocks + for j in range(n_blocks): + block_input = hidden + hidden = tf.nn.relu(hidden) + hidden = tf.layers.conv2d( + hidden, + ch, + kernel_size=[3, 3], + strides=[1, 1], + padding="same", + reuse=reuse, + name="layer%d_%d_conv1" % (i, j), + ) + hidden = tf.nn.relu(hidden) + hidden = tf.layers.conv2d( + hidden, + ch, + kernel_size=[3, 3], + strides=[1, 1], + padding="same", + reuse=reuse, + name="layer%d_%d_conv2" % (i, j), + ) + hidden = tf.add(block_input, hidden) + hidden = tf.nn.relu(hidden) + hidden = c_layers.flatten(hidden) + + with tf.variable_scope(scope + "/" + "flat_encoding"): + hidden_flat = self.create_vector_observation_encoder( + hidden, h_size, activation, num_layers, scope, reuse + ) + return hidden_flat + @staticmethod def create_discrete_action_masking_layer(all_logits, action_masks, action_size): """ @@ -262,7 +432,13 @@ def create_discrete_action_masking_layer(all_logits, action_masks, action_size): ), ) - def create_observation_streams(self, num_streams, h_size, num_layers): + def create_observation_streams( + self, + num_streams: int, + h_size: int, + num_layers: int, + vis_encode_type: EncoderType = EncoderType.SIMPLE, + ) -> tf.Tensor: """ Creates encoding stream for observations. :param num_streams: Number of streams to create. @@ -286,16 +462,39 @@ def create_observation_streams(self, num_streams, h_size, num_layers): visual_encoders = [] hidden_state, hidden_visual = None, None if self.vis_obs_size > 0: - for j in range(brain.number_visual_observations): - encoded_visual = self.create_visual_observation_encoder( - self.visual_in[j], - h_size, - activation_fn, - num_layers, - "main_graph_{}_encoder{}".format(i, j), - False, - ) - visual_encoders.append(encoded_visual) + if vis_encode_type == EncoderType.RESNET: + for j in range(brain.number_visual_observations): + encoded_visual = self.create_resnet_visual_observation_encoder( + self.visual_in[j], + h_size, + activation_fn, + num_layers, + "main_graph_{}_encoder{}".format(i, j), + False, + ) + visual_encoders.append(encoded_visual) + elif vis_encode_type == EncoderType.NATURE_CNN: + for j in range(brain.number_visual_observations): + encoded_visual = self.create_nature_cnn_visual_observation_encoder( + self.visual_in[j], + h_size, + activation_fn, + num_layers, + "main_graph_{}_encoder{}".format(i, j), + False, + ) + visual_encoders.append(encoded_visual) + else: + for j in range(brain.number_visual_observations): + encoded_visual = self.create_visual_observation_encoder( + self.visual_in[j], + h_size, + activation_fn, + num_layers, + "main_graph_{}_encoder{}".format(i, j), + False, + ) + visual_encoders.append(encoded_visual) hidden_visual = tf.concat(visual_encoders, axis=1) if brain.vector_observation_space_size > 0: hidden_state = self.create_vector_observation_encoder( @@ -333,26 +532,45 @@ def create_recurrent_encoder(input_state, memory_in, sequence_length, name="lstm m_size = memory_in.get_shape().as_list()[1] lstm_input_state = tf.reshape(input_state, shape=[-1, sequence_length, s_size]) memory_in = tf.reshape(memory_in[:, :], [-1, m_size]) - _half_point = int(m_size / 2) + half_point = int(m_size / 2) with tf.variable_scope(name): - rnn_cell = tf.contrib.rnn.BasicLSTMCell(_half_point) + rnn_cell = tf.contrib.rnn.BasicLSTMCell(half_point) lstm_vector_in = tf.contrib.rnn.LSTMStateTuple( - memory_in[:, :_half_point], memory_in[:, _half_point:] + memory_in[:, :half_point], memory_in[:, half_point:] ) recurrent_output, lstm_state_out = tf.nn.dynamic_rnn( rnn_cell, lstm_input_state, initial_state=lstm_vector_in ) - recurrent_output = tf.reshape(recurrent_output, shape=[-1, _half_point]) + recurrent_output = tf.reshape(recurrent_output, shape=[-1, half_point]) return recurrent_output, tf.concat([lstm_state_out.c, lstm_state_out.h], axis=1) - def create_cc_actor_critic(self, h_size, num_layers): + def create_value_heads(self, stream_names, hidden_input): + """ + Creates one value estimator head for each reward signal in stream_names. + Also creates the node corresponding to the mean of all the value heads in self.value. + self.value_head is a dictionary of stream name to node containing the value estimator head for that signal. + :param stream_names: The list of reward signal names + :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top + of the hidden input. + """ + self.value_heads = {} + for name in stream_names: + value = tf.layers.dense(hidden_input, 1, name="{}_value".format(name)) + self.value_heads[name] = value + self.value = tf.reduce_mean(list(self.value_heads.values()), 0) + + def create_cc_actor_critic( + self, h_size: int, num_layers: int, vis_encode_type: EncoderType + ) -> None: """ Creates Continuous control actor-critic model. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. """ - hidden_streams = self.create_observation_streams(2, h_size, num_layers) + hidden_streams = self.create_observation_streams( + 2, h_size, num_layers, vis_encode_type + ) if self.use_recurrent: self.memory_in = tf.placeholder( @@ -386,14 +604,14 @@ def create_cc_actor_critic(self, h_size, num_layers): kernel_initializer=c_layers.variance_scaling_initializer(factor=0.01), ) - log_sigma_sq = tf.get_variable( + self.log_sigma_sq = tf.get_variable( "log_sigma_squared", [self.act_size[0]], dtype=tf.float32, initializer=tf.zeros_initializer(), ) - sigma_sq = tf.exp(log_sigma_sq) + sigma_sq = tf.exp(self.log_sigma_sq) self.epsilon = tf.placeholder( shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon" @@ -408,15 +626,16 @@ def create_cc_actor_critic(self, h_size, num_layers): all_probs = ( -0.5 * tf.square(tf.stop_gradient(self.output_pre) - mu) / sigma_sq - 0.5 * tf.log(2.0 * np.pi) - - 0.5 * log_sigma_sq + - 0.5 * self.log_sigma_sq ) self.all_log_probs = tf.identity(all_probs, name="action_probs") - self.entropy = 0.5 * tf.reduce_mean(tf.log(2 * np.pi * np.e) + log_sigma_sq) + self.entropy = 0.5 * tf.reduce_mean( + tf.log(2 * np.pi * np.e) + self.log_sigma_sq + ) - value = tf.layers.dense(hidden_value, 1, activation=None) - self.value = tf.identity(value, name="value_estimate") + self.create_value_heads(self.stream_names, hidden_value) self.all_old_log_probs = tf.placeholder( shape=[None, self.act_size[0]], dtype=tf.float32, name="old_probabilities" @@ -430,13 +649,17 @@ def create_cc_actor_critic(self, h_size, num_layers): (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True ) - def create_dc_actor_critic(self, h_size, num_layers): + def create_dc_actor_critic( + self, h_size: int, num_layers: int, vis_encode_type: EncoderType + ) -> None: """ Creates Discrete control actor-critic model. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. """ - hidden_streams = self.create_observation_streams(1, h_size, num_layers) + hidden_streams = self.create_observation_streams( + 1, h_size, num_layers, vis_encode_type + ) hidden = hidden_streams[0] if self.use_recurrent: @@ -488,8 +711,7 @@ def create_dc_actor_critic(self, h_size, num_layers): self.output = tf.identity(output) self.normalized_logits = tf.identity(normalized_logits, name="action") - value = tf.layers.dense(hidden, 1, activation=None) - self.value = tf.identity(value, name="value_estimate") + self.create_value_heads(self.stream_names, hidden) self.action_holder = tf.placeholder( shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder" diff --git a/ml-agents/mlagents/trainers/ppo/models.py b/ml-agents/mlagents/trainers/ppo/models.py index 2b6758f2d5..225387c132 100644 --- a/ml-agents/mlagents/trainers/ppo/models.py +++ b/ml-agents/mlagents/trainers/ppo/models.py @@ -2,7 +2,7 @@ import numpy as np import tensorflow as tf -from mlagents.trainers.models import LearningModel +from mlagents.trainers.models import LearningModel, EncoderType logger = logging.getLogger("mlagents.trainers") @@ -20,10 +20,9 @@ def __init__( use_recurrent=False, num_layers=2, m_size=None, - use_curiosity=False, - curiosity_strength=0.01, - curiosity_enc_size=128, seed=0, + stream_names=None, + vis_encode_type=EncoderType.SIMPLE, ): """ Takes a Unity environment and model-specific hyper-parameters and returns the @@ -33,35 +32,29 @@ def __init__( :param h_size: Size of hidden layers :param epsilon: Value for policy-divergence threshold. :param beta: Strength of entropy regularization. - :return: a sub-class of PPOAgent tailored to the environment. :param max_step: Total number of training steps. :param normalize: Whether to normalize vector observation input. :param use_recurrent: Whether to use an LSTM layer in the network. :param num_layers Number of hidden layers between encoded input and policy & value layers :param m_size: Size of brain memory. + :param seed: Seed to use for initialization of model. + :param stream_names: List of names of value streams. Usually, a list of the Reward Signals being used. + :return: a sub-class of PPOAgent tailored to the environment. """ - LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, seed) - self.use_curiosity = use_curiosity + LearningModel.__init__( + self, m_size, normalize, use_recurrent, brain, seed, stream_names + ) if num_layers < 1: num_layers = 1 - self.last_reward, self.new_reward, self.update_reward = ( - self.create_reward_encoder() - ) if brain.vector_action_space_type == "continuous": - self.create_cc_actor_critic(h_size, num_layers) + self.create_cc_actor_critic(h_size, num_layers, vis_encode_type) self.entropy = tf.ones_like(tf.reshape(self.value, [-1])) * self.entropy else: - self.create_dc_actor_critic(h_size, num_layers) - if self.use_curiosity: - self.curiosity_enc_size = curiosity_enc_size - self.curiosity_strength = curiosity_strength - encoded_state, encoded_next_state = self.create_curiosity_encoders() - self.create_inverse_model(encoded_state, encoded_next_state) - self.create_forward_model(encoded_state, encoded_next_state) - self.create_ppo_optimizer( + self.create_dc_actor_critic(h_size, num_layers, vis_encode_type) + self.create_losses( self.log_probs, self.old_log_probs, - self.value, + self.value_heads, self.entropy, beta, epsilon, @@ -69,171 +62,31 @@ def __init__( max_step, ) - @staticmethod - def create_reward_encoder(): - """Creates TF ops to track and increment recent average cumulative reward.""" - last_reward = tf.Variable( - 0, name="last_reward", trainable=False, dtype=tf.float32 - ) - new_reward = tf.placeholder(shape=[], dtype=tf.float32, name="new_reward") - update_reward = tf.assign(last_reward, new_reward) - return last_reward, new_reward, update_reward - - def create_curiosity_encoders(self): - """ - Creates state encoders for current and future observations. - Used for implementation of Curiosity-driven Exploration by Self-supervised Prediction - See https://arxiv.org/abs/1705.05363 for more details. - :return: current and future state encoder tensors. - """ - encoded_state_list = [] - encoded_next_state_list = [] - - if self.vis_obs_size > 0: - self.next_visual_in = [] - visual_encoders = [] - next_visual_encoders = [] - for i in range(self.vis_obs_size): - # Create input ops for next (t+1) visual observations. - next_visual_input = self.create_visual_input( - self.brain.camera_resolutions[i], - name="next_visual_observation_" + str(i), - ) - self.next_visual_in.append(next_visual_input) - - # Create the encoder ops for current and next visual input. Not that these encoders are siamese. - encoded_visual = self.create_visual_observation_encoder( - self.visual_in[i], - self.curiosity_enc_size, - self.swish, - 1, - "stream_{}_visual_obs_encoder".format(i), - False, - ) - - encoded_next_visual = self.create_visual_observation_encoder( - self.next_visual_in[i], - self.curiosity_enc_size, - self.swish, - 1, - "stream_{}_visual_obs_encoder".format(i), - True, - ) - visual_encoders.append(encoded_visual) - next_visual_encoders.append(encoded_next_visual) - - hidden_visual = tf.concat(visual_encoders, axis=1) - hidden_next_visual = tf.concat(next_visual_encoders, axis=1) - encoded_state_list.append(hidden_visual) - encoded_next_state_list.append(hidden_next_visual) - - if self.vec_obs_size > 0: - # Create the encoder ops for current and next vector input. Not that these encoders are siamese. - # Create input op for next (t+1) vector observation. - self.next_vector_in = tf.placeholder( - shape=[None, self.vec_obs_size], - dtype=tf.float32, - name="next_vector_observation", - ) - - encoded_vector_obs = self.create_vector_observation_encoder( - self.vector_in, - self.curiosity_enc_size, - self.swish, - 2, - "vector_obs_encoder", - False, - ) - encoded_next_vector_obs = self.create_vector_observation_encoder( - self.next_vector_in, - self.curiosity_enc_size, - self.swish, - 2, - "vector_obs_encoder", - True, - ) - encoded_state_list.append(encoded_vector_obs) - encoded_next_state_list.append(encoded_next_vector_obs) - - encoded_state = tf.concat(encoded_state_list, axis=1) - encoded_next_state = tf.concat(encoded_next_state_list, axis=1) - return encoded_state, encoded_next_state - - def create_inverse_model(self, encoded_state, encoded_next_state): - """ - Creates inverse model TensorFlow ops for Curiosity module. - Predicts action taken given current and future encoded states. - :param encoded_state: Tensor corresponding to encoded current state. - :param encoded_next_state: Tensor corresponding to encoded next state. - """ - combined_input = tf.concat([encoded_state, encoded_next_state], axis=1) - hidden = tf.layers.dense(combined_input, 256, activation=self.swish) - if self.brain.vector_action_space_type == "continuous": - pred_action = tf.layers.dense(hidden, self.act_size[0], activation=None) - squared_difference = tf.reduce_sum( - tf.squared_difference(pred_action, self.selected_actions), axis=1 - ) - self.inverse_loss = tf.reduce_mean( - tf.dynamic_partition(squared_difference, self.mask, 2)[1] - ) - else: - pred_action = tf.concat( - [ - tf.layers.dense(hidden, self.act_size[i], activation=tf.nn.softmax) - for i in range(len(self.act_size)) - ], - axis=1, - ) - cross_entropy = tf.reduce_sum( - -tf.log(pred_action + 1e-10) * self.selected_actions, axis=1 - ) - self.inverse_loss = tf.reduce_mean( - tf.dynamic_partition(cross_entropy, self.mask, 2)[1] - ) - - def create_forward_model(self, encoded_state, encoded_next_state): - """ - Creates forward model TensorFlow ops for Curiosity module. - Predicts encoded future state based on encoded current state and given action. - :param encoded_state: Tensor corresponding to encoded current state. - :param encoded_next_state: Tensor corresponding to encoded next state. - """ - combined_input = tf.concat([encoded_state, self.selected_actions], axis=1) - hidden = tf.layers.dense(combined_input, 256, activation=self.swish) - # We compare against the concatenation of all observation streams, hence `self.vis_obs_size + int(self.vec_obs_size > 0)`. - pred_next_state = tf.layers.dense( - hidden, - self.curiosity_enc_size * (self.vis_obs_size + int(self.vec_obs_size > 0)), - activation=None, - ) - - squared_difference = 0.5 * tf.reduce_sum( - tf.squared_difference(pred_next_state, encoded_next_state), axis=1 - ) - self.intrinsic_reward = tf.clip_by_value( - self.curiosity_strength * squared_difference, 0, 1 - ) - self.forward_loss = tf.reduce_mean( - tf.dynamic_partition(squared_difference, self.mask, 2)[1] - ) - - def create_ppo_optimizer( - self, probs, old_probs, value, entropy, beta, epsilon, lr, max_step + def create_losses( + self, probs, old_probs, value_heads, entropy, beta, epsilon, lr, max_step ): """ Creates training-specific Tensorflow ops for PPO models. :param probs: Current policy probabilities :param old_probs: Past policy probabilities - :param value: Current value estimate + :param value_heads: Value estimate tensors from each value stream :param beta: Entropy regularization strength :param entropy: Current policy entropy :param epsilon: Value for policy-divergence threshold :param lr: Learning rate :param max_step: Total number of training steps. """ - self.returns_holder = tf.placeholder( - shape=[None], dtype=tf.float32, name="discounted_rewards" - ) + self.returns_holders = {} + self.old_values = {} + for name in value_heads.keys(): + returns_holder = tf.placeholder( + shape=[None], dtype=tf.float32, name="{}_returns".format(name) + ) + old_value = tf.placeholder( + shape=[None], dtype=tf.float32, name="{}_value_estimate".format(name) + ) + self.returns_holders[name] = returns_holder + self.old_values[name] = old_value self.advantage = tf.placeholder( shape=[None, 1], dtype=tf.float32, name="advantages" ) @@ -241,33 +94,32 @@ def create_ppo_optimizer( lr, self.global_step, max_step, 1e-10, power=1.0 ) - self.old_value = tf.placeholder( - shape=[None], dtype=tf.float32, name="old_value_estimates" - ) - decay_epsilon = tf.train.polynomial_decay( epsilon, self.global_step, max_step, 0.1, power=1.0 ) decay_beta = tf.train.polynomial_decay( beta, self.global_step, max_step, 1e-5, power=1.0 ) - optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) - - clipped_value_estimate = self.old_value + tf.clip_by_value( - tf.reduce_sum(value, axis=1) - self.old_value, -decay_epsilon, decay_epsilon - ) - v_opt_a = tf.squared_difference( - self.returns_holder, tf.reduce_sum(value, axis=1) - ) - v_opt_b = tf.squared_difference(self.returns_holder, clipped_value_estimate) - self.value_loss = tf.reduce_mean( - tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.mask, 2)[1] - ) + value_losses = [] + for name, head in value_heads.items(): + clipped_value_estimate = self.old_values[name] + tf.clip_by_value( + tf.reduce_sum(head, axis=1) - self.old_values[name], + -decay_epsilon, + decay_epsilon, + ) + v_opt_a = tf.squared_difference( + self.returns_holders[name], tf.reduce_sum(head, axis=1) + ) + v_opt_b = tf.squared_difference( + self.returns_holders[name], clipped_value_estimate + ) + value_loss = tf.reduce_mean( + tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.mask, 2)[1] + ) + value_losses.append(value_loss) + self.value_loss = tf.reduce_mean(value_losses) - # Here we calculate PPO policy loss. In continuous control this is done independently for each action gaussian - # and then averaged together. This provides significantly better performance than treating the probability - # as an average of probabilities, or as a joint probability. r_theta = tf.exp(probs - old_probs) p_opt_a = r_theta * self.advantage p_opt_b = ( @@ -285,6 +137,6 @@ def create_ppo_optimizer( * tf.reduce_mean(tf.dynamic_partition(entropy, self.mask, 2)[1]) ) - if self.use_curiosity: - self.loss += 10 * (0.2 * self.forward_loss + 0.8 * self.inverse_loss) + def create_ppo_optimizer(self): + optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) self.update_batch = optimizer.minimize(self.loss) diff --git a/ml-agents/mlagents/trainers/ppo/policy.py b/ml-agents/mlagents/trainers/ppo/policy.py index 351a0bc791..cb52a700db 100644 --- a/ml-agents/mlagents/trainers/ppo/policy.py +++ b/ml-agents/mlagents/trainers/ppo/policy.py @@ -1,13 +1,22 @@ import logging import numpy as np +from typing import Any, Dict +import tensorflow as tf +from mlagents.envs.timers import timed +from mlagents.trainers import BrainInfo, ActionInfo +from mlagents.trainers.models import EncoderType from mlagents.trainers.ppo.models import PPOModel -from mlagents.trainers.policy import Policy +from mlagents.trainers.tf_policy import TFPolicy +from mlagents.trainers.components.reward_signals.reward_signal_factory import ( + create_reward_signal, +) +from mlagents.trainers.components.bc.module import BCModule logger = logging.getLogger("mlagents.trainers") -class PPOPolicy(Policy): +class PPOPolicy(TFPolicy): def __init__(self, seed, brain, trainer_params, is_training, load): """ Policy for Proximal Policy Optimization Networks. @@ -18,9 +27,10 @@ def __init__(self, seed, brain, trainer_params, is_training, load): :param load: Whether a pre-trained model will be loaded or a new one created. """ super().__init__(seed, brain, trainer_params) - self.has_updated = False - self.use_curiosity = bool(trainer_params["use_curiosity"]) + reward_signal_configs = trainer_params["reward_signals"] + + self.reward_signals = {} with self.graph.as_default(): self.model = PPOModel( brain, @@ -33,11 +43,32 @@ def __init__(self, seed, brain, trainer_params, is_training, load): use_recurrent=trainer_params["use_recurrent"], num_layers=int(trainer_params["num_layers"]), m_size=self.m_size, - use_curiosity=bool(trainer_params["use_curiosity"]), - curiosity_strength=float(trainer_params["curiosity_strength"]), - curiosity_enc_size=float(trainer_params["curiosity_enc_size"]), seed=seed, + stream_names=list(reward_signal_configs.keys()), + vis_encode_type=EncoderType( + trainer_params.get("vis_encode_type", "simple") + ), ) + self.model.create_ppo_optimizer() + + # Create reward signals + for reward_signal, config in reward_signal_configs.items(): + self.reward_signals[reward_signal] = create_reward_signal( + self, reward_signal, config + ) + + # Create pretrainer if needed + if "pretraining" in trainer_params: + BCModule.check_config(trainer_params["pretraining"]) + self.bc_module = BCModule( + self, + policy_learning_rate=trainer_params["learning_rate"], + default_batch_size=trainer_params["batch_size"], + default_num_epoch=trainer_params["num_epoch"], + **trainer_params["pretraining"], + ) + else: + self.bc_module = None if load: self._load_graph() @@ -47,7 +78,7 @@ def __init__(self, seed, brain, trainer_params, is_training, load): self.inference_dict = { "action": self.model.output, "log_probs": self.model.all_log_probs, - "value": self.model.value, + "value": self.model.value_heads, "entropy": self.model.entropy, "learning_rate": self.model.learning_rate, } @@ -55,19 +86,23 @@ def __init__(self, seed, brain, trainer_params, is_training, load): self.inference_dict["pre_action"] = self.model.output_pre if self.use_recurrent: self.inference_dict["memory_out"] = self.model.memory_out - if is_training and self.use_vec_obs and trainer_params["normalize"]: - self.inference_dict["update_mean"] = self.model.update_mean - self.inference_dict["update_variance"] = self.model.update_variance + if ( + is_training + and self.use_vec_obs + and trainer_params["normalize"] + and not load + ): + self.inference_dict["update_mean"] = self.model.update_normalization + + self.total_policy_loss = self.model.policy_loss self.update_dict = { "value_loss": self.model.value_loss, - "policy_loss": self.model.policy_loss, + "policy_loss": self.total_policy_loss, "update_batch": self.model.update_batch, } - if self.use_curiosity: - self.update_dict["forward_loss"] = self.model.forward_loss - self.update_dict["inverse_loss"] = self.model.inverse_loss + @timed def evaluate(self, brain_info): """ Evaluates policy for the agent experiences provided. @@ -94,12 +129,13 @@ def evaluate(self, brain_info): size=(len(brain_info.vector_observations), self.model.act_size[0]) ) feed_dict[self.model.epsilon] = epsilon - feed_dict = self._fill_eval_dict(feed_dict, brain_info) + feed_dict = self.fill_eval_dict(feed_dict, brain_info) run_out = self._execute_model(feed_dict, self.inference_dict) if self.use_continuous_act: run_out["random_normal_epsilon"] = epsilon return run_out + @timed def update(self, mini_batch, num_sequences): """ Updates model using buffer. @@ -111,13 +147,19 @@ def update(self, mini_batch, num_sequences): self.model.batch_size: num_sequences, self.model.sequence_length: self.sequence_length, self.model.mask_input: mini_batch["masks"].flatten(), - self.model.returns_holder: mini_batch["discounted_returns"].flatten(), - self.model.old_value: mini_batch["value_estimates"].flatten(), self.model.advantage: mini_batch["advantages"].reshape([-1, 1]), self.model.all_old_log_probs: mini_batch["action_probs"].reshape( [-1, sum(self.model.act_size)] ), } + for name in self.reward_signals: + feed_dict[self.model.returns_holders[name]] = mini_batch[ + "{}_returns".format(name) + ].flatten() + feed_dict[self.model.old_values[name]] = mini_batch[ + "{}_value_estimates".format(name) + ].flatten() + if self.use_continuous_act: feed_dict[self.model.output_pre] = mini_batch["actions_pre"].reshape( [-1, self.model.act_size[0]] @@ -140,10 +182,6 @@ def update(self, mini_batch, num_sequences): feed_dict[self.model.vector_in] = mini_batch["vector_obs"].reshape( [-1, self.vec_obs_size] ) - if self.use_curiosity: - feed_dict[self.model.next_vector_in] = mini_batch[ - "next_vector_in" - ].reshape([-1, self.vec_obs_size]) if self.model.vis_obs_size > 0: for i, _ in enumerate(self.model.visual_in): _obs = mini_batch["visual_obs%d" % i] @@ -152,71 +190,28 @@ def update(self, mini_batch, num_sequences): feed_dict[self.model.visual_in[i]] = _obs.reshape([-1, _w, _h, _c]) else: feed_dict[self.model.visual_in[i]] = _obs - if self.use_curiosity: - for i, _ in enumerate(self.model.visual_in): - _obs = mini_batch["next_visual_obs%d" % i] - if self.sequence_length > 1 and self.use_recurrent: - (_batch, _seq, _w, _h, _c) = _obs.shape - feed_dict[self.model.next_visual_in[i]] = _obs.reshape( - [-1, _w, _h, _c] - ) - else: - feed_dict[self.model.next_visual_in[i]] = _obs if self.use_recurrent: mem_in = mini_batch["memory"][:, 0, :] feed_dict[self.model.memory_in] = mem_in - self.has_updated = True run_out = self._execute_model(feed_dict, self.update_dict) return run_out - def get_intrinsic_rewards(self, curr_info, next_info): - """ - Generates intrinsic reward used for Curiosity-based training. - :BrainInfo curr_info: Current BrainInfo. - :BrainInfo next_info: Next BrainInfo. - :return: Intrinsic rewards for all agents. - """ - if self.use_curiosity: - if len(curr_info.agents) == 0: - return [] - - feed_dict = { - self.model.batch_size: len(next_info.vector_observations), - self.model.sequence_length: 1, - } - if self.use_continuous_act: - feed_dict[ - self.model.selected_actions - ] = next_info.previous_vector_actions - else: - feed_dict[self.model.action_holder] = next_info.previous_vector_actions - for i in range(self.model.vis_obs_size): - feed_dict[self.model.visual_in[i]] = curr_info.visual_observations[i] - feed_dict[self.model.next_visual_in[i]] = next_info.visual_observations[ - i - ] - if self.use_vec_obs: - feed_dict[self.model.vector_in] = curr_info.vector_observations - feed_dict[self.model.next_vector_in] = next_info.vector_observations - if self.use_recurrent: - if curr_info.memories.shape[1] == 0: - curr_info.memories = self.make_empty_memory(len(curr_info.agents)) - feed_dict[self.model.memory_in] = curr_info.memories - intrinsic_rewards = self.sess.run( - self.model.intrinsic_reward, feed_dict=feed_dict - ) * float(self.has_updated) - return intrinsic_rewards - else: - return None - - def get_value_estimate(self, brain_info, idx): + def get_value_estimates( + self, brain_info: BrainInfo, idx: int, done: bool + ) -> Dict[str, float]: """ Generates value estimates for bootstrapping. :param brain_info: BrainInfo to be used for bootstrapping. :param idx: Index in BrainInfo of agent. - :return: Value estimate. + :param done: Whether or not this is the last element of the episode, in which case the value estimate will be 0. + :return: The value estimate dictionary with key being the name of the reward signal and the value the + corresponding value estimate. """ - feed_dict = {self.model.batch_size: 1, self.model.sequence_length: 1} + + feed_dict: Dict[tf.Tensor, Any] = { + self.model.batch_size: 1, + self.model.sequence_length: 1, + } for i in range(len(brain_info.visual_observations)): feed_dict[self.model.visual_in[i]] = [ brain_info.visual_observations[i][idx] @@ -231,21 +226,37 @@ def get_value_estimate(self, brain_info, idx): feed_dict[self.model.prev_action] = brain_info.previous_vector_actions[ idx ].reshape([-1, len(self.model.act_size)]) - value_estimate = self.sess.run(self.model.value, feed_dict) - return value_estimate + value_estimates = self.sess.run(self.model.value_heads, feed_dict) - def get_last_reward(self): - """ - Returns the last reward the trainer has had - :return: the new last reward - """ - return self.sess.run(self.model.last_reward) + value_estimates = {k: float(v) for k, v in value_estimates.items()} + + # If we're done, reassign all of the value estimates that need terminal states. + if done: + for k in value_estimates: + if self.reward_signals[k].use_terminal_states: + value_estimates[k] = 0.0 - def update_reward(self, new_reward): + return value_estimates + + def get_action(self, brain_info: BrainInfo) -> ActionInfo: """ - Updates reward value for policy. - :param new_reward: New reward to save. + Decides actions given observations information, and takes them in environment. + :param brain_info: A dictionary of brain names and BrainInfo from environment. + :return: an ActionInfo containing action, memories, values and an object + to be passed to add experiences """ - self.sess.run( - self.model.update_reward, feed_dict={self.model.new_reward: new_reward} + if len(brain_info.agents) == 0: + return ActionInfo([], [], [], None, None) + + run_out = self.evaluate(brain_info) + mean_values = np.mean( + np.array(list(run_out.get("value").values())), axis=0 + ).flatten() + + return ActionInfo( + action=run_out.get("action"), + memory=run_out.get("memory_out"), + text=None, + value=mean_values, + outputs=run_out, ) diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py index f2185c2876..8310c1c77f 100644 --- a/ml-agents/mlagents/trainers/ppo/trainer.py +++ b/ml-agents/mlagents/trainers/ppo/trainer.py @@ -1,18 +1,18 @@ # # Unity ML-Agents Toolkit # ## ML-Agent Learning (PPO) -# Contains an implementation of PPO as described (https://arxiv.org/abs/1707.06347). +# Contains an implementation of PPO as described in: https://arxiv.org/abs/1707.06347 import logging -from collections import deque +from collections import defaultdict +from typing import List, Any import numpy as np -import tensorflow as tf from mlagents.envs import AllBrainInfo, BrainInfo from mlagents.trainers.buffer import Buffer from mlagents.trainers.ppo.policy import PPOPolicy -from mlagents.trainers.trainer import Trainer - +from mlagents.trainers.trainer import Trainer, UnityTrainerException +from mlagents.envs.action_info import ActionInfoOutputs logger = logging.getLogger("mlagents.trainers") @@ -26,18 +26,18 @@ def __init__( """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). + :param reward_buff_cap: Max reward history to track in the reward buffer :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The identifier of the current run """ - super(PPOTrainer, self).__init__(brain, trainer_parameters, training, run_id) + super().__init__(brain, trainer_parameters, training, run_id, reward_buff_cap) self.param_keys = [ "batch_size", "beta", "buffer_size", "epsilon", - "gamma", "hidden_units", "lambd", "learning_rate", @@ -51,47 +51,40 @@ def __init__( "use_recurrent", "summary_path", "memory_size", - "use_curiosity", - "curiosity_strength", - "curiosity_enc_size", "model_path", + "reward_signals", ] - self.check_param_keys() - self.use_curiosity = bool(trainer_parameters["use_curiosity"]) + + # Make sure we have at least one reward_signal + if not self.trainer_parameters["reward_signals"]: + raise UnityTrainerException( + "No reward signals were defined. At least one must be used with {}.".format( + self.__class__.__name__ + ) + ) + self.step = 0 self.policy = PPOPolicy(seed, brain, trainer_parameters, self.is_training, load) - stats = { - "Environment/Cumulative Reward": [], - "Environment/Episode Length": [], - "Policy/Value Estimate": [], - "Policy/Entropy": [], - "Losses/Value Loss": [], - "Losses/Policy Loss": [], - "Policy/Learning Rate": [], - } - if self.use_curiosity: - stats["Losses/Forward Loss"] = [] - stats["Losses/Inverse Loss"] = [] - stats["Policy/Curiosity Reward"] = [] - self.intrinsic_rewards = {} + stats = defaultdict(list) + # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward + # used for reporting only. We always want to report the environment reward to Tensorboard, regardless + # of what reward signals are actually present. + self.collected_rewards = {"environment": {}} + for _reward_signal in self.policy.reward_signals.keys(): + self.collected_rewards[_reward_signal] = {} + self.stats = stats self.training_buffer = Buffer() - self.cumulative_rewards = {} - self._reward_buffer = deque(maxlen=reward_buff_cap) self.episode_steps = {} def __str__(self): - return """Hyperparameters for the PPO Trainer of brain {0}: \n{1}""".format( + return """Hyperparameters for the {0} of brain {1}: \n{2}""".format( + self.__class__.__name__, self.brain_name, - "\n".join( - [ - "\t{0}:\t{1}".format(x, self.trainer_parameters[x]) - for x in self.param_keys - ] - ), + self.dict_to_str(self.trainer_parameters, 0), ) @property @@ -117,34 +110,24 @@ def get_step(self): """ return self.step - @property - def reward_buffer(self): + def increment_step(self, n_steps: int) -> None: """ - Returns the reward buffer. The reward buffer contains the cumulative - rewards of the most recent episodes completed by agents using this - trainer. - :return: the reward buffer. - """ - return self._reward_buffer + Increment the step count of the trainer - def increment_step_and_update_last_reward(self): - """ - Increment the step count of the trainer and Updates the last reward + :param n_steps: number of steps to increment the step count by """ - if len(self.stats["Environment/Cumulative Reward"]) > 0: - mean_reward = np.mean(self.stats["Environment/Cumulative Reward"]) - self.policy.update_reward(mean_reward) - self.policy.increment_step() - self.step = self.policy.get_current_step() + self.step = self.policy.increment_step(n_steps) def construct_curr_info(self, next_info: BrainInfo) -> BrainInfo: """ - Constructs a BrainInfo which contains the most recent previous experiences for all agents info + Constructs a BrainInfo which contains the most recent previous experiences for all agents which correspond to the agents in a provided next_info. :BrainInfo next_info: A t+1 BrainInfo. :return: curr_info: Reconstructed BrainInfo to match agents of next_info. """ - visual_observations = [[]] + visual_observations: List[List[Any]] = [ + [] + ] # TODO add types to brain.py methods vector_observations = [] text_observations = [] memories = [] @@ -205,8 +188,8 @@ def add_experiences( self, curr_all_info: AllBrainInfo, next_all_info: AllBrainInfo, - take_action_outputs, - ): + take_action_outputs: ActionInfoOutputs, + ) -> None: """ Adds experiences to each agent's experience history. :param curr_all_info: Dictionary of all current brains and corresponding BrainInfo. @@ -215,13 +198,14 @@ def add_experiences( """ self.trainer_metrics.start_experience_collection_timer() if take_action_outputs: - self.stats["Policy/Value Estimate"].append( - take_action_outputs["value"].mean() - ) self.stats["Policy/Entropy"].append(take_action_outputs["entropy"].mean()) self.stats["Policy/Learning Rate"].append( take_action_outputs["learning_rate"] ) + for name, signal in self.policy.reward_signals.items(): + self.stats[signal.value_name].append( + np.mean(take_action_outputs["value"][name]) + ) curr_info = curr_all_info[self.brain_name] next_info = next_all_info[self.brain_name] @@ -237,7 +221,9 @@ def add_experiences( else: curr_to_use = curr_info - intrinsic_rewards = self.policy.get_intrinsic_rewards(curr_to_use, next_info) + tmp_rewards_dict = {} + for name, signal in self.policy.reward_signals.items(): + tmp_rewards_dict[name] = signal.evaluate(curr_to_use, next_info) for agent_id in next_info.agents: stored_info = self.training_buffer[agent_id].last_brain_info @@ -285,45 +271,55 @@ def add_experiences( stored_info.action_masks[idx], padding_value=1 ) a_dist = stored_take_action_outputs["log_probs"] + # value is a dictionary from name of reward to value estimate of the value head value = stored_take_action_outputs["value"] self.training_buffer[agent_id]["actions"].append(actions[idx]) self.training_buffer[agent_id]["prev_action"].append( stored_info.previous_vector_actions[idx] ) self.training_buffer[agent_id]["masks"].append(1.0) - if self.use_curiosity: - self.training_buffer[agent_id]["rewards"].append( - next_info.rewards[next_idx] + intrinsic_rewards[next_idx] - ) - else: - self.training_buffer[agent_id]["rewards"].append( - next_info.rewards[next_idx] - ) - self.training_buffer[agent_id]["action_probs"].append(a_dist[idx]) - self.training_buffer[agent_id]["value_estimates"].append( - value[idx][0] + self.training_buffer[agent_id]["done"].append( + next_info.local_done[next_idx] ) - if agent_id not in self.cumulative_rewards: - self.cumulative_rewards[agent_id] = 0 - self.cumulative_rewards[agent_id] += next_info.rewards[next_idx] - if self.use_curiosity: - if agent_id not in self.intrinsic_rewards: - self.intrinsic_rewards[agent_id] = 0 - self.intrinsic_rewards[agent_id] += intrinsic_rewards[next_idx] + + for name, reward_result in tmp_rewards_dict.items(): + # 0 because we use the scaled reward to train the agent + self.training_buffer[agent_id][ + "{}_rewards".format(name) + ].append(reward_result.scaled_reward[next_idx]) + self.training_buffer[agent_id][ + "{}_value_estimates".format(name) + ].append(value[name][idx][0]) + + self.training_buffer[agent_id]["action_probs"].append(a_dist[idx]) + + for name, rewards in self.collected_rewards.items(): + if agent_id not in rewards: + rewards[agent_id] = 0 + if name == "environment": + # Report the reward from the environment + rewards[agent_id] += np.array(next_info.rewards)[next_idx] + else: + # Report the reward signals + rewards[agent_id] += tmp_rewards_dict[name].scaled_reward[ + next_idx + ] + if not next_info.local_done[next_idx]: if agent_id not in self.episode_steps: self.episode_steps[agent_id] = 0 self.episode_steps[agent_id] += 1 self.trainer_metrics.end_experience_collection_timer() - def process_experiences(self, current_info: AllBrainInfo, new_info: AllBrainInfo): + def process_experiences( + self, current_info: AllBrainInfo, new_info: AllBrainInfo + ) -> None: """ Checks agent histories for processing condition, and processes them as necessary. Processing involves calculating value and advantage targets for model updating step. :param current_info: Dictionary of all current brains and corresponding BrainInfo. :param new_info: Dictionary of all next brains and corresponding BrainInfo. """ - self.trainer_metrics.start_experience_collection_timer() info = new_info[self.brain_name] for l in range(len(info.agents)): agent_actions = self.training_buffer[info.agents[l]]["actions"] @@ -332,34 +328,51 @@ def process_experiences(self, current_info: AllBrainInfo, new_info: AllBrainInfo or len(agent_actions) > self.trainer_parameters["time_horizon"] ) and len(agent_actions) > 0: agent_id = info.agents[l] - if info.local_done[l] and not info.max_reached[l]: - value_next = 0.0 + if info.max_reached[l]: + bootstrapping_info = self.training_buffer[agent_id].last_brain_info + idx = bootstrapping_info.agents.index(agent_id) else: - if info.max_reached[l]: - bootstrapping_info = self.training_buffer[ - agent_id - ].last_brain_info - idx = bootstrapping_info.agents.index(agent_id) - else: - bootstrapping_info = info - idx = l - value_next = self.policy.get_value_estimate(bootstrapping_info, idx) - - self.training_buffer[agent_id]["advantages"].set( - get_gae( - rewards=self.training_buffer[agent_id]["rewards"].get_batch(), - value_estimates=self.training_buffer[agent_id][ - "value_estimates" - ].get_batch(), - value_next=value_next, - gamma=self.trainer_parameters["gamma"], + bootstrapping_info = info + idx = l + value_next = self.policy.get_value_estimates( + bootstrapping_info, + idx, + info.local_done[l] and not info.max_reached[l], + ) + + tmp_advantages = [] + tmp_returns = [] + for name in self.policy.reward_signals: + bootstrap_value = value_next[name] + + local_rewards = self.training_buffer[agent_id][ + "{}_rewards".format(name) + ].get_batch() + local_value_estimates = self.training_buffer[agent_id][ + "{}_value_estimates".format(name) + ].get_batch() + local_advantage = get_gae( + rewards=local_rewards, + value_estimates=local_value_estimates, + value_next=bootstrap_value, + gamma=self.policy.reward_signals[name].gamma, lambd=self.trainer_parameters["lambd"], ) - ) - self.training_buffer[agent_id]["discounted_returns"].set( - self.training_buffer[agent_id]["advantages"].get_batch() - + self.training_buffer[agent_id]["value_estimates"].get_batch() - ) + local_return = local_advantage + local_value_estimates + # This is later use as target for the different value estimates + self.training_buffer[agent_id]["{}_returns".format(name)].set( + local_return + ) + self.training_buffer[agent_id]["{}_advantage".format(name)].set( + local_advantage + ) + tmp_advantages.append(local_advantage) + tmp_returns.append(local_return) + + global_advantages = list(np.mean(np.array(tmp_advantages), axis=0)) + global_returns = list(np.mean(np.array(tmp_returns), axis=0)) + self.training_buffer[agent_id]["advantages"].set(global_advantages) + self.training_buffer[agent_id]["discounted_returns"].set(global_returns) self.training_buffer.append_update_buffer( agent_id, @@ -369,26 +382,25 @@ def process_experiences(self, current_info: AllBrainInfo, new_info: AllBrainInfo self.training_buffer[agent_id].reset_agent() if info.local_done[l]: - self.cumulative_returns_since_policy_update.append( - self.cumulative_rewards.get(agent_id, 0) - ) - self.stats["Environment/Cumulative Reward"].append( - self.cumulative_rewards.get(agent_id, 0) - ) - self.reward_buffer.appendleft( - self.cumulative_rewards.get(agent_id, 0) - ) self.stats["Environment/Episode Length"].append( self.episode_steps.get(agent_id, 0) ) - self.cumulative_rewards[agent_id] = 0 self.episode_steps[agent_id] = 0 - if self.use_curiosity: - self.stats["Policy/Curiosity Reward"].append( - self.intrinsic_rewards.get(agent_id, 0) - ) - self.intrinsic_rewards[agent_id] = 0 - self.trainer_metrics.end_experience_collection_timer() + for name, rewards in self.collected_rewards.items(): + if name == "environment": + self.cumulative_returns_since_policy_update.append( + rewards.get(agent_id, 0) + ) + self.stats["Environment/Cumulative Reward"].append( + rewards.get(agent_id, 0) + ) + self.reward_buffer.appendleft(rewards.get(agent_id, 0)) + rewards[agent_id] = 0 + else: + self.stats[ + self.policy.reward_signals[name].stat_name + ].append(rewards.get(agent_id, 0)) + rewards[agent_id] = 0 def end_episode(self): """ @@ -396,13 +408,11 @@ def end_episode(self): Get only called when the academy resets. """ self.training_buffer.reset_local_buffers() - for agent_id in self.cumulative_rewards: - self.cumulative_rewards[agent_id] = 0 for agent_id in self.episode_steps: self.episode_steps[agent_id] = 0 - if self.use_curiosity: - for agent_id in self.intrinsic_rewards: - self.intrinsic_rewards[agent_id] = 0 + for rewards in self.collected_rewards.values(): + for agent_id in rewards: + rewards[agent_id] = 0 def is_ready_update(self): """ @@ -417,6 +427,7 @@ def is_ready_update(self): def update_policy(self): """ Uses demonstration_buffer to update the policy. + The reward signal generators must be updated in this method at their own pace. """ self.trainer_metrics.start_policy_update_timer( number_experiences=len(self.training_buffer.update_buffer["actions"]), @@ -426,7 +437,7 @@ def update_policy(self): n_sequences = max( int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1 ) - value_total, policy_total, forward_total, inverse_total = [], [], [], [] + value_total, policy_total = [], [] advantages = self.training_buffer.update_buffer["advantages"].get_batch() self.training_buffer.update_buffer["advantages"].set( (advantages - advantages.mean()) / (advantages.std() + 1e-10) @@ -445,14 +456,18 @@ def update_policy(self): ) value_total.append(run_out["value_loss"]) policy_total.append(np.abs(run_out["policy_loss"])) - if self.use_curiosity: - inverse_total.append(run_out["inverse_loss"]) - forward_total.append(run_out["forward_loss"]) self.stats["Losses/Value Loss"].append(np.mean(value_total)) self.stats["Losses/Policy Loss"].append(np.mean(policy_total)) - if self.use_curiosity: - self.stats["Losses/Forward Loss"].append(np.mean(forward_total)) - self.stats["Losses/Inverse Loss"].append(np.mean(inverse_total)) + for _, reward_signal in self.policy.reward_signals.items(): + update_stats = reward_signal.update( + self.training_buffer.update_buffer, n_sequences + ) + for stat, val in update_stats.items(): + self.stats[stat].append(val) + if self.policy.bc_module: + update_stats = self.policy.bc_module.update() + for stat, val in update_stats.items(): + self.stats[stat].append(val) self.training_buffer.reset_update_buffer() self.trainer_metrics.end_policy_update() @@ -483,7 +498,7 @@ def get_gae(rewards, value_estimates, value_next=0.0, gamma=0.99, lambd=0.95): :param lambd: GAE weighing factor. :return: list of advantage estimates for time-steps t to T. """ - value_estimates = np.asarray(value_estimates.tolist() + [value_next]) + value_estimates = np.append(value_estimates, value_next) delta_t = rewards + gamma * value_estimates[1:] - value_estimates[:-1] advantage = discount_rewards(r=delta_t, gamma=gamma * lambd) return advantage diff --git a/ml-agents/mlagents/trainers/tensorflow_to_barracuda.py b/ml-agents/mlagents/trainers/tensorflow_to_barracuda.py index 97ec2cc8a0..3e7ec4b0fe 100644 --- a/ml-agents/mlagents/trainers/tensorflow_to_barracuda.py +++ b/ml-agents/mlagents/trainers/tensorflow_to_barracuda.py @@ -52,7 +52,9 @@ id=1, rank=2, out_shapes=lambda shapes: [ - [shapes[0][0], 1, 1, shapes[0][1]], # W + [shapes[0][0], 1, 1, shapes[0][1]] + if len(shapes[0]) > 1 + else [1, 1, 1, 1], # W [1, 1, 1, shapes[-1][-1]], # B ], patch_data=lambda data: [data[0], data[1]], @@ -324,9 +326,14 @@ "ConcatV2", "Identity", ] - ): "BasicLSTM", - repr([re.compile("^lstm/"), "Reshape", "ConcatV2", "Identity"]): "BasicLSTM", - repr(["Reshape", re.compile("^lstm_[a-z]*/"), "Reshape", "ConcatV2"]): "BasicLSTM", + ): "BasicLSTMReshapeOut", + repr( + [re.compile("^lstm/"), "Reshape", "ConcatV2", "Identity"] + ): "BasicLSTMReshapeOut", + repr( + ["Reshape", re.compile("^lstm_[a-z]*/"), "Reshape", "ConcatV2"] + ): "BasicLSTMReshapeOut", + repr(["Reshape", re.compile("^lstm_[a-z]*/"), "ConcatV2"]): "BasicLSTMConcatOut", repr(["Sigmoid", "Mul"]): "Swish", repr(["Mul", "Abs", "Mul", "Add"]): "LeakyRelu", repr( @@ -376,7 +383,8 @@ def order_by(args, names): op="Flatten", input=[ inputs[-1] - ], # take only the last input, assume all other arguments are trivial (like sequence_length==1 always in ML-agents LSTM nets) + ], # take only the last input, assume all other arguments are trivial (like sequence_length==1 + # always in ML-agents LSTM nets) ), "Reshape": lambda nodes, inputs, tensors, context: Struct( op="Reshape", @@ -522,7 +530,8 @@ def order_by(args, names): input=[i for i in inputs] + [t.name for t in tensors][1:][ -2: - ], # [1:] - skips the 0th tensor, since Conv2DBackpropInput 0th tensor is 'input_sizes' (which differs from other Conv layers) + ], # [1:] - skips the 0th tensor, since Conv2DBackpropInput 0th tensor is 'input_sizes' + # (which differs from other Conv layers) # [-2:] - take only last 2 tensors, this allows to process large patterns with the same code padding=get_attr(by_op(nodes, "Conv2DBackpropInput"), "padding"), strides=get_attr(by_op(nodes, "Conv2DBackpropInput"), "strides"), @@ -544,15 +553,19 @@ def order_by(args, names): "SquaredDifference": lambda nodes, inputs, tensors, _: sqr_diff( nodes[-1].name, inputs[0], inputs[1] ), - "BasicLSTM": lambda nodes, inputs, tensors, context: basic_lstm( + "BasicLSTMReshapeOut": lambda nodes, inputs, tensors, context: basic_lstm( nodes, inputs, tensors, context, find_type="Reshape" ), + "BasicLSTMConcatOut": lambda nodes, inputs, tensors, context: basic_lstm( + nodes, inputs, tensors, context, find_type="ConcatV2" + ), "Swish": lambda nodes, inputs, tensors, _: Struct(op="Swish", input=inputs), "LeakyRelu": lambda nodes, inputs, tensors, _: Struct(op="LeakyRelu", input=inputs), # TODO:'Round' # TODO:'Rsqrt' } + # Debug def debug(s): print(s) @@ -783,7 +796,8 @@ def strided_slice( end = end.astype(np.int32).tolist() strides = strides.astype(np.int32).tolist() - # StridedSlice range and mask descriptions: https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/strided-slice + # StridedSlice range and mask descriptions: + # https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/strided-slice # TODO: I don't think elipsis and newaxis would work together well with current implementation assert len(begin) == len(end) @@ -1071,7 +1085,7 @@ def process_layer(layer, context, args): else: activation = "Linear" - if not class_name in known_classes: + if class_name not in known_classes: if class_name in requires_runtime_flag: print("SKIP:", class_name, "layer is used only for training") else: @@ -1091,7 +1105,6 @@ def process_layer(layer, context, args): auto_pad = get_attr(layer, "padding") # layer.attr['padding'].s.decode("utf-8") pads = get_attr(layer, "pads") strides = get_attr(layer, "strides") # layer.attr['strides'].list.i - dilations = get_attr(layer, "dilations") # layer.attr['dilations'].list.i pool_size = get_attr(layer, "ksize") # layer.attr['ksize'].list.i shape = get_attr(layer, "shape") starts = get_attr(layer, "starts") @@ -1105,11 +1118,11 @@ def process_layer(layer, context, args): alpha = get_attr(layer, "alpha", default=1) beta = get_attr(layer, "beta") - if activation and not activation in known_activations: + if activation and activation not in known_activations: print("IGNORED: unknown activation", activation) - if auto_pad and not auto_pad in known_paddings: + if auto_pad and auto_pad not in known_paddings: print("IGNORED: unknown padding", auto_pad) - if data_frmt and not data_frmt in supported_data_formats: + if data_frmt and data_frmt not in supported_data_formats: print("UNSUPPORTED: data format", data_frmt) o_l.activation = known_activations.get(activation) or 0 @@ -1198,7 +1211,7 @@ def process_layer(layer, context, args): -1 not in input_ranks ) # for rank() lambda all input ranks have to be known (not -1) rank = rank(input_ranks) - if rank == None: + if rank is None: def all_elements_equal(arr): # http://stackoverflow.com/q/3844948/ return arr.count(arr[0]) == len(arr) @@ -1335,7 +1348,8 @@ def get_tensors(pattern_nodes): # filter only inputs that are coming from nodes that are outside this pattern # preserve the order pattern_nodes = [n.name for n in pattern_nodes] + tensor_names - # inputs_from_outside_pattern = remove_duplicates_from_list([i for i in inputs_to_op_nodes if nodes_by_name[i] not in pattern_nodes]) + # inputs_from_outside_pattern = remove_duplicates_from_list([i for i in inputs_to_op_nodes if + # nodes_by_name[i] not in pattern_nodes]) inputs_from_outside_pattern = remove_duplicates_from_list( [i for i in inputs_to_op_nodes if i not in pattern_nodes] ) @@ -1496,7 +1510,8 @@ def convert( Converts a TensorFlow model into a Barracuda model. :param source_file: The TensorFlow Model :param target_file: The name of the file the converted model will be saved to - :param trim_unused_by_output: The regexp to match output nodes to remain in the model. All other uconnected nodes will be removed. + :param trim_unused_by_output: The regexp to match output nodes to remain in the model. + All other unconnected nodes will be removed. :param verbose: If True, will display debug messages :param compress_f16: If true, the float values will be converted to f16 :return: @@ -1554,7 +1569,6 @@ def is_unconnected_identity(layer): o_model.layers = cleanup_layers(o_model.layers) all_inputs = {i for l in o_model.layers for i in l.inputs} - embedded_tensors = {t.name for l in o_model.layers for t in l.tensors} # Trim if trim_unused_by_output: diff --git a/ml-agents/mlagents/trainers/tests/mock_brain.py b/ml-agents/mlagents/trainers/tests/mock_brain.py new file mode 100644 index 0000000000..1fcfbbc710 --- /dev/null +++ b/ml-agents/mlagents/trainers/tests/mock_brain.py @@ -0,0 +1,141 @@ +import unittest.mock as mock +import pytest +import numpy as np + +from mlagents.trainers.buffer import Buffer + + +def create_mock_brainparams( + number_visual_observations=0, + num_stacked_vector_observations=1, + vector_action_space_type="continuous", + vector_observation_space_size=3, + vector_action_space_size=None, +): + """ + Creates a mock BrainParameters object with parameters. + """ + # Avoid using mutable object as default param + if vector_action_space_size is None: + vector_action_space_size = [2] + mock_brain = mock.Mock() + mock_brain.return_value.number_visual_observations = number_visual_observations + mock_brain.return_value.num_stacked_vector_observations = ( + num_stacked_vector_observations + ) + mock_brain.return_value.vector_action_space_type = vector_action_space_type + mock_brain.return_value.vector_observation_space_size = ( + vector_observation_space_size + ) + camrez = {"blackAndWhite": False, "height": 84, "width": 84} + mock_brain.return_value.camera_resolutions = [camrez] * number_visual_observations + mock_brain.return_value.vector_action_space_size = vector_action_space_size + return mock_brain() + + +def create_mock_braininfo( + num_agents=1, + num_vector_observations=0, + num_vis_observations=0, + num_vector_acts=2, + discrete=False, +): + """ + Creates a mock BrainInfo with observations. Imitates constant + vector/visual observations, rewards, dones, and agents. + + :int num_agents: Number of "agents" to imitate in your BrainInfo values. + :int num_vector_observations: Number of "observations" in your observation space + :int num_vis_observations: Number of "observations" in your observation space + :int num_vector_acts: Number of actions in your action space + :bool discrete: Whether or not action space is discrete + """ + mock_braininfo = mock.Mock() + + mock_braininfo.return_value.visual_observations = num_vis_observations * [ + np.ones((num_agents, 84, 84, 3)) + ] + mock_braininfo.return_value.vector_observations = np.array( + num_agents * [num_vector_observations * [1]] + ) + if discrete: + mock_braininfo.return_value.previous_vector_actions = np.array( + num_agents * [1 * [0.5]] + ) + mock_braininfo.return_value.action_masks = np.array( + num_agents * [num_vector_acts * [1.0]] + ) + else: + mock_braininfo.return_value.previous_vector_actions = np.array( + num_agents * [num_vector_acts * [0.5]] + ) + mock_braininfo.return_value.memories = np.ones((num_agents, 8)) + mock_braininfo.return_value.rewards = num_agents * [1.0] + mock_braininfo.return_value.local_done = num_agents * [False] + mock_braininfo.return_value.text_observations = num_agents * [""] + mock_braininfo.return_value.agents = range(0, num_agents) + return mock_braininfo() + + +def setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo): + """ + Takes a mock UnityEnvironment and adds the appropriate properties, defined by the mock + BrainParameters and BrainInfo. + + :Mock mock_env: A mock UnityEnvironment, usually empty. + :Mock mock_brain: A mock Brain object that specifies the params of this environment. + :Mock mock_braininfo: A mock BrainInfo object that will be returned at each step and reset. + """ + mock_env.return_value.academy_name = "MockAcademy" + mock_env.return_value.brains = {"MockBrain": mock_brain} + mock_env.return_value.external_brain_names = ["MockBrain"] + mock_env.return_value.brain_names = ["MockBrain"] + mock_env.return_value.reset.return_value = {"MockBrain": mock_braininfo} + mock_env.return_value.step.return_value = {"MockBrain": mock_braininfo} + + +def simulate_rollout(env, policy, buffer_init_samples): + brain_info_list = [] + for i in range(buffer_init_samples): + brain_info_list.append(env.step()[env.brain_names[0]]) + buffer = create_buffer(brain_info_list, policy.brain, policy.sequence_length) + return buffer + + +def create_buffer(brain_infos, brain_params, sequence_length): + buffer = Buffer() + # Make a buffer + for idx, experience in enumerate(brain_infos): + if idx > len(brain_infos) - 2: + break + current_brain_info = brain_infos[idx] + next_brain_info = brain_infos[idx + 1] + buffer[0].last_brain_info = current_brain_info + buffer[0]["done"].append(next_brain_info.local_done[0]) + buffer[0]["rewards"].append(next_brain_info.rewards[0]) + for i in range(brain_params.number_visual_observations): + buffer[0]["visual_obs%d" % i].append( + current_brain_info.visual_observations[i][0] + ) + buffer[0]["next_visual_obs%d" % i].append( + current_brain_info.visual_observations[i][0] + ) + if brain_params.vector_observation_space_size > 0: + buffer[0]["vector_obs"].append(current_brain_info.vector_observations[0]) + buffer[0]["next_vector_in"].append( + current_brain_info.vector_observations[0] + ) + buffer[0]["actions"].append(next_brain_info.previous_vector_actions[0]) + buffer[0]["prev_action"].append(current_brain_info.previous_vector_actions[0]) + buffer[0]["masks"].append(1.0) + buffer[0]["advantages"].append(1.0) + buffer[0]["action_probs"].append(np.ones(buffer[0]["actions"][0].shape)) + buffer[0]["actions_pre"].append(np.ones(buffer[0]["actions"][0].shape)) + buffer[0]["random_normal_epsilon"].append( + np.ones(buffer[0]["actions"][0].shape) + ) + buffer[0]["action_mask"].append(np.ones(buffer[0]["actions"][0].shape)) + buffer[0]["memory"].append(np.ones(8)) + + buffer.append_update_buffer(0, batch_size=None, training_length=sequence_length) + return buffer diff --git a/ml-agents/mlagents/trainers/tests/test_bcmodule.py b/ml-agents/mlagents/trainers/tests/test_bcmodule.py new file mode 100644 index 0000000000..0eee0f4d2e --- /dev/null +++ b/ml-agents/mlagents/trainers/tests/test_bcmodule.py @@ -0,0 +1,158 @@ +import unittest.mock as mock +import pytest +import mlagents.trainers.tests.mock_brain as mb + +import numpy as np +import yaml +import os + +from mlagents.trainers.ppo.policy import PPOPolicy + + +@pytest.fixture +def dummy_config(): + return yaml.safe_load( + """ + trainer: ppo + batch_size: 32 + beta: 5.0e-3 + buffer_size: 512 + epsilon: 0.2 + hidden_units: 128 + lambd: 0.95 + learning_rate: 3.0e-4 + max_steps: 5.0e4 + normalize: true + num_epoch: 5 + num_layers: 2 + time_horizon: 64 + sequence_length: 64 + summary_freq: 1000 + use_recurrent: false + memory_size: 8 + pretraining: + demo_path: ./demos/ExpertPyramid.demo + strength: 1.0 + steps: 10000000 + reward_signals: + extrinsic: + strength: 1.0 + gamma: 0.99 + """ + ) + + +def create_mock_3dball_brain(): + mock_brain = mb.create_mock_brainparams( + vector_action_space_type="continuous", + vector_action_space_size=[2], + vector_observation_space_size=8, + ) + return mock_brain + + +def create_mock_banana_brain(): + mock_brain = mb.create_mock_brainparams( + number_visual_observations=1, + vector_action_space_type="discrete", + vector_action_space_size=[3, 3, 3, 2], + vector_observation_space_size=0, + ) + return mock_brain + + +def create_ppo_policy_with_bc_mock( + mock_env, mock_brain, dummy_config, use_rnn, demo_file +): + mock_braininfo = mb.create_mock_braininfo(num_agents=12, num_vector_observations=8) + mb.setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo) + env = mock_env() + + trainer_parameters = dummy_config + model_path = env.brain_names[0] + trainer_parameters["model_path"] = model_path + trainer_parameters["keep_checkpoints"] = 3 + trainer_parameters["use_recurrent"] = use_rnn + trainer_parameters["pretraining"]["demo_path"] = ( + os.path.dirname(os.path.abspath(__file__)) + "/" + demo_file + ) + policy = PPOPolicy(0, mock_brain, trainer_parameters, False, False) + return env, policy + + +# Test default values +@mock.patch("mlagents.envs.UnityEnvironment") +def test_bcmodule_defaults(mock_env, dummy_config): + # See if default values match + mock_brain = create_mock_3dball_brain() + env, policy = create_ppo_policy_with_bc_mock( + mock_env, mock_brain, dummy_config, False, "test.demo" + ) + assert policy.bc_module.num_epoch == dummy_config["num_epoch"] + assert policy.bc_module.batch_size == dummy_config["batch_size"] + env.close() + # Assign strange values and see if it overrides properly + dummy_config["pretraining"]["num_epoch"] = 100 + dummy_config["pretraining"]["batch_size"] = 10000 + env, policy = create_ppo_policy_with_bc_mock( + mock_env, mock_brain, dummy_config, False, "test.demo" + ) + assert policy.bc_module.num_epoch == 100 + assert policy.bc_module.batch_size == 10000 + env.close() + + +# Test with continuous control env and vector actions +@mock.patch("mlagents.envs.UnityEnvironment") +def test_bcmodule_update(mock_env, dummy_config): + mock_brain = create_mock_3dball_brain() + env, policy = create_ppo_policy_with_bc_mock( + mock_env, mock_brain, dummy_config, False, "test.demo" + ) + stats = policy.bc_module.update() + for _, item in stats.items(): + assert isinstance(item, np.float32) + env.close() + + +# Test with RNN +@mock.patch("mlagents.envs.UnityEnvironment") +def test_bcmodule_rnn_update(mock_env, dummy_config): + mock_brain = create_mock_3dball_brain() + env, policy = create_ppo_policy_with_bc_mock( + mock_env, mock_brain, dummy_config, True, "test.demo" + ) + stats = policy.bc_module.update() + for _, item in stats.items(): + assert isinstance(item, np.float32) + env.close() + + +# Test with discrete control and visual observations +@mock.patch("mlagents.envs.UnityEnvironment") +def test_bcmodule_dc_visual_update(mock_env, dummy_config): + mock_brain = create_mock_banana_brain() + env, policy = create_ppo_policy_with_bc_mock( + mock_env, mock_brain, dummy_config, False, "testdcvis.demo" + ) + stats = policy.bc_module.update() + for _, item in stats.items(): + assert isinstance(item, np.float32) + env.close() + + +# Test with discrete control, visual observations and RNN +@mock.patch("mlagents.envs.UnityEnvironment") +def test_bcmodule_rnn_dc_update(mock_env, dummy_config): + mock_brain = create_mock_banana_brain() + env, policy = create_ppo_policy_with_bc_mock( + mock_env, mock_brain, dummy_config, True, "testdcvis.demo" + ) + stats = policy.bc_module.update() + for _, item in stats.items(): + assert isinstance(item, np.float32) + env.close() + + +if __name__ == "__main__": + pytest.main() diff --git a/ml-agents/mlagents/trainers/tests/test_demo_dir/test.demo b/ml-agents/mlagents/trainers/tests/test_demo_dir/test.demo new file mode 100644 index 0000000000..3148108ca0 Binary files /dev/null and b/ml-agents/mlagents/trainers/tests/test_demo_dir/test.demo differ diff --git a/ml-agents/mlagents/trainers/tests/test_demo_dir/test2.demo b/ml-agents/mlagents/trainers/tests/test_demo_dir/test2.demo new file mode 100644 index 0000000000..3148108ca0 Binary files /dev/null and b/ml-agents/mlagents/trainers/tests/test_demo_dir/test2.demo differ diff --git a/ml-agents/mlagents/trainers/tests/test_demo_dir/test3.demo b/ml-agents/mlagents/trainers/tests/test_demo_dir/test3.demo new file mode 100644 index 0000000000..3148108ca0 Binary files /dev/null and b/ml-agents/mlagents/trainers/tests/test_demo_dir/test3.demo differ diff --git a/ml-agents/mlagents/trainers/tests/test_demo_loader.py b/ml-agents/mlagents/trainers/tests/test_demo_loader.py index b6c029780e..765c2df18e 100644 --- a/ml-agents/mlagents/trainers/tests/test_demo_loader.py +++ b/ml-agents/mlagents/trainers/tests/test_demo_loader.py @@ -16,3 +16,16 @@ def test_load_demo(): demo_buffer = make_demo_buffer(brain_infos, brain_parameters, 1) assert len(demo_buffer.update_buffer["actions"]) == total_expected - 1 + + +def test_load_demo_dir(): + path_prefix = os.path.dirname(os.path.abspath(__file__)) + brain_parameters, brain_infos, total_expected = load_demonstration( + path_prefix + "/test_demo_dir" + ) + assert brain_parameters.brain_name == "Ball3DBrain" + assert brain_parameters.vector_observation_space_size == 8 + assert len(brain_infos) == total_expected + + demo_buffer = make_demo_buffer(brain_infos, brain_parameters, 1) + assert len(demo_buffer.update_buffer["actions"]) == total_expected - 1 diff --git a/ml-agents/mlagents/trainers/tests/test_environments/__init__.py b/ml-agents/mlagents/trainers/tests/test_environments/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/ml-agents/mlagents/trainers/tests/test_environments/test_simple.py b/ml-agents/mlagents/trainers/tests/test_environments/test_simple.py new file mode 100644 index 0000000000..b9ebe4db1a --- /dev/null +++ b/ml-agents/mlagents/trainers/tests/test_environments/test_simple.py @@ -0,0 +1,177 @@ +import yaml +import math +import tempfile +from typing import Any, Dict + + +from mlagents.trainers.trainer_controller import TrainerController +from mlagents.envs.base_unity_environment import BaseUnityEnvironment +from mlagents.envs import BrainInfo, AllBrainInfo, BrainParameters +from mlagents.envs.communicator_objects import AgentInfoProto +from mlagents.envs.simple_env_manager import SimpleEnvManager +from mlagents.envs.sampler_class import SamplerManager + + +BRAIN_NAME = __name__ +OBS_SIZE = 1 +STEP_SIZE = 0.1 + +TIME_PENALTY = 0.001 +MIN_STEPS = int(1.0 / STEP_SIZE) + 1 +SUCCESS_REWARD = 1.0 + MIN_STEPS * TIME_PENALTY + + +def clamp(x, min_val, max_val): + return max(min_val, min(x, max_val)) + + +class Simple1DEnvironment(BaseUnityEnvironment): + """ + Very simple "game" - the agent has a position on [-1, 1], gets a reward of 1 if it reaches 1, and a reward of -1 if + it reaches -1. The position is incremented by the action amount (clamped to [-step_size, step_size]). + """ + + def __init__(self): + self._brains: Dict[str, BrainParameters] = {} + self._brains[BRAIN_NAME] = BrainParameters( + brain_name=BRAIN_NAME, + vector_observation_space_size=OBS_SIZE, + num_stacked_vector_observations=1, + camera_resolutions=[], + vector_action_space_size=[1], + vector_action_descriptions=["moveDirection"], + vector_action_space_type=1, # "continuous" + ) + + # state + self.position = 0.0 + self.step_count = 0 + + def step( + self, + vector_action: Dict[str, Any] = None, + memory: Dict[str, Any] = None, + text_action: Dict[str, Any] = None, + value: Dict[str, Any] = None, + ) -> AllBrainInfo: + assert vector_action is not None + + delta = vector_action[BRAIN_NAME][0][0] + delta = clamp(delta, -STEP_SIZE, STEP_SIZE) + self.position += delta + self.position = clamp(self.position, -1, 1) + self.step_count += 1 + done = self.position >= 1.0 or self.position <= -1.0 + if done: + reward = SUCCESS_REWARD * self.position + else: + reward = -TIME_PENALTY + + agent_info = AgentInfoProto( + stacked_vector_observation=[self.position] * OBS_SIZE, + reward=reward, + done=done, + ) + + if done: + self._reset_agent() + + return { + BRAIN_NAME: BrainInfo.from_agent_proto( + 0, [agent_info], self._brains[BRAIN_NAME] + ) + } + + def _reset_agent(self): + self.position = 0.0 + self.step_count = 0 + + def reset( + self, + config: Dict[str, float] = None, + train_mode: bool = True, + custom_reset_parameters: Any = None, + ) -> AllBrainInfo: # type: ignore + self._reset_agent() + + agent_info = AgentInfoProto( + stacked_vector_observation=[self.position] * OBS_SIZE, + done=False, + max_step_reached=False, + ) + return { + BRAIN_NAME: BrainInfo.from_agent_proto( + 0, [agent_info], self._brains[BRAIN_NAME] + ) + } + + @property + def global_done(self): + return False + + @property + def external_brains(self) -> Dict[str, BrainParameters]: + return self._brains + + @property + def reset_parameters(self) -> Dict[str, str]: + return {} + + def close(self): + pass + + +def test_simple(): + config = """ + default: + trainer: ppo + batch_size: 16 + beta: 5.0e-3 + buffer_size: 64 + epsilon: 0.2 + hidden_units: 128 + lambd: 0.95 + learning_rate: 5.0e-3 + max_steps: 2500 + memory_size: 256 + normalize: false + num_epoch: 3 + num_layers: 2 + time_horizon: 64 + sequence_length: 64 + summary_freq: 500 + use_recurrent: false + reward_signals: + extrinsic: + strength: 1.0 + gamma: 0.99 + """ + # Create controller and begin training. + with tempfile.TemporaryDirectory() as dir: + run_id = "id" + save_freq = 99999 + tc = TrainerController( + dir, + dir, + run_id, + save_freq, + meta_curriculum=None, + load=False, + train=True, + keep_checkpoints=1, + lesson=None, + training_seed=1337, + fast_simulation=True, + sampler_manager=SamplerManager(None), + resampling_interval=None, + ) + + # Begin training + env = Simple1DEnvironment() + env_manager = SimpleEnvManager(env) + trainer_config = yaml.safe_load(config) + tc.start_learning(env_manager, trainer_config) + + for brain_name, mean_reward in tc._get_measure_vals().items(): + assert not math.isnan(mean_reward) + assert mean_reward > 0.99 diff --git a/ml-agents/mlagents/trainers/tests/test_learn.py b/ml-agents/mlagents/trainers/tests/test_learn.py index bf8ddd028c..6809208cee 100644 --- a/ml-agents/mlagents/trainers/tests/test_learn.py +++ b/ml-agents/mlagents/trainers/tests/test_learn.py @@ -1,4 +1,3 @@ -import unittest.mock as mock import pytest from unittest.mock import * from mlagents.trainers import learn, TrainerController @@ -22,13 +21,17 @@ def basic_options(): "--no-graphics": False, "": "basic_path", "--debug": False, + "--sampler": None, } -@patch("mlagents.trainers.learn.SubprocessUnityEnvironment") +@patch("mlagents.trainers.learn.SamplerManager") +@patch("mlagents.trainers.learn.SubprocessEnvManager") @patch("mlagents.trainers.learn.create_environment_factory") @patch("mlagents.trainers.learn.load_config") -def test_run_training(load_config, create_environment_factory, subproc_env_mock): +def test_run_training( + load_config, create_environment_factory, subproc_env_mock, sampler_manager_mock +): mock_env = MagicMock() mock_env.external_brain_names = [] mock_env.academy_name = "TestAcademyName" @@ -50,16 +53,20 @@ def test_run_training(load_config, create_environment_factory, subproc_env_mock) False, 5, 0, - subproc_env_mock.return_value.external_brains, 0, True, + sampler_manager_mock.return_value, + None, ) -@patch("mlagents.trainers.learn.SubprocessUnityEnvironment") +@patch("mlagents.trainers.learn.SamplerManager") +@patch("mlagents.trainers.learn.SubprocessEnvManager") @patch("mlagents.trainers.learn.create_environment_factory") @patch("mlagents.trainers.learn.load_config") -def test_docker_target_path(load_config, create_environment_factory, subproc_env_mock): +def test_docker_target_path( + load_config, create_environment_factory, subproc_env_mock, sampler_manager_mock +): mock_env = MagicMock() mock_env.external_brain_names = [] mock_env.academy_name = "TestAcademyName" diff --git a/ml-agents/mlagents/trainers/tests/test_policy.py b/ml-agents/mlagents/trainers/tests/test_policy.py index 136e9a148b..a84a3b3d77 100644 --- a/ml-agents/mlagents/trainers/tests/test_policy.py +++ b/ml-agents/mlagents/trainers/tests/test_policy.py @@ -1,4 +1,4 @@ -from mlagents.trainers.policy import * +from mlagents.trainers.tf_policy import * from unittest.mock import MagicMock @@ -14,7 +14,7 @@ def basic_params(): def test_take_action_returns_empty_with_no_agents(): test_seed = 3 - policy = Policy(test_seed, basic_mock_brain(), basic_params()) + policy = TFPolicy(test_seed, basic_mock_brain(), basic_params()) no_agent_brain_info = BrainInfo([], [], [], agents=[]) result = policy.get_action(no_agent_brain_info) assert result == ActionInfo([], [], [], None, None) @@ -22,7 +22,7 @@ def test_take_action_returns_empty_with_no_agents(): def test_take_action_returns_nones_on_missing_values(): test_seed = 3 - policy = Policy(test_seed, basic_mock_brain(), basic_params()) + policy = TFPolicy(test_seed, basic_mock_brain(), basic_params()) policy.evaluate = MagicMock(return_value={}) brain_info_with_agents = BrainInfo([], [], [], agents=["an-agent-id"]) result = policy.get_action(brain_info_with_agents) @@ -31,7 +31,7 @@ def test_take_action_returns_nones_on_missing_values(): def test_take_action_returns_action_info_when_available(): test_seed = 3 - policy = Policy(test_seed, basic_mock_brain(), basic_params()) + policy = TFPolicy(test_seed, basic_mock_brain(), basic_params()) policy_eval_out = { "action": np.array([1.0]), "memory_out": np.array([2.5]), diff --git a/ml-agents/mlagents/trainers/tests/test_ppo.py b/ml-agents/mlagents/trainers/tests/test_ppo.py index 67c9948d2e..2b595f97fc 100644 --- a/ml-agents/mlagents/trainers/tests/test_ppo.py +++ b/ml-agents/mlagents/trainers/tests/test_ppo.py @@ -6,9 +6,9 @@ import yaml from mlagents.trainers.ppo.models import PPOModel -from mlagents.trainers.ppo.trainer import discount_rewards +from mlagents.trainers.ppo.trainer import PPOTrainer, discount_rewards from mlagents.trainers.ppo.policy import PPOPolicy -from mlagents.envs import UnityEnvironment +from mlagents.envs import UnityEnvironment, BrainParameters from mlagents.envs.mock_communicator import MockCommunicator @@ -21,7 +21,6 @@ def dummy_config(): beta: 5.0e-3 buffer_size: 512 epsilon: 0.2 - gamma: 0.99 hidden_units: 128 lambd: 0.95 learning_rate: 3.0e-4 @@ -34,9 +33,12 @@ def dummy_config(): summary_freq: 1000 use_recurrent: false memory_size: 8 - use_curiosity: false curiosity_strength: 0.0 curiosity_enc_size: 1 + reward_signals: + extrinsic: + strength: 1.0 + gamma: 0.99 """ ) @@ -64,6 +66,44 @@ def test_ppo_policy_evaluate(mock_communicator, mock_launcher, dummy_config): env.close() +@mock.patch("mlagents.envs.UnityEnvironment.executable_launcher") +@mock.patch("mlagents.envs.UnityEnvironment.get_communicator") +def test_ppo_get_value_estimates(mock_communicator, mock_launcher, dummy_config): + tf.reset_default_graph() + mock_communicator.return_value = MockCommunicator( + discrete_action=False, visual_inputs=0 + ) + env = UnityEnvironment(" ") + brain_infos = env.reset() + brain_info = brain_infos[env.brain_names[0]] + + trainer_parameters = dummy_config + model_path = env.brain_names[0] + trainer_parameters["model_path"] = model_path + trainer_parameters["keep_checkpoints"] = 3 + policy = PPOPolicy( + 0, env.brains[env.brain_names[0]], trainer_parameters, False, False + ) + run_out = policy.get_value_estimates(brain_info, 0, done=False) + for key, val in run_out.items(): + assert type(key) is str + assert type(val) is float + + run_out = policy.get_value_estimates(brain_info, 0, done=True) + for key, val in run_out.items(): + assert type(key) is str + assert val == 0.0 + + # Check if we ignore terminal states properly + policy.reward_signals["extrinsic"].use_terminal_states = False + run_out = policy.get_value_estimates(brain_info, 0, done=True) + for key, val in run_out.items(): + assert type(key) is str + assert val != 0.0 + + env.close() + + @mock.patch("mlagents.envs.UnityEnvironment.executable_launcher") @mock.patch("mlagents.envs.UnityEnvironment.get_communicator") def test_ppo_model_cc_vector(mock_communicator, mock_launcher): @@ -267,158 +307,6 @@ def test_ppo_model_cc_vector_rnn(mock_communicator, mock_launcher): env.close() -@mock.patch("mlagents.envs.UnityEnvironment.executable_launcher") -@mock.patch("mlagents.envs.UnityEnvironment.get_communicator") -def test_ppo_model_dc_vector_curio(mock_communicator, mock_launcher): - tf.reset_default_graph() - with tf.Session() as sess: - with tf.variable_scope("FakeGraphScope"): - mock_communicator.return_value = MockCommunicator( - discrete_action=True, visual_inputs=0 - ) - env = UnityEnvironment(" ") - model = PPOModel(env.brains["RealFakeBrain"], use_curiosity=True) - init = tf.global_variables_initializer() - sess.run(init) - - run_list = [ - model.output, - model.all_log_probs, - model.value, - model.entropy, - model.learning_rate, - model.intrinsic_reward, - ] - feed_dict = { - model.batch_size: 2, - model.sequence_length: 1, - model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]), - model.next_vector_in: np.array( - [[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]] - ), - model.action_holder: [[0], [0]], - model.action_masks: np.ones([2, 2]), - } - sess.run(run_list, feed_dict=feed_dict) - env.close() - - -@mock.patch("mlagents.envs.UnityEnvironment.executable_launcher") -@mock.patch("mlagents.envs.UnityEnvironment.get_communicator") -def test_ppo_model_cc_vector_curio(mock_communicator, mock_launcher): - tf.reset_default_graph() - with tf.Session() as sess: - with tf.variable_scope("FakeGraphScope"): - mock_communicator.return_value = MockCommunicator( - discrete_action=False, visual_inputs=0 - ) - env = UnityEnvironment(" ") - model = PPOModel(env.brains["RealFakeBrain"], use_curiosity=True) - init = tf.global_variables_initializer() - sess.run(init) - - run_list = [ - model.output, - model.all_log_probs, - model.value, - model.entropy, - model.learning_rate, - model.intrinsic_reward, - ] - feed_dict = { - model.batch_size: 2, - model.sequence_length: 1, - model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]), - model.next_vector_in: np.array( - [[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]] - ), - model.output: [[0.0, 0.0], [0.0, 0.0]], - model.epsilon: np.array([[0, 1], [2, 3]]), - } - sess.run(run_list, feed_dict=feed_dict) - env.close() - - -@mock.patch("mlagents.envs.UnityEnvironment.executable_launcher") -@mock.patch("mlagents.envs.UnityEnvironment.get_communicator") -def test_ppo_model_dc_visual_curio(mock_communicator, mock_launcher): - tf.reset_default_graph() - with tf.Session() as sess: - with tf.variable_scope("FakeGraphScope"): - mock_communicator.return_value = MockCommunicator( - discrete_action=True, visual_inputs=2 - ) - env = UnityEnvironment(" ") - model = PPOModel(env.brains["RealFakeBrain"], use_curiosity=True) - init = tf.global_variables_initializer() - sess.run(init) - - run_list = [ - model.output, - model.all_log_probs, - model.value, - model.entropy, - model.learning_rate, - model.intrinsic_reward, - ] - feed_dict = { - model.batch_size: 2, - model.sequence_length: 1, - model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]), - model.next_vector_in: np.array( - [[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]] - ), - model.action_holder: [[0], [0]], - model.visual_in[0]: np.ones([2, 40, 30, 3]), - model.visual_in[1]: np.ones([2, 40, 30, 3]), - model.next_visual_in[0]: np.ones([2, 40, 30, 3]), - model.next_visual_in[1]: np.ones([2, 40, 30, 3]), - model.action_masks: np.ones([2, 2]), - } - sess.run(run_list, feed_dict=feed_dict) - env.close() - - -@mock.patch("mlagents.envs.UnityEnvironment.executable_launcher") -@mock.patch("mlagents.envs.UnityEnvironment.get_communicator") -def test_ppo_model_cc_visual_curio(mock_communicator, mock_launcher): - tf.reset_default_graph() - with tf.Session() as sess: - with tf.variable_scope("FakeGraphScope"): - mock_communicator.return_value = MockCommunicator( - discrete_action=False, visual_inputs=2 - ) - env = UnityEnvironment(" ") - model = PPOModel(env.brains["RealFakeBrain"], use_curiosity=True) - init = tf.global_variables_initializer() - sess.run(init) - - run_list = [ - model.output, - model.all_log_probs, - model.value, - model.entropy, - model.learning_rate, - model.intrinsic_reward, - ] - feed_dict = { - model.batch_size: 2, - model.sequence_length: 1, - model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]), - model.next_vector_in: np.array( - [[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]] - ), - model.output: [[0.0, 0.0], [0.0, 0.0]], - model.visual_in[0]: np.ones([2, 40, 30, 3]), - model.visual_in[1]: np.ones([2, 40, 30, 3]), - model.next_visual_in[0]: np.ones([2, 40, 30, 3]), - model.next_visual_in[1]: np.ones([2, 40, 30, 3]), - model.epsilon: np.array([[0, 1], [2, 3]]), - } - sess.run(run_list, feed_dict=feed_dict) - env.close() - - def test_rl_functions(): rewards = np.array([0.0, 0.0, 0.0, 1.0]) gamma = 0.9 @@ -426,5 +314,46 @@ def test_rl_functions(): np.testing.assert_array_almost_equal(returns, np.array([0.729, 0.81, 0.9, 1.0])) +def test_trainer_increment_step(): + trainer_params = { + "trainer": "ppo", + "batch_size": 2048, + "beta": 0.005, + "buffer_size": 20480, + "epsilon": 0.2, + "gamma": 0.995, + "hidden_units": 512, + "lambd": 0.95, + "learning_rate": 0.0003, + "max_steps": "2e6", + "memory_size": 256, + "normalize": True, + "num_epoch": 3, + "num_layers": 3, + "time_horizon": 1000, + "sequence_length": 64, + "summary_freq": 3000, + "use_recurrent": False, + "use_curiosity": False, + "curiosity_strength": 0.01, + "curiosity_enc_size": 128, + "summary_path": "./summaries/test_trainer_summary", + "model_path": "./models/test_trainer_models/TestModel", + "keep_checkpoints": 5, + "reward_signals": {"extrinsic": {"strength": 1.0, "gamma": 0.99}}, + } + brain_params = BrainParameters("test_brain", 1, 1, [], [2], [], 0) + + trainer = PPOTrainer(brain_params, 0, trainer_params, True, False, 0, "0") + policy_mock = mock.Mock() + step_count = 10 + policy_mock.increment_step = mock.Mock(return_value=step_count) + trainer.policy = policy_mock + + trainer.increment_step(5) + policy_mock.increment_step.assert_called_with(5) + assert trainer.step == 10 + + if __name__ == "__main__": pytest.main() diff --git a/ml-agents/mlagents/trainers/tests/test_reward_signals.py b/ml-agents/mlagents/trainers/tests/test_reward_signals.py new file mode 100644 index 0000000000..1b695788c3 --- /dev/null +++ b/ml-agents/mlagents/trainers/tests/test_reward_signals.py @@ -0,0 +1,226 @@ +import unittest.mock as mock +import pytest +import mlagents.trainers.tests.mock_brain as mb + +import numpy as np +import tensorflow as tf +import yaml +import os + +from mlagents.trainers.ppo.models import PPOModel +from mlagents.trainers.ppo.trainer import discount_rewards +from mlagents.trainers.ppo.policy import PPOPolicy +from mlagents.trainers.demo_loader import make_demo_buffer +from mlagents.envs import UnityEnvironment +from mlagents.envs.mock_communicator import MockCommunicator + + +@pytest.fixture +def dummy_config(): + return yaml.safe_load( + """ + trainer: ppo + batch_size: 32 + beta: 5.0e-3 + buffer_size: 512 + epsilon: 0.2 + hidden_units: 128 + lambd: 0.95 + learning_rate: 3.0e-4 + max_steps: 5.0e4 + normalize: true + num_epoch: 5 + num_layers: 2 + time_horizon: 64 + sequence_length: 64 + summary_freq: 1000 + use_recurrent: false + memory_size: 8 + curiosity_strength: 0.0 + curiosity_enc_size: 1 + reward_signals: + extrinsic: + strength: 1.0 + gamma: 0.99 + """ + ) + + +@pytest.fixture +def gail_dummy_config(): + return { + "gail": { + "strength": 0.1, + "gamma": 0.9, + "encoding_size": 128, + "demo_path": os.path.dirname(os.path.abspath(__file__)) + "/test.demo", + } + } + + +@pytest.fixture +def curiosity_dummy_config(): + return {"curiosity": {"strength": 0.1, "gamma": 0.9, "encoding_size": 128}} + + +VECTOR_ACTION_SPACE = [2] +VECTOR_OBS_SPACE = 8 +DISCRETE_ACTION_SPACE = [2] +BUFFER_INIT_SAMPLES = 20 +NUM_AGENTS = 12 + + +def create_ppo_policy_mock( + mock_env, dummy_config, reward_signal_config, use_rnn, use_discrete, use_visual +): + + if not use_visual: + mock_brain = mb.create_mock_brainparams( + vector_action_space_type="discrete" if use_discrete else "continuous", + vector_action_space_size=DISCRETE_ACTION_SPACE + if use_discrete + else VECTOR_ACTION_SPACE, + vector_observation_space_size=VECTOR_OBS_SPACE, + ) + mock_braininfo = mb.create_mock_braininfo( + num_agents=NUM_AGENTS, + num_vector_observations=VECTOR_OBS_SPACE, + num_vector_acts=sum( + DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE + ), + discrete=use_discrete, + ) + else: + mock_brain = mb.create_mock_brainparams( + vector_action_space_type="discrete" if use_discrete else "continuous", + vector_action_space_size=DISCRETE_ACTION_SPACE + if use_discrete + else VECTOR_ACTION_SPACE, + vector_observation_space_size=0, + number_visual_observations=1, + ) + mock_braininfo = mb.create_mock_braininfo( + num_agents=NUM_AGENTS, + num_vis_observations=1, + num_vector_acts=sum( + DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE + ), + discrete=use_discrete, + ) + mb.setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo) + env = mock_env() + + trainer_parameters = dummy_config + model_path = env.brain_names[0] + trainer_parameters["model_path"] = model_path + trainer_parameters["keep_checkpoints"] = 3 + trainer_parameters["reward_signals"].update(reward_signal_config) + trainer_parameters["use_recurrent"] = use_rnn + policy = PPOPolicy(0, mock_brain, trainer_parameters, False, False) + return env, policy + + +def reward_signal_eval(env, policy, reward_signal_name): + brain_infos = env.reset() + brain_info = brain_infos[env.brain_names[0]] + next_brain_info = env.step()[env.brain_names[0]] + # Test evaluate + rsig_result = policy.reward_signals[reward_signal_name].evaluate( + brain_info, next_brain_info + ) + assert rsig_result.scaled_reward.shape == (NUM_AGENTS,) + assert rsig_result.unscaled_reward.shape == (NUM_AGENTS,) + + +def reward_signal_update(env, policy, reward_signal_name): + buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES) + out = policy.reward_signals[reward_signal_name].update(buffer.update_buffer, 2) + assert type(out) is dict + + +@mock.patch("mlagents.envs.UnityEnvironment") +def test_gail_cc(mock_env, dummy_config, gail_dummy_config): + env, policy = create_ppo_policy_mock( + mock_env, dummy_config, gail_dummy_config, False, False, False + ) + reward_signal_eval(env, policy, "gail") + reward_signal_update(env, policy, "gail") + + +@mock.patch("mlagents.envs.UnityEnvironment") +def test_gail_dc(mock_env, dummy_config, gail_dummy_config): + env, policy = create_ppo_policy_mock( + mock_env, dummy_config, gail_dummy_config, False, True, False + ) + reward_signal_eval(env, policy, "gail") + reward_signal_update(env, policy, "gail") + + +@mock.patch("mlagents.envs.UnityEnvironment") +def test_gail_visual(mock_env, dummy_config, gail_dummy_config): + gail_dummy_config["gail"]["demo_path"] = ( + os.path.dirname(os.path.abspath(__file__)) + "/testdcvis.demo" + ) + env, policy = create_ppo_policy_mock( + mock_env, dummy_config, gail_dummy_config, False, True, True + ) + reward_signal_eval(env, policy, "gail") + reward_signal_update(env, policy, "gail") + + +@mock.patch("mlagents.envs.UnityEnvironment") +def test_gail_rnn(mock_env, dummy_config, gail_dummy_config): + env, policy = create_ppo_policy_mock( + mock_env, dummy_config, gail_dummy_config, True, False, False + ) + reward_signal_eval(env, policy, "gail") + reward_signal_update(env, policy, "gail") + + +@mock.patch("mlagents.envs.UnityEnvironment") +def test_curiosity_cc(mock_env, dummy_config, curiosity_dummy_config): + env, policy = create_ppo_policy_mock( + mock_env, dummy_config, curiosity_dummy_config, False, False, False + ) + reward_signal_eval(env, policy, "curiosity") + reward_signal_update(env, policy, "curiosity") + + +@mock.patch("mlagents.envs.UnityEnvironment") +def test_curiosity_dc(mock_env, dummy_config, curiosity_dummy_config): + env, policy = create_ppo_policy_mock( + mock_env, dummy_config, curiosity_dummy_config, False, True, False + ) + reward_signal_eval(env, policy, "curiosity") + reward_signal_update(env, policy, "curiosity") + + +@mock.patch("mlagents.envs.UnityEnvironment") +def test_curiosity_visual(mock_env, dummy_config, curiosity_dummy_config): + env, policy = create_ppo_policy_mock( + mock_env, dummy_config, curiosity_dummy_config, False, False, True + ) + reward_signal_eval(env, policy, "curiosity") + reward_signal_update(env, policy, "curiosity") + + +@mock.patch("mlagents.envs.UnityEnvironment") +def test_curiosity_rnn(mock_env, dummy_config, curiosity_dummy_config): + env, policy = create_ppo_policy_mock( + mock_env, dummy_config, curiosity_dummy_config, True, False, False + ) + reward_signal_eval(env, policy, "curiosity") + reward_signal_update(env, policy, "curiosity") + + +@mock.patch("mlagents.envs.UnityEnvironment") +def test_extrinsic(mock_env, dummy_config, curiosity_dummy_config): + env, policy = create_ppo_policy_mock( + mock_env, dummy_config, curiosity_dummy_config, False, False, False + ) + reward_signal_eval(env, policy, "extrinsic") + reward_signal_update(env, policy, "extrinsic") + + +if __name__ == "__main__": + pytest.main() diff --git a/ml-agents/mlagents/trainers/tests/test_trainer_controller.py b/ml-agents/mlagents/trainers/tests/test_trainer_controller.py index ed97024a3f..b0125b22c5 100644 --- a/ml-agents/mlagents/trainers/tests/test_trainer_controller.py +++ b/ml-agents/mlagents/trainers/tests/test_trainer_controller.py @@ -1,4 +1,3 @@ -import json import os from unittest.mock import * @@ -11,7 +10,9 @@ from mlagents.trainers.ppo.trainer import PPOTrainer from mlagents.trainers.bc.offline_trainer import OfflineBCTrainer from mlagents.trainers.bc.online_trainer import OnlineBCTrainer +from mlagents.envs.subprocess_env_manager import StepInfo from mlagents.envs.exception import UnityEnvironmentException +from mlagents.envs.sampler_class import SamplerManager @pytest.fixture @@ -148,7 +149,7 @@ def dummy_bad_config(): @pytest.fixture -def basic_trainer_controller(brain_info): +def basic_trainer_controller(): return TrainerController( model_path="test_model_path", summaries_dir="test_summaries_dir", @@ -159,9 +160,10 @@ def basic_trainer_controller(brain_info): train=True, keep_checkpoints=False, lesson=None, - external_brains={"testbrain": brain_info}, training_seed=99, fast_simulation=True, + sampler_manager=SamplerManager(None), + resampling_interval=None, ) @@ -169,36 +171,54 @@ def basic_trainer_controller(brain_info): @patch("tensorflow.set_random_seed") def test_initialization_seed(numpy_random_seed, tensorflow_set_seed): seed = 27 - TrainerController("", "", "1", 1, None, True, False, False, None, {}, seed, True) + TrainerController( + "", + "", + "1", + 1, + None, + True, + False, + False, + None, + seed, + True, + SamplerManager(None), + None, + ) numpy_random_seed.assert_called_with(seed) tensorflow_set_seed.assert_called_with(seed) def assert_bc_trainer_constructed( - trainer_cls, input_config, tc, expected_brain_info, expected_config + trainer_cls, input_config, tc, expected_brain_params, expected_config ): - def mock_constructor(self, brain, trainer_params, training, load, seed, run_id): - assert brain == expected_brain_info - assert trainer_params == expected_config + external_brains = {"testbrain": expected_brain_params} + + def mock_constructor(self, brain, trainer_parameters, training, load, seed, run_id): + assert brain == expected_brain_params + assert trainer_parameters == expected_config assert training == tc.train_model assert load == tc.load_model assert seed == tc.seed assert run_id == tc.run_id with patch.object(trainer_cls, "__init__", mock_constructor): - tc.initialize_trainers(input_config) + tc.initialize_trainers(input_config, external_brains) assert "testbrain" in tc.trainers assert isinstance(tc.trainers["testbrain"], trainer_cls) def assert_ppo_trainer_constructed( - input_config, tc, expected_brain_info, expected_config, expected_reward_buff_cap=0 + input_config, tc, expected_brain_params, expected_config, expected_reward_buff_cap=1 ): + external_brains = {"testbrain": expected_brain_params} + def mock_constructor( self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id ): self.trainer_metrics = TrainerMetrics("", "") - assert brain == expected_brain_info + assert brain == expected_brain_params assert trainer_parameters == expected_config assert reward_buff_cap == expected_reward_buff_cap assert training == tc.train_model @@ -207,15 +227,15 @@ def mock_constructor( assert run_id == tc.run_id with patch.object(PPOTrainer, "__init__", mock_constructor): - tc.initialize_trainers(input_config) + tc.initialize_trainers(input_config, external_brains) assert "testbrain" in tc.trainers assert isinstance(tc.trainers["testbrain"], PPOTrainer) -@patch("mlagents.envs.BrainInfo") -def test_initialize_trainer_parameters_uses_defaults(BrainInfoMock): - brain_info_mock = BrainInfoMock() - tc = basic_trainer_controller(brain_info_mock) +@patch("mlagents.envs.BrainParameters") +def test_initialize_trainer_parameters_uses_defaults(BrainParametersMock): + brain_params_mock = BrainParametersMock() + tc = basic_trainer_controller() full_config = dummy_offline_bc_config() expected_config = full_config["default"] @@ -224,14 +244,14 @@ def test_initialize_trainer_parameters_uses_defaults(BrainInfoMock): expected_config["keep_checkpoints"] = tc.keep_checkpoints assert_bc_trainer_constructed( - OfflineBCTrainer, full_config, tc, brain_info_mock, expected_config + OfflineBCTrainer, full_config, tc, brain_params_mock, expected_config ) -@patch("mlagents.envs.BrainInfo") -def test_initialize_trainer_parameters_override_defaults(BrainInfoMock): - brain_info_mock = BrainInfoMock() - tc = basic_trainer_controller(brain_info_mock) +@patch("mlagents.envs.BrainParameters") +def test_initialize_trainer_parameters_override_defaults(BrainParametersMock): + brain_params_mock = BrainParametersMock() + tc = basic_trainer_controller() full_config = dummy_offline_bc_config_with_override() expected_config = full_config["default"] @@ -243,14 +263,14 @@ def test_initialize_trainer_parameters_override_defaults(BrainInfoMock): expected_config["normalize"] = False assert_bc_trainer_constructed( - OfflineBCTrainer, full_config, tc, brain_info_mock, expected_config + OfflineBCTrainer, full_config, tc, brain_params_mock, expected_config ) -@patch("mlagents.envs.BrainInfo") -def test_initialize_online_bc_trainer(BrainInfoMock): - brain_info_mock = BrainInfoMock() - tc = basic_trainer_controller(brain_info_mock) +@patch("mlagents.envs.BrainParameters") +def test_initialize_online_bc_trainer(BrainParametersMock): + brain_params_mock = BrainParametersMock() + tc = basic_trainer_controller() full_config = dummy_online_bc_config() expected_config = full_config["default"] @@ -259,14 +279,14 @@ def test_initialize_online_bc_trainer(BrainInfoMock): expected_config["keep_checkpoints"] = tc.keep_checkpoints assert_bc_trainer_constructed( - OnlineBCTrainer, full_config, tc, brain_info_mock, expected_config + OnlineBCTrainer, full_config, tc, brain_params_mock, expected_config ) -@patch("mlagents.envs.BrainInfo") -def test_initialize_ppo_trainer(BrainInfoMock): - brain_info_mock = BrainInfoMock() - tc = basic_trainer_controller(brain_info_mock) +@patch("mlagents.envs.BrainParameters") +def test_initialize_ppo_trainer(BrainParametersMock): + brain_params_mock = BrainParametersMock() + tc = basic_trainer_controller() full_config = dummy_config() expected_config = full_config["default"] @@ -274,23 +294,17 @@ def test_initialize_ppo_trainer(BrainInfoMock): expected_config["model_path"] = tc.model_path + "/testbrain" expected_config["keep_checkpoints"] = tc.keep_checkpoints - assert_ppo_trainer_constructed(full_config, tc, brain_info_mock, expected_config) + assert_ppo_trainer_constructed(full_config, tc, brain_params_mock, expected_config) -@patch("mlagents.envs.BrainInfo") -def test_initialize_invalid_trainer_raises_exception(BrainInfoMock): - brain_info_mock = BrainInfoMock() - tc = basic_trainer_controller(brain_info_mock) +@patch("mlagents.envs.BrainParameters") +def test_initialize_invalid_trainer_raises_exception(BrainParametersMock): + tc = basic_trainer_controller() bad_config = dummy_bad_config() + external_brains = {"testbrain": BrainParametersMock()} - try: - tc.initialize_trainers(bad_config) - assert ( - 1 == 0, - "Initialize trainers with bad config did not raise an exception.", - ) - except UnityEnvironmentException: - pass + with pytest.raises(UnityEnvironmentException): + tc.initialize_trainers(bad_config, external_brains) def trainer_controller_with_start_learning_mocks(): @@ -300,18 +314,19 @@ def trainer_controller_with_start_learning_mocks(): trainer_mock.parameters = {"some": "parameter"} trainer_mock.write_tensorboard_text = MagicMock() - brain_info_mock = MagicMock() - tc = basic_trainer_controller(brain_info_mock) + tc = basic_trainer_controller() tc.initialize_trainers = MagicMock() tc.trainers = {"testbrain": trainer_mock} - tc.take_step = MagicMock() + tc.advance = MagicMock() + tc.trainers["testbrain"].get_step = 0 - def take_step_sideeffect(env, curr_info): + def take_step_sideeffect(env): tc.trainers["testbrain"].get_step += 1 if tc.trainers["testbrain"].get_step > 10: raise KeyboardInterrupt + return 1 - tc.take_step.side_effect = take_step_sideeffect + tc.advance.side_effect = take_step_sideeffect tc._export_graph = MagicMock() tc._save_model = MagicMock() @@ -329,12 +344,15 @@ def test_start_learning_trains_forever_if_no_train_model(tf_reset_graph): env_mock = MagicMock() env_mock.close = MagicMock() env_mock.reset = MagicMock() + env_mock.external_brains = MagicMock() tc.start_learning(env_mock, trainer_config) tf_reset_graph.assert_called_once() - tc.initialize_trainers.assert_called_once_with(trainer_config) + tc.initialize_trainers.assert_called_once_with( + trainer_config, env_mock.external_brains + ) env_mock.reset.assert_called_once() - assert tc.take_step.call_count == 11 + assert tc.advance.call_count == 11 tc._export_graph.assert_not_called() tc._save_model.assert_not_called() env_mock.close.assert_called_once() @@ -350,14 +368,17 @@ def test_start_learning_trains_until_max_steps_then_saves(tf_reset_graph): env_mock = MagicMock() env_mock.close = MagicMock() env_mock.reset = MagicMock(return_value=brain_info_mock) + env_mock.external_brains = MagicMock() tc.start_learning(env_mock, trainer_config) tf_reset_graph.assert_called_once() - tc.initialize_trainers.assert_called_once_with(trainer_config) + tc.initialize_trainers.assert_called_once_with( + trainer_config, env_mock.external_brains + ) env_mock.reset.assert_called_once() - assert tc.take_step.call_count == trainer_mock.get_max_steps + 1 + assert tc.advance.call_count == trainer_mock.get_max_steps + 1 env_mock.close.assert_called_once() - tc._save_model.assert_called_once_with(steps=6) + tc._save_model.assert_called_once() def test_start_learning_updates_meta_curriculum_lesson_number(): @@ -387,8 +408,7 @@ def trainer_controller_with_take_step_mocks(): trainer_mock.parameters = {"some": "parameter"} trainer_mock.write_tensorboard_text = MagicMock() - brain_info_mock = MagicMock() - tc = basic_trainer_controller(brain_info_mock) + tc = basic_trainer_controller() tc.trainers = {"testbrain": trainer_mock} return tc, trainer_mock @@ -397,38 +417,25 @@ def trainer_controller_with_take_step_mocks(): def test_take_step_adds_experiences_to_trainer_and_trains(): tc, trainer_mock = trainer_controller_with_take_step_mocks() - curr_info_mock = MagicMock() - brain_info_mock = MagicMock() - curr_info_mock.__getitem__ = MagicMock(return_value=brain_info_mock) + old_step_info = StepInfo(Mock(), Mock(), MagicMock()) + new_step_info = StepInfo(Mock(), Mock(), MagicMock()) trainer_mock.is_ready_update = MagicMock(return_value=True) env_mock = MagicMock() - env_step_output_mock = MagicMock() - env_mock.step = MagicMock(return_value=env_step_output_mock) - env_mock.close = MagicMock() - env_mock.reset = MagicMock(return_value=curr_info_mock) + env_mock.step.return_value = [new_step_info] + env_mock.reset.return_value = [old_step_info] env_mock.global_done = False - action_output_mock = ActionInfo( - "action", "memory", "actiontext", "value", {"some": "output"} - ) - trainer_mock.get_action = MagicMock(return_value=action_output_mock) - - tc.take_step(env_mock, curr_info_mock) + tc.advance(env_mock) env_mock.reset.assert_not_called() - trainer_mock.get_action.assert_called_once_with(brain_info_mock) - env_mock.step.assert_called_once_with( - vector_action={"testbrain": action_output_mock.action}, - memory={"testbrain": action_output_mock.memory}, - text_action={"testbrain": action_output_mock.text}, - value={"testbrain": action_output_mock.value}, - ) + env_mock.step.assert_called_once() trainer_mock.add_experiences.assert_called_once_with( - curr_info_mock, env_step_output_mock, action_output_mock.outputs + new_step_info.previous_all_brain_info, + new_step_info.current_all_brain_info, + new_step_info.brain_name_to_action_info["testbrain"].outputs, ) trainer_mock.process_experiences.assert_called_once_with( - curr_info_mock, env_step_output_mock + new_step_info.previous_all_brain_info, new_step_info.current_all_brain_info ) trainer_mock.update_policy.assert_called_once() - trainer_mock.write_summary.assert_called_once() - trainer_mock.increment_step_and_update_last_reward.assert_called_once() + trainer_mock.increment_step.assert_called_once() diff --git a/ml-agents/mlagents/trainers/tests/testdcvis.demo b/ml-agents/mlagents/trainers/tests/testdcvis.demo new file mode 100644 index 0000000000..b46b1c664b Binary files /dev/null and b/ml-agents/mlagents/trainers/tests/testdcvis.demo differ diff --git a/ml-agents/mlagents/trainers/policy.py b/ml-agents/mlagents/trainers/tf_policy.py similarity index 93% rename from ml-agents/mlagents/trainers/policy.py rename to ml-agents/mlagents/trainers/tf_policy.py index f4df934c11..da8f32dfb1 100644 --- a/ml-agents/mlagents/trainers/policy.py +++ b/ml-agents/mlagents/trainers/tf_policy.py @@ -1,12 +1,16 @@ import logging +from typing import Any, Dict + import numpy as np import tensorflow as tf -from mlagents.trainers import ActionInfo, UnityException +from mlagents.trainers import UnityException +from mlagents.envs import Policy, ActionInfo from tensorflow.python.tools import freeze_graph from mlagents.trainers import tensorflow_to_barracuda as tf2bc from mlagents.envs import BrainInfo + logger = logging.getLogger("mlagents.trainers") @@ -18,7 +22,7 @@ class UnityPolicyException(UnityException): pass -class Policy(object): +class TFPolicy(Policy): """ Contains a learning model, and the necessary functions to interact with it to perform evaluate and updating. @@ -93,7 +97,7 @@ def _load_graph(self): ) self.saver.restore(self.sess, ckpt.model_checkpoint_path) - def evaluate(self, brain_info: BrainInfo): + def evaluate(self, brain_info: BrainInfo) -> Dict[str, Any]: """ Evaluates policy for the agent experiences provided. :param brain_info: BrainInfo input to network. @@ -140,7 +144,7 @@ def _execute_model(self, feed_dict, out_dict): run_out = dict(zip(list(out_dict.keys()), network_out)) return run_out - def _fill_eval_dict(self, feed_dict, brain_info): + def fill_eval_dict(self, feed_dict, brain_info): for i, _ in enumerate(brain_info.visual_observations): feed_dict[self.model.visual_in[i]] = brain_info.visual_observations[i] if self.use_vec_obs: @@ -165,11 +169,16 @@ def get_current_step(self): step = self.sess.run(self.model.global_step) return step - def increment_step(self): + def increment_step(self, n_steps): """ Increments model step. """ - self.sess.run(self.model.increment_step) + out_dict = { + "global_step": self.model.global_step, + "increment_step": self.model.increment_step, + } + feed_dict = {self.model.steps_to_increment: n_steps} + return self.sess.run(out_dict, feed_dict=feed_dict)["global_step"] def get_inference_vars(self): """ diff --git a/ml-agents/mlagents/trainers/trainer.py b/ml-agents/mlagents/trainers/trainer.py index 6353b1b56a..71faf83b1d 100644 --- a/ml-agents/mlagents/trainers/trainer.py +++ b/ml-agents/mlagents/trainers/trainer.py @@ -3,9 +3,9 @@ import os import tensorflow as tf import numpy as np +from collections import deque -from mlagents.envs import UnityException, AllBrainInfo, BrainInfo -from mlagents.trainers import ActionInfo +from mlagents.envs import UnityException, AllBrainInfo, ActionInfoOutputs from mlagents.trainers import TrainerMetrics LOGGER = logging.getLogger("mlagents.trainers") @@ -22,7 +22,7 @@ class UnityTrainerException(UnityException): class Trainer(object): """This class is the base class for the mlagents.envs.trainers""" - def __init__(self, brain, trainer_parameters, training, run_id): + def __init__(self, brain, trainer_parameters, training, run_id, reward_buff_cap=1): """ Responsible for collecting experiences and training a neural network model. :BrainParameters brain: Brain to be trained. @@ -45,6 +45,7 @@ def __init__(self, brain, trainer_parameters, training, run_id): ) self.summary_writer = tf.summary.FileWriter(self.summary_path) self.policy = None + self._reward_buffer = deque(maxlen=reward_buff_cap) def __str__(self): return """{} Trainer""".format(self.__class__) @@ -57,6 +58,28 @@ def check_param_keys(self): "brain {2}.".format(k, self.__class__, self.brain_name) ) + def dict_to_str(self, param_dict, num_tabs): + """ + Takes a parameter dictionary and converts it to a human-readable string. + Recurses if there are multiple levels of dict. Used to print out hyperaparameters. + param: param_dict: A Dictionary of key, value parameters. + return: A string version of this dictionary. + """ + if not isinstance(param_dict, dict): + return param_dict + else: + append_newline = "\n" if num_tabs > 0 else "" + return append_newline + "\n".join( + [ + "\t" + + " " * num_tabs + + "{0}:\t{1}".format( + x, self.dict_to_str(param_dict[x], num_tabs + 1) + ) + for x in param_dict + ] + ) + @property def parameters(self): """ @@ -88,35 +111,27 @@ def get_step(self): raise UnityTrainerException("The get_step property was not implemented.") @property - def get_last_reward(self): - """ - Returns the last reward the trainer has had - :return: the new last reward + def reward_buffer(self): """ - raise UnityTrainerException("The get_last_reward property was not implemented.") - - def increment_step_and_update_last_reward(self): - """ - Increment the step count of the trainer and updates the last reward + Returns the reward buffer. The reward buffer contains the cumulative + rewards of the most recent episodes completed by agents using this + trainer. + :return: the reward buffer. """ - raise UnityTrainerException( - "The increment_step_and_update_last_reward method was not implemented." - ) + return self._reward_buffer - def get_action(self, curr_info: BrainInfo) -> ActionInfo: + def increment_step(self, n_steps: int) -> None: """ - Get an action using this trainer's current policy. - :param curr_info: Current BrainInfo. - :return: The ActionInfo given by the policy given the BrainInfo. + Increment the step count of the trainer """ - self.trainer_metrics.start_experience_collection_timer() - action = self.policy.get_action(curr_info) - self.trainer_metrics.end_experience_collection_timer() - return action + raise UnityTrainerException("The increment_step method was not implemented.") def add_experiences( - self, curr_info: AllBrainInfo, next_info: AllBrainInfo, take_action_outputs - ): + self, + curr_info: AllBrainInfo, + next_info: AllBrainInfo, + take_action_outputs: ActionInfoOutputs, + ) -> None: """ Adds experiences to each agent's experience history. :param curr_info: Current AllBrainInfo. @@ -125,7 +140,9 @@ def add_experiences( """ raise UnityTrainerException("The add_experiences method was not implemented.") - def process_experiences(self, current_info: AllBrainInfo, next_info: AllBrainInfo): + def process_experiences( + self, current_info: AllBrainInfo, next_info: AllBrainInfo + ) -> None: """ Checks agent histories for processing condition, and processes them as necessary. Processing involves calculating value and advantage targets for model updating step. @@ -175,7 +192,9 @@ def write_training_metrics(self): """ self.trainer_metrics.write_training_metrics() - def write_summary(self, global_step, delta_train_start, lesson_num=0): + def write_summary( + self, global_step: int, delta_train_start: float, lesson_num: int = 0 + ) -> None: """ Saves training statistics to Tensorboard. :param delta_train_start: Time elapsed since training started. @@ -191,17 +210,18 @@ def write_summary(self, global_step, delta_train_start, lesson_num=0): if self.is_training and self.get_step <= self.get_max_steps else "Not Training." ) + step = min(self.get_step, self.get_max_steps) if len(self.stats["Environment/Cumulative Reward"]) > 0: mean_reward = np.mean(self.stats["Environment/Cumulative Reward"]) LOGGER.info( " {}: {}: Step: {}. " "Time Elapsed: {:0.3f} s " "Mean " - "Reward: {" - ":0.3f}. Std of Reward: {:0.3f}. {}".format( + "Reward: {:0.3f}" + ". Std of Reward: {:0.3f}. {}".format( self.run_id, self.brain_name, - min(self.get_step, self.get_max_steps), + step, delta_train_start, mean_reward, np.std(self.stats["Environment/Cumulative Reward"]), @@ -211,7 +231,7 @@ def write_summary(self, global_step, delta_train_start, lesson_num=0): else: LOGGER.info( " {}: {}: Step: {}. No episode was completed since last summary. {}".format( - self.run_id, self.brain_name, self.get_step, is_training + self.run_id, self.brain_name, step, is_training ) ) summary = tf.Summary() @@ -221,7 +241,7 @@ def write_summary(self, global_step, delta_train_start, lesson_num=0): summary.value.add(tag="{}".format(key), simple_value=stat_mean) self.stats[key] = [] summary.value.add(tag="Environment/Lesson", simple_value=lesson_num) - self.summary_writer.add_summary(summary, self.get_step) + self.summary_writer.add_summary(summary, step) self.summary_writer.flush() def write_tensorboard_text(self, key, input_dict): @@ -241,7 +261,7 @@ def write_tensorboard_text(self, key, input_dict): ) s = sess.run(s_op) self.summary_writer.add_summary(s, self.get_step) - except: + except Exception: LOGGER.info( "Cannot write text summary for Tensorboard. Tensorflow version must be r1.2 or above." ) diff --git a/ml-agents/mlagents/trainers/trainer_controller.py b/ml-agents/mlagents/trainers/trainer_controller.py index e1af120cc1..cfa7911ae4 100644 --- a/ml-agents/mlagents/trainers/trainer_controller.py +++ b/ml-agents/mlagents/trainers/trainer_controller.py @@ -3,23 +3,30 @@ """Launches trainers for each External Brains in a Unity Environment.""" import os +import json import logging -import shutil -import sys from typing import * import numpy as np import tensorflow as tf from time import time -from mlagents.envs import AllBrainInfo, BrainParameters -from mlagents.envs.base_unity_environment import BaseUnityEnvironment -from mlagents.envs.exception import UnityEnvironmentException -from mlagents.trainers import Trainer +from mlagents.envs import BrainParameters +from mlagents.envs.env_manager import StepInfo +from mlagents.envs.env_manager import EnvManager +from mlagents.envs.exception import ( + UnityEnvironmentException, + UnityCommunicationException, +) +from mlagents.envs.sampler_class import SamplerManager +from mlagents.envs.timers import hierarchical_timer, get_timer_tree, timed +from mlagents.trainers import Trainer, TrainerMetrics from mlagents.trainers.ppo.trainer import PPOTrainer from mlagents.trainers.bc.offline_trainer import OfflineBCTrainer from mlagents.trainers.bc.online_trainer import OnlineBCTrainer from mlagents.trainers.meta_curriculum import MetaCurriculum +from mlagents.envs.base_unity_environment import BaseUnityEnvironment +from mlagents.envs.subprocess_env_manager import SubprocessEnvManager class TrainerController(object): @@ -34,9 +41,10 @@ def __init__( train: bool, keep_checkpoints: int, lesson: Optional[int], - external_brains: Dict[str, BrainParameters], training_seed: int, fast_simulation: bool, + sampler_manager: SamplerManager, + resampling_interval: Optional[int], ): """ :param model_path: Path to save the model. @@ -48,14 +56,13 @@ def __init__( :param train: Whether to train model, or only run inference. :param keep_checkpoints: How many model checkpoints to keep. :param lesson: Start learning from this lesson. - :param external_brains: dictionary of external brain names to BrainInfo objects. :param training_seed: Seed to use for Numpy and Tensorflow random number generation. + :param sampler_manager: SamplerManager object handles samplers for resampling the reset parameters. + :param resampling_interval: Specifies number of simulation steps after which reset parameters are resampled. """ self.model_path = model_path self.summaries_dir = summaries_dir - self.external_brains = external_brains - self.external_brain_names = external_brains.keys() self.logger = logging.getLogger("mlagents.envs") self.run_id = run_id self.save_freq = save_freq @@ -65,17 +72,18 @@ def __init__( self.keep_checkpoints = keep_checkpoints self.trainers: Dict[str, Trainer] = {} self.trainer_metrics: Dict[str, TrainerMetrics] = {} - self.global_step = 0 self.meta_curriculum = meta_curriculum self.seed = training_seed self.training_start_time = time() self.fast_simulation = fast_simulation np.random.seed(self.seed) tf.set_random_seed(self.seed) + self.sampler_manager = sampler_manager + self.resampling_interval = resampling_interval def _get_measure_vals(self): + brain_names_to_measure_vals = {} if self.meta_curriculum: - brain_names_to_measure_vals = {} for ( brain_name, curriculum, @@ -89,25 +97,25 @@ def _get_measure_vals(self): elif curriculum.measure == "reward": measure_val = np.mean(self.trainers[brain_name].reward_buffer) brain_names_to_measure_vals[brain_name] = measure_val - return brain_names_to_measure_vals else: - return None + for brain_name, trainer in self.trainers.items(): + measure_val = np.mean(trainer.reward_buffer) + brain_names_to_measure_vals[brain_name] = measure_val + return brain_names_to_measure_vals - def _save_model(self, steps=0): + def _save_model(self): """ Saves current model to checkpoint folder. - :param steps: Current number of steps in training process. - :param saver: Tensorflow saver for session. """ for brain_name in self.trainers.keys(): self.trainers[brain_name].save_model() self.logger.info("Saved Model") - def _save_model_when_interrupted(self, steps=0): + def _save_model_when_interrupted(self): self.logger.info( - "Learning was interrupted. Please wait " "while the graph is generated." + "Learning was interrupted. Please wait while the graph is generated." ) - self._save_model(steps) + self._save_model() def _write_training_metrics(self): """ @@ -118,6 +126,16 @@ def _write_training_metrics(self): if brain_name in self.trainer_metrics: self.trainers[brain_name].write_training_metrics() + def _write_timing_tree(self) -> None: + timing_path = f"{self.summaries_dir}/{self.run_id}_timers.json" + try: + with open(timing_path, "w") as f: + json.dump(get_timer_tree(), f, indent=2) + except FileNotFoundError: + self.logger.warning( + f"Unable to save to {timing_path}. Make sure the directory exists" + ) + def _export_graph(self): """ Exports latest saved models to .nn format for Unity embedding. @@ -125,13 +143,17 @@ def _export_graph(self): for brain_name in self.trainers.keys(): self.trainers[brain_name].export_model() - def initialize_trainers(self, trainer_config: Dict[str, Dict[str, str]]): + def initialize_trainers( + self, + trainer_config: Dict[str, Any], + external_brains: Dict[str, BrainParameters], + ) -> None: """ Initialization of the trainers :param trainer_config: The configurations of the trainers """ trainer_parameters_dict = {} - for brain_name in self.external_brains: + for brain_name in external_brains: trainer_parameters = trainer_config["default"].copy() trainer_parameters["summary_path"] = "{basedir}/{name}".format( basedir=self.summaries_dir, name=str(self.run_id) + "_" + brain_name @@ -141,44 +163,47 @@ def initialize_trainers(self, trainer_config: Dict[str, Dict[str, str]]): ) trainer_parameters["keep_checkpoints"] = self.keep_checkpoints if brain_name in trainer_config: - _brain_key = brain_name + _brain_key: Any = brain_name while not isinstance(trainer_config[_brain_key], dict): _brain_key = trainer_config[_brain_key] - for k in trainer_config[_brain_key]: - trainer_parameters[k] = trainer_config[_brain_key][k] + trainer_parameters.update(trainer_config[_brain_key]) trainer_parameters_dict[brain_name] = trainer_parameters.copy() - for brain_name in self.external_brains: + for brain_name in external_brains: if trainer_parameters_dict[brain_name]["trainer"] == "offline_bc": self.trainers[brain_name] = OfflineBCTrainer( - self.external_brains[brain_name], - trainer_parameters_dict[brain_name], - self.train_model, - self.load_model, - self.seed, - self.run_id, + brain=external_brains[brain_name], + trainer_parameters=trainer_parameters_dict[brain_name], + training=self.train_model, + load=self.load_model, + seed=self.seed, + run_id=self.run_id, ) elif trainer_parameters_dict[brain_name]["trainer"] == "online_bc": self.trainers[brain_name] = OnlineBCTrainer( - self.external_brains[brain_name], - trainer_parameters_dict[brain_name], - self.train_model, - self.load_model, - self.seed, - self.run_id, + brain=external_brains[brain_name], + trainer_parameters=trainer_parameters_dict[brain_name], + training=self.train_model, + load=self.load_model, + seed=self.seed, + run_id=self.run_id, ) elif trainer_parameters_dict[brain_name]["trainer"] == "ppo": - self.trainers[brain_name] = PPOTrainer( - self.external_brains[brain_name], - self.meta_curriculum.brains_to_curriculums[ + # Find lesson length based on the form of learning + if self.meta_curriculum: + lesson_length = self.meta_curriculum.brains_to_curriculums[ brain_name ].min_lesson_length - if self.meta_curriculum - else 0, - trainer_parameters_dict[brain_name], - self.train_model, - self.load_model, - self.seed, - self.run_id, + else: + lesson_length = 1 + + self.trainers[brain_name] = PPOTrainer( + brain=external_brains[brain_name], + reward_buff_cap=lesson_length, + trainer_parameters=trainer_parameters_dict[brain_name], + training=self.train_model, + load=self.load_model, + seed=self.seed, + run_id=self.run_id, ) self.trainer_metrics[brain_name] = self.trainers[ brain_name @@ -203,22 +228,49 @@ def _create_model_path(model_path): "permissions are set correctly.".format(model_path) ) - def _reset_env(self, env: BaseUnityEnvironment): + def _reset_env(self, env: EnvManager) -> List[StepInfo]: """Resets the environment. Returns: A Data structure corresponding to the initial reset state of the environment. """ - if self.meta_curriculum is not None: - return env.reset( - train_mode=self.fast_simulation, - config=self.meta_curriculum.get_config(), - ) - else: - return env.reset(train_mode=self.fast_simulation) + sampled_reset_param = self.sampler_manager.sample_all() + new_meta_curriculum_config = ( + self.meta_curriculum.get_config() if self.meta_curriculum else {} + ) + sampled_reset_param.update(new_meta_curriculum_config) + return env.reset(train_mode=self.fast_simulation, config=sampled_reset_param) + + def _should_save_model(self, global_step: int) -> bool: + return ( + global_step % self.save_freq == 0 and global_step != 0 and self.train_model + ) + + def _not_done_training(self) -> bool: + return ( + any([t.get_step <= t.get_max_steps for k, t in self.trainers.items()]) + or not self.train_model + ) + + def write_to_tensorboard(self, global_step: int) -> None: + for brain_name, trainer in self.trainers.items(): + # Write training statistics to Tensorboard. + delta_train_start = time() - self.training_start_time + if self.meta_curriculum is not None: + trainer.write_summary( + global_step, + delta_train_start, + lesson_num=self.meta_curriculum.brains_to_curriculums[ + brain_name + ].lesson_num, + ) + else: + trainer.write_summary(global_step, delta_train_start) - def start_learning(self, env: BaseUnityEnvironment, trainer_config): + def start_learning( + self, env_manager: EnvManager, trainer_config: Dict[str, Any] + ) -> None: # TODO: Should be able to start learning at different lesson numbers # for each curriculum. if self.meta_curriculum is not None: @@ -228,42 +280,54 @@ def start_learning(self, env: BaseUnityEnvironment, trainer_config): tf.reset_default_graph() # Prevent a single session from taking all GPU memory. - self.initialize_trainers(trainer_config) + self.initialize_trainers(trainer_config, env_manager.external_brains) for _, t in self.trainers.items(): self.logger.info(t) + global_step = 0 + if self.train_model: for brain_name, trainer in self.trainers.items(): trainer.write_tensorboard_text("Hyperparameters", trainer.parameters) try: - curr_info = self._reset_env(env) - while ( - any([t.get_step <= t.get_max_steps for k, t in self.trainers.items()]) - or not self.train_model - ): - new_info = self.take_step(env, curr_info) - self.global_step += 1 - if ( - self.global_step % self.save_freq == 0 - and self.global_step != 0 - and self.train_model - ): - # Save Tensorflow model - self._save_model(steps=self.global_step) - curr_info = new_info + for brain_name, trainer in self.trainers.items(): + env_manager.set_policy(brain_name, trainer.policy) + self._reset_env(env_manager) + while self._not_done_training(): + n_steps = self.advance(env_manager) + for i in range(n_steps): + global_step += 1 + self.reset_env_if_ready(env_manager, global_step) + if self._should_save_model(global_step): + # Save Tensorflow model + self._save_model() + self.write_to_tensorboard(global_step) # Final save Tensorflow model - if self.global_step != 0 and self.train_model: - self._save_model(steps=self.global_step) - except KeyboardInterrupt: + if global_step != 0 and self.train_model: + self._save_model() + except (KeyboardInterrupt, UnityCommunicationException): if self.train_model: - self._save_model_when_interrupted(steps=self.global_step) + self._save_model_when_interrupted() pass - env.close() if self.train_model: self._write_training_metrics() self._export_graph() + self._write_timing_tree() + env_manager.close() + + def end_trainer_episodes( + self, env: BaseUnityEnvironment, lessons_incremented: Dict[str, bool] + ) -> None: + self._reset_env(env) + # Reward buffers reset takes place only for curriculum learning + # else no reset. + for brain_name, trainer in self.trainers.items(): + trainer.end_episode() + for brain_name, changed in lessons_incremented.items(): + if changed: + self.trainers[brain_name].reward_buffer.clear() - def take_step(self, env: BaseUnityEnvironment, curr_info: AllBrainInfo): + def reset_env_if_ready(self, env: BaseUnityEnvironment, steps: int) -> None: if self.meta_curriculum: # Get the sizes of the reward buffers. reward_buff_sizes = { @@ -279,62 +343,47 @@ def take_step(self, env: BaseUnityEnvironment, curr_info: AllBrainInfo): # If any lessons were incremented or the environment is # ready to be reset - if self.meta_curriculum and any(lessons_incremented.values()): - curr_info = self._reset_env(env) - for brain_name, trainer in self.trainers.items(): - trainer.end_episode() - for brain_name, changed in lessons_incremented.items(): - if changed: - self.trainers[brain_name].reward_buffer.clear() - - # Decide and take an action - take_action_vector = {} - take_action_memories = {} - take_action_text = {} - take_action_value = {} - take_action_outputs = {} - for brain_name, trainer in self.trainers.items(): - action_info = trainer.get_action(curr_info[brain_name]) - take_action_vector[brain_name] = action_info.action - take_action_memories[brain_name] = action_info.memory - take_action_text[brain_name] = action_info.text - take_action_value[brain_name] = action_info.value - take_action_outputs[brain_name] = action_info.outputs - time_start_step = time() - new_info = env.step( - vector_action=take_action_vector, - memory=take_action_memories, - text_action=take_action_text, - value=take_action_value, + meta_curriculum_reset = any(lessons_incremented.values()) + + # Check if we are performing generalization training and we have finished the + # specified number of steps for the lesson + + generalization_reset = ( + not self.sampler_manager.is_empty() + and (steps != 0) + and (self.resampling_interval) + and (steps % self.resampling_interval == 0) ) - delta_time_step = time() - time_start_step + if meta_curriculum_reset or generalization_reset: + self.end_trainer_episodes(env, lessons_incremented) + + @timed + def advance(self, env: SubprocessEnvManager) -> int: + with hierarchical_timer("env_step"): + time_start_step = time() + new_step_infos = env.step() + delta_time_step = time() - time_start_step + + for step_info in new_step_infos: + for brain_name, trainer in self.trainers.items(): + if brain_name in self.trainer_metrics: + self.trainer_metrics[brain_name].add_delta_step(delta_time_step) + trainer.add_experiences( + step_info.previous_all_brain_info, + step_info.current_all_brain_info, + step_info.brain_name_to_action_info[brain_name].outputs, + ) + trainer.process_experiences( + step_info.previous_all_brain_info, step_info.current_all_brain_info + ) for brain_name, trainer in self.trainers.items(): if brain_name in self.trainer_metrics: self.trainer_metrics[brain_name].add_delta_step(delta_time_step) - trainer.add_experiences( - curr_info, new_info, take_action_outputs[brain_name] - ) - trainer.process_experiences(curr_info, new_info) - if ( - trainer.is_ready_update() - and self.train_model - and trainer.get_step <= trainer.get_max_steps - ): - # Perform gradient descent with experience buffer - - trainer.update_policy() - # Write training statistics to Tensorboard. - delta_train_start = time() - self.training_start_time - if self.meta_curriculum is not None: - trainer.write_summary( - self.global_step, - delta_train_start, - lesson_num=self.meta_curriculum.brains_to_curriculums[ - brain_name - ].lesson_num, - ) - else: - trainer.write_summary(self.global_step, delta_train_start) if self.train_model and trainer.get_step <= trainer.get_max_steps: - trainer.increment_step_and_update_last_reward() - return new_info + trainer.increment_step(len(new_step_infos)) + if trainer.is_ready_update(): + # Perform gradient descent with experience buffer + with hierarchical_timer("update_policy"): + trainer.update_policy() + env.set_policy(brain_name, trainer.policy) + return len(new_step_infos) diff --git a/ml-agents/mlagents/trainers/trainer_metrics.py b/ml-agents/mlagents/trainers/trainer_metrics.py index 1f058f253c..9438c5b9f2 100644 --- a/ml-agents/mlagents/trainers/trainer_metrics.py +++ b/ml-agents/mlagents/trainers/trainer_metrics.py @@ -2,6 +2,7 @@ import logging import csv from time import time +from typing import List, Optional LOGGER = logging.getLogger("mlagents.trainers") FIELD_NAMES = [ @@ -27,23 +28,23 @@ def __init__(self, path: str, brain_name: str): """ self.path = path self.brain_name = brain_name - self.rows = [] - self.time_start_experience_collection = None + self.rows: List[List[Optional[str]]] = [] + self.time_start_experience_collection: Optional[float] = None self.time_training_start = time() - self.last_buffer_length = None - self.last_mean_return = None - self.time_policy_update_start = None - self.delta_last_experience_collection = None - self.delta_policy_update = None + self.last_buffer_length: Optional[int] = None + self.last_mean_return: Optional[float] = None + self.time_policy_update_start: Optional[float] = None + self.delta_last_experience_collection: Optional[float] = None + self.delta_policy_update: Optional[float] = None - def start_experience_collection_timer(self): + def start_experience_collection_timer(self) -> None: """ Inform Metrics class that experience collection is starting. Intended to be idempotent """ if self.time_start_experience_collection is None: self.time_start_experience_collection = time() - def end_experience_collection_timer(self): + def end_experience_collection_timer(self) -> None: """ Inform Metrics class that experience collection is done. """ @@ -55,7 +56,7 @@ def end_experience_collection_timer(self): self.delta_last_experience_collection += curr_delta self.time_start_experience_collection = None - def add_delta_step(self, delta: float): + def add_delta_step(self, delta: float) -> None: """ Inform Metrics class about time to step in environment. """ @@ -64,7 +65,9 @@ def add_delta_step(self, delta: float): else: self.delta_last_experience_collection = delta - def start_policy_update_timer(self, number_experiences: int, mean_return: float): + def start_policy_update_timer( + self, number_experiences: int, mean_return: float + ) -> None: """ Inform Metrics class that policy update has started. :int number_experiences: Number of experiences in Buffer at this point. @@ -74,8 +77,8 @@ def start_policy_update_timer(self, number_experiences: int, mean_return: float) self.last_mean_return = mean_return self.time_policy_update_start = time() - def _add_row(self, delta_train_start): - row = [self.brain_name] + def _add_row(self, delta_train_start: float) -> None: + row: List[Optional[str]] = [self.brain_name] row.extend( format(c, ".3f") if isinstance(c, float) else c for c in [ @@ -89,7 +92,7 @@ def _add_row(self, delta_train_start): self.delta_last_experience_collection = None self.rows.append(row) - def end_policy_update(self): + def end_policy_update(self) -> None: """ Inform Metrics class that policy update has started. """ @@ -115,7 +118,7 @@ def end_policy_update(self): ) self._add_row(delta_train_start) - def write_training_metrics(self): + def write_training_metrics(self) -> None: """ Write Training Metrics to CSV """ diff --git a/ml-agents/setup.py b/ml-agents/setup.py index 355caa3f2a..c7486140ef 100644 --- a/ml-agents/setup.py +++ b/ml-agents/setup.py @@ -10,7 +10,7 @@ setup( name="mlagents", - version="0.8.2", + version="0.9.0", description="Unity Machine Learning Agents", long_description=long_description, long_description_content_type="text/markdown", @@ -29,7 +29,7 @@ ), zip_safe=False, install_requires=[ - "mlagents_envs==0.8.2", + "mlagents_envs==0.9.0", "tensorflow>=1.7,<1.8", "Pillow>=4.2.1", "matplotlib", diff --git a/protobuf-definitions/README.md b/protobuf-definitions/README.md index 281a35fb1a..da3bacbcc0 100644 --- a/protobuf-definitions/README.md +++ b/protobuf-definitions/README.md @@ -4,29 +4,40 @@ Contains relevant definitions needed to generate probobuf files used in [ML-Agen ## Requirements -* grpc 1.14.1 * protobuf 3.6.0 +* grpcio-tools 1.11.1 +* Grpc.Tools 1.14.1 ## Set-up & Installation +First we will follow these steps once install protobuf and grpcio-tools via your terminal. +Assume the ml-agents repository is checked out to a folder named $MLAGENTS_ROOT. +**Note:** If you're using Anaconda, don't forget to activate the ml-agents environment first. + `pip install protobuf==3.6.0 --force` -`pip install grpcio-tools` +`pip install grpcio-tools==1.11.1` + +`pip install mypy-protobuf` + +If you don't have it already, download the latest version of [nuget](https://www.nuget.org/downloads). +Navigate to your installation of nuget and run the following: -`nuget install Grpc.Tools` into known directory. +`nuget install Grpc.Tools -Version 1.14.1 -OutputDirectory $MLAGENTS_ROOT\protobuf-definitions` ### Installing Protobuf Compiler On Mac: `brew install protobuf` -On Windows & Linux: [See here](https://github.com/google/protobuf/blob/master/src/README.md). - ## Running -1. Install pre-requisites. -2. Un-comment line 4 in `make.bat`, and set to correct Grpc.Tools sub-directory. -3. Run `make.bat` -4. In the generated `UnityToExternalGrpc.cs` file in the `UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects` folder, you will need to add the following to the beginning of the file +Whenever you change the fields of a custom message, you must follow the steps below to create C# and Python files corresponding to the new message. + +1. Open a terminal. **Note:** If you're using Anaconda, don't forget to activate the ml-agents environment first. +2. Un-comment line 7 in `make.bat` (for Windows, use `make_for_win.bat`), and set to correct Grpc.Tools sub-directory. +3. Run the `.bat` from the terminal by navigating to `$MLAGENTS_ROOT\protobuf-definitions` and entering `make.bat` (for Windows, use `make_for_win.bat`) +4. Note any errors generated that may result from setting the wrong directory in step 2. +5. In the generated `UnityToExternalGrpc.cs` file in the `$MLAGENTS_ROOT/UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects` folder, you will need to add the following to the beginning of the file: ```csharp # if UNITY_EDITOR || UNITY_STANDALONE_WIN || UNITY_STANDALONE_OSX || UNITY_STANDALONE_LINUX @@ -38,3 +49,17 @@ On Windows & Linux: [See here](https://github.com/google/protobuf/blob/master/sr ``` This is to make sure the generated code does not try to access the Grpc library on platforms that are not supported by Grpc. + +Finally, re-install the mlagents packages by running the following commands from the same `$MLAGENTS_ROOT\protobuf-definitions` directory. + +``` +cd .. +cd ml-agents-envs +pip install -e . +cd .. +cd ml-agents +pip install -e . +mlagents-learn +``` + +The final line will test if everything was generated and installed correctly. If it worked, you should see the Unity logo. diff --git a/protobuf-definitions/make.bat b/protobuf-definitions/make.bat index f0dbeb926d..f084c1b9c3 100755 --- a/protobuf-definitions/make.bat +++ b/protobuf-definitions/make.bat @@ -22,7 +22,7 @@ mkdir -p $DST_DIR_P/$PYTHON_PACKAGE # generate proto objects in python and C# protoc --proto_path=proto --csharp_out=$DST_DIR_C $SRC_DIR/*.proto -protoc --proto_path=proto --python_out=$DST_DIR_P $SRC_DIR/*.proto +protoc --proto_path=proto --python_out=$DST_DIR_P --mypy_out=$DST_DIR_P $SRC_DIR/*.proto # grpc diff --git a/protobuf-definitions/make_for_win.bat b/protobuf-definitions/make_for_win.bat index 26abe5afc2..3c729465cd 100644 --- a/protobuf-definitions/make_for_win.bat +++ b/protobuf-definitions/make_for_win.bat @@ -3,7 +3,8 @@ rem variables rem GRPC-TOOLS required. Install with `nuget install Grpc.Tools`. rem Then un-comment and replace [DIRECTORY] with location of files. rem For example, on Windows, you might have something like: -rem set COMPILER=Grpc.Tools.1.14.1/tools/windows_x64 +rem set COMPILER=Grpc.Tools.1.14.1\tools\windows_x64 +rem set COMPILER=[DIRECTORY] set SRC_DIR=proto\mlagents\envs\communicator_objects set DST_DIR_C=..\UnitySDK\Assets\ML-Agents\Scripts\CommunicatorObjects @@ -21,8 +22,8 @@ mkdir %DST_DIR_P%\%PYTHON_PACKAGE% rem generate proto objects in python and C# for %%i in (%SRC_DIR%\*.proto) do ( - protoc --proto_path=proto --csharp_out=%DST_DIR_C% %%i - protoc --proto_path=proto --python_out=%DST_DIR_P% %%i + %COMPILER%\protoc --proto_path=proto --csharp_out=%DST_DIR_C% %%i + %COMPILER%\protoc --proto_path=proto --python_out=%DST_DIR_P% %%i ) rem grpc @@ -30,7 +31,7 @@ rem grpc set GRPC=unity_to_external.proto %COMPILER%\protoc --proto_path=proto --csharp_out %DST_DIR_C% --grpc_out %DST_DIR_C% %SRC_DIR%\%GRPC% --plugin=protoc-gen-grpc=%COMPILER%\grpc_csharp_plugin.exe -python3 -m grpc_tools.protoc --proto_path=proto --python_out=%DST_DIR_P% --grpc_python_out=%DST_DIR_P% %SRC_DIR%\%GRPC% +python -m grpc_tools.protoc --proto_path=proto --python_out=%DST_DIR_P% --grpc_python_out=%DST_DIR_P% %SRC_DIR%\%GRPC% rem Generate the init file for the python module rem rm -f $DST_DIR_P/$PYTHON_PACKAGE/__init__.py diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000000..7573255fe6 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,19 @@ +[coverage:report] +# Run "pytest --cov=mlagents" to see the current coverage percentage. +# Run "pytest --cov=mlagents --cov-report html" to get a nice visualization of what is/isn't coverge in html format. +fail_under = 60 + + +[flake8] +# black will apply a line length of 88 to code but not docstrings/comments +# This seems like a decent compromise between readability and redoing all the docstrings. +max-line-length=120 + +ignore = + # Black tends to introduce things flake8 doesn't like, such as "line break before binary operator" + # or whitespace before ':'. Rather than fight with black, just ignore these for now. + W503, E203, + + # "may be undefined, or defined from star imports" and related warnings + # We should stop doing these, but for now, leave them in. + F405, F403, F401, diff --git a/utils/validate_meta_files.py b/utils/validate_meta_files.py index 0ea828ee8b..01eedb0ce1 100644 --- a/utils/validate_meta_files.py +++ b/utils/validate_meta_files.py @@ -33,10 +33,12 @@ def main(): num_matched += 1 if unmatched: - raise Exception(f"Mismatch between expected files and their .meta files: {sorted(unmatched)}") + raise Exception( + f"Mismatch between expected files and their .meta files: {sorted(unmatched)}" + ) print(f"Found {num_matched} correctly matched files") if __name__ == "__main__": - main() \ No newline at end of file + main()