diff --git a/UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallHardLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallHardLearning.nn index f5c74dc2a9..38ca710c2e 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallHardLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallHardLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallLearning.nn index dbe8ca2626..16c4b0baea 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/TFModels/BananaLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/TFModels/BananaLearning.nn index ac8a519cec..f83aa78394 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/TFModels/BananaLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/TFModels/BananaLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/Basic/TFModels/BasicLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/Basic/TFModels/BasicLearning.nn index 9d94677cf5..7ded379ad9 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/Basic/TFModels/BasicLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/Basic/TFModels/BasicLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/Bouncer/TFModels/BouncerLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/Bouncer/TFModels/BouncerLearning.nn index 4ca329a0d4..f647e15ca4 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/Bouncer/TFModels/BouncerLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/Bouncer/TFModels/BouncerLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerDynamicLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerDynamicLearning.nn index 6562f3bc85..87403f0d24 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerDynamicLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerDynamicLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerStaticLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerStaticLearning.nn index 8734082436..bd37244fc0 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerStaticLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerStaticLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/GridWorld/TFModels/GridWorldLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/GridWorld/TFModels/GridWorldLearning.nn index 1cfd3cc679..c5fc081004 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/GridWorld/TFModels/GridWorldLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/GridWorld/TFModels/GridWorldLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/Hallway/TFModels/HallwayLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/Hallway/TFModels/HallwayLearning.nn index 960ed82e2d..9ec4299254 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/Hallway/TFModels/HallwayLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/Hallway/TFModels/HallwayLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/PushBlock/TFModels/PushBlockLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/PushBlock/TFModels/PushBlockLearning.nn index 6c2f960098..b80b164554 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/PushBlock/TFModels/PushBlockLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/PushBlock/TFModels/PushBlockLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/Pyramids/TFModels/PyramidsLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/Pyramids/TFModels/PyramidsLearning.nn index 72def7accf..c6da2cfc8e 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/Pyramids/TFModels/PyramidsLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/Pyramids/TFModels/PyramidsLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/Reacher/TFModels/ReacherLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/Reacher/TFModels/ReacherLearning.nn index 6d93c6afd7..a1967b4b96 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/Reacher/TFModels/ReacherLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/Reacher/TFModels/ReacherLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/GoalieLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/GoalieLearning.nn index f4700c929e..3fa9a8fccb 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/GoalieLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/GoalieLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/StrikerLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/StrikerLearning.nn index 51d9a092d1..36946f2cfc 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/StrikerLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/StrikerLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/Tennis/TFModels/TennisLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/Tennis/TFModels/TennisLearning.nn index 7ed0efa125..6a8287c873 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/Tennis/TFModels/TennisLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/Tennis/TFModels/TennisLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/Walker/TFModels/WalkerLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/Walker/TFModels/WalkerLearning.nn index 55899f37a7..759cbb4b1d 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/Walker/TFModels/WalkerLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/Walker/TFModels/WalkerLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/BigWallJumpLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/BigWallJumpLearning.nn index 2d97135c03..3dbb8babae 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/BigWallJumpLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/BigWallJumpLearning.nn differ diff --git a/UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/SmallWallJumpLearning.nn b/UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/SmallWallJumpLearning.nn index 084c4097f6..630aed3361 100644 Binary files a/UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/SmallWallJumpLearning.nn and b/UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/SmallWallJumpLearning.nn differ diff --git a/docs/Installation.md b/docs/Installation.md index 198d201748..bb87d88b17 100644 --- a/docs/Installation.md +++ b/docs/Installation.md @@ -63,7 +63,7 @@ If you installed this correctly, you should be able to run `mlagents-learn --help`, after which you will see the Unity logo and the command line parameters you can use with `mlagents-learn`. -By installing the `mlagents` package, its dependencies listed in the [setup.py file](../ml-agents/setup.py) are also installed. +By installing the `mlagents` package, the dependencies listed in the [setup.py file](../ml-agents/setup.py) are also installed. Some of the primary dependencies include: - [TensorFlow](Background-TensorFlow.md) (Requires a CPU w/ AVX support) diff --git a/docs/Learning-Environment-Examples.md b/docs/Learning-Environment-Examples.md index dcdcbbf947..4930ecf291 100644 --- a/docs/Learning-Environment-Examples.md +++ b/docs/Learning-Environment-Examples.md @@ -32,7 +32,7 @@ If you would like to contribute environments, please see our * Vector Observation space: One variable corresponding to current state. * Vector Action space: (Discrete) Two possible actions (Move left, move right). - * Visual Observations: None. + * Visual Observations: None * Reset Parameters: None * Benchmark Mean Reward: 0.94 @@ -56,7 +56,7 @@ If you would like to contribute environments, please see our * Vector Action space: (Continuous) Size of 2, with one value corresponding to X-rotation, and the other to Z-rotation. * Visual Observations: None. -* Reset Parameters: Three, corresponding to the following: +* Reset Parameters: Three * scale: Specifies the scale of the ball in the 3 dimensions (equal across the three dimensions) * Default: 1 * Recommended Minimum: 0.2 @@ -116,8 +116,8 @@ If you would like to contribute environments, please see our of ball and racket. * Vector Action space: (Continuous) Size of 2, corresponding to movement toward net or away from net, and jumping. - * Visual Observations: None. -* Reset Parameters: Three, corresponding to the following: + * Visual Observations: None +* Reset Parameters: Three * angle: Angle of the racket from the vertical (Y) axis. * Default: 55 * Recommended Minimum: 35 @@ -153,7 +153,7 @@ If you would like to contribute environments, please see our `VisualPushBlock` scene. __The visual observation version of this environment does not train with the provided default training parameters.__ -* Reset Parameters: Four, corresponding to the following: +* Reset Parameters: Four * block_scale: Scale of the block along the x and z dimensions * Default: 2 * Recommended Minimum: 0.5 @@ -194,8 +194,8 @@ If you would like to contribute environments, please see our * Rotation (3 possible actions: Rotate Left, Rotate Right, No Action) * Side Motion (3 possible actions: Left, Right, No Action) * Jump (2 possible actions: Jump, No Action) - * Visual Observations: None. -* Reset Parameters: 4, corresponding to the height of the possible walls. + * Visual Observations: None +* Reset Parameters: Four * Benchmark Mean Reward (Big & Small Wall Brain): 0.8 ## [Reacher](https://youtu.be/2N9EoF6pQyE) @@ -213,7 +213,7 @@ If you would like to contribute environments, please see our * Vector Action space: (Continuous) Size of 4, corresponding to torque applicable to two joints. * Visual Observations: None. -* Reset Parameters: Five, corresponding to the following +* Reset Parameters: Five * goal_size: radius of the goal zone * Default: 5 * Recommended Minimum: 1 @@ -254,7 +254,7 @@ If you would like to contribute environments, please see our angular acceleration of the body. * Vector Action space: (Continuous) Size of 20, corresponding to target rotations for joints. - * Visual Observations: None. + * Visual Observations: None * Reset Parameters: None * Benchmark Mean Reward for `CrawlerStaticTarget`: 2000 * Benchmark Mean Reward for `CrawlerDynamicTarget`: 400 @@ -284,7 +284,7 @@ If you would like to contribute environments, please see our `VisualBanana` scene. __The visual observation version of this environment does not train with the provided default training parameters.__ -* Reset Parameters: Two, corresponding to the following +* Reset Parameters: Two * laser_length: Length of the laser used by the agent * Default: 1 * Recommended Minimum: 0.2 @@ -318,7 +318,7 @@ If you would like to contribute environments, please see our `VisualHallway` scene. __The visual observation version of this environment does not train with the provided default training parameters.__ -* Reset Parameters: None. +* Reset Parameters: None * Benchmark Mean Reward: 0.7 * To speed up training, you can enable curiosity by adding `use_curiosity: true` in `config/trainer_config.yaml` * Optional Imitation Learning scene: `HallwayIL`. @@ -340,8 +340,8 @@ If you would like to contribute environments, please see our banana. * Vector Action space: (Continuous) 3 corresponding to agent force applied for the jump. - * Visual Observations: None. -* Reset Parameters: Two, corresponding to the following + * Visual Observations: None +* Reset Parameters: Two * banana_scale: The scale of the banana in the 3 dimensions * Default: 150 * Recommended Minimum: 50 @@ -375,8 +375,8 @@ If you would like to contribute environments, please see our * Striker: 6 actions corresponding to forward, backward, sideways movement, as well as rotation. * Goalie: 4 actions corresponding to forward, backward, sideways movement. - * Visual Observations: None. -* Reset Parameters: Two, corresponding to the following: + * Visual Observations: None +* Reset Parameters: Two * ball_scale: Specifies the scale of the ball in the 3 dimensions (equal across the three dimensions) * Default: 7.5 * Recommended minimum: 4 @@ -409,8 +409,8 @@ If you would like to contribute environments, please see our velocity, and angular velocities of each limb, along with goal direction. * Vector Action space: (Continuous) Size of 39, corresponding to target rotations applicable to the joints. - * Visual Observations: None. -* Reset Parameters: Four, corresponding to the following + * Visual Observations: None +* Reset Parameters: Four * gravity: Magnitude of gravity * Default: 9.81 * Recommended Minimum: @@ -450,6 +450,6 @@ If you would like to contribute environments, please see our `VisualPyramids` scene. __The visual observation version of this environment does not train with the provided default training parameters.__ -* Reset Parameters: None. +* Reset Parameters: None * Optional Imitation Learning scene: `PyramidsIL`. * Benchmark Mean Reward: 1.75 diff --git a/docs/ML-Agents-Overview.md b/docs/ML-Agents-Overview.md index daeb770745..f194c64ef0 100644 --- a/docs/ML-Agents-Overview.md +++ b/docs/ML-Agents-Overview.md @@ -319,11 +319,11 @@ imitation learning algorithm will then use these pairs of observations and actions from the human player to learn a policy. [Video Link](https://youtu.be/kpb8ZkMBFYs). -ML-Agents provides ways to both learn directly from demonstrations as well as -use demonstrations to help speed up reward-based training, and two algorithms to do -so (Generative Adversarial Imitation Learning and Behavioral Cloning). The -[Training with Imitation Learning](Training-Imitation-Learning.md) tutorial -covers these features in more depth. +The toolkit provides a way to learn directly from demonstrations, as well as use them +to help speed up reward-based training (RL). We include two algorithms called +Behavioral Cloning (BC) and Generative Adversarial Imitation Learning (GAIL). The +[Training with Imitation Learning](Training-Imitation-Learning.md) tutorial covers these +features in more depth. ## Flexible Training Scenarios @@ -408,6 +408,14 @@ training process. learn more about adding visual observations to an agent [here](Learning-Environment-Design-Agents.md#multiple-visual-observations). +- **Training with Reset Parameter Sampling** - To train agents to be adapt + to changes in its environment (i.e., generalization), the agent should be exposed + to several variations of the environment. Similar to Curriculum Learning, + where environments become more difficult as the agent learns, the toolkit provides + a way to randomly sample Reset Parameters of the environment during training. See + [Training Generalized Reinforcement Learning Agents](Training-Generalized-Reinforcement-Learning-Agents.md) + to learn more about this feature. + - **Broadcasting** - As discussed earlier, a Learning Brain sends the observations for all its Agents to the Python API when dragged into the Academy's `Broadcast Hub` with the `Control` checkbox checked. This is helpful @@ -422,14 +430,6 @@ training process. the broadcasting feature [here](Learning-Environment-Design-Brains.md#using-the-broadcast-feature). -- **Training with Environment Parameter Sampling** - To train agents to be robust - to changes in its environment (i.e., generalization), the agent should be exposed - to a variety of environment variations. Similarly to Curriculum Learning, which - allows environments to get more difficult as the agent learns, we also provide - a way to randomly resample aspects of the environment during training. See - [Training with Environment Parameter Sampling](Training-Generalization-Learning.md) - to learn more about this feature. - - **Docker Set-up (Experimental)** - To facilitate setting up ML-Agents without installing Python or TensorFlow directly, we provide a [guide](Using-Docker.md) on how to create and run a Docker container. diff --git a/docs/Migrating.md b/docs/Migrating.md index 7c99341e12..9ab3dafce6 100644 --- a/docs/Migrating.md +++ b/docs/Migrating.md @@ -1,5 +1,26 @@ # Migrating +## Migrating from ML-Agents toolkit v0.8 to v0.9 + +### Important Changes +* We have changed the way reward signals (including Curiosity) are defined in the +`trainer_config.yaml`. +* When using multiple environments, every "step" is recorded in TensorBoard. +* The steps in the command line console corresponds to a single step of a single environment. +Previously, each step corresponded to one step for all environments (i.e., `num_envs` steps). + +#### Steps to Migrate +* If you were overriding any of these following parameters in your config file, remove them +from the top-level config and follow the steps below: + * `gamma`: Define a new `extrinsic` reward signal and set it's `gamma` to your new gamma. + * `use_curiosity`, `curiosity_strength`, `curiosity_enc_size`: Define a `curiosity` reward signal + and set its `strength` to `curiosity_strength`, and `encoding_size` to `curiosity_enc_size`. Give it + the same `gamma` as your `extrinsic` signal to mimic previous behavior. +See [Reward Signals](Reward-Signals.md) for more information on defining reward signals. +* TensorBoards generated when running multiple environments in v0.8 are not comparable to those generated in +v0.9 in terms of step count. Multiply your v0.8 step count by `num_envs` for an approximate comparison. +You may need to change `max_steps` in your config as appropriate as well. + ## Migrating from ML-Agents toolkit v0.7 to v0.8 ### Important Changes diff --git a/docs/Profiling.md b/docs/Profiling-Python.md similarity index 92% rename from docs/Profiling.md rename to docs/Profiling-Python.md index 1fc28dd314..45904b883e 100644 --- a/docs/Profiling.md +++ b/docs/Profiling-Python.md @@ -1,7 +1,7 @@ -# Profiling ML-Agents in Python +# Profiling in Python -ML-Agents provides a lightweight profiling system, in order to identity hotspots in the training process and help spot -regressions from changes. +As part of the ML-Agents tookit, we provide a lightweight profiling system, +in order to identity hotspots in the training process and help spot regressions from changes. Timers are hierarchical, meaning that the time tracked in a block of code can be further split into other blocks if desired. This also means that a function that is called from multiple places in the code will appear in multiple @@ -24,7 +24,6 @@ class TrainerController: You can also used the `hierarchical_timer` context manager. - ``` python with hierarchical_timer("communicator.exchange"): outputs = self.communicator.exchange(step_input) diff --git a/docs/Readme.md b/docs/Readme.md index fdad80e4f5..f85ae59d80 100644 --- a/docs/Readme.md +++ b/docs/Readme.md @@ -39,6 +39,7 @@ * [Training with Curriculum Learning](Training-Curriculum-Learning.md) * [Training with Imitation Learning](Training-Imitation-Learning.md) * [Training with LSTM](Feature-Memory.md) +* [Training Generalized Reinforcement Learning Agents](Training-Generalized-Reinforcement-Learning-Agents.md) * [Training on the Cloud with Amazon Web Services](Training-on-Amazon-Web-Service.md) * [Training on the Cloud with Microsoft Azure](Training-on-Microsoft-Azure.md) * [Training Using Concurrent Unity Instances](Training-Using-Concurrent-Unity-Instances.md) diff --git a/docs/Training-RewardSignals.md b/docs/Reward-Signals.md similarity index 73% rename from docs/Training-RewardSignals.md rename to docs/Reward-Signals.md index 2f62402f1b..0b44185766 100644 --- a/docs/Training-RewardSignals.md +++ b/docs/Reward-Signals.md @@ -18,9 +18,9 @@ The `curiosity` reward signal helps your agent explore when extrinsic rewards ar ## Enabling Reward Signals Reward signals, like other hyperparameters, are defined in the trainer config `.yaml` file. An -example is provided in `config/trainer_config.yaml`. To enable a reward signal, add it to the +example is provided in `config/trainer_config.yaml` and `config/gail_config.yaml`. To enable a reward signal, add it to the `reward_signals:` section under the brain name. For instance, to enable the extrinsic signal -in addition to a small curiosity reward, you would define your `reward_signals` as follows: +in addition to a small curiosity reward and a GAIL reward signal, you would define your `reward_signals` as follows: ```yaml reward_signals: @@ -28,9 +28,14 @@ reward_signals: strength: 1.0 gamma: 0.99 curiosity: + strength: 0.02 + gamma: 0.99 + encoding_size: 256 + gail: strength: 0.01 gamma: 0.99 encoding_size: 128 + demo_path: demos/ExpertPyramid.demo ``` Each reward signal should define at least two parameters, `strength` and `gamma`, in addition @@ -39,8 +44,9 @@ its entry entirely from `reward_signals`. At least one reward signal should be l at all times. ## Reward Signal Types +As part of the toolkit, we provide three reward signal types as part of hyperparameters - Extrinsic, Curiosity, and GAIL. -### The Extrinsic Reward Signal +### Extrinsic Reward Signal The `extrinsic` reward signal is simply the reward given by the [environment](Learning-Environment-Design.md). Remove it to force the agent @@ -63,10 +69,10 @@ cases when rewards are more immediate, it can be smaller. Typical Range: `0.8` - `0.995` -### The Curiosity Reward Signal +### Curiosity Reward Signal -The `curiosity` Reward Signal enables the Intrinsic Curiosity Module. This is an implementation -of the approach described in "Curiosity-driven Exploration by Self-supervised Prediction" +The `curiosity` reward signal enables the Intrinsic Curiosity Module. This is an implementation +of the approach described in "Curiosity-driven Exploration by Self-supervised Prediction" by Pathak, et al. It trains two networks: * an inverse model, which takes the current and next obersvation of the agent, encodes them, and uses the encoding to predict the action that was taken between the observations @@ -80,11 +86,11 @@ For more information, see * https://pathak22.github.io/noreward-rl/ * https://blogs.unity3d.com/2018/06/26/solving-sparse-reward-tasks-with-curiosity/ -#### Strength +#### Strength -In this case, `strength` corresponds to the magnitude of the curiosity reward generated -by the intrinsic curiosity module. This should be scaled in order to ensure it is large enough -to not be overwhelmed by extrinsic reward signals in the environment. +In this case, `strength` corresponds to the magnitude of the curiosity reward generated +by the intrinsic curiosity module. This should be scaled in order to ensure it is large enough +to not be overwhelmed by extrinsic reward signals in the environment. Likewise it should not be too large to overwhelm the extrinsic reward signal. Typical Range: `0.001` - `0.1` @@ -95,7 +101,7 @@ Typical Range: `0.001` - `0.1` Typical Range: `0.8` - `0.995` -#### Encoding Size +#### (Optional) Encoding Size `encoding_size` corresponds to the size of the encoding used by the intrinsic curiosity model. This value should be small enough to encourage the ICM to compress the original @@ -106,50 +112,59 @@ Default Value: `64` Typical Range: `64` - `256` -#### Learning Rate +#### (Optional) Learning Rate -`learning_rate` is the learning rate used to update the intrinsic curiosity module. +`learning_rate` is the learning rate used to update the intrinsic curiosity module. This should typically be decreased if training is unstable, and the curiosity loss is unstable. Default Value: `3e-4` -Typical Range: `1e-5` - `1e-3` +Typical Range: `1e-5` - `1e-3` + +#### (Optional) Num Epochs + +`num_epoch` The number of passes to make through the experience buffer when performing gradient +descent optimization for the ICM. This typically should be set to the same as used for PPO. + +Default Value: `3` -### The GAIL Reward Signal +Typical Range: `3` - `10` -GAIL, or [Generative Adversarial Imitation Learning](https://arxiv.org/abs/1606.03476), is an -imitation learning algorithm that uses an adversarial approach, in a similar vein to GANs +### GAIL Reward Signal + +GAIL, or [Generative Adversarial Imitation Learning](https://arxiv.org/abs/1606.03476), is an +imitation learning algorithm that uses an adversarial approach, in a similar vein to GANs (Generative Adversarial Networks). In this framework, a second neural network, the -discriminator, is taught to distinguish whether an observation/action is from a demonstration, or -produced by the agent. This discriminator can the examine a new observation/action and provide it a -reward based on how close it believes this new observation/action is to the provided demonstrations. +discriminator, is taught to distinguish whether an observation/action is from a demonstration or +produced by the agent. This discriminator can the examine a new observation/action and provide it a +reward based on how close it believes this new observation/action is to the provided demonstrations. -At each training step, the agent tries to learn how to maximize this reward. Then, the -discriminator is trained to better distinguish between demonstrations and agent state/actions. +At each training step, the agent tries to learn how to maximize this reward. Then, the +discriminator is trained to better distinguish between demonstrations and agent state/actions. In this way, while the agent gets better and better at mimicing the demonstrations, the -discriminator keeps getting stricter and stricter and the agent must try harder to "fool" it. +discriminator keeps getting stricter and stricter and the agent must try harder to "fool" it. -This approach, when compared to [Behavioral Cloning](Training-BehavioralCloning.md), requires +This approach, when compared to [Behavioral Cloning](Training-Behavioral-Cloning.md), requires far fewer demonstrations to be provided. After all, we are still learning a policy that happens -to be similar to the demonstration, not directly copying the behavior of the demonstrations. It -is also especially effective when combined with an Extrinsic signal, but can also be used -independently to purely learn from demonstration. +to be similar to the demonstrations, not directly copying the behavior of the demonstrations. It +is especially effective when combined with an Extrinsic signal. However, the GAIL reward signal can +also be used independently to purely learn from demonstrations. -Using GAIL requires recorded demonstrations from your Unity environment. See the +Using GAIL requires recorded demonstrations from your Unity environment. See the [imitation learning guide](Training-Imitation-Learning.md) to learn more about recording demonstrations. -#### Strength +#### Strength `strength` is the factor by which to multiply the raw reward. Note that when using GAIL -with an Extrinsic Signal, this value should be set lower if your demonstrations are -suboptimal (e.g. from a human), so that a trained agent will focus on receiving extrinsic -rewards instead of exactly copying the demonstrations. Keep the strength below about 0.1 in those cases. +with an Extrinsic Signal, this value should be set lower if your demonstrations are +suboptimal (e.g. from a human), so that a trained agent will focus on receiving extrinsic +rewards instead of exactly copying the demonstrations. Keep the strength below about 0.1 in those cases. Typical Range: `0.01` - `1.0` #### Gamma -`gamma` corresponds to the discount factor for future rewards. +`gamma` corresponds to the discount factor for future rewards. Typical Range: `0.8` - `0.9` @@ -158,54 +173,64 @@ Typical Range: `0.8` - `0.9` `demo_path` is the path to your `.demo` file or directory of `.demo` files. See the [imitation learning guide] (Training-Imitation-Learning.md). -#### Encoding Size +#### (Optional) Encoding Size -`encoding_size` corresponds to the size of the hidden layer used by the discriminator. +`encoding_size` corresponds to the size of the hidden layer used by the discriminator. This value should be small enough to encourage the discriminator to compress the original -observation, but also not too small to prevent it from learning to differentiate between +observation, but also not too small to prevent it from learning to differentiate between demonstrated and actual behavior. Dramatically increasing this size will also negatively affect -training times. +training times. Default Value: `64` Typical Range: `64` - `256` -#### Learning Rate +#### (Optional) Learning Rate -`learning_rate` is the learning rate used to update the discriminator. +`learning_rate` is the learning rate used to update the discriminator. This should typically be decreased if training is unstable, and the GAIL loss is unstable. Default Value: `3e-4` -Typical Range: `1e-5` - `1e-3` +Typical Range: `1e-5` - `1e-3` -#### Use Actions +#### (Optional) Use Actions -`use_actions` determines whether the discriminator should discriminate based on both +`use_actions` determines whether the discriminator should discriminate based on both observations and actions, or just observations. Set to `True` if you want the agent to mimic the actions from the demonstrations, and `False` if you'd rather have the agent -visit the same states as in the demonstrations but with possibly different actions. +visit the same states as in the demonstrations but with possibly different actions. Setting to `False` is more likely to be stable, especially with imperfect demonstrations, -but may learn slower. +but may learn slower. + +Default Value: `false` + +#### (Optional) Variational Discriminator Bottleneck + +`use_vail` enables a [variational bottleneck](https://arxiv.org/abs/1810.00821) within the +GAIL discriminator. This forces the discriminator to learn a more general representation +and reduces its tendency to be "too good" at discriminating, making learning more stable. +However, it does increase training time. Enable this if you notice your imitation learning is +unstable, or unable to learn the task at hand. Default Value: `false` #### (Optional) Samples Per Update -`samples_per_update` is the maximum number of samples to use during each discriminator update. You may -want to lower this if your buffer size is very large to avoid overfitting the discriminator on current data. -If set to 0, we will use the minimum of buffer size and the number of demonstration samples. +`samples_per_update` is the maximum number of samples to use during each discriminator update. You may +want to lower this if your buffer size is very large to avoid overfitting the discriminator on current data. +If set to 0, we will use the minimum of buffer size and the number of demonstration samples. Default Value: `0` Typical Range: Approximately equal to [`buffer_size`](Training-PPO.md) -#### (Optional) Variational Discriminator Bottleneck +#### (Optional) Num Epochs -`use_vail` enables a [variational bottleneck](https://arxiv.org/abs/1810.00821) within the -GAIL discriminator. This forces the discriminator to learn a more general representation -and reduces its tendency to be "too good" at discriminating, making learning more stable. -However, it does increase training time. Enable this if you notice your imitation learning is -unstable, or unable to learn the task at hand. +`num_epoch` The number of passes to make through the experience buffer when performing gradient +descent optimization for the discriminator. To avoid overfitting, this typically should be set to +the same as or less than used for PPO. -Default Value: `false` +Default Value: `3` + +Typical Range: `1` - `10` \ No newline at end of file diff --git a/docs/Training-BehavioralCloning.md b/docs/Training-Behavioral-Cloning.md similarity index 100% rename from docs/Training-BehavioralCloning.md rename to docs/Training-Behavioral-Cloning.md diff --git a/docs/Training-Generalization-Learning.md b/docs/Training-Generalization-Learning.md deleted file mode 100644 index 79dea8da9e..0000000000 --- a/docs/Training-Generalization-Learning.md +++ /dev/null @@ -1,157 +0,0 @@ -# Training Generalized Reinforcement Learning Agents - -Reinforcement learning has a rather unique setup as opposed to supervised and -unsupervised learning. Agents here are trained and tested on the same exact -environment, which is analogous to a model being trained and tested on an -identical dataset in supervised learning! This setting results in overfitting; -the inability of the agent to generalize to slight tweaks or variations in the -environment. This is problematic in instances when environments are randomly -instantiated with varying properties. To make agents robust, one approach is to -train an agent over multiple variations of the environment. The agent is -trained in this approach with the intent that it learns to adapt its performance -to future unseen variations of the environment. - -Ball scale of 0.5 | Ball scale of 4 -:-------------------------:|:-------------------------: -![](images/3dball_small.png) | ![](images/3dball_big.png) - -_Variations of the 3D Ball environment._ - -To vary environments, we first decide what parameters to vary in an -environment. We call these parameters `Reset Parameters`. In the 3D ball -environment example displayed in the figure above, the reset parameters are -`gravity`, `ball_mass` and `ball_scale`. - - -## How-to - -For generalization training, we need to provide a way to modify the environment -by supplying a set of reset parameters, and vary them over time. This provision -can be done either deterministically or randomly. - -This is done by assigning each reset parameter a sampler, which samples a reset -parameter value (such as a uniform sampler). If a sampler isn't provided for a -reset parameter, the parameter maintains the default value throughout the -training procedure, remaining unchanged. The samplers for all the reset parameters -are handled by a **Sampler Manager**, which also handles the generation of new -values for the reset parameters when needed. - -To setup the Sampler Manager, we setup a YAML file that specifies how we wish to -generate new samples. In this file, we specify the samplers and the -`resampling-interval` (number of simulation steps after which reset parameters are -resampled). Below is an example of a sampler file for the 3D ball environment. - -```yaml -resampling-interval: 5000 - -mass: - sampler-type: "uniform" - min_value: 0.5 - max_value: 10 - -gravity: - sampler-type: "multirange_uniform" - intervals: [[7, 10], [15, 20]] - -scale: - sampler-type: "uniform" - min_value: 0.75 - max_value: 3 - -``` - -* `resampling-interval` (int) - Specifies the number of steps for agent to -train under a particular environment configuration before resetting the -environment with a new sample of reset parameters. - -* `parameter_name` - Name of the reset parameter. This should match the name -specified in the academy of the intended environment for which the agent is -being trained. If a parameter specified in the file doesn't exist in the -environment, then this specification will be ignored. - - * `sampler-type` - Specify the sampler type to use for the reset parameter. - This is a string that should exist in the `Sampler Factory` (explained - below). - - * `sub-arguments` - Specify the characteristic parameters for the sampler. - In the example sampler file above, this would correspond to the `intervals` - key under the `multirange_uniform` sampler for the gravity reset parameter. - The key name should match the name of the corresponding argument in the sampler definition. (Look at defining a new sampler method) - - -The sampler manager allocates a sampler for a reset parameter by using the *Sampler Factory*, which maintains a dictionary mapping of string keys to sampler objects. The available samplers to be used for reset parameter resampling is as available in the Sampler Factory. - -#### Possible Sampler Types - -The currently implemented samplers that can be used with the `sampler-type` arguments are: - -* `uniform` - Uniform sampler - * Uniformly samples a single float value between defined endpoints. - The sub-arguments for this sampler to specify the interval - endpoints are as below. The sampling is done in the range of - [`min_value`, `max_value`). - - * **sub-arguments** - `min_value`, `max_value` - -* `gaussian` - Gaussian sampler - * Samples a single float value from the distribution characterized by - the mean and standard deviation. The sub-arguments to specify the - gaussian distribution to use are as below. - - * **sub-arguments** - `mean`, `st_dev` - -* `multirange_uniform` - Multirange Uniform sampler - * Uniformly samples a single float value between the specified intervals. - Samples by first performing a weight pick of an interval from the list - of intervals (weighted based on interval width) and samples uniformly - from the selected interval (half-closed interval, same as the uniform - sampler). This sampler can take an arbitrary number of intervals in a - list in the following format: - [[`interval_1_min`, `interval_1_max`], [`interval_2_min`, `interval_2_max`], ...] - - * **sub-arguments** - `intervals` - - -The implementation of the samplers can be found at `ml-agents-envs/mlagents/envs/sampler_class.py`. - -### Defining a new sampler method - -Custom sampling techniques must inherit from the *Sampler* base class (included in the `sampler_class` file) and preserve the interface. Once the class for the required method is specified, it must be registered in the Sampler Factory. - -This can be done by subscribing to the *register_sampler* method of the SamplerFactory. The command is as follows: - -`SamplerFactory.register_sampler(*custom_sampler_string_key*, *custom_sampler_object*)` - -Once the Sampler Factory reflects the new register, the custom sampler can be used for resampling reset parameter. For demonstration, lets say our sampler was implemented as below, and we register the `CustomSampler` class with the string `custom-sampler` in the Sampler Factory. - -```python -class CustomSampler(Sampler): - - def __init__(self, argA, argB, argC): - self.possible_vals = [argA, argB, argC] - - def sample_all(self): - return np.random.choice(self.possible_vals) -``` - -Now we need to specify this sampler in the sampler file. Lets say we wish to use this sampler for the reset parameter *mass*; the sampler file would specify the same for mass as the following (any order of the subarguments is valid). - -```yaml -mass: - sampler-type: "custom-sampler" - argB: 1 - argA: 2 - argC: 3 -``` - -With the sampler file setup, we can proceed to train our agent as explained in the next section. - -### Training with Generalization Learning - -We first begin with setting up the sampler file. After the sampler file is defined and configured, we proceed by launching `mlagents-learn` and specify our configured sampler file with the `--sampler` flag. To demonstrate, if we wanted to train a 3D ball agent with generalization using the `config/3dball_generalize.yaml` sampling setup, we can run - -```sh -mlagents-learn config/trainer_config.yaml --sampler=config/3dball_generalize.yaml --run-id=3D-Ball-generalization --train -``` - -We can observe progress and metrics via Tensorboard. diff --git a/docs/Training-Generalized-Reinforcement-Learning-Agents.md b/docs/Training-Generalized-Reinforcement-Learning-Agents.md new file mode 100644 index 0000000000..29210781ce --- /dev/null +++ b/docs/Training-Generalized-Reinforcement-Learning-Agents.md @@ -0,0 +1,171 @@ +# Training Generalized Reinforcement Learning Agents + +One of the challenges of training and testing agents on the same +environment is that the agents tend to overfit. The result is that the +agents are unable to generalize to any tweaks or variations in the enviornment. +This is analgous to a model being trained and tested on an identical dataset +in supervised learning. This becomes problematic in cases where environments +are randomly instantiated with varying objects or properties. + +To make agents robust and generalizable to different environments, the agent +should be trained over multiple variations of the enviornment. Using this approach +for training, the agent will be better suited to adapt (with higher performance) +to future unseen variations of the enviornment + +_Example of variations of the 3D Ball environment._ + +Ball scale of 0.5 | Ball scale of 4 +:-------------------------:|:-------------------------: +![](images/3dball_small.png) | ![](images/3dball_big.png) + +## Introducing Generalization Using Reset Parameters + +To enable variations in the environments, we implemented `Reset Parameters`. We +also included different sampling methods and the ability to create new kinds of +sampling methods for each `Reset Parameter`. In the 3D ball environment example displayed +in the figure above, the reset parameters are `gravity`, `ball_mass` and `ball_scale`. + + +## How to Enable Generalization Using Reset Parameters + +We first need to provide a way to modify the environment by supplying a set of `Reset Parameters` +and vary them over time. This provision can be done either deterministically or randomly. + +This is done by assigning each `Reset Parameter` a `sampler-type`(such as a uniform sampler), +which determines how to sample a `Reset +Parameter`. If a `sampler-type` isn't provided for a +`Reset Parameter`, the parameter maintains the default value throughout the +training procedure, remaining unchanged. The samplers for all the `Reset Parameters` +are handled by a **Sampler Manager**, which also handles the generation of new +values for the reset parameters when needed. + +To setup the Sampler Manager, we create a YAML file that specifies how we wish to +generate new samples for each `Reset Parameters`. In this file, we specify the samplers and the +`resampling-interval` (the number of simulation steps after which reset parameters are +resampled). Below is an example of a sampler file for the 3D ball environment. + +```yaml +resampling-interval: 5000 + +mass: + sampler-type: "uniform" + min_value: 0.5 + max_value: 10 + +gravity: + sampler-type: "multirange_uniform" + intervals: [[7, 10], [15, 20]] + +scale: + sampler-type: "uniform" + min_value: 0.75 + max_value: 3 + +``` + +Below is the explanation of the fields in the above example. + +* `resampling-interval` - Specifies the number of steps for the agent to +train under a particular environment configuration before resetting the +environment with a new sample of `Reset Parameters`. + +* `Reset Parameter` - Name of the `Reset Parameter` like `mass`, `gravity` and `scale`. This should match the name +specified in the academy of the intended environment for which the agent is +being trained. If a parameter specified in the file doesn't exist in the +environment, then this parameter will be ignored. Within each `Reset Parameter` + + * `sampler-type` - Specify the sampler type to use for the `Reset Parameter`. + This is a string that should exist in the `Sampler Factory` (explained + below). + + * `sampler-type-sub-arguments` - Specify the sub-arguments depending on the `sampler-type`. + In the example above, this would correspond to the `intervals` + under the `sampler-type` `"multirange_uniform"` for the `Reset Parameter` called gravity`. + The key name should match the name of the corresponding argument in the sampler definition. + (See below) + +The Sampler Manager allocates a sampler type for each `Reset Parameter` by using the *Sampler Factory*, +which maintains a dictionary mapping of string keys to sampler objects. The available sampler types +to be used for each `Reset Parameter` is available in the Sampler Factory. + +### Included Sampler Types + +Below is a list of included `sampler-type` as part of the toolkit. + +* `uniform` - Uniform sampler + * Uniformly samples a single float value between defined endpoints. + The sub-arguments for this sampler to specify the interval + endpoints are as below. The sampling is done in the range of + [`min_value`, `max_value`). + + * **sub-arguments** - `min_value`, `max_value` + +* `gaussian` - Gaussian sampler + * Samples a single float value from the distribution characterized by + the mean and standard deviation. The sub-arguments to specify the + gaussian distribution to use are as below. + + * **sub-arguments** - `mean`, `st_dev` + +* `multirange_uniform` - Multirange uniform sampler + * Uniformly samples a single float value between the specified intervals. + Samples by first performing a weight pick of an interval from the list + of intervals (weighted based on interval width) and samples uniformly + from the selected interval (half-closed interval, same as the uniform + sampler). This sampler can take an arbitrary number of intervals in a + list in the following format: + [[`interval_1_min`, `interval_1_max`], [`interval_2_min`, `interval_2_max`], ...] + + * **sub-arguments** - `intervals` + +The implementation of the samplers can be found at `ml-agents-envs/mlagents/envs/sampler_class.py`. + +### Defining a New Sampler Type + +If you want to define your own sampler type, you must first inherit the *Sampler* +base class (included in the `sampler_class` file) and preserve the interface. +Once the class for the required method is specified, it must be registered in the Sampler Factory. + +This can be done by subscribing to the *register_sampler* method of the SamplerFactory. The command +is as follows: + +`SamplerFactory.register_sampler(*custom_sampler_string_key*, *custom_sampler_object*)` + +Once the Sampler Factory reflects the new register, the new sampler type can be used for sample any +`Reset Parameter`. For example, lets say a new sampler type was implemented as below and we register +the `CustomSampler` class with the string `custom-sampler` in the Sampler Factory. + +```python +class CustomSampler(Sampler): + + def __init__(self, argA, argB, argC): + self.possible_vals = [argA, argB, argC] + + def sample_all(self): + return np.random.choice(self.possible_vals) +``` + +Now we need to specify the new sampler type in the sampler YAML file. For example, we use this new +sampler type for the `Reset Parameter` *mass*. + +```yaml +mass: + sampler-type: "custom-sampler" + argB: 1 + argA: 2 + argC: 3 +``` + +### Training with Generalization Using Reset Parameters + +After the sampler YAML file is defined, we proceed by launching `mlagents-learn` and specify +our configured sampler file with the `--sampler` flag. For example, if we wanted to train the +3D ball agent with generalization using `Reset Parameters` with `config/3dball_generalize.yaml` +sampling setup, we would run + +```sh +mlagents-learn config/trainer_config.yaml --sampler=config/3dball_generalize.yaml +--run-id=3D-Ball-generalization --train +``` + +We can observe progress and metrics via Tensorboard. diff --git a/docs/Training-Imitation-Learning.md b/docs/Training-Imitation-Learning.md index 2b834d5a2f..679568a339 100644 --- a/docs/Training-Imitation-Learning.md +++ b/docs/Training-Imitation-Learning.md @@ -1,4 +1,4 @@ -# Imitation Learning +# Training with Imitation Learning It is often more intuitive to simply demonstrate the behavior we want an agent to perform, rather than attempting to have it learn via trial-and-error methods. @@ -12,29 +12,32 @@ from a demonstration to learn a policy. [Video Link](https://youtu.be/kpb8ZkMBFY Imitation learning can also be used to help reinforcement learning. Especially in environments with sparse (i.e., infrequent or rare) rewards, the agent may never see -the reward and thus not learn from it. Curiosity helps the agent explore, but in some cases -it is easier to just show the agent how to achieve the reward. In these cases, -imitation learning can dramatically reduce the time it takes to solve the environment. +the reward and thus not learn from it. Curiosity (which is available in the toolkit) +helps the agent explore, but in some cases +it is easier to show the agent how to achieve the reward. In these cases, +imitation learning combined with reinforcement learning can dramatically +reduce the time the agent takes to solve the environment. For instance, on the [Pyramids environment](Learning-Environment-Examples.md#pyramids), -just 6 episodes of demonstrations can reduce training steps by more than 4 times. +using 6 episodes of demonstrations can reduce training steps by more than 4 times. +See PreTraining + GAIL + Curiosity + RL below.

Using Demonstrations with Reinforcement Learning + width="700" border="0" />

-ML-Agents provides several ways to learn from demonstrations. +The ML-Agents toolkit provides several ways to learn from demonstrations. * To train using GAIL (Generative Adversarial Imitaiton Learning) you can add the - [GAIL reward signal](Training-RewardSignals.md#the-gail-reward-signal). GAIL can be + [GAIL reward signal](Reward-Signals.md#the-gail-reward-signal). GAIL can be used with or without environment rewards, and works well when there are a limited number of demonstrations. * To help bootstrap reinforcement learning, you can enable [pretraining](Training-PPO.md#optional-pretraining-using-demonstrations) on the PPO trainer, in addition to using a small GAIL reward signal. * To train an agent to exactly mimic demonstrations, you can use the - [Behavioral Cloning](Training-BehavioralCloning.md) trainer. Behavioral Cloning can be + [Behavioral Cloning](Training-Behavioral-Cloning.md) trainer. Behavioral Cloning can be used offline and online (in-editor), and learns very quickly. However, it usually is ineffective on more complex environments without a large number of demonstrations. @@ -58,7 +61,7 @@ It is possible to record demonstrations of agent behavior from the Unity Editor, and save them as assets. These demonstrations contain information on the observations, actions, and rewards for a given agent during the recording session. They can be managed from the Editor, as well as used for training with Offline -Behavioral Cloning (see below). +Behavioral Cloning and GAIL. In order to record demonstrations from an agent, add the `Demonstration Recorder` component to a GameObject in the scene which contains an `Agent` component. diff --git a/docs/Training-ML-Agents.md b/docs/Training-ML-Agents.md index a36bfcca71..5a9a749119 100644 --- a/docs/Training-ML-Agents.md +++ b/docs/Training-ML-Agents.md @@ -91,65 +91,65 @@ While this example used the default training hyperparameters, you can edit the [training_config.yaml file](#training-config-file) with a text editor to set different values. -### Command line training options +### Command Line Training Options In addition to passing the path of the Unity executable containing your training environment, you can set the following command line options when invoking `mlagents-learn`: -* `--env=` - Specify an executable environment to train. -* `--curriculum=` – Specify a curriculum JSON file for defining the +* `--env=`: Specify an executable environment to train. +* `--curriculum=`: Specify a curriculum JSON file for defining the lessons for curriculum training. See [Curriculum Training](Training-Curriculum-Learning.md) for more information. -* `--sampler=` - Specify a sampler YAML file for defining the +* `--sampler=`: Specify a sampler YAML file for defining the sampler for generalization training. See [Generalization - Training](Training-Generalization-Learning.md) for more information. -* `--keep-checkpoints=` – Specify the maximum number of model checkpoints to + Training](Training-Generalized-Reinforcement-Learning-Agents.md) for more information. +* `--keep-checkpoints=`: Specify the maximum number of model checkpoints to keep. Checkpoints are saved after the number of steps specified by the `save-freq` option. Once the maximum number of checkpoints has been reached, the oldest checkpoint is deleted when saving a new checkpoint. Defaults to 5. -* `--lesson=` – Specify which lesson to start with when performing curriculum +* `--lesson=`: Specify which lesson to start with when performing curriculum training. Defaults to 0. -* `--load` – If set, the training code loads an already trained model to +* `--load`: If set, the training code loads an already trained model to initialize the neural network before training. The learning code looks for the model in `models//` (which is also where it saves models at the end of training). When not set (the default), the neural network weights are randomly initialized and an existing model is not loaded. -* `--num-runs=` - Sets the number of concurrent training sessions to perform. +* `--num-runs=`: Sets the number of concurrent training sessions to perform. Default is set to 1. Set to higher values when benchmarking performance and multiple training sessions is desired. Training sessions are independent, and do not improve learning performance. -* `--run-id=` – Specifies an identifier for each training run. This +* `--run-id=`: Specifies an identifier for each training run. This identifier is used to name the subdirectories in which the trained model and summary statistics are saved as well as the saved model itself. The default id is "ppo". If you use TensorBoard to view the training statistics, always set a unique run-id for each training run. (The statistics for all runs with the same id are combined as if they were produced by a the same session.) -* `--save-freq=` Specifies how often (in steps) to save the model during +* `--save-freq=`: Specifies how often (in steps) to save the model during training. Defaults to 50000. -* `--seed=` – Specifies a number to use as a seed for the random number +* `--seed=`: Specifies a number to use as a seed for the random number generator used by the training code. -* `--slow` – Specify this option to run the Unity environment at normal, game +* `--slow`: Specify this option to run the Unity environment at normal, game speed. The `--slow` mode uses the **Time Scale** and **Target Frame Rate** specified in the Academy's **Inference Configuration**. By default, training runs using the speeds specified in your Academy's **Training Configuration**. See [Academy Properties](Learning-Environment-Design-Academy.md#academy-properties). -* `--train` – Specifies whether to train model or only run in inference mode. +* `--train`: Specifies whether to train model or only run in inference mode. When training, **always** use the `--train` option. -* `--num-envs=` - Specifies the number of concurrent Unity environment instances to collect +* `--num-envs=`: Specifies the number of concurrent Unity environment instances to collect experiences from when training. Defaults to 1. -* `--base-port` - Specifies the starting port. Each concurrent Unity environment instance will get assigned a port sequentially, starting from the `base-port`. Each instance will use the port `(base_port + worker_id)`, where the `worker_id` is sequential IDs given to each instance from 0 to `num_envs - 1`. Default is 5005. -* `--docker-target-name=
` – The Docker Volume on which to store curriculum, +* `--base-port`: Specifies the starting port. Each concurrent Unity environment instance will get assigned a port sequentially, starting from the `base-port`. Each instance will use the port `(base_port + worker_id)`, where the `worker_id` is sequential IDs given to each instance from 0 to `num_envs - 1`. Default is 5005. +* `--docker-target-name=
`: The Docker Volume on which to store curriculum, executable and model files. See [Using Docker](Using-Docker.md). -* `--no-graphics` - Specify this option to run the Unity executable in +* `--no-graphics`: Specify this option to run the Unity executable in `-batchmode` and doesn't initialize the graphics driver. Use this only if your training doesn't involve visual observations (reading from Pixels). See [here](https://docs.unity3d.com/Manual/CommandLineArguments.html) for more details. -* `--debug` - Specify this option to enable debug-level logging for some parts of the code. +* `--debug`: Specify this option to enable debug-level logging for some parts of the code. -### Training config file +### Training Config File The training config files `config/trainer_config.yaml`, `config/online_bc_config.yaml` and `config/offline_bc_config.yaml` specifies the @@ -180,7 +180,7 @@ environments are included in the provided config file. | num_epoch | The number of passes to make through the experience buffer when performing gradient descent optimization. | PPO | | num_layers | The number of hidden layers in the neural network. | PPO, BC | | pretraining | Use demonstrations to bootstrap the policy neural network. See [Pretraining Using Demonstrations](Training-PPO.md#optional-pretraining-using-demonstrations). | PPO | -| reward_signals | The reward signals used to train the policy. Enable Curiosity and GAIL here. See [Reward Signals](Training-RewardSignals.md) for configuration options. | PPO | +| reward_signals | The reward signals used to train the policy. Enable Curiosity and GAIL here. See [Reward Signals](Reward-Signals.md) for configuration options. | PPO | | sequence_length | Defines how long the sequences of experiences must be while training. Only used for training with a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, BC | | summary_freq | How often, in steps, to save training statistics. This determines the number of data points shown by TensorBoard. | PPO, BC | | time_horizon | How many steps of experience to collect per-agent before adding it to the experience buffer. | PPO, (online)BC | @@ -196,8 +196,8 @@ are conducting, see: * [Training with PPO](Training-PPO.md) * [Using Recurrent Neural Networks](Feature-Memory.md) * [Training with Curriculum Learning](Training-Curriculum-Learning.md) -* [Training with Environment Parameter Sampling](Training-Generalization-Learning.md) * [Training with Imitation Learning](Training-Imitation-Learning.md) +* [Training Generalized Reinforcement Learning Agents](Training-Generalized-Reinforcement-Learning-Agents.md) You can also compare the [example environments](Learning-Environment-Examples.md) @@ -205,15 +205,17 @@ to the corresponding sections of the `config/trainer_config.yaml` file for each example to see how the hyperparameters and other configuration variables have been changed from the defaults. -### Output metrics -Trainer Metrics are logged to a CSV stored in the `summaries` directory. The metrics stored are: +### Debugging and Profiling +If you enable the `--debug` flag in the command line, the trainer metrics are logged to a CSV file +stored in the `summaries` directory. The metrics stored are: * brain name * time to update policy * time since start of training * time for last experience collection * number of experiences used for training - * mean return - + * mean return + This option is not available currently for Behavioral Cloning. -[Profiling](Profiling.md) information is also saved in the `summaries` directory. +Additionally, we have included basic [Profiling in Python](Profiling-Python.md) as part of the toolkit. +This information is also saved in the `summaries` directory. diff --git a/docs/Training-PPO.md b/docs/Training-PPO.md index d6cdddbd85..a2bd53844b 100644 --- a/docs/Training-PPO.md +++ b/docs/Training-PPO.md @@ -8,7 +8,7 @@ ML-Agents PPO algorithm is implemented in TensorFlow and runs in a separate Python process (communicating with the running Unity application over a socket). To train an agent, you will need to provide the agent one or more reward signals which -the agent should attempt to maximize. See [Reward Signals](Training-RewardSignals.md) +the agent should attempt to maximize. See [Reward Signals](Reward-Signals.md) for the available reward signals and the corresponding hyperparameters. See [Training ML-Agents](Training-ML-Agents.md) for instructions on running the @@ -25,7 +25,7 @@ Learning](Training-Curriculum-Learning.md). For information about imitation learning from demonstrations, see [Training with Imitation Learning](Training-Imitation-Learning.md). -## Best Practices when training with PPO +## Best Practices Training with PPO Successfully training a Reinforcement Learning model often involves tuning the training hyperparameters. This guide contains some best practices for tuning the @@ -42,10 +42,11 @@ rewarding the agent for various different behaviors. For instance, we could rewa the agent for exploring new states, rather than just when an explicit reward is given. Furthermore, we could mix reward signals to help the learning process. -`reward_signals` provides a section to define [reward signals.](Training-RewardSignals.md) -ML-Agents provides two reward signals by default, the Extrinsic (environment) reward, and the -Curiosity reward, which can be used to encourage exploration in sparse extrinsic reward -environments. +Using `reward_signals` allows you to define [reward signals.](Reward-Signals.md) +The ML-Agents toolkit provides three reward signals by default, the Extrinsic (environment) +reward signal, the Curiosity reward signal, which can be used to encourage exploration in +sparse extrinsic reward environments, and the GAIL reward signal. Please see [Reward Signals](Reward-Signals.md) +for additional details. ### Lambda @@ -171,10 +172,10 @@ Typical Range: `32` - `512` `vis_encode_type` corresponds to the encoder type for encoding visual observations. Valid options include: * `simple` (default): a simple encoder which consists of two convolutional layers -* `nature_cnn`: CNN implementation proposed by Mnih et al.(https://www.nature.com/articles/nature14236), +* `nature_cnn`: CNN implementation proposed by Mnih et al.(https://www.nature.com/articles/nature14236), consisting of three convolutional layers * `resnet`: IMPALA Resnet implementation (https://arxiv.org/abs/1802.01561), -consisting of three stacked layers, each with two risidual blocks, making a +consisting of three stacked layers, each with two risidual blocks, making a much larger network than the other two. Options: `simple`, `nature_cnn`, `resnet` @@ -206,9 +207,9 @@ Typical Range: `64` - `512` ## (Optional) Pretraining Using Demonstrations In some cases, you might want to bootstrap the agent's policy using behavior recorded -from a player. This can help guide the agent towards the reward. Pretraining adds -training operations that mimic a demonstration rather than attempting to maximize reward. -It is essentially equivalent to running [behavioral cloning](./Training-BehavioralCloning.md) +from a player. This can help guide the agent towards the reward. Pretraining adds +training operations that mimic a demonstration rather than attempting to maximize reward. +It is essentially equivalent to running [behavioral cloning](Training-Behavioral-Cloning.md) in-line with PPO. To use pretraining, add a `pretraining` section to the trainer_config. For instance: @@ -226,22 +227,22 @@ Below are the avaliable hyperparameters for pretraining. `strength` corresponds to the learning rate of the imitation relative to the learning rate of PPO, and roughly corresponds to how strongly we allow the behavioral cloning -to influence the policy. +to influence the policy. Typical Range: `0.1` - `0.5` ### Demo Path -`demo_path` is the path to your `.demo` file or directory of `.demo` files. +`demo_path` is the path to your `.demo` file or directory of `.demo` files. See the [imitation learning guide](Training-Imitation-Learning.md) for more on `.demo` files. ### Steps -During pretraining, it is often desirable to stop using demonstrations after the agent has +During pretraining, it is often desirable to stop using demonstrations after the agent has "seen" rewards, and allow it to optimize past the available demonstrations and/or generalize outside of the provided demonstrations. `steps` corresponds to the training steps over which -pretraining is active. The learning rate of the pretrainer will anneal over the steps. Set -the steps to 0 for constant imitation over the entire training run. +pretraining is active. The learning rate of the pretrainer will anneal over the steps. Set +the steps to 0 for constant imitation over the entire training run. ### (Optional) Batch Size @@ -263,7 +264,7 @@ Typical Range: `3` - `10` `samples_per_update` is the maximum number of samples to use during each imitation update. You may want to lower this if your demonstration -dataset is very large to avoid overfitting the policy on demonstrations. Set to 0 +dataset is very large to avoid overfitting the policy on demonstrations. Set to 0 to train over all of the demonstrations at each update step. Default Value: `0` (all) diff --git a/docs/localized/KR/README.md b/docs/localized/KR/README.md index 03d6b1a92e..5a3709847f 100644 --- a/docs/localized/KR/README.md +++ b/docs/localized/KR/README.md @@ -1,18 +1,18 @@ - + -# Unity ML-Agents Toolkit (Beta) + + +# Unity ML-Agents Toolkit (Beta) v0.9 [![docs badge](https://img.shields.io/badge/docs-reference-blue.svg)](docs/Readme.md) [![license badge](https://img.shields.io/badge/license-Apache--2.0-green.svg)](LICENSE) **Unity Machine Learning Agents Toolkit** (ML-Agents) 은 지능형 에이전트를 학습시키기 위한 환경을 제공하여 게임 또는 시뮬레이션을 만들 수 있게 해주는 오픈소스 유니티 플러그인 입니다. 사용하기 쉬운 파이썬 API를 통해 강화학습, 모방학습, 신경진화 또는 다른 기계학습 방법론을 사용하여 에이전트들을 학습시킬 수 있습니다. -우리는 또한 게임 개발자와 취미를 가지신 분들이 2D, 3D 그리고 VR/AR 게임들의 지능형 에이전트를 -쉽게 훈련시킬수 있도록하는 최신 알고리즘 구현(TensorFlow에 기반하여)을 제공합니다. 학습된 에이전트들은 -NPC의 행동 제어(다중-에이전트 또는 적대 관계와 같은 다양한 설정 속에서), 게임 빌드 테스트 자동화 -그리고 출시 전 게임 설계 검증 등을 포함한 다양한 목적을 위해 사용될 수 있습니다. -ML-Agents toolkit은 유니티의 풍부한 환경에서 AI의 발전을 위한 중심 플랫폼을 제공함으로써 -더욱 광범위한 연구와 게임 개발자 커뮤니티가 만들어지도록 하기 때문에 게임 개발자들과 AI 연구원들 모두에게 상호적으로 이익이 됩니다. +우리는 또한 게임 개발자와 개발에 대해 취미를 가지신 분들이 2D, 3D 그리고 VR/AR 게임들에 사용할 지능형 에이전트를 +쉽게 훈련시킬 수 있도록하는 최신 알고리즘 구현체를 ([텐서플로우]([https://www.tensorflow.org/](https://www.tensorflow.org/)) 기반)을 제공합니다. 학습된 에이전트들은 +NPC의 행동 제어(다중 에이전트, 적대적 에이전트 등), 게임 빌드 테스트 자동화, 그리고 출시 전 게임 설계 검증 등을 포함한 다양한 목적을 위해 사용될 수 있습니다. +ML-Agents toolkit은 유니티의 풍부한 환경에서 인공지능 에이전트 개발을 위한 중심 플랫폼을 제공함으로써 더욱 광범위한 연구와 게임 개발이 진행되도록 하며 이에 따라 게임 개발자들과 AI 연구원들 모두에게 도움을 줍니다. ## 특징 @@ -20,54 +20,45 @@ ML-Agents toolkit은 유니티의 풍부한 환경에서 AI의 발전을 위한 * 10가지 이상의 유니티 환경 샘플 * 여러 환경 구성 및 학습 시나리오 제공 * 심층 강화 학습을 사용하여 기억력이 향상된 에이전트 학습 -* 쉽게 정의 가능한 학습 시나리오 교육과정 +* 쉽게 정의 가능한 커리큘럼 학습 시나리오 * 지도 학습을 위한 에이전트 행동 브로드캐스팅 -* 기본 제공되는 모방 학습 지원 -* 온-디맨드(수요 기반) 의사 결정을 통한 유연한 에이전트 제어 +* 모방 학습 지원 기본 제공 +* 온 디맨드 의사 결정을 통한 유연한 에이전트 제어 * 환경 속 네트워크 출력의 시각화 -* 독커(Docker)를 통한 간단한 설정 -* gym과 같은 학습 환경 +* [도커(Docker)]([https://www.docker.com/](https://www.docker.com/))를 통한 설정 단순화 +* [gym]([https://gym.openai.com/](https://gym.openai.com/))과 같은 학습 환경 * 유니티 인터페이스 엔진 활용 -* 유니티 환경 인스턴스를 동시에 사용하는 교육 +* 유니티 환경 인스턴스를 동시에 사용하는 학습 ## 문서화 * 설치와 사용법 외에 더 많은 정보는 [설명서 홈](docs/Readme.md)을 참고해주십시오. -* 만약 유니티 AI 플랫폼에 관한 토론에 관심있는 연구원이라면 유니티와 ML-Agents Toolkit에 관한 출판 전 -[참조 논문](https://arxiv.org/abs/1809.02627)을 참고해 주십시오. 또한 이 논문을 인용하는 것에 관한 사항은 아래를 참조하십시오. +* 만약 유니티 AI 플랫폼에 관한 토론에 관심있는 연구원이라면 유니티와 ML-Agents Toolkit에 관한 [논문](https://arxiv.org/abs/1809.02627)을 참고해 주십시오. 또한 이 논문을 인용하는 것에 관한 사항은 아래의 인용 부분을 참조하십시오. * 만약 이전 버전의 ML-Agents toolkit을 사용하고 있다면 [이전 버전 마이그레이션 가이드](docs/Migrating.md)를 확인해주십시오. ## 추가 리소스 블로그에 ML-Agents와 관련된 시리즈의 게시물을 게시하였습니다(영어). -* 강화 학습 개념 개요 - ([multi-armed bandit](https://blogs.unity3d.com/kr/2017/06/26/unity-ai-themed-blog-entries/) - and - [Q-learning](https://blogs.unity3d.com/kr/2017/08/22/unity-ai-reinforcement-learning-with-q-learning/)) -* [Using Machine Learning Agents in a real game: a beginner’s guide](https://blogs.unity3d.com/kr/2017/12/11/using-machine-learning-agents-in-a-real-game-a-beginners-guide/) -* [포스트](https://blogs.unity3d.com/kr/2018/02/28/introducing-the-winners-of-the-first-ml-agents-challenge/) - [first ML-Agents Challenge](https://connect.unity.com/challenges/ml-agents-1)의 승자 발표 -* [포스트](https://blogs.unity3d.com/kr/2018/01/23/designing-safer-cities-through-simulations/) - 안전한 도시 설계를 위한 유니티 사용 방법 개요. +* 강화 학습 개념 개요 ([multi-armed bandit](https://blogs.unity3d.com/kr/2017/06/26/unity-ai-themed-blog-entries/) 과 [Q-learning](https://blogs.unity3d.com/kr/2017/08/22/unity-ai-reinforcement-learning-with-q-learning/)) +* [실제 게임에서 Machine Learning 에이전트 사용하기: 초보자 가이드](https://blogs.unity3d.com/kr/2017/12/11/using-machine-learning-agents-in-a-real-game-a-beginners-guide/) +* [첫번째 ML-Agents 챌린지](https://connect.unity.com/challenges/ml-agents-1)의 수상자 관련 [포스트](https://blogs.unity3d.com/kr/2018/02/28/introducing-the-winners-of-the-first-ml-agents-challenge/) +* 안전한 도시 설계를 위한 유니티 사용 방법 개요 관련 [포스트](https://blogs.unity3d.com/kr/2018/01/23/designing-safer-cities-through-simulations/) -저희의 문서 뿐만 아니라 관련된 기사들이 있습니다: +유니티에서 제공하는 문서 뿐만 아니라 관련된 기사들이 있습니다: -* [Unity AI - Unity 3D Artificial Intelligence](https://www.youtube.com/watch?v=bqsfkGbBU6k) -* [A Game Developer Learns Machine Learning](https://mikecann.co.uk/machine-learning/a-game-developer-learns-machine-learning-intent/) -* [Explore Unity Technologies ML-Agents Exclusively on Intel Architecture](https://software.intel.com/en-us/articles/explore-unity-technologies-ml-agents-exclusively-on-intel-architecture) +* [유니티 AI - 유니티의 3D 인공지능](https://www.youtube.com/watch?v=bqsfkGbBU6k) +* [머신러닝을 배우는 게임 개발자](https://mikecann.co.uk/machine-learning/a-game-developer-learns-machine-learning-intent/) +* [인텔 아키텍쳐 전용 Unity Technologies ML-Agents 둘러보기](https://software.intel.com/en-us/articles/explore-unity-technologies-ml-agents-exclusively-on-intel-architecture) ## 커뮤니티 그리고 피드백 -ML-Agents toolkit은 오픈소스 프로젝트 이며 컨트리뷰션을 환영하고 격려합니다. -만약 컨트리뷰트를 원하시면 저희의 -[컨트리뷰션 가이드라인](CONTRIBUTING.md)과 -[행동 규칙](CODE_OF_CONDUCT.md)을 검토해주십시오. +ML-Agents toolkit은 오픈소스 프로젝트이며 컨트리뷰션을 환영합니다. 만약 컨트리뷰션을 원하시는 경우 +[컨트리뷰션 가이드라인](CONTRIBUTING.md)과 [행동 규칙](CODE_OF_CONDUCT.md)을 검토해주십시오. -만약 ML-Agents toolkit을 사용하며 문제가 생긴다면, -가능한 많은 세부 사항을 포함하여 [이슈 제출](https://github.com/Unity-Technologies/ml-agents/issues)을 해주십시오. +만약 ML-Agents toolkit을 사용하며 문제가 생긴다면, 가능한 많은 세부 사항을 포함하여 [이슈 제출](https://github.com/Unity-Technologies/ml-agents/issues)을 해주십시오. -여러분의 의견은 저희에게 매우 중요합니다. Unity ML-Agents Toolkit에 관하여 단지 듣기만 해도 저희는 계속해서 +여러분의 의견은 저희에게 매우 중요합니다. Unity ML-Agents Toolkit에 관련된 여러분의 의견을 통해서 저희는 계속해서 발전하고 성장할 수 있습니다. 단 몇 분만 사용하여 [저희에게 알려주세요](https://github.com/Unity-Technologies/ml-agents/issues/1454). @@ -83,3 +74,13 @@ ML-Agents toolkit은 오픈소스 프로젝트 이며 컨트리뷰션을 환영 만약 Unity 또는 the ML-Agents Toolkit을 사용하여 연구를 수행할 경우 다음 논문을 참고 자료로 인용하여 주시길 바랍니다: Juliani, A., Berges, V., Vckay, E., Gao, Y., Henry, H., Mattar, M., Lange, D. (2018). Unity: A General Platform for Intelligent Agents. *arXiv preprint arXiv:1809.02627.* https://github.com/Unity-Technologies/ml-agents. + + + +## 한글 번역 + +유니티 ML-Agents 관련 문서의 한글 번역은 [장현준(Hyeonjun Jang)][https://github.com/JangHyeonJun], [민규식 (Kyushik Min)]([https://github.com/Kyushik](https://github.com/Kyushik))에 의해 진행되었습니다. 내용상 오류나 오탈자가 있는 경우 각 문서의 번역을 진행한 사람의 이메일을 통해 연락주시면 감사드리겠습니다. + +장현준: totok682@naver.com + +민규식: kyushikmin@gmail.com \ No newline at end of file diff --git a/docs/localized/KR/docs/Installation-Windows.md b/docs/localized/KR/docs/Installation-Windows.md new file mode 100644 index 0000000000..fde671f589 --- /dev/null +++ b/docs/localized/KR/docs/Installation-Windows.md @@ -0,0 +1,304 @@ +# Windows ڸ ML-Agents Toolkit ġ + +ML-Agents toolkit Windows 10 մϴ. ٸ Windows ε ML-Agents toolkit + ʾҽϴ. , ML-Agents toolkit Windows VM(Bootcamp Ǵ ó +ȯ ) ʾҽϴ . + +ML-Agents toolkit ϱ , Ʒ Ȱ ó Python 䱸Ǵ Python Ű ġؾ մϴ. + ̵ GPU н(ڸ ) ٷϴ. +, ML-Agents toolkit GPU н ʿ Ǵ Ư ׿ ʿ ֽϴ. + +## ܰ 1: Anaconda Python ġ + +Windows Anaconda [ٿε](https://www.anaconda.com/download/#windows)ϰ ġϽʽÿ. +Anaconda ν, ٸ Python и ȯ濡 ֽϴ. +Python 2 ̻ ʱ Python 3.5 Ǵ 3.6 ʿմϴ. ̵忡 츮 +Python 3.6 Anaconda 5.1 Դϴ. +([64-bit](https://repo.continuum.io/archive/Anaconda3-5.1.0-Windows-x86_64.exe) +Ǵ [32-bit](https://repo.continuum.io/archive/Anaconda3-5.1.0-Windows-x86.exe) +ũ). + +

+ Anaconda Install +

+ +Ʈ _advanced installation options_ ϴ õ Ȳ ɼ Ͻʽÿ. + +

+ Anaconda Install +

+ +ġ Ŀ ݵ __Anaconda Navigator__ Ϸؾ մϴ. +Windows Ž â, _anaconda navigator_ ŸϿ Anaconda Navigator ֽϴ. + +ȯ Ǿ ʴٸ `conda` ɾ Ÿ +"conda is not recognized as internal or external command" Դϴ. +̸ ذϱ Ȯ ȯ ʿմϴ. + +Ž â `ȯ ` Ÿ Ͽ ( Ű ų Ʒ ư ֽϴ). + __ý ȯ __ ɼ ҷɴϴ. + +

+ edit env variables +

+ + ɼǿ __ȯ __ ư Ŭϰ. Ʒ __ý __ "Path" Ŭϰ __ __ ŬϿ path ߰Ͻʽÿ. + +```console +%UserProfile%\Anaconda3\Scripts +%UserProfile%\Anaconda3\Scripts\conda.exe +%UserProfile%\Anaconda3 +%UserProfile%\Anaconda3\python.exe +``` + +## ܰ 2: ο Conda ȯ Ȱȭ + +ML-Agents toolkit Բ ο [Conda ȯ](https://conda.io/docs/) Դϴ. + ۾ ġ Ű ȯ濡 ѵȴٴ ǹմϴ. ̴ ٸ ȯ̳ ٸ ̽ ġ + ġ ʽϴ. ML-Agents ׻ Conda ȯ Ȱȭ Ѿ մϴ. + +ο Conda ȯ , ο Anaconda Ʈ(Ž â _Anaconda Prompt_ Ŭ) +ɾ Ÿ Ͻʽÿ: + +```sh +conda create -n ml-agents python=3.6 +``` + + Ű ġϱ ޼ `y` Ÿϰ ͸ ʽÿ _(ͳ Ǿִ ȮϽʽÿ)_. + 䱸Ǵ Ű ݵ ġؾ մϴ. ο Conda ȯ濡 Python 3.6 Ǹ ml-agents ȣ˴ϴ. + +

+ Anaconda Install +

+ +ռ ȯ ̿ϱ ݵ Ȱȭ ؾմϴ. _(Ŀ ɾ ȯ ֽϴ)_. + Anaconda Ʈ ɾ Ÿ Ͻʽÿ: + +```sh +activate ml-agents +``` + +Ȱȭ Ŀ `(ml-agents)` ڰ տ Ÿ ֽϴ. + +, `tensorflow` ġմϴ. ̽ Ű ġϱ ϴ `pip` Ű ý۸ Ͽ ġ ֽϴ. +ֽ TensorFlow ۵ Ƿ, ġ 1.7.1 Ȯؾ մϴ. Anaconda Ʈ â + ɾ Ÿ Ͻʽÿ._(ͳ Ǿ ִ ȮϿ ֽʽÿ)_: + +```sh +pip install tensorflow==1.7.1 +``` + +## ܰ 3: ʼ ̽ Ű ġ + +ML-Agents toolkit ̽ Ű Դϴ. `pip` Ͽ ̽ Ӽ ġϽʽÿ. + +ML-Agents Toolkit Ұ ǻͿ Ǿ ʾҴٸ Ͻʽÿ. Git ([ٿε](https://git-scm.com/download/win))ϰ +Ų ɾ Anaconda Ʈâ ԷϿ ֽϴ. _( Ʈ â ִٸ `activate ml-agents` ŸϿ +ml-agents Conda ȯ Ȱȭ Ǿִ ȮϽʽÿ)_: + +```sh +git clone https://github.com/Unity-Technologies/ml-agents.git +``` + + Git ϰ ʴٸ [ũ](https://github.com/Unity-Technologies/ml-agents/archive/master.zip) ٿε ֽϴ. + +`UnitySDK` 丮 Ʈ ߰ Ƽ ּ ԵǾ ֽϴ. ϴµ Ǵ [ ȯ](Learning-Environment-Examples.md) ֽϴ. + +`ml-agents` 丮 Ƽ ȯ ԰ ϴ ȭн Ʈ̳ ̽ Ű ԵǾ ֽϴ. + +`ml-agents-envs` 丮 `ml-agents` Ű ӵǴ Ƽ ̽ ̽ API ԵǾ ֽϴ. + +`gym-unity` 丮 OpenAI Gym ̽ Ű ԵǾ ֽϴ. + +`mlagents-learn` Ʈ̳ ȯ 丮 ȿ ʿϹǷ, ٿε 丮 ġ Ͻʽÿ. +ͳ Ǿ Ȯϰ Anaconda Ʈ ɾ Ÿ Ͻʽÿt: + +```console +pip install mlagents +``` + +ML-Agents toolkit ʿ ̽ Ű ġ Ϸ Դϴ. + +Windows pip Ͽ Ư ̽ Ű ġ Ű ij д ֽϴ. + ذ ֽϴ: + +```console +pip install mlagents --no-cache-dir +``` + +`--no-cache-dir` pip ij Ȱȭ Ѵٴ Դϴ. + + +### ġ + + `ml-agents` Ǵ `ml-agents-envs` ϰ ʹٸ, PyPi ƴ ҷ Ű ġؾ մϴ. +̸ , `ml-agents` `ml-agents-envs` ġؾ մϴ. + + `C:\Downloads` ġ ֽϴ. ϰų ٿε +Anaconda Ʈ ml-agents 丮 ml-agents 丮 Ͻʽÿ: + +```console +cd C:\Downloads\ml-agents +``` + + 丮 Ͻʽÿ: + +```console +cd ml-agents-envs +pip install -e . +cd .. +cd ml-agents +pip install -e . +``` + +`-e` ÷׸ Ͽ pip ϸ ̽ ְ `mlagents-learn` ݿ˴ϴ. +`mlagents` Ű `mlagents_envs` ̰, ٸ ġϸ PyPi `mlagents_envs` ġ ֱ + Ű ġϴ ߿մϴ. + +## (ɼ) Step 4: ML-Agents Toolkit GPU н + +ML-Agents toolkit GPU ʿ н ߿ PPO ˰ ӵ ũ մϴ( Ŀ GPU ֽϴ). + ̵ GPU н ϰ ڸ ̵ Դϴ. GPU CUDA ȣȯǴ Ȯؾ մϴ. +[](https://developer.nvidia.com/cuda-gpus) Nvidia Ȯ ֽʽÿ. + + ML-Agents toolkit CUDA 9.0 cuDNN 7.0.5 ˴ϴ. + +### Nvidia CUDA toolkit ġ + +Nvidia ī̺꿡 CUDA Ŷ(toolkit) 9.0 [ٿε](https://developer.nvidia.com/cuda-toolkit-archive)ϰ ġϽʽÿ. +ML-Agents toolkit Ű CUDA Ŷ GPU ̺귯, +-ȭ , C/C++(־ Ʃ 2017) Ϸ, Ÿ ̺귯 մϴ. + ̵忡 [9.0.176](https://developer.nvidia.com/compute/cuda/9.0/Prod/network_installers/cuda_9.0.176_win10_network-exe)) մϴ. + +ġϱ , __ Ƽ Ǵ ־ Ʃ ߴ__ ȮϿ ֽʽÿ. + +ν緯 ϰ Express ɼ Ͻʽÿ. CUDA Ŷ ġ 丮 ֽʽÿ. ̵忡, +`C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0` ο ġմϴ. + +### Nvidia cuDNN ̺귯 ġ + +Nvidia cuDNN ̺귯 [ٿε](https://developer.nvidia.com/cudnn)ϰ ġϽʽÿ. +cuDNN Ű ⺻ Ǵ GPU ̺귯. ٿε Nvidia Developer Program ؾ Դϴ(). + +

+ cuDNN membership required +

+ +ϰ cuDNN [ٿε ](https://developer.nvidia.com/cudnn) ưʽÿ. +ª 翡 ؾ ֽϴ. When you get to the list +cuDNN Ʈ __ܰ 1 ġ CUDA Ŷ ´ ٿεϰ ִ ȮϽʽÿ.__ ̵忡, +CUDA Ŷ 9.0 7.0.5 մϴ +([ٿε ũ](https://developer.nvidia.com/compute/machine-learning/cudnn/secure/v7.0.5/prod/9.0_20171129/cudnn-9.0-windows10-x64-v7)). + +cuDNN ٿε Ŀ, CUDA Ŷ 丮ȿ ( )ؾ մϴ. +cuDNN zip ȿ `bin`, `include`, ׸ `lib` ֽϴ. + +

+ cuDNN zip files +

+ + CUDA Ŷ 丮ȿ Ͻʽÿ. +CUDA Ŷ 丮 `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0` ġ ֽϴ. + +

+ cuda toolkit directory +

+ +### ȯ + +1 ȯ 2 ߰ؾ մϴ. + +ȯ ϱ , Ž â `ȯ ` Ÿ Ͽ ( Ű ų Ʒ ư ֽϴ). + __ý ȯ __ ɼ ҷɴϴ. + +

+ edit env variables +

+ + ɼǿ __ȯ __ ư Ŭϰ ý __ __ ŬϽʽÿ _( ƴ Ʒ __ý __ ȮϽʽÿ). + +

+ new system variable +

+ +__ ̸__ `CUDA_HOME` ϰ CUDA Ŷ 丮 θ Է ֽʽÿ. + ̵忡 丮 δ `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0` Դϴ. Ȯ ư ֽʽÿ. + +

+ system variable names and values +

+ +2 __ȯ __ â ׸ Ʒ ι° ڽ __ý __ȿ, +`Path` ã Ŭϰ ____ư ʽÿ. Ʈ 2 丮 ߰ Դϴ. 丮 ϴ: + +```console +C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\lib\x64 +C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\extras\CUPTI\libx64 +``` + + 丮 ġ ġ 丮 ȮϽʽÿ. _ҹڿ Ͻʽÿ_. + +

+ Path variables +

+ +### TensorFlow GPU ġ + +, `pip` Ͽ 1.7.1. `tensorflow-gpu` ġϽʽÿ . ml-agents Conda ȯ Ȱȭ Ų Anaconda Ʈ +CPU TensorFlow ϰ GPU TensorFlow ġϱ ɾ Ÿ Ͻʽÿ _(ͳ Ǿ ִ ȮϽʽÿ)_: + +```sh +pip uninstall tensorflow +pip install tensorflow-gpu==1.7.1 +``` + +, ġǾ ְ, Tensorflow GPU νϰ ִ ׽Ʈؾմϴ. + Anaconda Ʈ Python ȣϿ ϴ: + +```sh +python +``` + +׸ ɾ Ÿ Ͻʽÿ: + +```python +import tensorflow as tf + +sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) +``` + + Դϴ: + +```console +Found device 0 with properties ... +``` + +## Acknowledgments + + ̵带 ʾ ۼ +[Jason Weimann](https://unity3d.college/2017/10/25/machine-learning-in-unity3d-setting-up-the-environment-tensorflow-for-agentml-on-windows-10/) + +[Nitish S. Mutha](http://blog.nitishmutha.com/tensorflow/2017/01/22/TensorFlow-with-gpu-for-windows.html) + 帳ϴ. + +## ѱ + +ش ѱ [ (Hyeonjun Jang)]([https://github.com/janghyeonjun](https://github.com/janghyeonjun)) Ǿϴ. Żڰ ִ totok682@naver.com ֽø 帮ڽϴ. diff --git a/docs/localized/KR/docs/Installation.md b/docs/localized/KR/docs/Installation.md new file mode 100644 index 0000000000..44b50d3f36 --- /dev/null +++ b/docs/localized/KR/docs/Installation.md @@ -0,0 +1,104 @@ +# 설치 + +ML-Agents를 설치하고 사용하기 위해 유니티를 설치해야 하고 이 Repository(저장소)를 +Clone(복제)하고 추가종속성을 가지는 Python(파이썬)을 설치해야합니다. 아래 Subsection(하위섹션)에서는 Docker(도커) 설정 외에도 +각 단계를 개괄적으로 설명합니다. + +## **Unity 2017.4** 또는 이후의 버전을 설치하십시오. + +[다운로드](https://store.unity.com/kr/download)하고 설치하십시오. 만약 저희의 도커 설정(차후에 소개할)을 사용하고 싶다면, +유니티를 설치할 때, Linux Build Support를 설정하십시오. + +

+ Linux Build Support +

+ +## Windows 사용자 +Windows에서 환경을 설정하기 위해, [세부 사항](Installation-Windows.md)에 설정 방법에 대해 작성하였습니다. +Mac과 Linux는 다음 가이드를 확인해주십시오. + +## Mac 또는 Unix 사용자 + +### ML-Agents Toolkit 저장소 복제 + +유니티 설치 후에 ML-Agents Toolkit 깃허브 저장소를 설치하고 싶을 것입니다. + +```sh +git clone https://github.com/Unity-Technologies/ml-agents.git +``` + +`UnitySDK` 하위 디렉토리에는 프로젝트에 추가할 유니티 애셋이 포함되어 있습니다. +또한 시작하는데 도움이 되는 많은 [예제 환경](Learning-Environment-Examples.md)들이 있습니다. + +`ml-agents` 하위 디렉토리에는 유니티 환경과 함게 사용하는 심층 강화학습 트레이너 파이썬 패키지가 포함되어 있습니다. + +`ml-agents-envs` 하위 디렉토리에는 `ml-agents` 패키지에 종속되는 유니티의 인터페이스를 위한 파이썬 API가 포함되어 있습니다. + +`gym-unity` 하위 디렉토리에는 OpenAI Gym의 인터페이스를 위한 패키지가 포함되어 있습니다. + +### 파이썬과 mlagents 패키지 설치 + +ML-Agents toolkit을 사용하기 위해 [setup.py file](../ml-agents/setup.py)에 나열된 종속성과 함께 파이썬 3.6이 필요합니다. +주요 종속성의 일부는 다음을 포함합니다: + +- [TensorFlow](Background-TensorFlow.md) (Requires a CPU w/ AVX support) +- [Jupyter](Background-Jupyter.md) + +Python 3.6이 만약 설치되어 있지 않다면, [다운로드](https://www.python.org/downloads/)하고 설치하십시오. + +만약 당신의 파이썬 환경이 `pip3`을 포함하지 않는다면, 다음 +[지시사항](https://packaging.python.org/guides/installing-using-linux-tools/#installing-pip-setuptools-wheel-with-linux-package-managers) +을 따라서 설치하십시오. + +종속성과 `mlagents` 파이썬 패키지를 설치하기 위해 다음 명령어를 실행하십시오: + +```sh +pip3 install mlagents +``` + +이 명령어를 통해 PyPi로 부터(복제된 저장소가 아닌) `ml-agents`가 설치될 것입니다. +만약 성공적으로 설치를 완료 했다면, `mlagents-learn --help` 명령어를 실행할 수 있을 것입니다. +명령어를 실행하면 유니티 로고와 `mlagents-learn`에서 사용할 수 있는 명령어 라인 매개변수들을 볼 수 있습니다. + +**주의:** + +- 현재 Python 3.7 또는 Python 3.5을 지원하지 않습니다. +- 만약 Anaconda를 사용하고 TensorFlow에 문제가 있다면, 다음 + [링크](https://www.tensorflow.org/install/pip)에서 Anaconda 환경에서 어떻게 TensorFlow를 설치하는지 확인하십시오. +### 개발을 위한 설치방법 + +만약 `ml-agents` 또는 `ml-agents-envs`를 수정하고 싶다면, PyPi가 아닌 복제된 저장소로 부터 패키지를 설치해야 합니다. +이를 위해, `ml-agents`와 `ml-agents-envs`를 각각 설치해야 합니다. 저장소의 루트 디렉토리에서 다음 명령어를 실행하십시오: + +```sh +cd ml-agents-envs +pip3 install -e ./ +cd .. +cd ml-agents +pip3 install -e ./ +``` + +`-e` 플래그를 사용하여 pip를 실행 하면 파이썬 파일을 직접 변경할 수 있고 `mlagents-learn`를 실행할 때 반영됩니다. +`mlagents` 패키지가 `mlagents_envs`에 의존적이고, 다른 순서로 설치하면 PyPi로 부터 `mlagents_envs`를 +설치할 수 있기 때문에 이 순서대로 패키지를 설치하는 것은 중요합니다. + +## 도커 기반 설치 + +만약 ML-Agents를 위해 도커를 사용하고 싶다면, [이 가이드](Using-Docker.md)를 따라하십시오. + +## 다음 단계 + +[기초 가이드](Basic-Guide.md) 페이지에는 유니티 내에서 ML-Agents toolkit의 설정 및 학습된 모델 실행, +환경 구축, 학습 방법에 대한 여러 짧은 튜토리얼을 포함하고 있습니다. + +## 도움말 + +ML-Agents와 관련된 문제가 발생하면 저희의 [FAQ](FAQ.md)와 [제약 사항](Limitations.md) 페이지를 참고해 주십시오. +만약 문제에 대한 아무것도 찾을 수 없다면 OS, Pythons 버전 및 정확한 오류 메세지와 함께 [이슈 제출](https://github.com/Unity-Technologies/ml-agents/issues)을 해주십시오. + + +## 한글 번역 + +해당 문서의 한글 번역은 [장현준 (Hyeonjun Jang)]([https://github.com/janghyeonjun](https://github.com/janghyeonjun))에 의해 진행되었습니다. 내용상 오류나 오탈자가 있는 경우 totok682@naver.com 으로 연락주시면 감사드리겠습니다. \ No newline at end of file diff --git a/docs/localized/KR/docs/Migrating.md b/docs/localized/KR/docs/Migrating.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/localized/KR/docs/Readme.md b/docs/localized/KR/docs/Readme.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/localized/KR/docs/Training-Imitation-Learning.md b/docs/localized/KR/docs/Training-Imitation-Learning.md new file mode 100644 index 0000000000..dc4a24b9d1 --- /dev/null +++ b/docs/localized/KR/docs/Training-Imitation-Learning.md @@ -0,0 +1,85 @@ +# 모방학습을 통한 에이전트 학습 + +에이전트가 시행착오를 통해 스스로 학습하는 것보다 단순히 에이전트가 수행하기를 원하는 행동을 우리가 알려주는 것이 더 직관적일 수 있습니다. 위생병 NPC를 학습하기 위한 [실행 예시](ML-Agents-Overview.md#running-example-training-npc-behaviors) 문서 내용에 대해 생각해보겠습니다. 보상 함수를 이용하여 위생병의 행동을 간접적으로 학습하는 것이 아니라 게임에서 얻어진 관측 (observation)과 게임 컨트롤러를 통해 얻어진 행동들 (actions)의 실제 데이터를 통해 위생병의 행동을 결정하도록 학습합니다. 모방학습 (Imitation Learning)은 실제 플레이를 통해 얻어진 관측과 행동 데이터 쌍을 이용하여 에이전트의 정책을 학습합니다. [비디오 링크](https://youtu.be/kpb8ZkMBFYs). + +## 시범 (Demonstration) 데이터 기록 + +유니티 에디터를 이용하여 에이전트의 플레이를 기록하고 에셋으로 저장하는 것이 가능합니다. 이런 플레이 데이터에는 기록을 진행하는 동안의 관측, 행동 그리고 보상 정보가 포함됩니다. 이것들은 데이터를 통해 관리가 가능하며 Behavioral Cloning과 같은 오프라인 학습에 사용될 수 있습니다. (아래 내용 참고) + +에이전트의 플레이 데이터를 기록하기 위해서는 씬(Scene)에서 `Agent` 컴포넌트를 포함하고 있는 GameObject에 `Demonstration Recorder` 컴포넌트를 추가해주어야 합니다. 일단 추가되고나면 에이전트로부터 플레이 데이터를 기록할 수 있게 됩니다. + +

+ BC Teacher Helper +

+ +`Record`가 체크되는 경우 씬이 실행되면 데이터가 생성됩니다. 환경의 난이도에 따라 모방학습에 사용하기 위해 몇분에서 몇시간 정도 플레이 데이터를 수집해야합니다. 충분한 데이터가 기록되었으면 유니티 상에서 게임의 실행을 정지합니다. 그렇게 하면 `.demo` 파일이 `Assets/Demonstations` 폴더 내부에 생성됩니다. 이 파일에는 에이전트의 플레이 데이터가 저장되어 있습니다. 이 파일을 클릭하면 인스펙터 상에 데모 파일에 대한 정보를 아래와 같이 알려줍니다. + +

+ BC Teacher Helper +

+ + +## Behavioral Cloning을 통한 학습 + +모방학습을 위한 다양한 알고리즘이 존재하며 모방학습 알고리즘 중 가장 간단한 알고리즘이 Behavioral Cloning 입니다. 이 알고리즘은 마치 이미지 분류를 위한 지도학습 (Supervised Learning)이나 기타 고전적인 머신러닝 기법들처럼 전문가의 플레이로부터 수집된 데이터를 직접적으로 모방하도록 정책 (Policy)을 학습합니다. + + +### 오프라인 학습 + +오프라인 Behavioral Cloning에서 우리는 에이전트의 행동을 학습하기 위해 `Demonstration Recorder`를 통해 생성된 `demo` 파일을 데이터 셋으로 이용합니다. + +1. 전문가의 플레이 데이터를 모방하도록 학습하는 에이전트 선택 +2. `Demonstration Recorder`를 이용하여 전문가의 플레이를 기록합니다. (위의 내용 참고) + 앞으로 설명을 위해 이 기록된 파일의 이름을 `AgentRecording.demo`라고 하겠습니다. +3. 씬을 빌드하고 에이전트에게 러닝 브레인 (Learning Brain)을 할당합니다. 그리고 아카데미의 Broadcast Hub에서 이 브레인의 Control을 체크해줍니다. 브레인에 대한 정보가 필요하시면 다음의 [문서](Learning-Environment-Design-Brains.md)를 참고해주세요. +4. `config/offline_bc_config.yaml` 파일을 열어줍니다. +5. `demo_path` 파라미터를 스텝 2에서 기록한 데모 파일의 경로로 수정해줍니다. 이번 예시의 경우 설정된 경로는 다음과 같습니다: `./UnitySDK/Assets/Demonstrations/AgentRecording.demo` +6. `./config/offline_bc_config.yaml` 을 설정 파라미터로 하는 mlagent-learn을 실행하며 `--run-id` 와 `--train` 을 입력합니다. 빌드된 환경이 standalone으로 컴파일되었거나 에디터에서 train이 생략된 경우 `--env` 파라미터에 빌드된 환경의 경로를 기입해주세요. + ​ +7. (선택적) 텐서 보드를 활용하여 학습 성능을 확인해보세요!. + +위 방법은 데모 파일을 이용하여 에이전트가 직접적으로 전문가의 행동을 따라하도록 인공신경망을 학습하는 기법입니다. 환경은 학습이 진행되는 동안 에이전트의 성능을 평가하기 위해 실행되며 사용될 것입니다. + +### 온라인 학습 + +미리 생성된 데모 파일 없이 학습이 진행되는 동안 실시간으로 전문가의 플레이 데이터를 제공하며 에이전트를 학습하는 것도 간으합니다. 이 방법은 다음의 단계를 따라 진행됩니다: +without pre-recording a demonstration file. The steps to do this are as follows: + +1. 먼저 두개의 브레인들을 생성합니다. 하나는 "선생님"이 될 것이고 하나는 "학생"이 될 것입니다. 이번 예시에서는 두개의 브레인 에셋의 이름을 각각 "Teacher"와 "Student"로 설정할 것입니다. +2. "Teacher" 브레인은 반드시 **플레이어 브레인 (Player Brain)**이어야 합니다. +3. "Student" 브레인은 반드시 **러닝 브레인 (Learning Brain)**이어야 합니다. +4. "Teacher" 브레인과 "Student" 브레인의 파라미터는 에이전트에서 설정한대로 동일하게 설정되어야 합니다. +5. "Teacher" 브레인과 "Student" 브레인을 아카데미의 `Broadcast Hub`에 추가하고 "Student" 브레인의 `Control` 체크박스에 체크를 해줍니다. +6. 브레인들을 원하는 에이전트들에게 연결해줍니다. (하나의 에이전트는 선생님으로 설정되어야 하며 적어도 하나의 에이전트는 학생으로 설정되어야 합니다). +7. `config/online_bc_config.yaml` 파일에서, "Student" 브레인에 대한 항목을 추가해야합니다. `trainer` 파라미터를 `online_bc`로 설정하고 `brain_to_imitate` 파라미터를 선생님 에이전트의 브레인 이름인 "Teacher"로 설정합니다. 추가적으로 각 순간마다 얼마나 많은 학습을 진행할지 결정하는 `batches_per_epoch`를 설정합니다. 에이전트를 더 오랜 기간동안 학습하고 싶은 경우 `max_steps` 값을 증가시켜주세요. +8. `mlagents-learn config/online_bc_config.yaml + ​--train —slow`를 통해 학습과정을 실행하고 화면에 _"Start training by pressing the Play button in the Unity Editor"_ 라는 메세지가 출력되면 유니티의 :arrow_forward: 버튼을 눌러주세요 +9. 유니티 윈도우 상에서 선생님 브레인을 가진 에이전트를 제어하면서 원하는대로 플레이 데이터를 생성합니다. +10. 학생 브레인을 가진 에이전트(들)을 살펴보면 선생님 브레인을 가진 에이전트의 플레이와 유사하게 행동하기 시작합니다. +11. 학생 에이전트들이 원하는대로 행동하게 되면 커멘드 라인에서 `CTL+C`를 눌러서 학습을 중단하십시오. +12. 생성된 `*.nn` 파일을 Assets 폴더의 하위 폴더인 `TFModels` 폴더로 이동시키고 이 파일을 `러닝` 브레인에 사용하세요. + +**BC Teacher Helper** + +더 편리한 사용을 위해서, `BC Teacher Helper` 컴포넌트를 선생님 에이전트에 사용할 수 있습니다. + +

+ BC Teacher Helper +

+ +이것을 사용하면 다음과 같은 키보드 단축키를 사용할 수 있습니다: + +1. 기록을 시작하거나 중단할 수 있습니다. 이것은 에이전트를 통해 게임을 플레이하되 에이전트가 학습은 되지 않도록 사용할 때 유용합니다. 이것에 대한 기본적인 실행은 키보드의 `R` 버튼을 누르면 됩니다. +2. 트레이닝 버퍼를 리셋합니다. 이 명령을 통해 에이전트가 최근의 경험에 대한 버퍼를 비우도록 설정합니다. 이것은 에이전트가 빠르게 새로운 행동을 배우게 하고싶을때 사용하면 유용합니다. 버퍼를 리셋하기 위한 기본 명령은 키보드의 `C` 버튼을 누르면 됩니다. + + + +## 한글 번역 + +해당 문서의 한글 번역은 [민규식 (Kyushik Min)]([https://github.com/Kyushik](https://github.com/Kyushik))에 의해 진행되었습니다. 내용상 오류나 오탈자가 있는 경우 kyushikmin@gmail.com 으로 연락주시면 감사드리겠습니다. \ No newline at end of file diff --git a/docs/localized/KR/docs/Training-PPO.md b/docs/localized/KR/docs/Training-PPO.md new file mode 100644 index 0000000000..d61af58b84 --- /dev/null +++ b/docs/localized/KR/docs/Training-PPO.md @@ -0,0 +1,151 @@ +# Proximal Policy Optimization를 이용한 학습 + +ML-Agents는 [Proximal Policy Optimization (PPO)](https://blog.openai.com/openai-baselines-ppo/) 라는 강화학습 기법을 사용합니다. +PPO는 에이전트의 관측 (Observation)을 통해 에이전트가 주어진 상태에서 최선의 행동을 선택할 수 있도록 하는 이상적인 함수를 인공신경망을 이용하여 근사하는 기법입니다. ML-agents의 PPO 알고리즘은 텐서플로우로 구현되었으며 별도의 파이썬 프로세스 (소켓 통신을 통해 실행중인 유니티 프로그램과 통신)에서 실행됩니다. + +에이전트를 학습하기 위해서 사용자는 에이전트가 최대화하도록 시도하는 보상 시그널을 하나 혹은 그 이상 설정해야합니다. 사용 가능한 보상 시그널들과 관련된 하이퍼파라미터에 대해서는 [보상 시그널](Training-RewardSignals.md) 문서를 참고해주십시오. + +`learn.py`를 이용하여 학습 프로그램을 실행하는 방법은 [ML-Agents 학습](Training-ML-Agents.md) 문서를 참고해주십시오. + +만약 에이전트에게 기억력을 부여하기 위해 순환 신경망 (Recurrent Neural Network, RNN)을 사용하는 경우, 순환 신경망에 대한 구체적인 학습 방법을 설명하는 [순환 신경망 사용하기](Feature-Memory.md) 문서를 참고해주십시오. + + +만약 에이전트에게 제시된 문제의 난이도를 점차적으로 증가시키며 학습하는 커리큘럼 학습 (Curriculum Learning)을 사용하는 경우 [커리큘럼 학습을 통한 에이전트 학습](Training-Curriculum-Learning.md) 문서를 참고해주십니오. + +모방 학습 (Imitation Learning)에 대한 정보를 얻고 싶으시다면 [모방 학습을 통한 에이전트 학습](Training-Imitation-Learning.md) 문서를 참고해주십시오. + + + +## PPO 학습을 위한 하이퍼파라미터 + +강화학습 모델을 성공적으로 학습하기 위해서는 학습과 관련된 하이퍼파라미터 튜닝이 필요합니다. 이 가이드는 기본적인 파라미터들을 이용하여 학습했을 때 사용자가 원하는 성능을 만족하지 못한 경우 파라미터 튜닝을 수행하는 방법에 대해 설명합니다. + +## 하이퍼파라미터 + +### Reward Signals + +강화학습에서 목표는 보상을 최대로 하는 정책 (Policy)을 학습하는 것입니다. 기본적으로 보상은 환경으로부터 주어집니다. 그러나 우리는 다양한 다른 행동을 통해 에이전트에게 보상을 주는 것을 생각해볼 수 있습니다. 예를 들어 에이전트가 새로운 상태를 탐험했을 때 에이전트에게 보상을 줄 수 있습니다. 이런 보상 시그널을 추가하여 학습 과정에 도움을 줄 수도 있습니다. + +`reward_signals`는 [보상 시그널](Training-RewardSignals.md)을 정의합니다. ML-Agents는 기본적으로 두개의 보상 시그널을 제공합니다. 하나는 외부 (환경) 보상이며 다른 하나는 호기심 (Curiosity) 보상입니다. 이 호기심 보상은 외부 보상이 희소성을 가지는 환경 (Sparse Extrinsic Reward Environment)에서 더 다양한 탐험을 수행할 수 있도록 도와줍니다. + +### Lambda + +`lambd` 는 `람다(lambda)` 파라미터를 의미하며 일반화된 이득 추정 (Generalized Advantage Estimate, [GAE]((https://arxiv.org/abs/1506.02438))) 계산에 사용됩니다. 이는 업데이트된 가치를 예측할 때 현재 예측된 가치에 얼마나 의존할지 결정하는 값입니다. 이 값이 낮으면 현재 예측된 가치에 더 의존하는 것을 의미하며 (높은 편향 (bias) 발생 가능), 값이 높으면 환경을 통해 얻은 실제 보상에 더 의존하는 것을 의미합니다 (높은 분산 (variance) 발생 가능). 즉 이 파라미터를 어떻게 선택하냐에 따라 두 특성간에 트레이드오프 (trade-off)가 존재합니다. 또한 이 파라미터를 적절하게 선택하면 더 안정적인 학습이 가능합니다. + +일반적인 범위: `0.9` - `0.95` + +### Buffer Size + +`buffer_size` 는 모델 학습을 시작하기 전 얼마나 많은 경험들(관측, 행동, 보상 등)을 저장할지 결정합니다. **이 값은 `batch_size`의 배수로 설정되어야 합니다.** 일반적으로 큰 `buffer_size`는 더 안정적인 학습을 가능하게 합니다. + +일반적인 범위: `2048` - `409600` + +### Batch Size + +`batch_size` 는 한번의 경사하강(Gradient Descent) 업데이트를 수행할 때 사용할 경험들의 수를 의미합니다. **이 값은 항상 `buffer_size`의 약수로 설정되어야 합니다.** 만약 연속적인 행동 공간 (Continuous Action Space) 환경을 사용하는 경우 이 값은 크게 설정되어야 합니다 (1000의 단위). 만약 이산적인 행동 공간 (Discrete Action Space) 환경을 사용하는 경우 이 값은 더 작게 설정되어야 합니다. (10의 단위). + +일반적인 범위 (연속적인 행동): `512` - `5120` + +일반적인 범위 (이산적인 행동): `32` - `512` + +### Number of Epochs + +`num_epoch` 는 경사 하강 (Gradient Descent) 학습 동안 경험 버퍼 (Experience Buffer) 데이터에 대해 학습을 몇번 수행할 지 결정합니다. `batch_size`가 클수록 이 값도 커져야합니다. 이 값을 줄이면 더 안정적인 업데이트가 보장되지만 학습 속도가 느려집니다. + +일반적인 범위: `3` - `10` + +### Learning Rate + +`learning_rate` 는 경사 하강 (Gradient Descent) 학습의 정도를 결정합니다. 학습이 불안정하고 에이전트가 얻는 보상이 증가하지 않는 경우 일반적으로 학습률을 감소시킵니다. + +일반적인 범위: `1e-5` - `1e-3` + +### Time Horizon + +`time_horizon` 은 경험 버퍼 (Experience Buffer)에 저장하기 전 에이전트당 수집할 경험의 스텝 수를 의미합니다. 에피소드가 끝나기 전에 이 한도에 도달하면 가치 평가를 통해 에이전트의 현재 상태로부터 기대되는 전체 보상을 예측합니다. 따라서 이 값의 설정에 따라 덜 편향되지만 분산이 커질수도 있고 (긴 time horizon), 더 편향 (bias)되지만 분산 (variance)이 작아질 수도 있습니다 (짧은 time horizon). 한 에피소드 동안 보상이 빈번하게 발생하는 경우나 에피소드가 엄청나게 긴 경우에는 time horizon 값은 작게 설정하는 것이 이상적입니다. 이 값은 에이전트가 취하는 일련의 행동 내에서 중요한 행동을 모두 포착할 수 있을 만큼 큰 값을 가져야 합니다. + +일반적인 범위: `32` - `2048` + +### Max Steps + +`max_steps` 은 학습 과정 동안 얼마나 많은 시뮬레이션 스텝 (프레임 스킵을 곱한만큼) 을 실행할지 결정합니다. 이 값은 복잡한 문제일수록 크게 설정해야합니다. + +일반적인 범위: `5e5` - `1e7` + +### Beta + +`beta` 는 엔트로피 정규화 (Entropy Regulazation)의 정도를 결정하며 이를 통해 정책을 더 랜덤하게 만들 수 있습니다. 이 값을 통해 에이전트는 학습 동안 액션 공간을 적절하게 탐험할 수 있습니다. 이 값을 증가시키면 에이전트가 더 많이 랜덤 행동을 취하게 됩니다. 엔트로피 (텐서보드를 통해 측정 가능)는 보상이 증가함에 따라 서서히 크기를 감소시켜야합니다. 만약 엔트로피가 너무 빠르게 떨어지면 `beta`를 증가시켜야합니다. 만약 엔트로피가 너무 느리게 떨어지면 `beta`를 감소시켜야 합니다. + +일반적인 범위: 1e-4 - 1e-2 + +### Epsilon + +`epsilon` 은 경사 하강 업데이트 동안 사용하는 이전 정책과 새로운 정책 사이의 비율을 일정 범위의 크기로 제한하는 값입니다. 이 값이 작게 설정되면 더 안정적인 학습이 가능하지만 학습이 느리게 진행될 것입니다. + +일반적인 범위: `0.1` - `0.3` + +### Normalize + +`normalize`는 벡터 관측 (Vector Observation) 입력을 정규화 (Normalization)할지 결정합니다. 이 정규화는 벡터 관측의 이동 평균 및 분산을 기반으로 수행합니다. 정규화는 복잡하고 연속적인 제어 문제에서 도움이 될 수 있지만 단순하고 이산적인 제어 문제에서는 정규화를 사용하는 것이 좋지 않을 수 있습니다. + +### Number of Layers + +`num_layers` 는 관측 입력 후 혹은 시각적 관측 (Visual Observation)의 CNN 인코딩 이후 몇개의 은닉층 (Hidden Layer)을 사용할지 결정합니다. 간단한 문제에서는 적은 수의 층을 사용하여 빠르고 효율적으로 학습해야합니다. 복잡한 제어 문제에서는 많은 층을 사용할 필요가 있습니다. + +일반적인 범위: `1` - `3` + +### Hidden Units + +`hidden_units` 은 인공신경망의 각 완전연결층 (Fully Connected Layer)에 몇개의 유닛을 사용할지 결정합니다. 최적의 행동이 관측 입력의 간단한 조합으로 결정되는 단순한 문제에 대해서는 이 값을 작게 설정합니다. 최적의 행동이 관측 입력의 복잡한 관계에 의해 결정되는 어려운 문제에 대해서는 이 값을 크게 설정합니다. + +일반적인 범위: `32` - `512` + +## (선택적) 순환신경망의 하이퍼파라미터 + +아래의 하이퍼파라미터들은 `use_recurrent` 이 참(True)으로 결정된 경우에만 사용합니다. + +### Sequence Length + +`sequence_length` 는 학습 동안 네트워크를 통과하는 연속적인 경험들의 길이를 의미합니다. 에이전트가 긴 시간에 대해 기억해야하는 정보가 있다면 이 값을 충분히 길게 설정해야합니다. 예를 들어 에이전트가 물체의 속도를 기억해야하는 경우 이 값은 작게 설정해도 괜찮습니다. 만약 에이전트가 에피소드 초반에 한번 주어진 정보를 계속 기억해야한다면 이 값을 크게 설정해야 합니다. + +일반적인 범위: `4` - `128` + +### Memory Size + +`memory_size` 는 순환신경망의 은닉 상태(hidden state)를 저장하는데 사용되는 배열의 크기를 의미합니다. 이 값은 반드시 4의 배수로 설정되어야 하며 에이전트가 임무를 성공적으로 완수하기 위해서 기억해야하는 정보의 양에 따라 크기를 조절해야합니다. + +일반적인 범위: `64` - `512` + +## Training Statistics + +학습의 상태를 확인하려면 텐서보드 (TensorBoard)를 사용해야합니다. 텐서보드를 실행하고 사용하는 것에 대한 정보를 알고싶으신 경우 이 [문서](./Getting-Started-with-Balance-Ball.md#observing-training-progress)를 참고해주십시오. + +### Cumulative Reward + +보상은 일반적으로 지속적으로 증가하는 경향을 가져야합니다. 작은 기복이 발생할수는 있습니다. 문제의 복잡도에 따라 수백만 스텝의 학습이 진행되어도 보상이 증가하지 않을수도 있습니다. + +### Entropy + +이 값은 브레인이 결정이 얼마나 무작위인지 나타냅니다. 이 값은 학습이 진행되는 동안 지속적으로 감소해야합니다. 만약 이 값이 너무 빠르게 감소하거나 아예 감소하지 않는 경우 `beta`의 크기를 조절해야합니다. (이산적인 행동 공간을 사용하는 경우) + +### Learning Rate + +이 값은 시간이 지남에 따라 선형적으로 감소합니다. + +### Policy Loss + +이 값들은 학습이 진행되는 동안 진동합니다. 일반적으로 이 값들은 1보다 작아야합니다. + +### Value Estimate + +이 값들은 누적 보상이 증가함에 따라 커져야합니다. 이 값들은 주어진 시점에서 에이전트가 스스로 받을 것이라 예측하는 미래의 보상이 얼마나 될것인지를 나타냅니다. + +### Value Loss + +이 값들은 보상이 증가하면 증가하고 보상이 안정되면 감소합니다. + + + +## 한글 번역 + +해당 문서의 한글 번역은 [민규식 (Kyushik Min)]([https://github.com/Kyushik](https://github.com/Kyushik))에 의해 진행되었습니다. 내용상 오류나 오탈자가 있는 경우 kyushikmin@gmail.com 으로 연락주시면 감사드리겠습니다. \ No newline at end of file diff --git a/docs/localized/KR/docs/Using-Docker.md b/docs/localized/KR/docs/Using-Docker.md new file mode 100644 index 0000000000..947479296d --- /dev/null +++ b/docs/localized/KR/docs/Using-Docker.md @@ -0,0 +1,123 @@ +# ML-Agents 용 도커 사용법 + +도커를 사용해 추론과 학습을 하고자하는 Windows와 Mac 사용자를 위한 솔루션을 제공합니다. +이것은 Python과 TensorFlow 설치를 피하고자 하는 분에게 매력적인 옵션이 될 것입니다. 현재 설정은 TensorFlow와 Unity가 _CPU를 통해서만_ +계산하도록 합니다. 따라서 도커 시뮬레이션은 GPU를 사용하지 않고 시각적 렌더링을 위해 [`Xvfb`](https://en.wikipedia.org/wiki/Xvfb)를 사용합니다. +`Xvfb`는 `ML-Agents`(또는 다른 응용 프로그램)가 가상으로 렌더링을 할 수 있게하는 유틸리티 입니다. 즉, `ML-Agents`를 실행하는 기계가 GPU를 가지고 있거나 +디스플레이를 가지고 있다고 가정하지 않습니다. 이것은 카메라 기반의 시각적 관찰 요소가 포함된 환경은 더욱 느려질 수도 있음을 의미합니다. + +## 요구사항 + +- 유니티 _Linux Build Support_ 컴포넌트 +- [도커](https://www.docker.com) + +## 설치 + +- 유니티 인스톨러를 [다운로드](https://unity3d.com/kr/get-unity/download)하고 _Linux Build Support_ 컴포넌트를 추가하십시오. + +- 도커가 설치되어 있지 않다면 [다운로드](https://www.docker.com/community-edition#/download)하고 설치 하십시오. + +- 호스트 머신과 분리된 환경에서 도커를 실행하기 때문에, 호스트 머신안에 마운트된 디렉토리는 트레이너 환경 설정 파일, + 유니티 실행 파일, 커리큘럼 파일과 TensorFlow 그래프와 같은 데이터를 공유하기위해 사용됩니다. + 이를 위해, 편의상 비어있는 `unity-volume` 디렉토리를 저장소의 루트에 만들었으나, 다른 디렉토리의 사용은 자유롭게 할 수 있습니다. + 이 가이드의 나머지 부분에서는 `unity-volume` 디렉토리가 사용된다고 가정하고 진행됩니다. + +## 사용법 + +ML-Agents 용 도커 사용에는 세 단계가 포함됩니다.: 특정 플래그를 사용하여 유니티 환경 빌드, 도커 컨테이너 빌드 +마지막으로, 컨테이너 실행. 만약 ML-Agents 용 유니티 환경 빌드에 익숙하지 않다면, [3D 밸런스 볼 예제와 함께 시작하기](Getting-Started-with-Balance-Ball.md) 가이드를 먼저 읽으십시오. + +### 환경 빌드 (옵션) + +_학습을 위해 에디터 사용을 원한다면 이 단계를 건너뛸 수 있습니다._ + +도커는 일반적으로 호스트 머신과 (리눅스) 커널을 공유하는 컨테이너를 실행하기 때문에, +유니티 환경은 리눅스 플랫폼이 구축되어야 합니다. 유니티 환경을 빌드할 때, 빌드 세팅 창(Build Settings window)에서 +다음 옵션을 선택해 주십시오: + +- 타겟 플랫폼을 `리눅스`로 설정 (Set the _Target Platform_ to `Linux`) +- _아키텍처_를 `x86_64'로 설정 (Set the _Architecture_ to `x86_64`) +- 환경에서 시각적인 관찰을 필요로 하지않는다면, `headless` 옵션을 선택할 수 있습니다 (아래 사진 참조). + +`빌드` (Build)를 클릭하고, 환경 이름을 선택하고 (예시: `3DBall`) 출력 디레토리를 `unity-volume`으로 설정하십시오. +빌드 후에, 파일 `<환경 이름>.x86_64` 와 하위디렉토리 `<환경 이름>_Data/` 가 `unity-volume` 에 생성 되어있는지 확인하십시오. + +![도커를 위한 빌드 설정](images/docker_build_settings.png) + +### 도커 컨테이너 빌드 + +첫 번째, 도커 머신이 시스템에서 작동하는지 확인하십시오. 저장소의 최상단에서 다음 명령어를 호출하여 +도커 컨테이너를 빌드하십시오: + +```sh +docker build -t . +``` + +``을 도커 이미지 이름으로 바꾸십시오, 예시: `balance.ball.v0.1`. + +### 도커 컨테이너 실행 + +저장소의 최상단에서 다음 명령어를 호출하여 도커 컨테이너를 실행하십시오: + +```sh +docker run --name \ + --mount type=bind,source="$(pwd)"/unity-volume,target=/unity-volume \ + -p 5005:5005 \ + :latest \ + --docker-target-name=unity-volume \ + \ + --env= \ + --train \ + --run-id= +``` + +인수(argument) 값 정보: + +- `` 은 컨테이너를 구분하기위해 사용됩니다 (컨테이너를 인터럽트하거나 종료시킬 때). +이것은 선택사항이며 설정하지 않았을 경우 도커는 랜덤한 이름을 생성합니다. _도커 이미지를 실행할 때마다 +고유한 이름을 가져야함에 유의하십시오._ +- `` 컨테이너를 빌드할 때 사용할 image name을 참조합니다. +- `` __(옵션)__: 리눅스 실행파일과 함께 학습을 할 경우, 인수 값이 실행파일의 이름이 된다. +에디터에서 학습을 할 경우, `` 인수를 전달하지 말고 유니티에서 _"Start training by pressing + the Play button in the Unity Editor"_ 메세지가 화면에 표시될 때 :arrow_forward: 버튼을 누르십시오. +- `source`: 유니티 실행파일을 저장할 호스트 운영체제의 경로를 참조합니다. +- `target`: 도커가`source` 경로에 이 이름을 가진 디스크로 마운트하도록 합니다. +- `docker-target-name`: ML-Agents 파이썬 패키지에게 유니티 실행파일을 읽고 그래프를 저장할 수 있는 디스크의 이름을 알려준다. +**그러므로 `target`과 동일한 값을 가져야 합니다.** +- `trainer-config-file`, `train`, `run-id`: ML-Agents 인자들은 `mlagents-learn`로 전달됩니다. 트레이너 설정 파일의 이름 `trainer-config-file`, +알고리즘을 학습하는 `train`, 그리고 각 실험에 고유한 식별자를 태깅하는데 사용되는 `run-id`. +컨테이너가 파일에 접근할 수 있도록 trainer-config 파일을 `unity-volume` 안에 둘 것을 권장합니다. + +`3DBall` 환경 실행파일을 학습하기 위해 다음 명령어가 사용됩니다: + +```sh +docker run --name 3DBallContainer.first.trial \ + --mount type=bind,source="$(pwd)"/unity-volume,target=/unity-volume \ + -p 5005:5005 \ + balance.ball.v0.1:latest 3DBall \ + --docker-target-name=unity-volume \ + trainer_config.yaml \ + --env=3DBall + --train \ + --run-id=3dball_first_trial +``` + +도커 마운트에 대한 세부 사항은 도커의 [이 문서](https://docs.docker.com/storage/bind-mounts/)를 참고해 주십시오. + +**참고** 도커를 사용해 시각적인 관찰을 포함한 환경을 학습할 경우, 콘테이너를 위해 할당한 도커의 디폴트 메모리를 늘려야할 것입니다. +예를 들어, [여기](https://docs.docker.com/docker-for-mac/#advanced) Mac 사용자를 위한 도커 지시사항을 봐주십시오. + +### 컨테이너 중지 및 상태 저장 + +학습 진행 상황에 만족했을 경우, 상태를 저장하는 동안 `Ctrl+C` or `⌘+C` (Mac) 키를 사용하거나 다음 명령어를 통해 도커 컨테이너를 중지할 수 있습니다: + +```sh +docker kill --signal=SIGINT +``` + +`` 은 `docker run` 명령어에 지정된 컨테이너 이름입니다. 지정하지 않으면 무작위로 생성되며`docker container ls`를 통해 확인할 수 있습니다. + + +## 한글 번역 + +해당 문서의 한글 번역은 [장현준 (Hyeonjun Jang)]([https://github.com/janghyeonjun](https://github.com/janghyeonjun))에 의해 진행되었습니다. 내용상 오류나 오탈자가 있는 경우 totok682@naver.com 으로 연락주시면 감사드리겠습니다. diff --git a/docs/localized/KR/docs/images/3dball_learning_brain.png b/docs/localized/KR/docs/images/3dball_learning_brain.png new file mode 100644 index 0000000000..1f4a4440ed Binary files /dev/null and b/docs/localized/KR/docs/images/3dball_learning_brain.png differ diff --git a/docs/localized/KR/docs/images/3dballhard.png b/docs/localized/KR/docs/images/3dballhard.png new file mode 100644 index 0000000000..a452167157 Binary files /dev/null and b/docs/localized/KR/docs/images/3dballhard.png differ diff --git a/docs/localized/KR/docs/images/academy.png b/docs/localized/KR/docs/images/academy.png index 19e42d5d60..62f3e5f8e5 100644 Binary files a/docs/localized/KR/docs/images/academy.png and b/docs/localized/KR/docs/images/academy.png differ diff --git a/docs/localized/KR/docs/images/agent.png b/docs/localized/KR/docs/images/agent.png index 4504666f30..1918afe54d 100644 Binary files a/docs/localized/KR/docs/images/agent.png and b/docs/localized/KR/docs/images/agent.png differ diff --git a/docs/localized/KR/docs/images/bananaimitation.png b/docs/localized/KR/docs/images/bananaimitation.png new file mode 100644 index 0000000000..7fcf2fed87 Binary files /dev/null and b/docs/localized/KR/docs/images/bananaimitation.png differ diff --git a/docs/localized/KR/docs/images/brain.png b/docs/localized/KR/docs/images/brain.png index 7cc9749361..b7e45cdfb7 100644 Binary files a/docs/localized/KR/docs/images/brain.png and b/docs/localized/KR/docs/images/brain.png differ diff --git a/docs/localized/KR/docs/images/broadcast.png b/docs/localized/KR/docs/images/broadcast.png index 0bf73e8e78..5428110aef 100644 Binary files a/docs/localized/KR/docs/images/broadcast.png and b/docs/localized/KR/docs/images/broadcast.png differ diff --git a/docs/localized/KR/docs/images/crawler.png b/docs/localized/KR/docs/images/crawler.png index 6123e86670..3b5c46050a 100644 Binary files a/docs/localized/KR/docs/images/crawler.png and b/docs/localized/KR/docs/images/crawler.png differ diff --git a/docs/localized/KR/docs/images/cuDNN_membership_required.png b/docs/localized/KR/docs/images/cuDNN_membership_required.png index 40a9ee0abc..6a7ffc6cd2 100644 Binary files a/docs/localized/KR/docs/images/cuDNN_membership_required.png and b/docs/localized/KR/docs/images/cuDNN_membership_required.png differ diff --git a/docs/localized/KR/docs/images/cuda_toolkit_directory.PNG b/docs/localized/KR/docs/images/cuda_toolkit_directory.PNG index d097d17905..304ec7fc57 100644 Binary files a/docs/localized/KR/docs/images/cuda_toolkit_directory.PNG and b/docs/localized/KR/docs/images/cuda_toolkit_directory.PNG differ diff --git a/docs/localized/KR/docs/images/cudnn_zip_files.PNG b/docs/localized/KR/docs/images/cudnn_zip_files.PNG index 532d888c76..9170f34f94 100644 Binary files a/docs/localized/KR/docs/images/cudnn_zip_files.PNG and b/docs/localized/KR/docs/images/cudnn_zip_files.PNG differ diff --git a/docs/localized/KR/docs/images/demo_component.png b/docs/localized/KR/docs/images/demo_component.png new file mode 100644 index 0000000000..6cc78380bb Binary files /dev/null and b/docs/localized/KR/docs/images/demo_component.png differ diff --git a/docs/localized/KR/docs/images/demo_inspector.png b/docs/localized/KR/docs/images/demo_inspector.png new file mode 100644 index 0000000000..9cb7a60980 Binary files /dev/null and b/docs/localized/KR/docs/images/demo_inspector.png differ diff --git a/docs/localized/KR/docs/images/edit_env_var_kr.png b/docs/localized/KR/docs/images/edit_env_var_kr.png new file mode 100644 index 0000000000..c94f359e6c Binary files /dev/null and b/docs/localized/KR/docs/images/edit_env_var_kr.png differ diff --git a/docs/localized/KR/docs/images/image-banner.png b/docs/localized/KR/docs/images/image-banner.png new file mode 100644 index 0000000000..1c705bb00e Binary files /dev/null and b/docs/localized/KR/docs/images/image-banner.png differ diff --git a/docs/localized/KR/docs/images/mlagents-3DBallHierarchy.png b/docs/localized/KR/docs/images/mlagents-3DBallHierarchy.png index 166d506d5a..a853848a48 100644 Binary files a/docs/localized/KR/docs/images/mlagents-3DBallHierarchy.png and b/docs/localized/KR/docs/images/mlagents-3DBallHierarchy.png differ diff --git a/docs/localized/KR/docs/images/mlagents-NewProject.png b/docs/localized/KR/docs/images/mlagents-NewProject.png index e8120309cd..81f5d994ef 100644 Binary files a/docs/localized/KR/docs/images/mlagents-NewProject.png and b/docs/localized/KR/docs/images/mlagents-NewProject.png differ diff --git a/docs/localized/KR/docs/images/mlagents-NewTutAcademy.png b/docs/localized/KR/docs/images/mlagents-NewTutAcademy.png index 02e2a4d210..d3bf3289a8 100644 Binary files a/docs/localized/KR/docs/images/mlagents-NewTutAcademy.png and b/docs/localized/KR/docs/images/mlagents-NewTutAcademy.png differ diff --git a/docs/localized/KR/docs/images/mlagents-NewTutAssignBrain.png b/docs/localized/KR/docs/images/mlagents-NewTutAssignBrain.png index 9e00edbc4a..b657046c88 100644 Binary files a/docs/localized/KR/docs/images/mlagents-NewTutAssignBrain.png and b/docs/localized/KR/docs/images/mlagents-NewTutAssignBrain.png differ diff --git a/docs/localized/KR/docs/images/mlagents-NewTutBlock.png b/docs/localized/KR/docs/images/mlagents-NewTutBlock.png index 32eac7fe1e..8ef983a869 100644 Binary files a/docs/localized/KR/docs/images/mlagents-NewTutBlock.png and b/docs/localized/KR/docs/images/mlagents-NewTutBlock.png differ diff --git a/docs/localized/KR/docs/images/mlagents-NewTutBrain.png b/docs/localized/KR/docs/images/mlagents-NewTutBrain.png index b130395452..23a5093d81 100644 Binary files a/docs/localized/KR/docs/images/mlagents-NewTutBrain.png and b/docs/localized/KR/docs/images/mlagents-NewTutBrain.png differ diff --git a/docs/localized/KR/docs/images/mlagents-NewTutFloor.png b/docs/localized/KR/docs/images/mlagents-NewTutFloor.png index b3e71bb11a..7c070c40ee 100644 Binary files a/docs/localized/KR/docs/images/mlagents-NewTutFloor.png and b/docs/localized/KR/docs/images/mlagents-NewTutFloor.png differ diff --git a/docs/localized/KR/docs/images/mlagents-NewTutHierarchy.png b/docs/localized/KR/docs/images/mlagents-NewTutHierarchy.png index 549040098f..d1c4e350c5 100644 Binary files a/docs/localized/KR/docs/images/mlagents-NewTutHierarchy.png and b/docs/localized/KR/docs/images/mlagents-NewTutHierarchy.png differ diff --git a/docs/localized/KR/docs/images/mlagents-NewTutSphere.png b/docs/localized/KR/docs/images/mlagents-NewTutSphere.png index 982e1e3a3b..55d6e3cb47 100644 Binary files a/docs/localized/KR/docs/images/mlagents-NewTutSphere.png and b/docs/localized/KR/docs/images/mlagents-NewTutSphere.png differ diff --git a/docs/localized/KR/docs/images/mlagents-NewTutSplash.png b/docs/localized/KR/docs/images/mlagents-NewTutSplash.png index 25d8e80d21..0e6efc2181 100644 Binary files a/docs/localized/KR/docs/images/mlagents-NewTutSplash.png and b/docs/localized/KR/docs/images/mlagents-NewTutSplash.png differ diff --git a/docs/localized/KR/docs/images/mlagents-Open3DBall.png b/docs/localized/KR/docs/images/mlagents-Open3DBall.png index f59092a5bb..840ad6b64f 100644 Binary files a/docs/localized/KR/docs/images/mlagents-Open3DBall.png and b/docs/localized/KR/docs/images/mlagents-Open3DBall.png differ diff --git a/docs/localized/KR/docs/images/mlagents-RollerAgentStats.png b/docs/localized/KR/docs/images/mlagents-RollerAgentStats.png new file mode 100644 index 0000000000..f1cde7cda2 Binary files /dev/null and b/docs/localized/KR/docs/images/mlagents-RollerAgentStats.png differ diff --git a/docs/localized/KR/docs/images/mlagents-SetBrainToTrain.png b/docs/localized/KR/docs/images/mlagents-SetBrainToTrain.png new file mode 100644 index 0000000000..9fa8347e3d Binary files /dev/null and b/docs/localized/KR/docs/images/mlagents-SetBrainToTrain.png differ diff --git a/docs/localized/KR/docs/images/mlagents-TensorBoard.png b/docs/localized/KR/docs/images/mlagents-TensorBoard.png index ffe7055e4f..a4e3fde36f 100644 Binary files a/docs/localized/KR/docs/images/mlagents-TensorBoard.png and b/docs/localized/KR/docs/images/mlagents-TensorBoard.png differ diff --git a/docs/localized/KR/docs/images/new_system_variable.PNG b/docs/localized/KR/docs/images/new_system_variable.PNG index d9ea72a72d..b27365977a 100644 Binary files a/docs/localized/KR/docs/images/new_system_variable.PNG and b/docs/localized/KR/docs/images/new_system_variable.PNG differ diff --git a/docs/localized/KR/docs/images/new_system_variable_kr.PNG b/docs/localized/KR/docs/images/new_system_variable_kr.PNG new file mode 100644 index 0000000000..14c879a333 Binary files /dev/null and b/docs/localized/KR/docs/images/new_system_variable_kr.PNG differ diff --git a/docs/localized/KR/docs/images/path_variables.PNG b/docs/localized/KR/docs/images/path_variables.PNG index c6455327d7..35745c56a5 100644 Binary files a/docs/localized/KR/docs/images/path_variables.PNG and b/docs/localized/KR/docs/images/path_variables.PNG differ diff --git a/docs/localized/KR/docs/images/path_variables_kr.PNG b/docs/localized/KR/docs/images/path_variables_kr.PNG new file mode 100644 index 0000000000..b7193984cc Binary files /dev/null and b/docs/localized/KR/docs/images/path_variables_kr.PNG differ diff --git a/docs/localized/KR/docs/images/platform_prefab.png b/docs/localized/KR/docs/images/platform_prefab.png new file mode 100644 index 0000000000..9eed9e3c1d Binary files /dev/null and b/docs/localized/KR/docs/images/platform_prefab.png differ diff --git a/docs/localized/KR/docs/images/pyramids.png b/docs/localized/KR/docs/images/pyramids.png new file mode 100644 index 0000000000..9d26a7d8cc Binary files /dev/null and b/docs/localized/KR/docs/images/pyramids.png differ diff --git a/docs/localized/KR/docs/images/running-a-pretrained-model.gif b/docs/localized/KR/docs/images/running-a-pretrained-model.gif new file mode 100644 index 0000000000..8d9e5929c8 Binary files /dev/null and b/docs/localized/KR/docs/images/running-a-pretrained-model.gif differ diff --git a/docs/localized/KR/docs/images/system_variable_name_value.PNG b/docs/localized/KR/docs/images/system_variable_name_value.PNG index 85d1d18b6c..ae3a47d623 100644 Binary files a/docs/localized/KR/docs/images/system_variable_name_value.PNG and b/docs/localized/KR/docs/images/system_variable_name_value.PNG differ diff --git a/docs/localized/KR/docs/images/system_variable_name_value_kr.PNG b/docs/localized/KR/docs/images/system_variable_name_value_kr.PNG new file mode 100644 index 0000000000..447cab78c4 Binary files /dev/null and b/docs/localized/KR/docs/images/system_variable_name_value_kr.PNG differ diff --git a/docs/localized/KR/docs/images/visual-observation-combination.png b/docs/localized/KR/docs/images/visual-observation-combination.png new file mode 100644 index 0000000000..a40b37752c Binary files /dev/null and b/docs/localized/KR/docs/images/visual-observation-combination.png differ diff --git a/docs/localized/KR/docs/images/visual-observation-debug.png b/docs/localized/KR/docs/images/visual-observation-debug.png new file mode 100644 index 0000000000..32449c963b Binary files /dev/null and b/docs/localized/KR/docs/images/visual-observation-debug.png differ diff --git a/docs/localized/KR/docs/images/visual-observation-rawimage.png b/docs/localized/KR/docs/images/visual-observation-rawimage.png new file mode 100644 index 0000000000..03142985aa Binary files /dev/null and b/docs/localized/KR/docs/images/visual-observation-rawimage.png differ diff --git a/docs/localized/KR/docs/images/visual-observation-rendertexture.png b/docs/localized/KR/docs/images/visual-observation-rendertexture.png new file mode 100644 index 0000000000..d2f8c7f662 Binary files /dev/null and b/docs/localized/KR/docs/images/visual-observation-rendertexture.png differ diff --git a/docs/localized/KR/docs/images/visual-observation.png b/docs/localized/KR/docs/images/visual-observation.png index f89a71489e..bfc3144049 100644 Binary files a/docs/localized/KR/docs/images/visual-observation.png and b/docs/localized/KR/docs/images/visual-observation.png differ diff --git a/docs/localized/KR/docs/images/walker.png b/docs/localized/KR/docs/images/walker.png new file mode 100644 index 0000000000..af901fa943 Binary files /dev/null and b/docs/localized/KR/docs/images/walker.png differ diff --git a/docs/localized/zh-CN/README.md b/docs/localized/zh-CN/README.md index 0d38124091..20ef6a4cdd 100755 --- a/docs/localized/zh-CN/README.md +++ b/docs/localized/zh-CN/README.md @@ -1,6 +1,6 @@ -# Unity ML-Agents 工具包(Beta) +# Unity ML-Agents 工具包(Beta) v0.3.1 **注意:** 本文档为v0.3版本文档的部分翻译版,目前并不会随着英文版文档更新而更新。若要查看更新更全的英文版文档,请查看[这里](https://github.com/Unity-Technologies/ml-agents)。 diff --git a/ml-agents-envs/mlagents/envs/environment.py b/ml-agents-envs/mlagents/envs/environment.py index b6acf890cd..d642d792b6 100644 --- a/ml-agents-envs/mlagents/envs/environment.py +++ b/ml-agents-envs/mlagents/envs/environment.py @@ -11,6 +11,7 @@ from .brain import AllBrainInfo, BrainInfo, BrainParameters from .exception import ( UnityEnvironmentException, + UnityCommunicationException, UnityActionException, UnityTimeOutException, ) @@ -343,7 +344,7 @@ def reset( self._generate_reset_input(train_mode, config, custom_reset_parameters) ) if outputs is None: - raise KeyboardInterrupt + raise UnityCommunicationException("Communicator has stopped.") rl_output = outputs.rl_output s = self._get_state(rl_output) self._global_done = s[1] @@ -570,7 +571,7 @@ def step( with hierarchical_timer("communicator.exchange"): outputs = self.communicator.exchange(step_input) if outputs is None: - raise KeyboardInterrupt + raise UnityCommunicationException("Communicator has stopped.") rl_output = outputs.rl_output state = self._get_state(rl_output) self._global_done = state[1] diff --git a/ml-agents-envs/mlagents/envs/exception.py b/ml-agents-envs/mlagents/envs/exception.py index 7824740c47..f1c0bed80c 100644 --- a/ml-agents-envs/mlagents/envs/exception.py +++ b/ml-agents-envs/mlagents/envs/exception.py @@ -19,6 +19,14 @@ class UnityEnvironmentException(UnityException): pass +class UnityCommunicationException(UnityException): + """ + Related to errors with the communicator. + """ + + pass + + class UnityActionException(UnityException): """ Related to errors with sending actions. diff --git a/ml-agents-envs/mlagents/envs/subprocess_env_manager.py b/ml-agents-envs/mlagents/envs/subprocess_env_manager.py index 679e548956..babb20382c 100644 --- a/ml-agents-envs/mlagents/envs/subprocess_env_manager.py +++ b/ml-agents-envs/mlagents/envs/subprocess_env_manager.py @@ -2,6 +2,7 @@ import cloudpickle from mlagents.envs import UnityEnvironment +from mlagents.envs.exception import UnityCommunicationException from multiprocessing import Process, Pipe, Queue from multiprocessing.connection import Connection from queue import Empty as EmptyQueueException @@ -47,14 +48,14 @@ def send(self, name: str, payload=None): cmd = EnvironmentCommand(name, payload) self.conn.send(cmd) except (BrokenPipeError, EOFError): - raise KeyboardInterrupt + raise UnityCommunicationException("UnityEnvironment worker: send failed.") def recv(self) -> EnvironmentResponse: try: response: EnvironmentResponse = self.conn.recv() return response except (BrokenPipeError, EOFError): - raise KeyboardInterrupt + raise UnityCommunicationException("UnityEnvironment worker: recv failed.") def close(self): try: @@ -115,8 +116,9 @@ def _send_response(cmd_name, payload): _send_response("global_done", env.global_done) elif cmd.name == "close": break - except KeyboardInterrupt: - print("UnityEnvironment worker: keyboard interrupt") + except (KeyboardInterrupt, UnityCommunicationException): + print("UnityEnvironment worker: environment stopping.") + step_queue.put(EnvironmentResponse("env_close", worker_id, None)) finally: step_queue.close() env.close() @@ -171,6 +173,10 @@ def step(self) -> List[StepInfo]: try: while True: step = self.step_queue.get_nowait() + if step.name == "env_close": + raise UnityCommunicationException( + "At least one of the environments has closed." + ) self.env_workers[step.worker_id].waiting = False if step.worker_id not in step_workers: worker_steps.append(step) diff --git a/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py b/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py index 891f12a69a..a7c923ac4d 100644 --- a/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py +++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py @@ -34,6 +34,7 @@ def __init__( reward multiplied by the strength parameter :param gamma: The time discounting factor used for this reward. :param demo_path: The path to the demonstration file + :param num_epoch: The number of epochs to train over the training buffer for the discriminator. :param encoding_size: The size of the the hidden layers of the discriminator :param learning_rate: The Learning Rate used during GAIL updates. :param samples_per_update: The maximum number of samples to update during GAIL updates. diff --git a/ml-agents/mlagents/trainers/tensorflow_to_barracuda.py b/ml-agents/mlagents/trainers/tensorflow_to_barracuda.py index 69a9ebdaf4..3e7ec4b0fe 100644 --- a/ml-agents/mlagents/trainers/tensorflow_to_barracuda.py +++ b/ml-agents/mlagents/trainers/tensorflow_to_barracuda.py @@ -52,7 +52,9 @@ id=1, rank=2, out_shapes=lambda shapes: [ - [shapes[0][0], 1, 1, shapes[0][1]], # W + [shapes[0][0], 1, 1, shapes[0][1]] + if len(shapes[0]) > 1 + else [1, 1, 1, 1], # W [1, 1, 1, shapes[-1][-1]], # B ], patch_data=lambda data: [data[0], data[1]], @@ -324,9 +326,14 @@ "ConcatV2", "Identity", ] - ): "BasicLSTM", - repr([re.compile("^lstm/"), "Reshape", "ConcatV2", "Identity"]): "BasicLSTM", - repr(["Reshape", re.compile("^lstm_[a-z]*/"), "Reshape", "ConcatV2"]): "BasicLSTM", + ): "BasicLSTMReshapeOut", + repr( + [re.compile("^lstm/"), "Reshape", "ConcatV2", "Identity"] + ): "BasicLSTMReshapeOut", + repr( + ["Reshape", re.compile("^lstm_[a-z]*/"), "Reshape", "ConcatV2"] + ): "BasicLSTMReshapeOut", + repr(["Reshape", re.compile("^lstm_[a-z]*/"), "ConcatV2"]): "BasicLSTMConcatOut", repr(["Sigmoid", "Mul"]): "Swish", repr(["Mul", "Abs", "Mul", "Add"]): "LeakyRelu", repr( @@ -546,9 +553,12 @@ def order_by(args, names): "SquaredDifference": lambda nodes, inputs, tensors, _: sqr_diff( nodes[-1].name, inputs[0], inputs[1] ), - "BasicLSTM": lambda nodes, inputs, tensors, context: basic_lstm( + "BasicLSTMReshapeOut": lambda nodes, inputs, tensors, context: basic_lstm( nodes, inputs, tensors, context, find_type="Reshape" ), + "BasicLSTMConcatOut": lambda nodes, inputs, tensors, context: basic_lstm( + nodes, inputs, tensors, context, find_type="ConcatV2" + ), "Swish": lambda nodes, inputs, tensors, _: Struct(op="Swish", input=inputs), "LeakyRelu": lambda nodes, inputs, tensors, _: Struct(op="LeakyRelu", input=inputs), # TODO:'Round' diff --git a/ml-agents/mlagents/trainers/trainer_controller.py b/ml-agents/mlagents/trainers/trainer_controller.py index 607fc4ede1..cfa7911ae4 100644 --- a/ml-agents/mlagents/trainers/trainer_controller.py +++ b/ml-agents/mlagents/trainers/trainer_controller.py @@ -14,7 +14,10 @@ from mlagents.envs import BrainParameters from mlagents.envs.env_manager import StepInfo from mlagents.envs.env_manager import EnvManager -from mlagents.envs.exception import UnityEnvironmentException +from mlagents.envs.exception import ( + UnityEnvironmentException, + UnityCommunicationException, +) from mlagents.envs.sampler_class import SamplerManager from mlagents.envs.timers import hierarchical_timer, get_timer_tree, timed from mlagents.trainers import Trainer, TrainerMetrics @@ -302,15 +305,15 @@ def start_learning( # Final save Tensorflow model if global_step != 0 and self.train_model: self._save_model() - except KeyboardInterrupt: + except (KeyboardInterrupt, UnityCommunicationException): if self.train_model: self._save_model_when_interrupted() pass - env_manager.close() if self.train_model: self._write_training_metrics() self._export_graph() self._write_timing_tree() + env_manager.close() def end_trainer_episodes( self, env: BaseUnityEnvironment, lessons_incremented: Dict[str, bool]