-
Notifications
You must be signed in to change notification settings - Fork 4.4k
Fix timers when using multithreading. #3901
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
Example from 3DBall with the fix {
"name": "root",
"gauges": {
"ppo_3DBall.Policy.Entropy.mean": {
"value": 1.3003549575805664,
"min": 1.3003549575805664,
"max": 1.4189382791519165,
"count": 12
},
"ppo_3DBall.Environment.EpisodeLength.mean": {
"value": 971.5,
"min": 21.158379373848987,
"max": 971.5,
"count": 12
},
"ppo_3DBall.Policy.ExtrinsicValueEstimate.mean": {
"value": 3.8920631408691406,
"min": -0.03331759199500084,
"max": 3.8920631408691406,
"count": 12
},
"ppo_3DBall.Environment.CumulativeReward.mean": {
"value": 97.06668154398601,
"min": 1.113284105744309,
"max": 97.06668154398601,
"count": 12
},
"ppo_3DBall.Policy.ExtrinsicReward.mean": {
"value": 97.06668154398601,
"min": 1.113284105744309,
"max": 97.06668154398601,
"count": 12
},
"ppo_3DBall.IsTraining.mean": {
"value": 1.0,
"min": 1.0,
"max": 1.0,
"count": 12
},
"ppo_3DBall.Losses.ValueLoss.mean": {
"value": 8.972146034240723,
"min": 0.3149506449699402,
"max": 11.991447448730469,
"count": 11
},
"ppo_3DBall.Losses.PolicyLoss.mean": {
"value": 0.10021013021469116,
"min": 0.09831372648477554,
"max": 0.10745824873447418,
"count": 11
},
"ppo_3DBall.Policy.LearningRate.mean": {
"value": 0.0002184834738727659,
"min": 0.0002184834738727659,
"max": 0.0002927808091044426,
"count": 11
}
},
"metadata": {
"timer_format_version": "0.1.0",
"start_time_seconds": "1588192751",
"python_version": "3.7.6 (default, Dec 30 2019, 19:38:28) \n[Clang 11.0.0 (clang-1100.0.33.16)]",
"command_line_arguments": "/Users/foo/code/ml-agents/venv/bin/mlagents-learn config/trainer_config.yaml --force --env=Project/Build/ball --no-graphics",
"mlagents_version": "0.16.0",
"mlagents_envs_version": "0.16.0",
"communication_protocol_version": "0.17.0",
"tensorflow_version": "2.0.1",
"end_time_seconds": "1588192850"
},
"total": 99.22794142299999,
"count": 1,
"self": 0.004021215999983951,
"children": {
"run_training.setup": {
"total": 0.01560470500000033,
"count": 1,
"self": 0.01560470500000033
},
"TrainerController.start_learning": {
"total": 99.208315502,
"count": 1,
"self": 9.104267448999181,
"children": {
"TrainerController._reset_env": {
"total": 0.8152601319999997,
"count": 1,
"self": 0.8152601319999997
},
"TrainerController.advance": {
"total": 89.07477324800081,
"count": 14848,
"self": 0.18365643900101247,
"children": {
"env_step": {
"total": 88.8911168089998,
"count": 14848,
"self": 77.19064573999994,
"children": {
"SubprocessEnvManager._take_step": {
"total": 11.51412037099989,
"count": 14848,
"self": 0.47601072899954566,
"children": {
"NNPolicy.evaluate": {
"total": 11.038109642000345,
"count": 13030,
"self": 11.038109642000345
}
}
},
"workers": {
"total": 0.18635069799996806,
"count": 14847,
"self": 0.0,
"children": {
"worker_root": {
"total": 97.55444978000044,
"count": 14847,
"is_parallel": true,
"self": 51.16557370200104,
"children": {
"run_training.setup": {
"total": 0.0,
"count": 0,
"is_parallel": true,
"self": 0.0,
"children": {
"steps_from_proto": {
"total": 0.004443772999999762,
"count": 1,
"is_parallel": true,
"self": 0.0003309589999993534,
"children": {
"_process_vector_observation": {
"total": 0.004112814000000409,
"count": 2,
"is_parallel": true,
"self": 0.004112814000000409
}
}
},
"UnityEnvironment.step": {
"total": 0.017111769000000443,
"count": 1,
"is_parallel": true,
"self": 0.0001352060000003874,
"children": {
"UnityEnvironment._generate_step_input": {
"total": 0.0002027519999998617,
"count": 1,
"is_parallel": true,
"self": 0.0002027519999998617
},
"communicator.exchange": {
"total": 0.016472735999999877,
"count": 1,
"is_parallel": true,
"self": 0.016472735999999877
},
"steps_from_proto": {
"total": 0.00030107500000031706,
"count": 1,
"is_parallel": true,
"self": 0.00010562699999994152,
"children": {
"_process_vector_observation": {
"total": 0.00019544800000037554,
"count": 2,
"is_parallel": true,
"self": 0.00019544800000037554
}
}
}
}
}
}
},
"UnityEnvironment.step": {
"total": 46.3888760779994,
"count": 14846,
"is_parallel": true,
"self": 1.6719174050012242,
"children": {
"UnityEnvironment._generate_step_input": {
"total": 1.6747899139994242,
"count": 14846,
"is_parallel": true,
"self": 1.6747899139994242
},
"communicator.exchange": {
"total": 39.38887078999902,
"count": 14846,
"is_parallel": true,
"self": 39.38887078999902
},
"steps_from_proto": {
"total": 3.653297968999733,
"count": 14846,
"is_parallel": true,
"self": 1.4095413739998186,
"children": {
"_process_vector_observation": {
"total": 2.2437565949999145,
"count": 29692,
"is_parallel": true,
"self": 2.2437565949999145
}
}
}
}
}
}
}
}
}
}
}
}
},
"trainer_threads": {
"total": 7.220700000232227e-05,
"count": 1,
"self": 7.220700000232227e-05,
"children": {
"thread_root": {
"total": 0.0,
"count": 0,
"is_parallel": true,
"self": 0.0,
"children": {
"trainer_advance": {
"total": 95.04654379600397,
"count": 206214,
"is_parallel": true,
"self": 3.6411367380041924,
"children": {
"process_trajectory": {
"total": 59.422005785999794,
"count": 206214,
"is_parallel": true,
"self": 59.422005785999794
},
"_update_policy": {
"total": 31.98340127199998,
"count": 12,
"is_parallel": true,
"self": 13.672576812999822,
"children": {
"PPOOptimizer.update": {
"total": 18.31082445900016,
"count": 6948,
"is_parallel": true,
"self": 18.31082445900016
}
}
}
}
}
}
}
}
},
"TrainerController._save_model": {
"total": 0.213942466000006,
"count": 1,
"self": 0.213942466000006
}
}
}
}
} |
| if t.ident is None: | ||
| # Thread hasn't started, shouldn't ever happen | ||
| return None | ||
| return _thread_timer_stacks.get(t.ident) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we use _thread_timer_stacks.get(t.ident, None) with the understanding that _thread_timers_stacks[None] should never work?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In practice, it's fine. But mypy complained about the types - the dict key is int but Thread.ident can be None if the thread hasn't started yet.
ervteng
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changes LGTM
| except Exception: | ||
| pass | ||
|
|
||
| with hierarchical_timer("trainer_threads") as main_timer_node: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
would prefer to have this happen behind-the-scenes but can't think of a good way to do it. Doing it in the main thread seems fine.
commit 2548035 Author: Chris Elion <chris.elion@unity3d.com> Date: Wed Apr 29 16:15:06 2020 -0700 set communication to 1.0.0 (#3896) commit aaa288a Author: andrewcoh <54679309+andrewcoh@users.noreply.github.com> Date: Wed Apr 29 15:37:19 2020 -0700 Update mlagents readme with limitations (#3902) commit bd9a394 Author: Chris Elion <chris.elion@unity3d.com> Date: Wed Apr 29 14:55:49 2020 -0700 Fix timers when using multithreading. (#3901) commit 9ab9203 Author: Ervin T <ervin@unity3d.com> Date: Wed Apr 29 13:49:47 2020 -0700 [docs] Edits to training configuration file (#3898)
Proposed change(s)
Fix handling of timers in threaded trainers.
Previously, we'd keep a single TimerStack, which would get completely garbled when threads switched context.
Now we keep one TimerStack per thread, and merge them when shutting down.
Useful links (Github issues, JIRA tickets, ML-Agents forum threads etc.)
Types of change(s)
Checklist
Other comments