diff --git a/example/gym_client.cpp b/example/gym_client.cpp index 86e65bc..beef12c 100644 --- a/example/gym_client.cpp +++ b/example/gym_client.cpp @@ -19,15 +19,15 @@ using namespace cpprl; // Algorithm hyperparameters const int batch_size = 5; const float discount_factor = 0.99; -const float entropy_coef = 1e-4; +const float entropy_coef = 1e-3; const float learning_rate = 1e-3; -const int reward_average_window_size = 100; +const int reward_average_window_size = 10; const float value_loss_coef = 0.5; // Environment hyperparameters const std::string env_name = "LunarLander-v2"; const int num_envs = 8; -const float env_gamma = -1; // Set to -1 to disable +const float env_gamma = discount_factor; // Set to -1 to disable // Model hyperparameters const int actions = 4; @@ -83,64 +83,64 @@ int main(int argc, char *argv[]) storage.set_first_observation(observation); - std::ifstream weights_file{"/home/px046/prog/pytorch-cpp-rl/build/weights.json"}; - auto json = nlohmann::json::parse(weights_file); - for (const auto ¶meter : json.items()) - { - if (base->named_parameters().contains(parameter.key())) - { - std::vector tensor_size = parameter.value()[0]; - std::vector parameter_vec; - if (parameter.key().find("bias") == std::string::npos) - { - std::vector> parameter_2d_vec = parameter.value()[1].get>>(); - parameter_vec = flatten_2d_vector(parameter_2d_vec); - } - else - { - parameter_vec = parameter.value()[1].get>(); - } - NoGradGuard guard; - auto json_weights = torch::from_blob(parameter_vec.data(), tensor_size).contiguous(); - base->named_parameters()[parameter.key()].copy_(json_weights); - spdlog::info("Wrote {}", parameter.key()); - if (parameter.key().find("bias") == std::string::npos) - { - spdlog::info("Json: {} - Memory: {}", parameter.value()[1][0][0], base->named_parameters()[parameter.key()][0][0].item().toFloat()); - } - } - else if (policy->named_modules()["output"]->named_parameters().contains(parameter.key())) - { - std::vector tensor_size = parameter.value()[0]; - std::vector parameter_vec; - if (parameter.key().find("bias") == std::string::npos) - { - std::vector> parameter_2d_vec = parameter.value()[1].get>>(); - parameter_vec = flatten_2d_vector(parameter_2d_vec); - } - else - { - parameter_vec = parameter.value()[1].get>(); - } - NoGradGuard guard; - auto json_weights = torch::from_blob(parameter_vec.data(), tensor_size).contiguous(); - policy->named_modules()["output"]->named_parameters()[parameter.key()].copy_(json_weights); - spdlog::info("Wrote {}", parameter.key()); - if (parameter.key().find("bias") == std::string::npos) - { - spdlog::info("Json: {} - Memory: {}", - parameter.value()[1][0][0], - policy->named_modules()["output"]->named_parameters()[parameter.key()][0][0].item().toFloat()); - } - } - } + // std::ifstream weights_file{"/home/px046/prog/pytorch-cpp-rl/build/weights.json"}; + // auto json = nlohmann::json::parse(weights_file); + // for (const auto ¶meter : json.items()) + // { + // if (base->named_parameters().contains(parameter.key())) + // { + // std::vector tensor_size = parameter.value()[0]; + // std::vector parameter_vec; + // if (parameter.key().find("bias") == std::string::npos) + // { + // std::vector> parameter_2d_vec = parameter.value()[1].get>>(); + // parameter_vec = flatten_2d_vector(parameter_2d_vec); + // } + // else + // { + // parameter_vec = parameter.value()[1].get>(); + // } + // NoGradGuard guard; + // auto json_weights = torch::from_blob(parameter_vec.data(), tensor_size).contiguous(); + // base->named_parameters()[parameter.key()].copy_(json_weights); + // spdlog::info("Wrote {}", parameter.key()); + // if (parameter.key().find("bias") == std::string::npos) + // { + // spdlog::info("Json: {} - Memory: {}", parameter.value()[1][0][0], base->named_parameters()[parameter.key()][0][0].item().toFloat()); + // } + // } + // else if (policy->named_modules()["output"]->named_parameters().contains(parameter.key())) + // { + // std::vector tensor_size = parameter.value()[0]; + // std::vector parameter_vec; + // if (parameter.key().find("bias") == std::string::npos) + // { + // std::vector> parameter_2d_vec = parameter.value()[1].get>>(); + // parameter_vec = flatten_2d_vector(parameter_2d_vec); + // } + // else + // { + // parameter_vec = parameter.value()[1].get>(); + // } + // NoGradGuard guard; + // auto json_weights = torch::from_blob(parameter_vec.data(), tensor_size).contiguous(); + // policy->named_modules()["output"]->named_parameters()[parameter.key()].copy_(json_weights); + // spdlog::info("Wrote {}", parameter.key()); + // if (parameter.key().find("bias") == std::string::npos) + // { + // spdlog::info("Json: {} - Memory: {}", + // parameter.value()[1][0][0], + // policy->named_modules()["output"]->named_parameters()[parameter.key()][0][0].item().toFloat()); + // } + // } + // } std::vector running_rewards(num_envs); int episode_count = 0; std::vector reward_history(reward_average_window_size); torch::manual_seed(0); - for (int update = 0; update < 1; ++update) + for (int update = 0; update < 100000; ++update) { for (int step = 0; step < batch_size; ++step) { @@ -167,9 +167,10 @@ int main(int argc, char *argv[]) observation_vec = flatten_2d_vector(step_result->observation); observation = torch::from_blob(observation_vec.data(), {num_envs, observation_size}); auto rewards = flatten_2d_vector(step_result->reward); + auto real_rewards = flatten_2d_vector(step_result->real_reward); for (int i = 0; i < num_envs; ++i) { - running_rewards[i] += rewards[i]; + running_rewards[i] += real_rewards[i]; if (step_result->done[i][0]) { reward_history[episode_count % reward_average_window_size] = running_rewards[i]; diff --git a/example/requests.h b/example/requests.h index 465f3ea..1384098 100644 --- a/example/requests.h +++ b/example/requests.h @@ -54,6 +54,7 @@ struct StepResponse std::vector> observation; std::vector> reward; std::vector> done; - MSGPACK_DEFINE_MAP(observation, reward, done); + std::vector> real_reward; + MSGPACK_DEFINE_MAP(observation, reward, done, real_reward); }; } diff --git a/gym_server/envs.py b/gym_server/envs.py index 9ee06f9..c4d9ba6 100644 --- a/gym_server/envs.py +++ b/gym_server/envs.py @@ -40,9 +40,7 @@ def _obfilt(self, obs): obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) - return obs - else: - return obs + return obs def train(self): self.training = True @@ -50,6 +48,19 @@ def train(self): def eval(self): self.training = False + def step_wait(self): + obs, rews, news, infos = self.venv.step_wait() + infos = {'reward': np.expand_dims(rews, -1)} + self.ret = self.ret * self.gamma + rews + obs = self._obfilt(obs) + if self.ret_rms: + self.ret_rms.update(self.ret) + rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), + -self.cliprew, + self.cliprew) + self.ret[news] = 0. + return obs, rews, news, infos + def make_env(env_id, seed, rank): def _thunk(): diff --git a/gym_server/messages.py b/gym_server/messages.py index 088f03e..6050751 100644 --- a/gym_server/messages.py +++ b/gym_server/messages.py @@ -53,15 +53,18 @@ class StepMessage(Message): def __init__(self, observation: np.ndarray, reward: np.ndarray, - done: np.ndarray): + done: np.ndarray, + real_reward: np.ndarray): self.observation = observation self.reward = reward self.done = done + self.real_reward = real_reward def to_msg(self) -> bytes: request = { "observation": self.observation.tolist(), "reward": self.reward.tolist(), - "done": self.done.tolist() + "done": self.done.tolist(), + "real_reward": self.real_reward.tolist() } return msgpack.packb(request) diff --git a/gym_server/server.py b/gym_server/server.py index 5bb054f..9521fd2 100644 --- a/gym_server/server.py +++ b/gym_server/server.py @@ -38,26 +38,28 @@ def serve(self): def _serve(self): while True: request = self.zmq_client.receive() - method = request["method"] - param = request["param"] + method = request['method'] + param = request['param'] - if method == "make": - self.__make(param["env_name"], param["num_envs"], - param["gamma"]) + if method == 'make': + self.__make(param['env_name'], param['num_envs'], + param['gamma']) self.zmq_client.send(MakeMessage()) - elif method == "reset": + elif method == 'reset': observation = self.__reset() self.zmq_client.send(ResetMessage(observation)) - elif method == "step": - if "render" in param: + elif method == 'step': + if 'render' in param: result = self.__step( - np.array(param["actions"]), param["render"]) + np.array(param['actions']), param['render']) else: - result = self.__step(np.array(param["actions"])) - self.zmq_client.send(StepMessage(result[0], result[1], - result[2])) + result = self.__step(np.array(param['actions'])) + self.zmq_client.send(StepMessage(result[0], + result[1], + result[2], + result[3]['reward'])) def make(self, env_name, num_envs, gamma): """ diff --git a/gym_server/zmq_client.py b/gym_server/zmq_client.py index 3dd53b6..1940cb9 100644 --- a/gym_server/zmq_client.py +++ b/gym_server/zmq_client.py @@ -1,12 +1,9 @@ """ Pytorch-cpp-rl OpenAI gym server ZMQ client. """ -import logging import zmq import msgpack -from gym_server.messages import Message - class ZmqClient: """ diff --git a/gym_server/main.py b/launch_gym_server.py similarity index 100% rename from gym_server/main.py rename to launch_gym_server.py