# OpenAI - Gym 指南

## 初始化环境
首先需要安装 gym package，可以通过 pip 或者 conda 等完成。

In [None]:
%pip install gym

调用gym.make()函数，传入一个现存的环境名，可以返回一个该环境的实例。

In [None]:
import numpy as np
from ray.rllib.env.multi_agent_env import MultiAgentEnv

from gym import spaces
from gym.spaces import Box

In [None]:
def make_multi_agent(env_name_or_creator):
    """Convenience wrapper for any single-agent env to be converted into MA.
    Agent IDs are int numbers starting from 0 (first agent).
    Args:
        env_name_or_creator (Union[str, Callable[]]: String specifier or
            env_maker function.
    Returns:
        Type[MultiAgentEnv]: New MultiAgentEnv class to be used as env.
            The constructor takes a config dict with `num_agents` key
            (default=1). The reset of the config dict will be passed on to the
            underlying single-agent env's constructor.
    Examples:
         # >>> # By gym string:
         # >>> ma_cartpole_cls = make_multi_agent("CartPole-v0")
         # >>> # Create a 2 agent multi-agent cartpole.
         # >>> ma_cartpole = ma_cartpole_cls({"num_agents": 2})
         # >>> obs = ma_cartpole.reset()
         # >>> print(obs)
         # ... {0: [...], 1: [...]}
         # >>> # By env-maker callable:
         # >>> ma_stateless_cartpole_cls = make_multi_agent(
         # ...    lambda config: StatelessCartPole(config))
         # >>> # Create a 2 agent multi-agent stateless cartpole.
         # >>> ma_stateless_cartpole = ma_stateless_cartpole_cls(
         # ...    {"num_agents": 2})
    """

    class MultiEnv(MultiAgentEnv):
        def __init__(self, config):
            super().__init__()
            self.env = env_name_or_creator(config)
            obs = self.env.observation_space
            self.observation_space = {i: spaces.Tuple((obs[0], Box(obs[1].low[0], obs[1].high[0], shape=(1,),
                                                                   dtype=np.int64), obs[2])) for i in range(self.env.number_agents)}

            self.action_space = self.env.action_space

        def reset(self):
            obs_state = self.env.reset()
            agents_state = obs_state[1]
            return {i: (obs_state[0], np.array([agents_state[i]]), np.array(obs_state[2])) for i in range(self.env.number_agents)}

        def step(self, action_dict):
            obs, rew, dones, info = {}, {}, {}, {}
            action_list = []
            for i, action in action_dict.items():
                action_list.append(action)
            joint_obs_np, joint_reward_np, done, _ = self.env.step(action_list)
            agents_state = joint_obs_np[1]
            dones["__all__"] = done
            rew = {i: joint_reward_np for i in range(self.env.number_agents)}
            obs = {i: (joint_obs_np[0], np.array([agents_state[i]]), np.array(joint_obs_np[2])) for i in range(self.env.number_agents)}
            return obs, rew, dones, info

    return MultiEnv

In [None]:
env = gym.make("CartPole-v0")
obs = env.observation_space
print(obs)

In [None]:
env.reset()
env.step(0)

In [None]:
for i in range(100):
    if done == 
    env.step(0)
    print(obs)

## MountainCar example

1. **首先查看观测空间和动作空间**  
Box 代表连续空间，观测空间为浮点型的二维np.array。  
Discrete 代表离散空间，动作空间为（0.1.2）的整型数值。

In [1]:
import gym
env = gym.make('MountainCar-v0')
print('观测空间 = {}'.format(env.observation_space))
print('动作空间 = {}'.format(env.action_space))
print('观测范围 = {} ~ {}'.format(env.observation_space.low,
        env.observation_space.high))
print('动作数 = {}'.format(env.action_space.n))

观测空间 = Box([-1.2  -0.07], [0.6  0.07], (2,), float32)
动作空间 = Discrete(3)
观测范围 = [-1.2  -0.07] ~ [0.6  0.07]
动作数 = 3


2. **接下来实现一个智能体类 - BespokeAgent类**  
智能体往往都是我们自己实现的。

In [2]:
class BespokeAgent:
    def __init__(self, env):
        pass
    
    def decide(self, observation): # 决策
        position, velocity = observation
        lb = min(-0.09 * (position + 0.25) ** 2 + 0.03,
                0.3 * (position + 0.9) ** 4 - 0.008)
        ub = -0.07 * (position + 0.38) ** 2 + 0.07
        if lb < velocity < ub:
            action = 2
        else:
            action = 0
        return action # 返回动作

    def learn(self, *args): # 学习
        pass
    
agent = BespokeAgent(env)


In [3]:
def play_montecarlo(env, agent, render=False, train=False):
    episode_reward = 0. # 记录回合总奖励，初始化为0
    observation = env.reset() # 重置游戏环境，开始新回合
    while True: # 不断循环，直到回合结束
        if render: # 判断是否显示
            env.render() # 显示图形界面，图形界面可以用 env.close() 语句关闭
        action = agent.decide(observation)
        next_observation, reward, done, _ = env.step(action) # 执行动作
        episode_reward += reward # 收集回合奖励
        if train: # 判断是否训练智能体
            agent.learn(observation, action, reward, done) # 学习
        if done: # 回合结束，跳出循环
            break
        observation = next_observation
    return episode_reward # 返回回合总奖励


In [None]:
%pip install pyglet

In [5]:
env.seed(0) # 设置随机数种子,只是为了让结果可以精确复现,一般情况下可删去
episode_reward = play_montecarlo(env, agent, render=True)
print('回合奖励 = {}'.format(episode_reward))
env.close() # 此语句可关闭图形界面


回合奖励 = -105.0


In [None]:
%pip install numpy
import numpy as np

In [None]:
episode_rewards = [play_montecarlo(env, agent) for _ in range(100)]
print('平均回合奖励 = {}'.format(np.mean(episode_rewards)))
