## 3.4 CartPoleをQ学習で制御

In [None]:
# 导入所需要的包
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gym


In [2]:
# 声明动画的绘制函数
# 参考URL http://nbviewer.jupyter.org/github/patrickmineault
# /xcorr-notebooks/blob/master/Render%20OpenAI%20gym%20as%20GIF.ipynb
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
from IPython.display import display


def display_frames_as_gif(frames):
    """
    Displays a list of frames as a gif, with controls
    """
    plt.figure(figsize=(frames[0].shape[1]/72.0, frames[0].shape[0]/72.0),
               dpi=72)
    patch = plt.imshow(frames[0])
    plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames),
                                   interval=50)

    anim.save('movie_cartpole.mp4')  # 视频b
    display(display_animation(anim, default_mode='loop'))
    

In [3]:
# 参数的设定
ENV = 'CartPole-v0'  # 要使用的任务名称
NUM_DIZITIZED = 6  # 将每个状态划分为离散值的个数
GAMMA = 0.99  # 时间折扣率
ETA = 0.5  # 学习系数
MAX_STEPS = 200  # 一次试验中的步数
NUM_EPISODES = 1000  # 最大试验次数


### Agent类接收初始化函数init中CartPole状态变量的数量和动作类型的数量，并生成Brain类作为自己的大脑。Agent类包括get_action和updata_Q_function这两个方法。

In [4]:
class Agent:
    '''CartPole的智能体类，它将是一个带有杆的小车'''

    def __init__(self, num_states, num_actions):
        self.brain = Brain(num_states, num_actions)  # 为智能体创建大脑以做出决定

    def update_Q_function(self, observation, action, reward, observation_next):
        '''Q函数的更新'''
        self.brain.update_Q_table(
            observation, action, reward, observation_next)

    def get_action(self, observation, step):
        '''动作的确定'''
        action = self.brain.decide_action(observation, step)
        return action
    

### 定义Brain类，它是Agent的大脑。更新Q表与确定动作的方法。

In [5]:
class Brain:
    '''这是一个将成为智能体大脑的类，用于进行Q学习'''

    def __init__(self, num_states, num_actions):
        self.num_actions = num_actions  # CartPole有两种动作（向左或向右）

        # 创建Q表。行数是将状态转换为数字得到的分割数（有4各变量），列表示为动作数
        self.q_table = np.random.uniform(low=0, high=1, size=(
            NUM_DIZITIZED**num_states, num_actions))


    def bins(self, clip_min, clip_max, num):
        '''求得观察到的状态（连续值）到离散值的数字转换阈值'''
        return np.linspace(clip_min, clip_max, num + 1)[1:-1]

    def digitize_state(self, observation):
        '''将观察到的observation转换为离散的值'''
        cart_pos, cart_v, pole_angle, pole_v = observation
        digitized = [
            np.digitize(cart_pos, bins=self.bins(-2.4, 2.4, NUM_DIZITIZED)),
            np.digitize(cart_v, bins=self.bins(-3.0, 3.0, NUM_DIZITIZED)),
            np.digitize(pole_angle, bins=self.bins(-0.5, 0.5, NUM_DIZITIZED)),
            np.digitize(pole_v, bins=self.bins(-2.0, 2.0, NUM_DIZITIZED))
        ]
        return sum([x * (NUM_DIZITIZED**i) for i, x in enumerate(digitized)])

    def update_Q_table(self, observation, action, reward, observation_next):
        '''Q学习更新Q表'''
        state = self.digitize_state(observation)  # 状态离散化
        state_next = self.digitize_state(observation_next)  # 将下一个状态离散化
        Max_Q_next = max(self.q_table[state_next][:])
        self.q_table[state, action] = self.q_table[state, action] + \
            ETA * (reward + GAMMA * Max_Q_next - self.q_table[state, action])

    def decide_action(self, observation, episode):
        '''根据ε-贪婪法逐渐采用最优动作'''
        state = self.digitize_state(observation)
        epsilon = 0.5 * (1 / (episode + 1))

        if epsilon <= np.random.uniform(0, 1):
            action = np.argmax(self.q_table[state][:])
        else:
            action = np.random.choice(self.num_actions)  # 随机返回0.1动作
        return action
    

### 最后定义Environment类。这一次，如果它连续10次站立195步或更多，说明强化学习成功。我们将再次运行一次以保存成功后的动画。

In [6]:
class Environment:
    '''执行CartPole的环境类'''

    def __init__(self):
        self.env = gym.make(ENV)  # 设置要执行的任务
        num_states = self.env.observation_space.shape[0]  # 获取任务状态的个数
        num_actions = self.env.action_space.n  # 获取CartPole的动作（向左或向右）数为2
        self.agent = Agent(num_states, num_actions)  # 创建在环境中行动的Agent

    def run(self):
        '''执行'''
        complete_episodes = 0  # 持续195步或者更多的步数
        is_episode_final = False  # 最终试验的标志
        frames = []  # 用于存储视频图像的变量

        for episode in range(NUM_EPISODES):  # 试验的最大重复次数
            observation = self.env.reset()  # 环境初始化

            for step in range(MAX_STEPS):  # 每个回合的循环

                if is_episode_final is True:  # 将最终试验各个时刻的图像添加到帧中
                    frames.append(self.env.render(mode='rgb_array'))

                # 求取动作
                action = self.agent.get_action(observation, episode)

                # 通过执行动作a_t找到s_{t+1},r_{t+1}
                observation_next, _, done, _ = self.env.step(
                    action)  # 不使用regain,info

                # 给予奖励
                if done:  # 如果步数超过200，或者如果倾斜超过某一个角度，则done为true
                    if step < 195:
                        reward = -1  # 如何半途摔倒，给予奖励-1作为惩罚
                        complete_episodes = 0  # 站立超过195步，重置试验次数
                    else:
                        reward = 1  # 一直站立到结束时给予奖励1
                        complete_episodes += 1  # 更新连续记录
                else:
                    reward = 0  # 途中奖励为0

                # 使用step_1的状态observation_next更新Q函数
                self.agent.update_Q_function(
                    observation, action, reward, observation_next)

                # observation更新
                observation = observation_next

                # 结束时的处理
                if done:
                    print('{0} Episode: Finished after {1} time steps'.format(
                        episode, step + 1))
                    break

            if is_episode_final is True:  # 在最后一次试验中保存并绘制动画
                display_frames_as_gif(frames)
                break

            if complete_episodes >= 10:  # 如果连续成功10次，绘制下一次试验作为最终试验
                print('10回合连续成功')
                is_episode_final = True  
                

In [7]:
# main
cartpole_env = Environment()
cartpole_env.run()


0 Episode: Finished after 11 time steps
1 Episode: Finished after 13 time steps
2 Episode: Finished after 18 time steps
3 Episode: Finished after 111 time steps
4 Episode: Finished after 66 time steps
5 Episode: Finished after 56 time steps
6 Episode: Finished after 32 time steps
7 Episode: Finished after 51 time steps
8 Episode: Finished after 100 time steps
9 Episode: Finished after 20 time steps
10 Episode: Finished after 27 time steps
11 Episode: Finished after 13 time steps
12 Episode: Finished after 30 time steps
13 Episode: Finished after 20 time steps
14 Episode: Finished after 13 time steps
15 Episode: Finished after 15 time steps
16 Episode: Finished after 11 time steps
17 Episode: Finished after 19 time steps
18 Episode: Finished after 12 time steps
19 Episode: Finished after 70 time steps
20 Episode: Finished after 19 time steps
21 Episode: Finished after 18 time steps
22 Episode: Finished after 194 time steps
23 Episode: Finished after 178 time steps
24 Episode: Finished a

196 Episode: Finished after 200 time steps
197 Episode: Finished after 200 time steps
198 Episode: Finished after 200 time steps
199 Episode: Finished after 200 time steps
200 Episode: Finished after 200 time steps
201 Episode: Finished after 200 time steps
10回合连续成功
202 Episode: Finished after 200 time steps


TypeError: a bytes-like object is required, not 'str'