# MountainCar-v0问题描述：
### 介绍：
A car is on a one-dimensional track, positioned between two "mountains". The goal is to drive up the mountain on the right; however, the car's engine is not strong enough to scale the mountain in a single pass. Therefore, the only way to succeed is to drive back and forth to build up momentum.
- reward: -1 for each time step, until the goal position of 0.5 is reached. As with MountainCarContinuous v0, there is no penalty for climbing the left hill, which upon reached acts as a wall.

### 动作空间：
- 0: push left
- 1: no push
- 2: push right

### 状态空间：
- position: [-1.2,0.6]
- velocity: [-0.07,0.07]
- final state: position = 0.5 or 200 iterations
- starting state: Random position from -0.6 to -0.4 with no velocity

In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
env = gym.make("MountainCar-v0")
env.reset()

array([-0.46205972,  0.        ])

## 状态最大值

In [3]:
env.observation_space.high

array([0.6 , 0.07], dtype=float32)

## 状态最小值

In [4]:
env.observation_space.low

array([-1.2 , -0.07], dtype=float32)

## 初始化Q表

In [5]:
Q_TABLE_LEN = 20

In [6]:
q_table_shape = [Q_TABLE_LEN] * len(env.observation_space.high)
q_table_shape

[20, 20]

In [7]:
q_table_shape = q_table_shape + [env.action_space.n]

In [8]:
q_table = np.zeros(q_table_shape)
q_table.shape

(20, 20, 3)

## 根据state定位Q表索引

In [9]:
def get_Q_index_by_state (state):
    temp = (env.observation_space.high - env.observation_space.low) / Q_TABLE_LEN
    rst = (state - env.observation_space.low) // temp
    return tuple(rst.astype(int))

## 根据epcilon-greedy策略获取动作

In [10]:
def get_max_q_action(q_index):
    position_index = q_index[0]
    velocity_index = q_index[1]
    return np.argmax(q_table[position_index][velocity_index])

In [11]:
def get_action(state, epsilon):
    q_index = get_Q_index_by_state(state)
    if np.random.random() < epsilon:#随机取一个动作
        return np.random.randint(0, env.action_space.n)
    else:#取Q值最大的index
        return get_max_q_action(q_index)

# 开始训练

## Q value更新：
$$
Q(S_t, A_t) = Q(S_t, A_t) + \alpha (R+\gamma max_{a^{'}}Q(S_{t+1},a^{'})-Q(S_t,A_t))
$$

## 定义训练参数

In [12]:
def get_max_q_value(q_index):
    position_index = q_index[0]
    velocity_index = q_index[1]
    return np.max(q_table[position_index][velocity_index])

In [13]:
def save_q_table(q_table, episode):
    df = pd.DataFrame(q_table.reshape([400,3]))
    df.columns = ["0", "1", "2"]
    df.to_csv("./Q_Tabel_{}.csv".format(episode), index=False, header=True, sep=",", encoding='utf-8_sig')

In [14]:
alpha = 0.02
gamma = 0.95
episodes = 50000
record_times = 2000
ep_rewards = []
epsilon = 1
epsilon_step = epsilon / (int(episodes/2) - 1)

In [15]:
---

SyntaxError: invalid syntax (<ipython-input-15-29e0c3615294>, line 1)

In [None]:
for episode in range(episodes):
    ep_reward = 0
    state = env.reset()
    done = False
    while not done:
        action = get_action(state, epsilon)
        next_state, reward, done, _ = env.step(action)
        ep_reward += reward
        if not done:
            td_target = reward + gamma * get_max_q_value(get_Q_index_by_state(next_state))
            q_table[get_Q_index_by_state(state)][action] += alpha * (td_target - q_table[get_Q_index_by_state(state)][action])
        elif next_state[0] >= 0.5:
            q_table[get_Q_index_by_state(state)][action] = 0
        state = next_state
    print("[episode={},reward={}]".format(episode, ep_reward))
    if episode % record_times == 0:
        save_q_table(q_table, episode)
    epsilon -= epsilon_step
    ep_rewards.append(ep_reward)

## 保存最终Q表

In [None]:
df = pd.DataFrame(q_table.reshape([400,3]))
df.columns = ["0", "1", "2"]
df.to_csv("./Q_Tabel_Final.csv", index=False, header=True, sep=",", encoding='utf-8_sig')

## 保存训练时每个episode的总reward

In [None]:
plt.plot(ep_rewards)
plt.

# 结果测试

In [16]:
q_table_data = pd.read_csv("./Q_Tabel_48000.csv")
q_table_tmp = q_table_data.values
q_table_tmp = q_table_tmp.reshape([20,20,3])

In [17]:
done = False
state = env.reset()
actions = []
while not done:
    action = np.argmax(q_table_tmp[get_Q_index_by_state(state)])
    actions.append(action)
    next_state, _, done, _ = env.step(action)
    state = next_state
    env.render()
env.close()

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(q_table.reshape([400,3]))
df.columns = ["0", "1", "2"]
df.to_csv("./Q_Tabel_Final.csv", index=False, header=True, sep=",", encoding='utf-8_sig')

In [None]:
actions

In [None]:
a.reshape([20,20,3])