# MountainCar-v0问题描述：
### 介绍：
A car is on a one-dimensional track, positioned between two "mountains". The goal is to drive up the mountain on the right; however, the car's engine is not strong enough to scale the mountain in a single pass. Therefore, the only way to succeed is to drive back and forth to build up momentum.
- reward: -1 for each time step, until the goal position of 0.5 is reached. As with MountainCarContinuous v0, there is no penalty for climbing the left hill, which upon reached acts as a wall.

### 动作空间：
- 0: push left
- 1: no push
- 2: push right

### 状态空间：
- position: [-1.2,0.6]
- velocity: [-0.07,0.07]
- final state: position = 0.5 or 200 iterations
- starting state: Random position from -0.6 to -0.4 with no velocity

In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
env = gym.make("MountainCar-v0")
env.reset()

array([-0.51629599,  0.        ])

## 状态最大值

In [3]:
env.observation_space.high

array([0.6 , 0.07], dtype=float32)

## 状态最小值

In [4]:
env.observation_space.low

array([-1.2 , -0.07], dtype=float32)

## 初始化Q表

In [5]:
Q_TABLE_LEN = 20

In [6]:
q_table_shape = [Q_TABLE_LEN] * len(env.observation_space.high)
q_table_shape

[20, 20]

In [7]:
q_table_shape = q_table_shape + [env.action_space.n]

In [8]:
q_table = np.zeros(q_table_shape)
q_table.shape

(20, 20, 3)

## 根据state定位Q表索引

In [9]:
def get_Q_index_by_state (state):
    temp = (env.observation_space.high - env.observation_space.low) / Q_TABLE_LEN
    rst = (state - env.observation_space.low) // temp
    return tuple(rst.astype(int))

## 根据epcilon-greedy策略获取动作

In [10]:
def get_max_q_action(q_index):
    position_index = q_index[0]
    velocity_index = q_index[1]
    return np.argmax(q_table[position_index][velocity_index])

In [11]:
def get_action(state, epsilon):
    q_index = get_Q_index_by_state(state)
    if np.random.random() < epsilon:#随机取一个动作
        return np.random.randint(0, env.action_space.n)
    else:#取Q值最大的index
        return get_max_q_action(q_index)

# 开始训练

## Q value更新：
$$
Q(S_t, A_t) = Q(S_t, A_t) + \alpha (R+\gamma max_{a^{'}}Q(S_{t+1},a^{'})-Q(S_t,A_t))
$$

## 定义训练参数

In [12]:
def get_max_q_value(q_index):
    position_index = q_index[0]
    velocity_index = q_index[1]
    return np.max(q_table[position_index][velocity_index])

In [1]:
def save_q_table(q_table, episode):
    df = pd.DataFrame(q_table.reshape([400,3]))
    df.columns = ["0", "1", "2"]
    df.to_csv("./Q_Tabel_{}.csv".format(episode), index=False, header=True, sep=",", encoding='utf-8_sig')

In [13]:
alpha = 0.02
gamma = 0.95
episodes = 50000
record_total_reward_times = 2000
ep_rewards = []
epsilon = 1
epsilon_step = epsilon / (int(episodes/2) - 1)

In [14]:
for episode in range(episodes):
    ep_reward = 0
    state = env.reset()
    done = False
    while not done:
        action = get_action(state, epsilon)
        next_state, reward, done, _ = env.step(action)
        ep_reward += reward
        if not done:
            td_target = reward + gamma * get_max_q_value(get_Q_index_by_state(next_state))
            q_table[get_Q_index_by_state(state)][action] += alpha * (td_target - q_table[get_Q_index_by_state(state)][action])
        elif next_state[0] >= 0.5:
            q_table[get_Q_index_by_state(state)][action] = 0
        state = next_state
    print("[episode={},reward={}]".format(episode, ep_reward))
    if episode % record_total_reward_times == 0:
        save_q_table(q_table, episode)
        
    epsilon -= epsilon_step
    ep_rewards.append(ep_reward)

[episode=0,reward=-200.0]
[episode=1,reward=-200.0]
[episode=2,reward=-200.0]
[episode=3,reward=-200.0]
[episode=4,reward=-200.0]
[episode=5,reward=-200.0]
[episode=6,reward=-200.0]
[episode=7,reward=-200.0]
[episode=8,reward=-200.0]
[episode=9,reward=-200.0]
[episode=10,reward=-200.0]
[episode=11,reward=-200.0]
[episode=12,reward=-200.0]
[episode=13,reward=-200.0]
[episode=14,reward=-200.0]
[episode=15,reward=-200.0]
[episode=16,reward=-200.0]
[episode=17,reward=-200.0]
[episode=18,reward=-200.0]
[episode=19,reward=-200.0]
[episode=20,reward=-200.0]
[episode=21,reward=-200.0]
[episode=22,reward=-200.0]
[episode=23,reward=-200.0]
[episode=24,reward=-200.0]
[episode=25,reward=-200.0]
[episode=26,reward=-200.0]
[episode=27,reward=-200.0]
[episode=28,reward=-200.0]
[episode=29,reward=-200.0]
[episode=30,reward=-200.0]
[episode=31,reward=-200.0]
[episode=32,reward=-200.0]
[episode=33,reward=-200.0]
[episode=34,reward=-200.0]
[episode=35,reward=-200.0]
[episode=36,reward=-200.0]
[episode=37

[episode=304,reward=-200.0]
[episode=305,reward=-200.0]
[episode=306,reward=-200.0]
[episode=307,reward=-200.0]
[episode=308,reward=-200.0]
[episode=309,reward=-200.0]
[episode=310,reward=-200.0]
[episode=311,reward=-200.0]
[episode=312,reward=-200.0]
[episode=313,reward=-200.0]
[episode=314,reward=-200.0]
[episode=315,reward=-200.0]
[episode=316,reward=-200.0]
[episode=317,reward=-200.0]
[episode=318,reward=-200.0]
[episode=319,reward=-200.0]
[episode=320,reward=-200.0]
[episode=321,reward=-200.0]
[episode=322,reward=-200.0]
[episode=323,reward=-200.0]
[episode=324,reward=-200.0]
[episode=325,reward=-200.0]
[episode=326,reward=-200.0]
[episode=327,reward=-200.0]
[episode=328,reward=-200.0]
[episode=329,reward=-200.0]
[episode=330,reward=-200.0]
[episode=331,reward=-200.0]
[episode=332,reward=-200.0]
[episode=333,reward=-200.0]
[episode=334,reward=-200.0]
[episode=335,reward=-200.0]
[episode=336,reward=-200.0]
[episode=337,reward=-200.0]
[episode=338,reward=-200.0]
[episode=339,reward=

[episode=598,reward=-200.0]
[episode=599,reward=-200.0]
[episode=600,reward=-200.0]
[episode=601,reward=-200.0]
[episode=602,reward=-200.0]
[episode=603,reward=-200.0]
[episode=604,reward=-200.0]
[episode=605,reward=-200.0]
[episode=606,reward=-200.0]
[episode=607,reward=-200.0]
[episode=608,reward=-200.0]
[episode=609,reward=-200.0]
[episode=610,reward=-200.0]
[episode=611,reward=-200.0]
[episode=612,reward=-200.0]
[episode=613,reward=-200.0]
[episode=614,reward=-200.0]
[episode=615,reward=-200.0]
[episode=616,reward=-200.0]
[episode=617,reward=-200.0]
[episode=618,reward=-200.0]
[episode=619,reward=-200.0]
[episode=620,reward=-200.0]
[episode=621,reward=-200.0]
[episode=622,reward=-200.0]
[episode=623,reward=-200.0]
[episode=624,reward=-200.0]
[episode=625,reward=-200.0]
[episode=626,reward=-200.0]
[episode=627,reward=-200.0]
[episode=628,reward=-200.0]
[episode=629,reward=-200.0]
[episode=630,reward=-200.0]
[episode=631,reward=-200.0]
[episode=632,reward=-200.0]
[episode=633,reward=

[episode=895,reward=-200.0]
[episode=896,reward=-200.0]
[episode=897,reward=-200.0]
[episode=898,reward=-200.0]
[episode=899,reward=-200.0]
[episode=900,reward=-200.0]
[episode=901,reward=-200.0]
[episode=902,reward=-200.0]
[episode=903,reward=-200.0]
[episode=904,reward=-200.0]
[episode=905,reward=-200.0]
[episode=906,reward=-200.0]
[episode=907,reward=-200.0]
[episode=908,reward=-200.0]
[episode=909,reward=-200.0]
[episode=910,reward=-200.0]
[episode=911,reward=-200.0]
[episode=912,reward=-200.0]
[episode=913,reward=-200.0]
[episode=914,reward=-200.0]
[episode=915,reward=-200.0]
[episode=916,reward=-200.0]
[episode=917,reward=-200.0]
[episode=918,reward=-200.0]
[episode=919,reward=-200.0]
[episode=920,reward=-200.0]
[episode=921,reward=-200.0]
[episode=922,reward=-200.0]
[episode=923,reward=-200.0]
[episode=924,reward=-200.0]
[episode=925,reward=-200.0]
[episode=926,reward=-200.0]
[episode=927,reward=-200.0]
[episode=928,reward=-200.0]
[episode=929,reward=-200.0]
[episode=930,reward=

[episode=1188,reward=-200.0]
[episode=1189,reward=-200.0]
[episode=1190,reward=-200.0]
[episode=1191,reward=-200.0]
[episode=1192,reward=-200.0]
[episode=1193,reward=-200.0]
[episode=1194,reward=-200.0]
[episode=1195,reward=-200.0]
[episode=1196,reward=-200.0]
[episode=1197,reward=-200.0]
[episode=1198,reward=-200.0]
[episode=1199,reward=-200.0]
[episode=1200,reward=-200.0]
[episode=1201,reward=-200.0]
[episode=1202,reward=-200.0]
[episode=1203,reward=-200.0]
[episode=1204,reward=-200.0]
[episode=1205,reward=-200.0]
[episode=1206,reward=-200.0]
[episode=1207,reward=-200.0]
[episode=1208,reward=-200.0]
[episode=1209,reward=-200.0]
[episode=1210,reward=-200.0]
[episode=1211,reward=-200.0]
[episode=1212,reward=-200.0]
[episode=1213,reward=-200.0]
[episode=1214,reward=-200.0]
[episode=1215,reward=-200.0]
[episode=1216,reward=-200.0]
[episode=1217,reward=-200.0]
[episode=1218,reward=-200.0]
[episode=1219,reward=-200.0]
[episode=1220,reward=-200.0]
[episode=1221,reward=-200.0]
[episode=1222,

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



[episode=1451,reward=-200.0]
[episode=1452,reward=-200.0]
[episode=1453,reward=-200.0]
[episode=1454,reward=-200.0]
[episode=1455,reward=-200.0]
[episode=1456,reward=-200.0]



KeyboardInterrupt



In [None]:
-----

In [None]:
plt.plot(ep_rewards)

In [None]:
q_table_data = pd.read_csv("./Q_Tabel_Final.csv")
q_table_tmp = q_table_data.values
q_table_tmp = q_table_tmp.reshape([20,20,3])

In [None]:
done = False
state = env.reset()
while not done:
    action = np.argmax(q_table_tmp[get_Q_index_by_state(state)])
    next_state, _, done, _ = env.step(action)
    state = next_state
    env.render()
env.close()

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(q_table.reshape([400,3]))
df.columns = ["0", "1", "2"]
df.to_csv("./Q_Tabel_Final.csv", index=False, header=True, sep=",", encoding='utf-8_sig')

In [None]:
!ls

In [None]:
a.reshape([20,20,3])