In [2]:
import gym
import numpy as np
import matplotlib.pyplot as plt

In [23]:
env = gym.make('FrozenLake-v1')
n_games = 1000
win_pct = []
scores = []

In [32]:
print(list(range(env.action_space.n)))
env.action_space.sample()

[0, 1, 2, 3]


1

In [None]:
for i in range(n_games):
    done = False
    obs = env.reset()
    score = 0
    while not done:
        action = env.action_space.sample()
        obs, reward, done, truncated, info = env.step(action)
        score += reward
        done = done or truncated  # Ensure the loop exits if the episode is truncated
    scores.append(score)
    if i % 10 == 0:
        average = np.mean(scores[-10:])
        win_pct.append(average)

plt.plot(win_pct)
plt.show()

In [22]:
print(list(range(env.action_space.n)))
print(env.action_space.n)


[0, 1, 2, 3]
4


$Q(s, a; \theta) = \mathbb{E} \left[ r_t + \gamma \max_{a'} Q(s', a'; \theta') \middle| s, a \right]$

* Q 是状态 s 和动作 a 下的 Q 值，参数 θ 表示 Q 网络的参数。
* $r_t$ 是当前步的奖励。
* $\gamma$ 是折扣因子，让未来的收益打折
* $max_{a'} Q$ 是下一步状态 $s'$中最大的 Q 值，$a'$ 是 $s'$ 中所有动作

Q 度量的是当前状态和动下下，且未来总用最优动作带来的回报折现

学习更新的方法

$Q(s_t, a_t) \leftarrow Q(s_t, a_t) + \alpha \left( r_t + \gamma \max_{a'} Q(s_{t+1}, a') - Q(s_t, a_t) \right)$

In [None]:
class Agent():
    def __init__(self, lr, gamma, n_actions, n_states, eps_start, eps_end, eps_dec):
        self.lr = lr
        self.gamma = gamma 
        self.n_actions = n_actions 
        self.n_states = n_states
        self.eps = eps_start
        self.eps_min = eps_end
        self.eps_dec = eps_dec
        self.Q = {}

        self.init_Q()

    def init_Q(self):
        for state in range(self.n_states):
            for action in range(self.n_actions):
                self.Q[(state, action)] = 0.0

    def choose_action(self, state):
        if np.random.random() < self.eps:
            action = np.random.choice([i for i in range(self.n_actions)])
        else:
            actions = np.array([self.Q[(state, a)] for a in range(self.n_actions)])
            action = np.argmax(actions)
        return action
    
    def decrement_epsilon(self):
        self.eps = self.eps - self.eps_dec if self.eps > self.eps_min else self.eps_min

    def learn(self, state, action, reward, state_):
        actions = np.array([self.Q[(state_, a)] for a in range(self.n_actions)])
        a_max = np.argmax(actions)
        self.Q[(state, action)] += self.lr * (reward + self.gamma * self.Q[(state_, a_max)] - self.Q[(state, action)])
        self.decrement_epsilon()

agent = Agent(lr=0.001, gamma=0.9, n_actions=env.action_space.n, n_states=env.observation_space.n, eps_start=1.0, eps_end=0.01, eps_dec=0.001)