# FrozenLake-v0
- The agent controls the movement of a character in a grid world. Some tiles of the grid are walkable, and others lead to the agent falling into the water. Additionally, the movement direction of the agent is uncertain and only partially depends on the chosen direction. The agent is rewarded for finding a walkable path to a goal tile.

- Winter is here. You and your friends were tossing around a frisbee at the park when you made a wild throw that left the frisbee out in the middle of the lake. The water is mostly frozen, but there are a few holes where the ice has melted. If you step into one of those holes, you'll fall into the freezing water. At this time, there's an international frisbee shortage, so it's absolutely imperative that you navigate across the lake and retrieve the disc. However, the ice is slippery, so you won't always move in the direction you intend.

```
SFFF       (S: starting point, safe)
FHFH       (F: frozen surface, safe)
FFFH       (H: hole, fall to your doom)
HFFG       (G: goal, where the frisbee is located)
```
```
LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3
```

In [1]:
import gym
import numpy as np
import random

In [2]:
def get_environment():
    env = gym.make('FrozenLake-v0')
    state_space_n = env.observation_space.n
    action_space_n = env.action_space.n
    print(state_space_n)
    print(action_space_n)
    return env, state_space_n, action_space_n

# 1. 策略迭代[基于贪婪策略]
个体在处于任一状态时,将比较所有可能后续状态的价值，从中选择最大价值的状态，再选择能到该状态的行为。

In [3]:
env, state_space_n, action_space_n = get_environment()

16
4


In [4]:
env.P[0][0]

[(0.3333333333333333, 0, 0.0, False),
 (0.3333333333333333, 0, 0.0, False),
 (0.3333333333333333, 4, 0.0, False)]

In [5]:
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [6]:
def convergence_flag(value_table1, value_table2, threshold = 1e-1):
    if np.sum(np.fabs(value_table1-value_table2)) < threshold:
        return True
    else:
        return False

## 迭代函数：
$$
V_{k+1}(s) = \sum_{a \in A}\pi(a|s)(R_{s}^{a} + \gamma \sum_{s^{'}\in S}P^{a}_{ss^{'}}V_{k}(s^{'}))
$$

In [7]:
value_table = np.zeros(16)
pi = 0.25
gamma = 1.0
iteration_num = 10000
value_table.reshape([4, 4])
action_list = [0, 1, 2, 3]

In [8]:
def get_max_state(value_table, actions, state):
    max_value = -1000
    rst = 0
    for action in actions:
        for next_states in env.P[state][action]:
            pi, next_state, reward, done = next_states
            if value_table[next_state] > max_value:
                max_value = value_table[next_state]
                rst = next_state
    return rst

In [9]:
for k in range(iteration_num):
    value_table_tmp = np.copy(value_table)
    for s in range(state_space_n):
        flag = False
        if s not in [0, 15]:
            value_s = []
            next_state_max = get_max_state(value_table_tmp, action_list, s)
            #print(next_state_max)
            for a in range(action_space_n):
                q_tmp = []
                rewards = 0
                for next_state_info in env.P[s][a]:
                    P, next_state, reward, done =  next_state_info
                    if reward == 0:
                        reward = -1
                    if reward == 1:
                        reward = 0
                    flag = done
                    #print(next_state_info)
                    if next_state == next_state_max:#选择能到达最大value状态的随机一个action
                        rewards += reward * P
                        q_tmp.append(P * value_table_tmp[next_state])
                        break
                #print("{}*({} + {} * {})".format(pi, rewards, gamma,np.sum(q_tmp)))
                value_s.append(pi * (rewards + gamma * np.sum(q_tmp)))
            value_table[s] = np.sum(value_s)
#         if flag:
#             break
    value_table_print = np.copy(value_table)
    value_table_print = value_table_print.reshape([4, 4])
    #print("[Time k={}]".format(k))
    #print(value_table_print)
    if convergence_flag(value_table_tmp, value_table):
        break

In [10]:
value_table.reshape([4, 4])

array([[ 0.00000e+00, -2.50000e-01, -3.12500e-01, -3.28125e-01],
       [-2.50000e-01, -9.91000e+02, -3.12500e-01, -9.91000e+02],
       [-3.12500e-01, -3.12500e-01, -3.12500e-01, -9.91000e+02],
       [-9.91000e+02, -3.12500e-01,  0.00000e+00,  0.00000e+00]])

In [11]:
def get_possible_state(state, action_space_n):
    rst = set()
    for a in range(action_space_n):
        for next_states in env.P[state][a]:
            P, next_state, reward, done = next_states 
            rst.add(next_state)
    return rst
def get_max_state(state_now, states, value_table):
    rst = 0
    max_value = -0xffffff
    for state in states:
        if value_table[state] > max_value and state != state_now:
            max_value = value_table[state]
            rst = state
    return rst
def get_state_action(state_now, state_next, action_space_n):
    rst = set()
    for a in range(action_space_n):
        for next_states in env.P[state_now][a]:
            P, next_state, reward, done = next_states
            if next_state == state_next:
                rst.add(a)
    return list(rst)
def get_action(state, value_table, action_space_n):
    possible_states = get_possible_state(state, action_space_n)
    max_next_state = get_max_state(state, possible_states, value_table)
    action_list = get_state_action(state, max_next_state, action_space_n)
    return action_list[random.randint(0,len(action_list) - 1)]

In [12]:
def get_success_rate(env, value_table, action_space_n, start_state = 0):
    success = 0
    success_temp = []
    for i in range(10000):
        done = False
        state_now = start_state
        env.reset()
        while not done:
            action = get_action(state_now, value_table, action_space_n)
            next_state, reward, done, _ = env.step(action)
            if next_state == 15:
                success += 1
            state_now = next_state
    return success / 10000

In [13]:
get_success_rate(env, value_table, action_space_n, start_state = 0)

0.0195

In [14]:
def test(env, value_table, action_space_n, start_state = 0):
    done = False
    state_now = start_state
    env.reset()
    while not done:
        action = get_action(state_now, value_table, action_space_n)
        next_state, reward, done, _ = env.step(action)
        env.render()
        state_now = next_state

In [15]:
test(env, value_table, action_space_n, start_state = 0)

  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Down)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
