## Cliff Walking Environment

![avatar](fig/cliff_walking.png)



In [18]:
import time 
import numpy  


In [19]:
class Env():
    def __init__(self, length, height):
        self.length = length 
        self.height = height 
        self.x = 0
        self.y = 0

    def step(self, action):
        """4 legal actions, 0:up, 1:down, 2:left, 3:right"""
        change = [[0, 1], [0, -1], [-1, 0], [1, 0]]
        self.x = min(self.height - 1, max(0, self.x + change[action][0]))
        self.y = min(self.length - 1, max(0, self.y + change[action][1]))

        states = [self.x, self.y]
        reward = -1
        terminal = False 

        if self.x == 0: # if agent is on the cliff line "SxxxxxT"
            if self.y > 0: # if agent is not on the start position 
                terminal = True
                if self.y != self.length - 1: # if agent falls
                    reward = -100

        return reward, states, terminal 
    def reset(self):
        self.x = 0
        self.y = 0

## $\epsilon$-greedy action selection

任意$\forall$选择一个行动时，都有一个概率$\epsilon \in [0, 1]$:
- $\epsilon$的概率随机选择一个行动
- $1 - \epsilon$的概率贪心选择最优行动
通常可取$\epsilon = 0.1$

In [20]:
from numpy import random 
class Q_table():
    def __init__(self, length, height, actions = 4, alpha = 0.1, gamma = 0.9, epsilon = 0.1):
        self.table = [0] * actions * length * height 
        self.actions = actions
        self.length = length  
        self.height = height 
        self.alpha = alpha 
        self.gamma = gamma 
        self.epsilon = epsilon 

    def _index(self, a, x, y):
        """Return the index of Q([x, y], a) in Q_table."""
        return a * self.height * self.length + x * self.length + y 
 
    def take_action(self, x, y, num_episode):
        """epsilon-greedy action selection"""
        if random.random() < self.epsilon:
            return random.randint(4)
        else:
            actions_value = [self.table[self._index(a, x, y)] for a in range(self.actions)]
            return actions_value.index(max(actions_value))

    def max_q(self, x, y):
        actions_value = [self.table[self._index(a, x, y)] for a in range(self.actions)]
        return max(actions_value)

    def update(self, a, s0, s1, r, is_terminated):
        # both s0, s1 have the form [x,y]
        q_predict = self.table[self._index(a, s0[0], s0[1])]
        if not is_terminated:
            q_target = r + self.gamma * self.max_q(s1[0], s1[1])
        else:
            q_target = r
        self.table[self._index(a, s0[0], s0[1])] += self.alpha * (q_target - q_predict)

In [21]:
def cliff_walk():
    env = Env(length=6, height=4)
    table = Q_table(length=12, height=4)
    for num_episode in range(1000):
        # within the whole learning process
        episodic_reward = 0
        is_terminated = False
        s0 = [0, 0]
        while not is_terminated:
            # within one episode
            action = table.take_action(s0[0], s0[1], num_episode)
            r, s1, is_terminated = env.step(action)
            table.update(action, s0, s1, r, is_terminated)
            episodic_reward += r
            # env.render(frames=100)
            s0 = s1
        if num_episode % 20 == 0:
            print("Episode: {}, Score: {}".format(num_episode, episodic_reward))
        env.reset()

In [22]:
cliff_walk()

Episode: 0, Score: -100
Episode: 20, Score: -64
Episode: 40, Score: -46
Episode: 60, Score: -21
Episode: 80, Score: -34
Episode: 100, Score: -29
Episode: 120, Score: -15
Episode: 140, Score: -13
Episode: 160, Score: -15
Episode: 180, Score: -7
Episode: 200, Score: -7
Episode: 220, Score: -9
Episode: 240, Score: -11
Episode: 260, Score: -8
Episode: 280, Score: -7
Episode: 300, Score: -7
Episode: 320, Score: -105
Episode: 340, Score: -7
Episode: 360, Score: -9
Episode: 380, Score: -7
Episode: 400, Score: -7
Episode: 420, Score: -9
Episode: 440, Score: -9
Episode: 460, Score: -7
Episode: 480, Score: -7
Episode: 500, Score: -7
Episode: 520, Score: -7
Episode: 540, Score: -103
Episode: 560, Score: -7
Episode: 580, Score: -9
Episode: 600, Score: -7
Episode: 620, Score: -7
Episode: 640, Score: -9
Episode: 660, Score: -8
Episode: 680, Score: -7
Episode: 700, Score: -9
Episode: 720, Score: -7
Episode: 740, Score: -7
Episode: 760, Score: -7
Episode: 780, Score: -7
Episode: 800, Score: -7
Episode