In [31]:
import gym
import numpy as np
from time import sleep
import random
from IPython.display import clear_output

In [32]:
env = gym.make("CliffWalking-v0")

In [19]:
action_size = env.action_space.n
state_size = env.observation_space.n

q_table = np.zeros((state_size,action_size))

In [21]:
EPISODES = 10000
STEPS = 250 

LR = 0.01
DISCOUNT = 0.99

MIN_EXPLORATION = 0.1
MAX_EXPLORATION = 1
DECAY = 0.001
exploration = 1

In [28]:
all_rewards = []

for episode in range(EPISODES):

    state = env.reset()
    done= False
    rewards = 0

    for step in range(STEPS):
        rate = random.uniform(0,1)

        if(exploration < rate):
            action = np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()
        
        new_state, reward, done, info = env.step(action)

        q_table[state,action] *= (1-LR)
        q_table[state,action] += (LR *(reward + (DISCOUNT* np.max(q_table[new_state, :]))))

        state = new_state
        rewards += reward

        if done:
            break
    
    exploration = MIN_EXPLORATION + (MAX_EXPLORATION - MIN_EXPLORATION ) * np.exp(-DECAY*episode)
    all_rewards.append(rewards)

In [30]:
rewards_per_thousand = np.split(np.array(all_rewards), EPISODES/1000)
count = 1000

print("********Average reward per 1000 episodes********\n")
for r in rewards_per_thousand:
    print(count, ": ", str(sum(r/1000)))
    count += 1000

print("\n\n********Q-Table********\n")
print(q_table)

********Average reward per 1000 episodes********

1000 :  -1154.797999999998
2000 :  -146.33100000000024
3000 :  -76.75200000000035
4000 :  -54.70500000000006
5000 :  -51.987000000000066
6000 :  -50.4120000000001
7000 :  -52.460000000000065
8000 :  -49.31200000000015
9000 :  -51.99800000000006
10000 :  -52.87799999999996


********Q-Table********

[[ -10.93666721  -10.93734592  -10.94972192  -10.93940708]
 [ -10.51288798  -10.51507363  -10.52075897  -10.52439481]
 [  -9.91770359   -9.91507575   -9.92472504   -9.95329664]
 [  -9.23867054   -9.23997828   -9.24379202   -9.24624802]
 [  -8.51543658   -8.51332512   -8.52052492   -8.53061804]
 [  -7.77841369   -7.76486841   -7.76785394   -7.76512495]
 [  -7.00102526   -6.98873231   -6.99244472   -7.01150417]
 [  -6.20615118   -6.20635024   -6.2104534    -6.21336849]
 [  -5.41591066   -5.4107096    -5.41152545   -5.42540541]
 [  -4.60981538   -4.59961559   -4.60189602   -4.62521943]
 [  -3.78315581   -3.77190357   -3.77345781   -3.78667878]
 

In [34]:
for episode in range(3):
    state=env.reset()
    done=False
    sleep(1)

    for step in range(STEPS):        
        clear_output(wait=True)
        env.render()
        sleep(0.3)
        # Choose action with highest Q-value for current state       
        # Take new action

        action = np.argmax(q_table[state,:])
        new_state, reward, done, info = env.step(action)

        if done:
            clear_output(wait=True)
            env.render()
            if reward == -100:
               print(f'You fell!')
            else:
                print("You reached!")           

            sleep(3)
            clear_output(wait=True)
            break
        state = new_state

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  x

You reached!
