In [1]:
pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/953.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━[0m [32m757.8/953.9 kB[0m [31m22.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


In [2]:
import gymnasium as gym
import numpy as np

In [3]:
env=gym.make('CliffWalking-v0',render_mode='rgb_array')

In [4]:
n_observations=env.observation_space.n
n_actions=env.action_space.n


In [5]:
print("num of states",n_observations)
print("num of actions",n_actions)

num of states 48
num of actions 4


In [6]:
Q_table=np.zeros((n_observations,n_actions))
Q_table

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],


In [7]:
Q_table.shape

(48, 4)

In [10]:
# Set the discount factor
gamma = 0.99
# Set the learning rate
alpha = 0.1
# Set the number of episodes to train for
num_episodes = 10000
# Set the maximum number of steps per episode
max_steps_per_episode = 150
# Set the exploration rate
exploration_rate = 1.0
min_exploration_rate=0.01
max_exploration_rate=1.0
# Set the exploration decay rate
exploration_decay_rate = 0.01

In [9]:
rewards_per_episodes=list()

In [12]:
# first we iterate over episodes
for e in range(num_episodes):
  #we initialize the first state of the episode
  state = env.reset()[0]
  done = False

  #sum the rewards
  total_reward = 0

  for i in range(max_steps_per_episode):

    if np.random.uniform(3,0) < exploration_rate:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q_table[state,:])

    # The environment runs the chosen action and returns
    # the next state, a reward and true if the episode is ended.
    next_state, reward, done, truncated , info = env.step(action)

    #updating Q-table using the Q-learning iteration
    Q_table[state, action] = (1-alpha) * Q_table[state, action] \
                       + alpha*(reward + gamma* np.max(Q_table[next_state,:]))
    total_reward = total_reward + reward

    state = next_state
    # if episode is finished
    if done:
        break

  #updating the exploration probe using exponential decay formula
  epsilon = exploration_rate = min_exploration_rate + \
    (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*e)
  rewards_per_episodes.append(total_reward)

In [13]:
len(rewards_per_episodes)

10000

In [15]:
rewards_per_thousand_episodes=np.split(np.array(rewards_per_episodes),num_episodes/1000)

In [24]:
count=1000
print('Average reward per thousand episodes')
for r in rewards_per_thousand_episodes:
  print(count,':',str(sum(r/1000)))
  count+=1000

Average reward per thousand episodes
1000 : -43.6889999999988
2000 : -13.894999999999918
3000 : -14.120999999999917
4000 : -13.582999999999915
5000 : -14.211999999999914
6000 : -13.440999999999912
7000 : -13.696999999999916
8000 : -14.663999999999916
9000 : -14.094999999999914
10000 : -13.677999999999914


In [25]:
print('updated Q-Table')
print(Q_table)

updated Q-Table
[[ -10.49074437  -10.44982218  -10.44481755  -10.45857137]
 [ -10.09181311  -10.0574576   -10.0650409   -10.02555597]
 [  -9.4601066    -9.47497618   -9.45879948   -9.5448778 ]
 [  -8.85959795   -8.83299267   -8.82443267   -8.83651555]
 [  -8.20467091   -8.15862747   -8.14278795   -8.15948794]
 [  -7.42162454   -7.41174468   -7.49270876   -7.54101265]
 [  -6.72446737   -6.67871023   -6.69992711   -6.72719561]
 [  -6.00023812   -5.93149243   -5.96603301   -6.01602032]
 [  -5.22431241   -5.16923172   -5.16336959   -5.31689162]
 [  -4.46694135   -4.39316774   -4.41579651   -4.46272357]
 [  -3.67016553   -3.62295565   -3.6443644    -3.62317823]
 [  -2.89628602   -2.93483799   -2.87350376   -3.01554821]
 [ -10.83330948  -10.84025781  -10.88223638  -10.83229635]
 [ -10.32683129  -10.28858407  -10.34406456  -10.27697306]
 [  -9.63929431   -9.63729018   -9.68062675   -9.73289852]
 [  -8.90234254   -8.91394433   -8.96529585   -8.99616426]
 [  -8.16799682   -8.16142937   -8.21206

In [18]:
import time
from IPython.display import clear_output

In [22]:
%pip install gymnasium[classic_control] comet_ml
import comet_ml
comet_ml.init(project_name="CliffWalking-v0")
env = gym.wrappers.RecordVideo(env, 'gameplay video')



  logger.warn(


In [23]:
for episode in range(3):
  state=env.reset()[0]
  done=False
  print('----------EPISODE:',episode+1,'---------\n\n\n\n')
  time.sleep(1)

  for step in range(max_steps_per_episode):
    clear_output(wait=True)
    print(env.render())
    time.sleep(0.4)

    action = np.argmax(Q_table[state,:])
    new_state, reward, done, truncated, info = env.step(action)

    if done:
      clear_output(wait=True)
      print(env.render())
      if reward == -1:
          print("****You reached the goal!****")
          time.sleep(3)
      else:
          print("****You fell!****")
          time.sleep(3)
      clear_output(wait=True)
      break

    state=new_state

env.close()

[[[ 91 150  97]
  [ 91 150  97]
  [ 91 150  97]
  ...
  [ 86 170  69]
  [ 86 170  69]
  [ 86 170  69]]

 [[ 91 150  97]
  [ 91 150  97]
  [ 91 150  97]
  ...
  [ 86 170  69]
  [ 86 170  69]
  [ 86 170  69]]

 [[ 91 150  97]
  [ 91 150  97]
  [ 98 166 105]
  ...
  [105 182  74]
  [105 182  74]
  [ 86 170  69]]

 ...

 [[ 86 170  69]
  [ 86 170  69]
  [ 93 188  74]
  ...
  [ 91 150  97]
  [ 91 150  97]
  [ 91 150  97]]

 [[ 86 170  69]
  [ 86 170  69]
  [ 93 188  74]
  ...
  [ 91 150  97]
  [ 91 150  97]
  [ 91 150  97]]

 [[ 86 170  69]
  [ 86 170  69]
  [ 86 170  69]
  ...
  [ 91 150  97]
  [ 91 150  97]
  [ 91 150  97]]]
****You reached the goal!****
