In [64]:
pip install gymnasium



In [65]:
import gymnasium as gym
import numpy as np


In [66]:
env=gym.make('FrozenLake-v1',desc=None,map_name="4x4",is_slippery=True,render_mode='rgb_array')
n_observations=env.observation_space.n
n_actions=env.action_space.n

In [67]:
print("num of states",n_observations)
print("num of actions",n_actions)

num of states 16
num of actions 4


In [68]:
Q_table=np.zeros((n_observations,n_actions))
Q_table

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [69]:
Q_table.shape

(16, 4)

In [70]:
Q_table[9,:]

array([0., 0., 0., 0.])

In [71]:
n_episodes=10000
steps_allowed=100
epsilon=1
decay_rate=0.001
min_epsilon=0.01
max_epsilon=1
gamma=0.99
lr=0.1


In [72]:
rewards_per_episodes=list()


In [73]:
#we iterate over episodes
for e in range(n_episodes):
  #we initialize the first state of the episode
  state = env.reset()[0]
  done = False

  #sum the rewards that the agent gets from the environment
  total_reward = 0

  for i in range(steps_allowed):
    # epsilon greedy strategy
    # we initiate a random number between 0 and 1
    # if the random_number is less than the exploration proba(epsilon)
    #     the agent explores
    # else
    #     he exploits his knowledge

    if np.random.uniform(0,1) < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q_table[state,:])

    # The environment runs the chosen action and returns
    # the next state, a reward and true if the episode is ended.
    next_state, reward, done, truncated , info = env.step(action)

    # We update our Q-table using the Q-learning iteration
    Q_table[state, action] = (1-lr) * Q_table[state, action] \
                       + lr*(reward + gamma* np.max(Q_table[next_state,:]))
    total_reward = total_reward + reward

    state = next_state
    # If the episode is finished, we leave the for loop
    if done:
        break

  #We update the exploration proba using exponential decay formula
  epsilon = exploration_rate = min_epsilon + \
    (max_epsilon - min_epsilon) * np.exp(-decay_rate*e)
  rewards_per_episodes.append(total_reward)

In [74]:
len(rewards_per_episodes)

10000

In [75]:
rewards_per_thousand_episodes=np.split(np.array(rewards_per_episodes),n_episodes/1000)


In [76]:
count=1000
print('----------Average reward per thousand episodes-------------')
for r in rewards_per_thousand_episodes:
  print(count,':',str(sum(r/1000)))
  count+=1000

----------Average reward per thousand episodes-------------
1000 : 0.04300000000000003
2000 : 0.23000000000000018
3000 : 0.3950000000000003
4000 : 0.5710000000000004
5000 : 0.6500000000000005
6000 : 0.6450000000000005
7000 : 0.6610000000000005
8000 : 0.6790000000000005
9000 : 0.6970000000000005
10000 : 0.6430000000000005


In [77]:
print('--------------updated Q-Table-----------------')
print(Q_table)

--------------updated Q-Table-----------------
[[0.51731326 0.47172897 0.46321928 0.46829528]
 [0.27685912 0.24860471 0.31647805 0.46443406]
 [0.38548215 0.28515498 0.26558353 0.26627473]
 [0.08561794 0.13819486 0.06736455 0.0724016 ]
 [0.53316377 0.32543117 0.36609726 0.41535073]
 [0.         0.         0.         0.        ]
 [0.12802121 0.10497156 0.23192171 0.10728969]
 [0.         0.         0.         0.        ]
 [0.21495406 0.2764691  0.35384117 0.56107238]
 [0.39257593 0.61386807 0.44029396 0.42261549]
 [0.63074117 0.2780046  0.24434348 0.35655583]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.45840344 0.58619855 0.70112917 0.52446733]
 [0.73246291 0.84195931 0.73026793 0.74624733]
 [0.         0.         0.         0.        ]]


In [78]:
import time
from IPython.display import clear_output


In [79]:
%pip install gymnasium[classic_control] comet_ml
import comet_ml
comet_ml.init(project_name="frozen_lake")
env = gym.wrappers.RecordVideo(env, 'gameplay video')




In [81]:
#Visualising the game
for episode in range(3):
  state=env.reset()[0]
  done=False
  print('----------EPISODE:',episode+1,'---------\n\n\n\n')
  time.sleep(1)

  for step in range(steps_allowed):
    clear_output(wait=True)
    print(env.render())
    time.sleep(0.4)

    action = np.argmax(Q_table[state,:])
    new_state, reward, done, truncated, info = env.step(action)

    if done:
      clear_output(wait=True)
      print(env.render())
      if reward == 1:
          print("****You reached the goal!****")
          time.sleep(3)
      else:
          print("****You fell through a hole!****")
          time.sleep(3)
      clear_output(wait=True)
      break

    state=new_state

env.close()

[[[180 200 230]
  [180 200 230]
  [180 200 230]
  ...
  [180 200 230]
  [180 200 230]
  [180 200 230]]

 [[180 200 230]
  [204 230 255]
  [204 230 255]
  ...
  [204 230 255]
  [204 230 255]
  [180 200 230]]

 [[180 200 230]
  [235 245 249]
  [204 230 255]
  ...
  [204 230 255]
  [204 230 255]
  [180 200 230]]

 ...

 [[180 200 230]
  [235 245 249]
  [235 245 249]
  ...
  [204 230 255]
  [235 245 249]
  [180 200 230]]

 [[180 200 230]
  [235 245 249]
  [235 245 249]
  ...
  [204 230 255]
  [204 230 255]
  [180 200 230]]

 [[180 200 230]
  [180 200 230]
  [180 200 230]
  ...
  [180 200 230]
  [180 200 230]
  [180 200 230]]]
****You reached the goal!****
