In [None]:
pip install gym



In [None]:
pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-0.29.0-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.8/953.8 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.0


In [None]:
pip install Ipython

Collecting jedi>=0.16 (from Ipython)
  Downloading jedi-0.19.0-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi
Successfully installed jedi-0.19.0


In [None]:
import numpy as np
import gymnasium as gym
import random
import time
from IPython.display import clear_output

In [None]:
# Create the environment
env = gym.make('Taxi-v3', render_mode='ansi').env


In [None]:
# Initialize the Q-table
# A Q-Table helps us to find the best action for each state in the environment.
state_space_size = env.observation_space.n # number of rows in q table
action_space_size = env.action_space.n # number of columns in q table

q_table = np.zeros((state_space_size, action_space_size)) #building and filling the q table with zeros
print(q_table)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [None]:
# Set hyperparameters
num_episodes = 10000
max_steps_per_episode = 100 # if agent havent reach the goal by 100 step agent will recieve 0 points and new episode will start


learning_rate = 0.1 # aplha
discount_rate = 0.99 #gamma

# epsilon greedy strategy
exploration_rate = 1 #epsilon
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001 # determine the rate at which the exploration rate will decay
# first agent explore the environment epsilon = 1 then after each episode epsilon start to decrease and probability of
# exploring decrease and tend toward exploiting the environment.

In [None]:
# List of rewards
rewards_all_episodes = []

# 2 Q learning algorithm - For life or until learning is stopped
for episode in range(num_episodes): #this loop contain everything that happen with a single episode
    # Reset the environment back to the starting state
    state = env.reset()[0]
    done = False # starting out with environment state not done
    rewards_current_episode = 0 # starting out with zero rewards

    for step in range(max_steps_per_episode): #this loop contain everything that happen with a single time step in each episode
        # 3. Choose an action a in the current world state (s)
        ## First we randomize a number
        ### Exploration - exploitation trade-off
        exploration_rate_threshold = random.uniform(0, 1) #this will be used to determine if agen will explore or exploit the environment

        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state,:]) # chossing the biggest q value

        # Else doing a random choice --> exploration
        else:
            action = env.action_space.sample()

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, truncated, info = env.step(action)

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        # qtable[new_state,:] : all the actions we can take from new state
        q_table[state, action] = q_table[state, action] * (1-learning_rate) + \
            learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))

        # new q value is the weighted sum of old q value and the learned value
        # Learning rate in how quickly the agent abendends the q value in Q-table for new q-value



        # Our new state is state
        state = new_state
        rewards_current_episode += reward

        # If done (if we're dead) : finish episode
        if done == True:
            break

    # Reduce epsilon (because we need less and less exploration)
    ## Exploration rate decay
    exploration_rate = min_exploration_rate + \
        (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)

    rewards_all_episodes.append(rewards_current_episode)


# calculate and print the average rewards per thousand episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes),num_episodes/1000)
count = 1000
print("*****Average reward per thousand episodes******\n")
for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r/1000)))
    count += 1000
# print updated q table
print("\n\n*****Q-table*****")
print(q_table)

*****Average reward per thousand episodes******

1000 :  -245.41799999999992
2000 :  -36.40199999999995
3000 :  2.1549999999999936
4000 :  5.874999999999978
5000 :  6.813999999999961
6000 :  7.305999999999968
7000 :  7.256999999999966
8000 :  7.569999999999966
9000 :  7.4189999999999525
10000 :  7.672999999999968


*****Q-table*****
[[  0.           0.           0.           0.           0.
    0.        ]
 [  0.23293784   0.90927919  -0.8983074    0.51466107   9.6220697
   -9.06361301]
 [  5.24444611   9.62140261   3.71929389   5.86730715  14.11880599
   -1.55264263]
 ...
 [ -1.04890958  12.60355686  -0.87229527  -1.04239506  -6.93355943
   -8.52379732]
 [ -2.69626579  -2.74686908  -2.7665515    6.38256196 -10.35742999
  -10.59541701]
 [ -0.1999       1.61480328   0.2686028   18.28648957  -1.62029393
   -1.9       ]]


In [None]:
# Watch our agent play Taxi game by playing the best action
# from each state according to the Q-table

for episode in range(5):
    # initialize new episode params
    state = env.reset()[0]
    done = False
    print("*****EPISODE ", episode+1, "*****\n\n\n\n")
    time.sleep(1)

    for step in range(max_steps_per_episode):
        # Show current state of environment on screen
        # Choose action with highest Q-value for current state
        # Take new action
        clear_output(wait=True) # clear the output for new episode
        print(env.render()) # to see the game grid and see where our agent is
        time.sleep(0.3)

        action = np.argmax(q_table[state,:])
        new_state, reward, done, truncated, info= env.step(action)


        if done:
            clear_output(wait=True)
            print(env.render())
            if reward == 1:
                print("****You reached the goal!****")
                time.sleep(3)
            else:
                print("****You reached the goal!****")
                time.sleep(3)
                clear_output(wait=True)
            break

        state = new_state


env.close()

+---------+
|R: | : :[35m[34;1m[43mG[0m[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

****You reached the goal!****
