In [1]:
!pip install cmake 'gym[atari]' scipy

Collecting cmake
  Downloading cmake-3.21.3-py2.py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (21.5 MB)
[K     |████████████████████████████████| 21.5 MB 5.7 MB/s eta 0:00:011
Collecting ale-py~=0.7.1
  Downloading ale_py-0.7.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 6.7 MB/s eta 0:00:01
[?25hCollecting importlib-resources
  Downloading importlib_resources-5.2.2-py3-none-any.whl (27 kB)
Installing collected packages: importlib-resources, ale-py, cmake
Successfully installed ale-py-0.7.2 cmake-3.21.3 importlib-resources-5.2.2


In [2]:
import gym

# =======================================
## CREATE ENVIRONMENT
# ========================================

# We are using the .env on the end of make to avoid training stopping at 200 iterations, 
# which is the default for the new version of Gym (verify this in gym website...)
env = gym.make("Taxi-v3").env   

env.render()

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[43m [0m|
+---------+



In [None]:
# SOME INTERESTING COMMANDS

In [None]:
# Resets the environment and returns a random initial state.
env.reset

# Step the environment by one timestep. Returns
env.step(action)

# Renders one frame of the environment (helpful in visualizing the environment)
env.render

In [None]:
# observation: Observations of the environment
# reward: If your action was beneficial or not
# done: Indicates if we have successfully picked up and dropped off a passenger, also called one episode
# info: Additional info such as performance and latency for debugging purposes

In [None]:
#===========================
##        RULES:
#===========================

# "There are 4 locations (labeled by different letters), and our job is to pick up the passenger 
# at one location and drop him off at another. We receive +20 points for a successful drop-off 
# and lose 1 point for every time-step it takes. There is also a 10 point penalty for illegal 
# pick-up and drop-off actions."

In [3]:
# =======================================
## RESET RANDOMLY ENVIRONMENT
# ========================================

env.reset() # reset environment to a new, random state
env.render()

print("Action Space {}".format(env.action_space)) # There 6 possible actions
print("State Space {}".format(env.observation_space)) # This is explained in the .odt file

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m|[43m [0m: |B: |
+---------+

Action Space Discrete(6)
State Space Discrete(500)


In [None]:
# ========================================
## UNDERSTAND VISUAL ENVIRONMENT
# ========================================

# Taxi is the filled rectangle
# Vertical lines are walls
# R,G,Y and B are the pick-up and dropoff locations
# The blue letter is the passenger pickup location
# The purple letter is the destination

# ========================================
## ACTION  NOTATIONS
# ========================================
# 0 = south
# 1 = north
# 2 = east
# 3 = west
# 4 = pickup
# 5 = dropoff

# ====================================
## STATE NOTATIONS
# ====================================
# (taxi_row,taxi_column) with taxi_row and taxi_column in {0,1,2,3,4} ==> Current location of the taxi

# Passenger locations:
#     - 0: R(ed)
#     - 1: G(reen)
#     - 2: Y(ellow)
#     - 3: B(lue)
#     - 4: in taxi
        
# Destinations:
#     - 0: R(ed)
#     - 1: G(reen)
#     - 2: Y(ellow)
#     - 3: B(lue)

In [4]:
# =======================================
## CREATE ENVIRONMENT
# ========================================
state = env.encode(3, 1, 2, 0) # (taxi row, taxi column, passenger index, destination index)
# the encode function provides to a value between 0 and 499 (remember that the state space has 500 dimension)
print("State:", state)
env.s = state
env.render()

State: 328
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+



In [5]:
# Reward Table in the form of a dictionary: We can think of it like a matrix that has 
# the number of states as rows and number of actions as columns,

## This dictionary has the structure {action: [(probability, nextstate, reward, done)]}

# All the movement actions have a -1 reward and the pickup/dropoff actions have -10 reward 
# in this particular state. If we are in a state where the taxi has a passenger and is on 
# top of the right destination, we would see a reward of 20 at the dropoff action (5)

env.P[state]

{0: [(1.0, 428, -1, False)],
 1: [(1.0, 228, -1, False)],
 2: [(1.0, 348, -1, False)],
 3: [(1.0, 328, -1, False)],
 4: [(1.0, 328, -10, False)],
 5: [(1.0, 328, -10, False)]}

In [None]:
# EN résumé, j'ai un état courant, et une action courante, ce qui donne lieu à un état suivant.
# cet état suivant, si ce n'est pas déjà la destination, c'est -1.
# Si le passager est dans le taxi et que le taxi est arrivé à destination, le reward c'est 20
# Si l'état contient des valeurs illégales, par exemple, il va récupérer le passager ailleurs
# qu'aux stations prévues à cet effet (ça veut dire qu'à l'état précédent, la position passager
# était ailleurs que dans le taxi), ou alors que le taxi dépose le passager ailleurs qu'à une station
# blabla (càd qu'à l'état l'action déposer se trouver dans le vecteur d'état alors que la position
# position du taxi est ailleuyrs qu'à une station)