In [None]:
!pip install swig
!pip install gymnasium[box2d]


Collecting swig
  Downloading swig-4.2.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.2.1
Collecting gymnasium[box2d]
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium[box2d])
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setu

In [None]:
import gymnasium as gym
import numpy as np
from collections import defaultdict
import random

# Initialize the Bipedal Walker environment
env = gym.make('BipedalWalker-v3')

# Get the observation and action space
obs = env.observation_space
act = env.action_space

print(f"Observation Space: {obs}")
print(f"Action Space: {act}")


Observation Space: Box([-3.1415927 -5.        -5.        -5.        -3.1415927 -5.
 -3.1415927 -5.        -0.        -3.1415927 -5.        -3.1415927
 -5.        -0.        -1.        -1.        -1.        -1.
 -1.        -1.        -1.        -1.        -1.        -1.       ], [3.1415927 5.        5.        5.        3.1415927 5.        3.1415927
 5.        5.        3.1415927 5.        3.1415927 5.        5.
 1.        1.        1.        1.        1.        1.        1.
 1.        1.        1.       ], (24,), float32)
Action Space: Box(-1.0, 1.0, (4,), float32)


In [None]:
# Define the number of discrete buckets for observation and action spaces
obs_buckets = 20  # Number of buckets per dimension in observation space
act_buckets = 10      # Number of discrete actions per dimension in action space

episodes = 10000 # Number of episods
# hyperparameters
gamma = 0.99
alpha = 0.01

# this class returns an empty action space for unknown states, allowing us to use a sparse Q-Table
qtable = defaultdict(lambda: np.zeros(tuple([act_buckets] * 4)))

# discretize each observation state component in range [0, obs_buckets)
def discretizeState(state):
    # compute observation state value in interval [0,1], rescale it and approximate to the nearest integer
    discrete_state = np.round((state - obs.low) / (obs.high - obs.low) * (obs_buckets - 1)).astype(int)
    return tuple(discrete_state)

# discretize each action component in range [0, act_buckets)
def discretizeAction(action):
    discrete_action = np.round((action - act.low) / (act.high - act.low) * (act_buckets - 1)).astype(int)
    return tuple(discrete_action)

# recover the action corresponding to the bucket index
def undiscretizeAction(action):
    action = (action / (act_buckets - 1)) * (act.high - act.low) + act.low
    return tuple(action)

def epsilonGreedyStrategy(qtable, epsilon, state):

    if random.random() < epsilon:
        #exploration: generate an action tuple with random values
        action = np.random.randint(0, act_buckets, size=act.shape)

    else:
        #exploitation: choose action with maximum Q value
        flat_best_action_index = np.argmax(qtable[state])
        action = np.array(np.unravel_index(flat_best_action_index, qtable[state].shape))

    return action

def updateQTable(qtable, state, action, reward, next_state):
    q_sa = qtable[state][action]
    #print(action, state)
    max_next_value = np.max(qtable[next_state]) if next_state in qtable else 0  # 0 if next_state is unknown

    # Update Q-value using the Q-learning update rule
    new_value = q_sa * (1 - alpha) + alpha * (reward + gamma * max_next_value)
    qtable[state][action] = new_value
    #print(state, q_sa, qtable[state][action])


#sample_state = np.random.uniform(low=obs.low, high=obs.high, size=obs.shape)
#discrete_observation = discretizeState(sample_state)
#print(sample_state)
#print(discrete_observation)



In [None]:
for episode in range(1, episodes + 1):
    total_reward = 0
    init = env.reset()[0]
    state = discretizeState(init)  # Reset dell'environment e discretizzazione dello stato iniziale
    epsilon = 1.0/episodes  # Calcolo del valore epsilon per l'epsilon-greedy strategy

    while True:
        action = epsilonGreedyStrategy(qtable, epsilon, state)  # Selezione dell'azione usando la strategia epsilon-greedy
        continuous_action = undiscretizeAction(action)

        next_state, reward, done, truncated, info = env.step(continuous_action)  # Esecuzione dell'azione nel environment
        next_state = discretizeState(next_state)  # Discretizzazione del prossimo stato
        action = tuple(action)
        updateQTable(qtable, state, action, reward, next_state)  # Aggiornamento della Q-table
        total_reward += reward  # Aggiornamento della ricompensa totale
        state = next_state  # Passaggio allo stato successivo

        if done:
            break

    # Stampa della ricompensa totale di ogni episodio
    print(f"Episode {episode}/{episodes}, Total Reward: {total_reward}")

# Close the environment
env.close()

Episode 1/10000, Total Reward: -101.58378074409089
Episode 2/10000, Total Reward: -144.49624309135288
Episode 3/10000, Total Reward: -101.08297441289864
Episode 4/10000, Total Reward: -101.33267605988102
Episode 5/10000, Total Reward: -101.08669024970776
Episode 6/10000, Total Reward: -101.11505787293986
Episode 7/10000, Total Reward: -101.30026276934002
Episode 8/10000, Total Reward: -104.95009796253468
Episode 9/10000, Total Reward: -101.4757148174002
Episode 10/10000, Total Reward: -106.86069052189963
Episode 11/10000, Total Reward: -101.24928716641003
Episode 12/10000, Total Reward: -108.69415430525237
Episode 13/10000, Total Reward: -108.53209653561046
Episode 14/10000, Total Reward: -106.80622283713707
Episode 15/10000, Total Reward: -106.19200884975412
Episode 16/10000, Total Reward: -108.80179164756339
Episode 17/10000, Total Reward: -107.89550263005931
Episode 18/10000, Total Reward: -107.4233123044262
Episode 19/10000, Total Reward: -106.77653317297232
Episode 20/10000, Total

KeyboardInterrupt: 

In [None]:
qtable[discrete_observation] = np.random.uniform(low=act.low, high=act.high, size=act.shape)
print(qtable[discrete_observation])
print(np.argmax(qtable[discrete_observation]))
print(qtable[discrete_observation].shape)
print(np.unravel_index(np.argmax(qtable[discrete_observation]), qtable[discrete_observation].shape))

In [None]:
print((qtable[tuple(np.zeros(24))]).size, (qtable[tuple(np.zeros(24))]).shape)

In [None]:
print(qtable[tuple(np.zeros(24))])

[[[[0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]]

  [[0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]]

  [[0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]]

  [[0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]]

  [[0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]]]


 [[[0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]]

  [[0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]]

  [[0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]]

  [[0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]]

  [[0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
   [0. 0. 0. 0. 0.]
