Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Pendulum"

In [43]:
import numpy as np
from pendulum_env_extended import PendulumEnvExtended
import random 

In [44]:
alpha = 0.1
gamma = 0.99
epsilon = 0.1
n_episodes = 100
epsilon_variability = 0.8

In [45]:
env = PendulumEnvExtended(render_mode='rgb_array')

Discretización de los estados

In [46]:
x_space = np.linspace(-1, 1, 10)
y_space = np.linspace(-1, 1, 10)
vel_space = np.linspace(-8, 8, 100)
x_space

array([-1.        , -0.77777778, -0.55555556, -0.33333333, -0.11111111,
        0.11111111,  0.33333333,  0.55555556,  0.77777778,  1.        ])

Obtener el estado a partir de la observación

In [47]:
def get_state(obs):
    x, y, vel = obs
    x_bin = np.digitize(x, x_space)
    y_bin = np.digitize(y, y_space)
    vel_bin = np.digitize(vel, vel_space)
    return x_bin, y_bin, vel_bin

In [48]:
state = get_state(np.array([-0.4, 0.2, 0.3])) #mapeo de estado. Nos dice el bin en el que estamos
state

(3, 6, 52)

Discretización de las acciones

In [49]:
actions = list(np.linspace(-2, 2, 10)) #minimo, maximo y cuantos bins
actions

[-2.0,
 -1.5555555555555556,
 -1.1111111111111112,
 -0.6666666666666667,
 -0.22222222222222232,
 0.22222222222222232,
 0.6666666666666665,
 1.1111111111111107,
 1.5555555555555554,
 2.0]

In [50]:
def get_sample_action():
    return np.array([random.choice(actions)], dtype=np.float32)

Inicilización de la tabla Q

In [51]:
Q = np.zeros((len(x_space) + 1, len(y_space) + 1, len(vel_space) + 1, len(actions)))
Q

array([[[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
    

Obtención de la acción a partir de la tabla Q

In [52]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

Epsilon-Greedy Policy

In [53]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    # explore
    if explore:
        action = get_sample_action()
        print('explore')
    # exploit
    else:
        action = np.array([np.argmax(Q[get_state(state)])], dtype=np.int32)
        print('exploit')
        
    return action

Ejemplo de episodio 

In [54]:
obs,_ = env.reset()
print(obs)
done = False
total_reward = 0
while not done:
    state = get_state(obs)
    print('state', state)
    action = epsilon_greedy_policy(state, Q, 0.5)
    obs, reward, done, _, _ = env.step(action)
    total_reward += reward
    print('->', state, action, reward, obs, done)
    env.render()
print('total_reward', total_reward)

[-0.76230913 -0.6472131   0.635138  ]
state (2, 2, 54)
explore
-> (2, 2, 54) [1.5555556] -5.984993757612033 [-0.7497739  -0.66169405  0.3830615 ] False
state (2, 2, 52)
exploit
-> (2, 2, 52) [0] -5.863896684194426 [-0.7535074  -0.6574394  -0.11320904] False
state (2, 2, 49)
exploit
-> (2, 2, 49) [0] -5.877916538034134 [-0.77308804 -0.63429874 -0.6062886 ] False
state (2, 2, 46)
explore
-> (2, 2, 46) [-2.] -6.06528756211143 [-0.8150386 -0.5794066 -1.3820126] False
state (1, 2, 41)
explore
-> (1, 2, 41) [1.1111112] -6.56074817764114 [-0.86001086 -0.5102757  -1.6499009 ] False
state (1, 3, 40)
exploit
-> (1, 3, 40) [0] -7.063908162662156 [-0.90734357 -0.4203899  -2.0326078 ] False
state (1, 3, 37)
exploit
-> (1, 3, 37) [0] -7.744884320561922 [-0.9503368 -0.3112233 -2.3479002] False
state (1, 4, 35)
explore
-> (1, 4, 35) [2.] -8.536525073670107 [-0.9795841  -0.20103474 -2.2813177 ] False
state (1, 4, 36)
explore
-> (1, 4, 36) [0.22222222] -9.159260912324315 [-0.99660075 -0.08238262 -2.3987

In [55]:
def train_policy(alpha, gamma, epsilon, epsilon_variability, episodes):

            total_rewards = []

            for episode in range(episodes):
                obs, _ = env.reset()
                done = False
                total_reward = 0
                step_count = 0

                while not done:
                    state = obs
                    epsilon = max(epsilon - epsilon_variability * epsilon, 0)  # Decay epsilon
                    action = epsilon_greedy_policy(state, Q, epsilon)
                    obs, reward, done, _, _ = env.step(action)
                    print(state)
                    print(reward)
                    print(action)
                    newState = get_state(state)
                    Q[newState[0],newState[1],newState[2],action] += alpha * (reward + gamma * np.max(Q[get_state(obs)]) - Q[newState[0],newState[1],newState[2],action])
                    total_reward += reward
                    step_count += 1

                total_rewards.append(total_reward)
                print(f" total reward: {total_reward}, steps: {step_count}")


In [56]:
train_policy(alpha, gamma, epsilon, epsilon_variability, n_episodes)

exploit
[-0.8276551  -0.5612371  -0.03291355]
-6.480762605257869
[0]
exploit
[-0.8401765  -0.542313   -0.45384136]
-6.617301386225504
[0]
exploit
[-0.86272675 -0.5056704  -0.8605761 ]
-6.89364558975864
[0]
exploit
[-0.89239675 -0.45125166 -1.239829  ]
-7.300920090697748
[0]
exploit
[-0.9251924  -0.37949836 -1.5782677 ]
-7.824460774648157
[0]
exploit
[-0.956479  -0.2918012 -1.8628914]
-8.443811051514066
[0]
exploit
[-0.9816203  -0.19084446 -2.0817423 ]
-9.13333012397252
[0]
exploit
[-0.9967392  -0.08069074 -2.2248757 ]
-9.863590086580569
[0]
exploit
[-0.99943924  0.03348486 -2.2853937 ]
-10.183597552488035
[1]
exploit
[-0.99035436  0.13855791 -2.11028   ]
-9.46086003862912
[0]
exploit
[-0.9714986   0.23704518 -2.0063617 ]
-8.825716618580854
[0]
exploit
[-0.94579834  0.32475445 -1.8285778 ]
-8.235194283802203
[0]
exploit
[-0.91711974  0.3986118  -1.585012  ]
-7.712809980839364
[0]
exploit
[-0.8896102  0.4567206 -1.2860531]
-7.27981482055796
[0]
exploit
[-0.86708236  0.4981648  -0.9435126

In [57]:
#alpha = 0.1  # Tasa de aprendizaje
#gamma = 0.99  # Factor de descuento
#epsilon = 0.1  # Tasa de exploración

#num_episodes = 1000

for episode in range(n_episodes):
    obs, _ = env.reset()
    done = False
    total_reward = 0
    total_reward_promedio = []

    while not done:
        state = get_state(obs)
        action = epsilon_greedy_policy(state, Q, epsilon) #Elije una acción
        next_obs, reward, done, _, _ = env.step(action) #Obtengo la recompensa
        total_reward_promedio.append(reward)
        next_state = get_state(next_obs)
        
        best_next_action = np.argmax(Q[next_state])
        td_target = reward + gamma * Q[next_state][best_next_action] #Calcula el target para la diferencia temporal
        td_error = td_target - Q[state][np.argmax(action)]
        Q[state][np.argmax(action)] += alpha * td_error
        
        obs = next_obs
        last_total_reward = total_reward_promedio[-10:]
        total_reward += reward

    last_rewards_mean = np.mean(last_total_reward)
    print(f'Episode {episode + 1}: Total: {last_rewards_mean}')
    print(f'Episode {episode + 1}: Total Reward: {total_reward}')



exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
explore
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
explore
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
explore
exploit
exploit
exploit
exploit
exploit
exploit
explore
exploit
explore
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
explore
exploit
exploit
exploit
explore
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
explore
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
explore
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
explore
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
explore
exploit


exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
explore
explore
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
explore
exploit
exploit
exploit
exploit
exploit
explore
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
explore
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
explore
exploit
exploit
explore
exploit
exploit
exploit
exploit
exploit
exploit
exploit
explore
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
explore
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
exploit
