Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Pendulum"

In [116]:
import numpy as np
from pendulum_env_extended import PendulumEnvExtended
import random 

In [117]:
alpha = 0.1
gamma = 0.99
epsilon = 0.1
n_episodes = 100
epsilon_variability = 0.8

In [118]:
env = PendulumEnvExtended(render_mode='rgb_array')

Discretización de los estados

In [119]:
x_space = np.linspace(-1, 1, 10)
y_space = np.linspace(-1, 1, 10)
vel_space = np.linspace(-8, 8, 100)
x_space

array([-1.        , -0.77777778, -0.55555556, -0.33333333, -0.11111111,
        0.11111111,  0.33333333,  0.55555556,  0.77777778,  1.        ])

Obtener el estado a partir de la observación

In [120]:
def get_state(obs):
    x, y, vel = obs
    x_bin = np.digitize(x, x_space)
    y_bin = np.digitize(y, y_space)
    vel_bin = np.digitize(vel, vel_space)
    return x_bin, y_bin, vel_bin

In [121]:
state = get_state(np.array([-0.4, 0.2, 0.3])) #mapeo de estado. Nos dice el bin en el que estamos
state

(3, 6, 52)

Discretización de las acciones

In [122]:
actions = list(np.linspace(-2, 2, 10)) #minimo, maximo y cuantos bins
actionBuckets=np.linspace(-2, 2, 10)
actions

[-2.0,
 -1.5555555555555556,
 -1.1111111111111112,
 -0.6666666666666667,
 -0.22222222222222232,
 0.22222222222222232,
 0.6666666666666665,
 1.1111111111111107,
 1.5555555555555554,
 2.0]

In [123]:
def getActions(action):
    return np.digitize(action,actionBuckets)

In [124]:
def get_sample_action():
    return random.choice(actions)

Inicilización de la tabla Q

In [125]:
Q = np.zeros((len(x_space) + 1, len(y_space) + 1, len(vel_space) + 1, len(actions)))
Q

array([[[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
    

Obtención de la acción a partir de la tabla Q

In [126]:
def optimal_policy(state, Q):
    action = actions[np.argmax(Q[state])]
    return action

Epsilon-Greedy Policy

In [127]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    # explore
    if explore:
        action = get_sample_action()
        print('explore')
    # exploit
    else:
        action = optimal_policy(state, Q)
        print('exploit')
        
    return action

Ejemplo de episodio 

In [128]:
obs,_ = env.reset()
print(obs)
done = False
total_reward = 0
while not done:
    state = get_state(obs)
    print('state', state)
    action = epsilon_greedy_policy(state, Q, 0.5)
    action_idx = actions.index(action)
    # Acción del ambiente
    real_action = np.array([action])

    obs, reward, done, _, _ = env.step(real_action)
    next_state = get_state(obs)
    
    total_reward += reward
    print('->', state, action, reward, obs, done)
    env.render()
print('total_reward', total_reward)

[-0.99998915 -0.00465182  0.9482235 ]
state (1, 5, 56)
explore
-> (1, 5, 56) 1.5555555555555554 -9.932730193449272 [-0.9979811  -0.06351246  1.1780679 ] False
state (1, 5, 57)
explore
-> (1, 5, 57) 1.1111111111111107 -9.614333307823554 [-0.99176675 -0.12805764  1.2971003 ] False
state (1, 4, 58)
exploit
-> (1, 4, 58) -2.0 -9.251515098776812 [-0.984993  -0.1725945  0.901057 ] False
state (1, 4, 56)
exploit
-> (1, 4, 56) -2.0 -8.894983405681883 [-0.98064965 -0.19577105  0.47161114] False
state (1, 4, 53)
explore
-> (1, 4, 53) -1.5555555555555556 -8.6950302318199 [-0.9797442  -0.200253    0.09144953] False
state (1, 4, 51)
explore
-> (1, 4, 51) -1.1111111111111112 -8.645532787081919 [-0.9819389  -0.18919845 -0.22540689] False
state (1, 4, 49)
explore
-> (1, 4, 49) -0.22222222222222232 -8.71498824245785 [-0.9855316  -0.16949166 -0.40063906] False
state (1, 4, 48)
exploit
-> (1, 4, 48) -2.0 -8.848548954651983 [-0.99170053 -0.12856908 -0.8277578 ] False
state (1, 4, 45)
explore
-> (1, 4, 45)

In [129]:
def train_policy(alpha, gamma, epsilon, epsilon_variability, episodes):

            total_rewards = []

            for episode in range(episodes):
                obs, _ = env.reset()
                done = False
                total_reward = 0
                step_count = 0

                while not done:
                    state = obs
                    epsilon = max(epsilon - epsilon_variability * epsilon, 0)  # Decay epsilon
                    action = epsilon_greedy_policy(get_state(state), Q, epsilon)
                    print(action)
                    obs, reward, done, _, _ = env.step([action])
                    print(state)
                    print(reward)
                    print(action)
                    newState = get_state(state)
                    binAction = getActions(action)
                    Q[newState[0],newState[1],newState[2],binAction] += alpha * (reward + gamma * np.max(Q[get_state(obs)]) - Q[newState[0],newState[1],newState[2],binAction])
                    total_reward += reward
                    step_count += 1

                total_rewards.append(total_reward)
                print(f" total reward: {total_reward}, steps: {step_count}")


In [130]:
train_policy(alpha, gamma, epsilon, epsilon_variability, n_episodes)

exploit
-2.0
[ 0.32430583 -0.9459523   0.19981845]
-1.5468789468497728
-2.0
exploit
-2.0
[ 0.28575626 -0.9583023  -0.8096458 ]
-1.710515703732407
-2.0
exploit
-2.0
[ 0.19707832 -0.98038775 -1.8283725 ]
-2.22182971241561
-2.0
exploit
-2.0
[ 0.0551657 -0.9984772 -2.8636634]
-3.121108009599076
-2.0
exploit
-2.0
[-0.13997121 -0.9901556  -3.9125211 ]
-4.463085777736877
-2.0
exploit
-2.0
[-0.378513  -0.925596  -4.9551377]
-6.296963593342379
-2.0
exploit
-2.0
[-0.63318086 -0.77400386 -5.949335  ]
-8.635035566819415
-2.0
exploit
-2.0
[-0.85582715 -0.51726186 -6.829838  ]
-11.417982454275055
-2.0
exploit
-2.0
[-0.9859614  -0.16697347 -7.517784  ]
-14.499393035150264
-2.0
exploit
-2.0
[-0.97380495  0.22738497 -7.943014  ]
-14.794063388415323
-2.0
exploit
-2.0
[-0.80838585  0.5886529  -8.        ]
-12.715154811670276
-2.0
exploit
-2.0
[-0.52139026  0.85331833 -7.8585105 ]
-10.670947791320964
-2.0
exploit
-2.0
[-0.17169826  0.98514956 -7.5185213 ]
-8.696084412896738
-2.0
exploit
-2.0
[ 0.18043599 

exploit
-2.0
[-0.7344375 -0.6786763 -8.       ]
-12.143061521612148
-2.0
exploit
-2.0
[-0.9407508  -0.33909884 -8.        ]
-14.219568597996625
-2.0
exploit
-2.0
[-0.9985401   0.05401489 -8.        ]
-15.936974028639568
-2.0
exploit
-2.0
[-0.898682    0.43860084 -8.        ]
-13.626932859280384
-2.0
exploit
-2.0
[-0.65803236  0.75298965 -7.9710493 ]
-11.597279184498342
-2.0
exploit
-2.0
[-0.3267732  0.9451028 -7.706307 ]
-9.566728341689311
-2.0
exploit
-2.0
[ 0.03198124  0.9994885  -7.29748   ]
-7.697256578927587
-2.0
exploit
-2.0
[ 0.36569542  0.9307346  -6.8478637 ]
-6.12473602573872
-2.0
exploit
-2.0
[ 0.64182097  0.7668545  -6.449813  ]
-4.92775477745187
-2.0
exploit
-2.0
[ 0.84448564  0.53557813 -6.1746716 ]
-4.13609951163811
-2.0
exploit
-2.0
[ 0.9659922  0.2585712 -6.072988 ]
-3.760523269678145
-2.0
exploit
-2.0
[ 0.99887633 -0.04739242 -6.17906   ]
-3.824325789169888
-2.0
exploit
-2.0
[ 0.93118715 -0.36454153 -6.514604  ]
-4.387240501006019
-2.0
exploit
-2.0
[ 0.74681205 -0.665

In [131]:
#alpha = 0.1  # Tasa de aprendizaje
#gamma = 0.99  # Factor de descuento
#epsilon = 0.1  # Tasa de exploración

#num_episodes = 1000

for episode in range(n_episodes):
    obs, _ = env.reset()
    done = False
    total_reward = 0
    total_reward_promedio = []

    while not done:
        state = get_state(obs)
        action = epsilon_greedy_policy(state, Q, epsilon) #Elije una acción
        next_obs, reward, done, _, _ = env.step(action) #Obtengo la recompensa
        total_reward_promedio.append(reward)
        next_state = get_state(next_obs)
        
        best_next_action = np.argmax(Q[next_state])
        td_target = reward + gamma * Q[next_state][best_next_action] #Calcula el target para la diferencia temporal
        td_error = td_target - Q[state][np.argmax(action)]
        Q[state][np.argmax(action)] += alpha * td_error
        
        obs = next_obs
        last_total_reward = total_reward_promedio[-10:]
        total_reward += reward

    last_rewards_mean = np.mean(last_total_reward)
    print(f'Episode {episode + 1}: Total: {last_rewards_mean}')
    print(f'Episode {episode + 1}: Total Reward: {total_reward}')



exploit


IndexError: invalid index to scalar variable.