Actividad final: Entrenamiento de un agente con aprendizaje por refuerzo - Equipo Jacob, Rillo, Samuel Padilla y Dhali

In [None]:
!pip install gymnasium



In [None]:
#------------------------------------------------------------------------------------------------------------------
#   Card pole example for reinforcement learning
#------------------------------------------------------------------------------------------------------------------
import gymnasium as gym
import numpy as np
import random
from time import sleep
from sklearn.neural_network import MLPRegressor

###### Q-Learning ######
# Create the environment
env = gym.make("Acrobot-v1", render_mode=None)

# Parameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1

# Observation and action space dimensions
obs_space = env.observation_space.shape[0]  # The environment has a continuous observation space of size 4
num_actions = env.action_space.n  # Number of available actions (2)

# Neural network
model = MLPRegressor(hidden_layer_sizes=(50, 50), activation='relu', solver='adam', max_iter=10)

# Initial model training with dummy data
X = np.random.rand(10, obs_space)
y = np.random.rand(10, num_actions)
model.fit(X, y)

# Function to choose action
def choose_action(state, epsilon):
    if np.random.rand() < epsilon:
        return env.action_space.sample()  # Random action

    q_values = model.predict([state])  # Predict Q-values for the given state
    return np.argmax(q_values[0])  # Choose action with the highest Q-value

#Function to calculate the height of the end effector
def calculate_height(state):
    theta1 = np.arctan2(state[1], state[0])  # cos?1, sin?1
    theta2 = np.arctan2(state[3], state[2])  # cos?2, sin?2
    # Calculate the height of the end effector
    height = -np.cos(theta1) - np.cos(theta1 + theta2)
    return height

# Training phase
episodes = 1000
for i in range(episodes):
    state = env.reset()[0]  # Reset the environment and get the initial state
    epochs = 0
    terminated  = False
    truncated = False

    while not terminated and not truncated:

        # Select the next action
        action = choose_action(state, epsilon)

        # Perform the selected action
        next_state, reward, terminated, truncated, info = env.step(action)

        # Calculate reward
        if terminated or truncated:
            reward = 100
        else:
            current_height = calculate_height(state)
            next_height = calculate_height(next_state)

            # Reward based on change in height
            height_gain = next_height - current_height

            reward = height_gain * 10

        # Get current and next Q-values
        q_values = model.predict([state])[0]
        next_q_values = model.predict([next_state])[0]

        # Update the Q-value
        q_values[action] = (1 - alpha) * q_values[action] + alpha * (reward + gamma * np.max(next_q_values))

        # Train the model with the new Q-values
        model.partial_fit([state], [q_values])

        # Update state and count epochs
        state = next_state
        epochs += 1

    # Show training status
    print(f"Episode: {i}, epochs: {epochs}")

print("Training finished.\n")

# Execution phase
env = gym.make("Acrobot-v1", render_mode='human').env

for k in range(10):
    state = env.reset()[0]
    env.render()

    epochs = 0
    terminated  = False
    truncated = False

    while not terminated and not truncated:

        # Select the action with the best Q-value
        action = choose_action(state, epsilon=0.0)

        # Perform the selected action
        state, reward, terminated, truncated, info = env.step(action)

        # Show current state
        env.render()
        print(f'State: {state}  Action: {action}  Reward: {reward}')

        # Count epochs
        epochs += 1

        sleep(.02)

    print(f"Timesteps taken: {epochs}")

env.close()

#------------------------------------------------------------------------------------------------------------------
#   End of file
#------------------------------------------------------------------------------------------------------------------



[1;30;43mSe truncaron las últimas líneas 5000 del resultado de transmisión.[0m
State: [ 0.99994934  0.01006709  0.9996066   0.02804728 -0.01042526 -0.02900115]  Action: 1  Reward: -1.0
State: [ 0.9999732   0.00732514  0.99979144  0.02042247 -0.01667161 -0.04637649]  Action: 1  Reward: -1.0
State: [ 0.9999937   0.00355643  0.99995065  0.00993646 -0.02057347 -0.05726972]  Action: 1  Reward: -1.0
State: [ 9.9999976e-01 -7.0966710e-04  9.9999809e-01 -1.9438440e-03
 -2.1588493e-02 -6.0146395e-02]  Action: 1  Reward: -1.0
State: [ 0.99998814 -0.0048749   0.99990815 -0.01355334 -0.01957776 -0.05459777]  Action: 1  Reward: -1.0
State: [ 0.9999651  -0.00835543  0.99972934 -0.02326479 -0.01482257 -0.04140312]  Action: 1  Reward: -1.0
State: [ 0.99994314 -0.01066322  0.99955827 -0.02971908 -0.00798821 -0.02241412]  Action: 1  Reward: -1.0
State: [ 9.9993414e-01 -1.1474715e-02  9.9948740e-01 -3.2014322e-02
 -3.5207479e-05 -2.8614359e-04]  Action: 1  Reward: -1.0
State: [ 0.999943   -0.01067702  