In [1]:
import argparse
import logging
import os
import random
import time
import math

import gymnasium as gym
import numpy as np
import torch

from ddpg import DDPG
from utils.noise import OrnsteinUhlenbeckActionNoise
from utils.replay_memory import ReplayMemory, Transition

In [2]:
class NormalizedActions(gym.ActionWrapper):

    def action(self, action):
        """
        Normalizes the actions to be in between action_space.high and action_space.low.
        If action_space.low == -action_space.high, this is equals to action_space.high*action.

        :param action:
        :return: normalized action
        """
        action = (action + 1) / 2  # [-1, 1] => [0, 1]
        action *= (self.action_space.high - self.action_space.low)
        action += self.action_space.low
        return action

    def reverse_action(self, action):
        """
        Reverts the normalization

        :param action:
        :return:
        """
        action -= self.action_space.low
        action /= (self.action_space.high - self.action_space.low)
        action = action * 2 - 1
        return action

In [3]:
# Create logger
logger = logging.getLogger('train')
logger.setLevel(logging.INFO)

# Libdom raises an error if this is not set to true on Mac OSX
# see https://github.com/openai/spinningup/issues/16 for more information
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

# Parse given arguments
# gamma, tau, hidden_size, replay_size, batch_size, hidden_size are taken from the original paper
parser = argparse.ArgumentParser()
parser.add_argument("--env", default="Pendulum-v1", help="the environment on which the agent should be trained "
                         "(Default: Pendulum-v1")
parser.add_argument("--target_angle", default=0, type=int,
                    help="Target angle to control (default: 0 degrees)")
parser.add_argument("--render_train", default=False, type=bool,
                    help="Render the training steps (default: False)")
parser.add_argument("--render_eval", default=False, type=bool,
                    help="Render the evaluation steps (default: False)")
parser.add_argument("--load_model", default=False, type=bool,
                    help="Load a pretrained model (default: False)")
parser.add_argument("--save_dir", default="./models/",
                    help="Dir. path to save and load a model (default: ./models/)")
parser.add_argument("--seed", default=0, type=int,
                    help="Random seed (default: 0)")
parser.add_argument("--timesteps", default=1e6, type=int,
                    help="Num. of total timesteps of training (default: 1e6)")
parser.add_argument("--batch_size", default=64, type=int,
                    help="Batch size (default: 64; OpenAI: 128)")
parser.add_argument("--replay_size", default=1e6, type=int,
                    help="Size of the replay buffer (default: 1e6; OpenAI: 1e5)")
parser.add_argument("--gamma", default=0.99,
                    help="Discount factor (default: 0.99)")
parser.add_argument("--tau", default=0.001,
                    help="Update factor for the soft update of the target networks (default: 0.001)")
parser.add_argument("--noise_stddev", default=0.2, type=int,
                    help="Standard deviation of the OU-Noise (default: 0.2)")
parser.add_argument("--hidden_size", nargs=2, default=[400, 300], type=tuple,
                    help="Num. of units of the hidden layers (default: [400, 300]; OpenAI: [64, 64])")
parser.add_argument("--n_test_cycles", default=10, type=int,
                    help="Num. of episodes in the evaluation phases (default: 10; OpenAI: 20)")
args = parser.parse_args()

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info("Using {}".format(device))

args = argparse.Namespace(
    env="Pendulum-v1",
    target_angle=0, 
    render_train=True, 
    render_eval=False, 
    load_model=False, 
    save_dir="./models/", 
    seed=0, 
    timesteps=1_000, 
    batch_size=64, 
    replay_size=1_000_000, 
    gamma=0.99, 
    tau=0.001, 
    noise_stddev= 0.2, 
    hidden_size=[400, 300], 
    n_test_cycles=10)

usage: ipykernel_launcher.py [-h] [--env ENV] [--target_angle TARGET_ANGLE]
                             [--render_train RENDER_TRAIN]
                             [--render_eval RENDER_EVAL]
                             [--load_model LOAD_MODEL] [--save_dir SAVE_DIR]
                             [--seed SEED] [--timesteps TIMESTEPS]
                             [--batch_size BATCH_SIZE]
                             [--replay_size REPLAY_SIZE] [--gamma GAMMA]
                             [--tau TAU] [--noise_stddev NOISE_STDDEV]
                             [--hidden_size HIDDEN_SIZE HIDDEN_SIZE]
                             [--n_test_cycles N_TEST_CYCLES]
ipykernel_launcher.py: error: unrecognized arguments: --f=c:\Users\Kley2\AppData\Roaming\jupyter\runtime\kernel-v2-22336HWSz5qrTivdx.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
def calculate_reward(observ, torque, target_angle): # Todos los valores estan en radianes
		theta = math.atan2(observ[1],observ[0])
		theta_dot = observ[2]
		
		theta_n = ((theta + np.pi) % (2*np.pi)) - np.pi

		theta_error = np.abs(theta_n - target_angle)
		
		#torque_castigo = (torque**2) - np.minimum(2-np.absolute(torque),0)
		torque_castigo = 0.001 * (torque**2)
		costs = (theta_error**2) + 0.1 * (theta_dot**2) + torque_castigo
		if theta_error <= 0.087: # ~ 5°
			reward_n = -costs + math.exp(-(8*theta_error)**2)
		else:
			reward_n = -costs
		# reward_n = -costs
		return reward_n

In [None]:
if __name__ == "__main__":

    target_angle = args.target_angle

    # Define the directory where to save and load models
    checkpoint_dir = args.save_dir + args.env
    writer = SummaryWriter('runs/run_1')

    # Create the env
    kwargs = dict()
    if args.env == 'RoboschoolInvertedPendulumSwingup-v1':
        # 'swingup=True' must be passed as an argument
        # See pull request 'https://github.com/openai/roboschool/pull/192'
        kwargs['swingup'] = True
    env = gym.make(args.env, **kwargs)
    env = NormalizedActions(env)

    # Define the reward threshold when the task is solved (if existing) for model saving
    reward_threshold = gym.spec(args.env).reward_threshold if gym.spec(
        args.env).reward_threshold is not None else np.inf

    # Set random seed for all used libraries where possible
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    # Define and build DDPG agent
    hidden_size = tuple(args.hidden_size)
    agent = DDPG(args.gamma,
                 args.tau,
                 hidden_size,
                 env.observation_space.shape[0],
                 env.action_space,
                 checkpoint_dir=checkpoint_dir
                 )

    # Initialize replay memory
    memory = ReplayMemory(int(args.replay_size))

    # Initialize OU-Noise
    nb_actions = env.action_space.shape[-1]
    ou_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                            sigma=float(args.noise_stddev) * np.ones(nb_actions))

    # Define counters and other variables
    start_step = 0
    # timestep = start_step
    if args.load_model:
        # Load agent if necessary
        start_step, memory = agent.load_checkpoint()
    timestep = start_step // 10000 + 1
    rewards, policy_losses, value_losses, mean_test_rewards = [], [], [], []
    epoch = 0
    t = 0
    time_last_checkpoint = time.time()

    # Start training
    logger.info('Train agent on {} env'.format({env.unwrapped.spec.id}))
    logger.info('Doing {} timesteps'.format(args.timesteps))
    logger.info('Start at timestep {0} with t = {1}'.format(timestep, t))
    logger.info('Start training at {}'.format(time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.localtime())))

    while timestep <= args.timesteps:
        ou_noise.reset()
        epoch_return = 0

        state = torch.Tensor([env.reset()]).to(device)
        while True:
            if args.render_train:
                env.render()

            action = agent.calc_action(state, ou_noise)
            #next_state, reward, done, _ = env.step(action.cpu().numpy()[0])
            next_state, reward, terminated, truncated, _ = env.step(action.cpu().numpy()[0])
            done = terminated or truncated
            timestep += 1

            reward = calculate_reward(next_state, action.item(), target_angle)

            epoch_return += reward

            mask = torch.Tensor([done]).to(device)
            reward = torch.Tensor([reward]).to(device)
            next_state = torch.Tensor([next_state]).to(device)

            memory.push(state, action, mask, next_state, reward)

            state = next_state

            epoch_value_loss = 0
            epoch_policy_loss = 0

            if len(memory) > args.batch_size:
                transitions = memory.sample(args.batch_size)
                # Transpose the batch
                # (see http://stackoverflow.com/a/19343/3343043 for detailed explanation).
                batch = Transition(*zip(*transitions))

                # Update actor and critic according to the batch
                value_loss, policy_loss = agent.update_params(batch)

                epoch_value_loss += value_loss
                epoch_policy_loss += policy_loss

            if done:
                break

        rewards.append(epoch_return)
        value_losses.append(epoch_value_loss)
        policy_losses.append(epoch_policy_loss)
        writer.add_scalar('epoch/return', epoch_return, epoch)

        # Test every 10th episode (== 1e4) steps for a number of test_epochs epochs
        if timestep >= 10000 * t:
            t += 1
            test_rewards = []
            for _ in range(args.n_test_cycles):
                state = torch.Tensor([env.reset()]).to(device)
                test_reward = 0
                while True:
                    if args.render_eval:
                        env.render()

                    action = agent.calc_action(state)  # Selection without noise

                    #next_state, reward, done, _ = env.step(action.cpu().numpy()[0])
                    next_state, reward, terminated, truncated, _ = env.step(action.cpu().numpy()[0])
                    done = terminated or truncated

                    reward = calculate_reward(next_state, action.item(), target_angle)

                    test_reward += reward

                    next_state = torch.Tensor([next_state]).to(device)

                    state = next_state
                    if done:
                        break
                test_rewards.append(test_reward)

            mean_test_rewards.append(np.mean(test_rewards))

            for name, param in agent.actor.named_parameters():
                writer.add_histogram(name, param.clone().cpu().data.numpy(), epoch)
            for name, param in agent.critic.named_parameters():
                writer.add_histogram(name, param.clone().cpu().data.numpy(), epoch)

            writer.add_scalar('test/mean_test_return', mean_test_rewards[-1], epoch)
            logger.info("Epoch: {}, current timestep: {}, last reward: {}, "
                        "mean reward: {}, mean test reward {}".format(epoch,
                                                                      timestep,
                                                                      rewards[-1],
                                                                      np.mean(rewards[-10:]),
                                                                      np.mean(test_rewards)))

            # Save if the mean of the last three averaged rewards while testing
            # is greater than the specified reward threshold
            # TODO: Option if no reward threshold is given
            if np.mean(mean_test_rewards[-3:]) >= reward_threshold:
                agent.save_checkpoint(timestep, memory)
                time_last_checkpoint = time.time()
                logger.info('Saved model at {}'.format(time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.localtime())))

        epoch += 1

    agent.save_checkpoint(timestep, memory)
    logger.info('Saved model at endtime {}'.format(time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.localtime())))
    logger.info('Stopping training at {}'.format(time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.localtime())))
    env.close()