In [4]:
%load_ext autoreload
import race
import math
import random

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal

from IPython.display import clear_output
import matplotlib.pyplot as plt
import optuna
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
use_cuda = torch.cuda.is_available()
device   = torch.device("cuda" if use_cuda else "cpu")
device

device(type='cuda')

In [21]:
def load_best_paramerters(study_name, storage):
    """
    This function retrun a dictionary of parameters of trial in the given study
    Parameters:
        study_name: Name of the study to load
        storage: Location of the study

    Return:
         Dictionary of best parameters
    """
    loaded_study = optuna.load_study(study_name=study_name, storage=storage)
    best_params = loaded_study.best_trial.params
    return best_params

params = load_best_paramerters(study_name='TD3_Hyperparameters', storage='sqlite:///DRL_Studienarbeit2.db')

In [7]:
env = race.CurvyRace()

In [8]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0

    def push(self, state, action, next_state, reward, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, next_state, reward, done)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, next_state, reward, done = map(np.stack, zip(*batch))
        return state, action, next_state, reward, done

    def __len__(self):
        return len(self.buffer)

# Initialize replay buffer
replay_buffer = ReplayBuffer(capacity=10000)


In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gym
import math

# Define the Actor and Critic networks
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()
        self.layer1 = nn.Linear(state_dim, 400)
        self.layer2 = nn.Linear(400, 300)
        self.layer3 = nn.Linear(300, action_dim)
        self.max_action = max_action

    def forward(self, state):
        a = torch.relu(self.layer1(state))
        a = torch.relu(self.layer2(a))
        a = self.max_action * torch.tanh(self.layer3(a))
        return a

class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        self.layer1 = nn.Linear(state_dim + action_dim, 400)
        self.layer2 = nn.Linear(400, 300)
        self.layer3 = nn.Linear(300, 1)

    def forward(self, state, action):
        q = torch.relu(self.layer1(torch.cat([state, action], 1)))
        q = torch.relu(self.layer2(q))
        q = self.layer3(q)
        return q

In [24]:
# Define the TD3 class
class TD3:
    def __init__(self, state_dim, action_dim, max_action):
        self.actor = Actor(state_dim, action_dim, max_action)
        self.actor_target = Actor(state_dim, action_dim, max_action)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=0.001)

        self.critic1 = Critic(state_dim, action_dim)
        self.critic1_target = Critic(state_dim, action_dim)
        self.critic1_target.load_state_dict(self.critic1.state_dict())
        self.critic1_optimizer = optim.Adam(self.critic1.parameters(), lr=0.001)

        self.critic2 = Critic(state_dim, action_dim)
        self.critic2_target = Critic(state_dim, action_dim)
        self.critic2_target.load_state_dict(self.critic2.state_dict())
        self.critic2_optimizer = optim.Adam(self.critic2.parameters(), lr=0.001)

        self.max_action = max_action

    def select_action(self, state):
        state = torch.FloatTensor(state.reshape(1, -1))
        action = self.actor(state).cpu().data.numpy().flatten()
        return action

    def train(self, replay_buffer, batch_size=64, gamma=params['gamma'], soft_tau=params['soft_tau'], policy_noise=params['policy_noise'], noise_clip=params['noise_clip'], policy_freq=params['policy_freq']):
        #gamma = trial.suggest_float('gamma', 0.8, 0.999)
        #soft_tau = trial.suggest_float('soft_tau', 0.01, 0.1)
        #policy_noise = trial.suggest_float('policy_noise', 0.1, 0.5)
        #noise_clip = trial.suggest_float('noise_clip', 0.1, 0.5)
        #policy_freq = trial.suggest_int('policy_freq', 1, 10)
        if len(replay_buffer) < batch_size:
            return

        # Sample a batch from the replay buffer
        state, action, next_state, reward, not_done = replay_buffer.sample(batch_size)

        state = torch.FloatTensor(state)
        action = torch.FloatTensor(action)
        next_state = torch.FloatTensor(next_state)
        reward = torch.FloatTensor(reward)
        not_done = torch.FloatTensor(not_done)

        # Update Critic networks
        with torch.no_grad():
            noise = (torch.randn_like(action) * policy_noise).clamp(-noise_clip, noise_clip)
            next_action = (self.actor_target(next_state) + noise).clamp(-self.max_action, self.max_action)

            target_Q1 = self.critic1_target(next_state, next_action)
            target_Q2 = self.critic2_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + not_done * gamma * target_Q

        current_Q1 = self.critic1(state, action)
        current_Q2 = self.critic2(state, action)

        critic1_loss = nn.functional.mse_loss(current_Q1, target_Q)
        critic2_loss = nn.functional.mse_loss(current_Q2, target_Q)

        self.critic1_optimizer.zero_grad()
        critic1_loss.backward()
        self.critic1_optimizer.step()

        self.critic2_optimizer.zero_grad()
        critic2_loss.backward()
        self.critic2_optimizer.step()

        # Update Actor network
        if self.step % policy_freq == 0:
            actor_loss = -self.critic1(state, self.actor(state)).mean()
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Update target networks
            self.soft_update(self.actor, self.actor_target, soft_tau)
            self.soft_update(self.critic1, self.critic1_target, soft_tau)
            self.soft_update(self.critic2, self.critic2_target, soft_tau)

    def soft_update(self, model, target_model, soft_tau):
        for param, target_param in zip(model.parameters(), target_model.parameters()):
            target_param.data.copy_((1 - soft_tau) * target_param.data + soft_tau * param.data)

In [25]:
# Create the environment
env = race.CurvyRace()

# Set random seed for reproducibility
np.random.seed(0)
torch.manual_seed(0)

# Initialize the TD3 agent
state_dim = env.get_observation_dim()
action_dim = env.get_action_dim()
max_action = float(env.get_action_limits()[0])
td3_agent = TD3(state_dim, action_dim, max_action)

# Training parameters
max_episodes = 1000
max_steps_per_episode = 100
batch_size = 256
actions=[]

# Training loop
def objective():
    #scale_factor = trial.suggest_int('scale_factor', 1, 10)
    #reward_shape = trial.suggest_int('reward_shape', 1, 10)
    model_reward = 0
    for episode in range(max_episodes):
        state = env.reset()
        total_reward = 0
    
        for step in range(max_steps_per_episode):
            action = td3_agent.select_action(state)
            
            actions.append(action)
            next_state, reward, done = env.step(action)
    
            if reward == 1:            
                total_reward += reward + (params['reward_shape'] * env.get_gate_idx()/len(env.get_gates())) 
    
            td3_agent.train(replay_buffer)  # Update the TD3 agent
    
            state = next_state
    
            if done:
                break
        model_reward += total_reward
        
        #return model_reward
        print(f"Episode: {episode + 1}, Total Reward: {total_reward}")
        if total_reward > 20:
            env.plot()

# Test the trained agent
#state = env.reset()
#done = False
#while not done:
    #action = td3_agent.select_action(state)
    #next_state, reward, done = env.step(action)
    #env.plot()

#plt.show()


In [26]:
objective()

Episode: 1, Total Reward: 0
Episode: 2, Total Reward: 0
Episode: 3, Total Reward: 0
Episode: 4, Total Reward: 0
Episode: 5, Total Reward: 0
Episode: 6, Total Reward: 0
Episode: 7, Total Reward: 0
Episode: 8, Total Reward: 0
Episode: 9, Total Reward: 0
Episode: 10, Total Reward: 0
Episode: 11, Total Reward: 0
Episode: 12, Total Reward: 0
Episode: 13, Total Reward: 0
Episode: 14, Total Reward: 0
Episode: 15, Total Reward: 0
Episode: 16, Total Reward: 0
Episode: 17, Total Reward: 0
Episode: 18, Total Reward: 0
Episode: 19, Total Reward: 0
Episode: 20, Total Reward: 0
Episode: 21, Total Reward: 0
Episode: 22, Total Reward: 0
Episode: 23, Total Reward: 0
Episode: 24, Total Reward: 0
Episode: 25, Total Reward: 0
Episode: 26, Total Reward: 0
Episode: 27, Total Reward: 0
Episode: 28, Total Reward: 0
Episode: 29, Total Reward: 0
Episode: 30, Total Reward: 0
Episode: 31, Total Reward: 0
Episode: 32, Total Reward: 0
Episode: 33, Total Reward: 0
Episode: 34, Total Reward: 0
Episode: 35, Total Rewa

In [16]:
optuna.delete_study(study_name='TD3_Hyperparameters', storage='sqlite:///DRL_Studienarbeit2.db')

In [20]:
import optuna
study = optuna.create_study(study_name='TD3_Hyperparameters', storage='sqlite:///DRL_Studienarbeit2.db',load_if_exists=True, direction='maximize')
study.optimize(lambda trial: objective(trial), n_trials=20000)

[I 2023-12-20 00:16:39,424] Using an existing study with name 'TD3_Hyperparameters' instead of creating a new one.
[I 2023-12-20 00:17:06,365] Trial 3 finished with value: 2.125 and parameters: {'reward_shape': 1, 'gamma': 0.9389717322856364, 'soft_tau': 0.03331363991353948, 'policy_noise': 0.2471457077347401, 'noise_clip': 0.430337011231377, 'policy_freq': 9}. Best is trial 1 with value: 5.25.
[I 2023-12-20 00:17:32,824] Trial 4 finished with value: 1.375 and parameters: {'reward_shape': 6, 'gamma': 0.9319527191181636, 'soft_tau': 0.043917237348737585, 'policy_noise': 0.1550784968903429, 'noise_clip': 0.40469610941368783, 'policy_freq': 8}. Best is trial 1 with value: 5.25.
[I 2023-12-20 00:17:58,906] Trial 5 finished with value: 3.25 and parameters: {'reward_shape': 10, 'gamma': 0.8938880286824613, 'soft_tau': 0.09973505297029614, 'policy_noise': 0.12097250362768874, 'noise_clip': 0.34915289204501687, 'policy_freq': 5}. Best is trial 1 with value: 5.25.
[I 2023-12-20 00:18:25,240] Tr

KeyboardInterrupt: 