In [2]:
import gym
import torch
import torch.nn as nn
import numpy as np
from collections import deque
import random
from itertools import count
import torch.nn.functional as F
import matplotlib.pyplot as plt
import optuna

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [2]:
class duelling_dqn_model(nn.Module):

    def __init__(self, state_size, action_size, seed, fc1_units=64,  vl1_units = 256, al1_units = 256, type_num = 1):
        super(duelling_dqn_model, self).__init__()
        self.type_num = type_num

        self.fc1 = nn.Linear(state_size, fc1_units)
        self.value_layer_1 = nn.Linear(fc1_units, vl1_units)
        self.advantage_layer_1 = nn.Linear(fc1_units, al1_units)

        self.value_layer_2 = nn.Linear(vl1_units, 1)
        self.advantage_layer_2 = nn.Linear(al1_units, action_size)
    
    def forward(self, state):
        a=F.relu(self.fc1(state))
        adv=F.relu(self.advantage_layer_1(a))
        adv=self.advantage_layer_2(adv)
        value=F.relu(self.value_layer_1(a))
        value=self.value_layer_2(value) 
        advAverage = torch.mean(adv, dim=1, keepdim=True)
        Q = value + adv - advAverage
        return Q
    
    def select_action(self, state):
        with torch.no_grad():
            Q = self.forward(state)
            action_index = torch.argmax(Q, dim=1)
        return action_index.item()


In [3]:
class Memory(object):
    def __init__(self, memory_size: int) -> None:
        self.memory_size = memory_size
        self.buffer = deque(maxlen=self.memory_size)

    def add(self, experience) -> None:
        self.buffer.append(experience)

    def size(self):
        return len(self.buffer)

    def sample(self, batch_size: int, continuous: bool = True):
        if batch_size > len(self.buffer):
            batch_size = len(self.buffer)
        if continuous:
            rand = random.randint(0, len(self.buffer) - batch_size)
            return [self.buffer[i] for i in range(rand, rand + batch_size)]
        else:
            indexes = np.random.choice(np.arange(len(self.buffer)), size=batch_size, replace=False)
            return [self.buffer[i] for i in indexes]

    def clear(self):
        self.buffer.clear()


In [4]:
def objective(trial,env,type_num):
    # Define the search space
    params= {
      'batch_size' : trial.suggest_categorical('batch_size', [32,64,128]),
      'lr'  : trial.suggest_loguniform('lr',1e-5,1e-4),
      'eps_start' : trial.suggest_loguniform('eps_start',0.1,0.2),
      'replay_size':trial.suggest_categorical('replay_size', [50000,75000,100000]),
    }

    # Train the model with the given hyperparameters
    seed=1
    rewards_episode= duel_dqn(env,seed = seed,params = params,type_num = type_num)
    return np.mean(rewards_episode)

def duel_dqn(env,seed,params,type_num):
    print('\n')
    print("For seed =",seed)
    env = env
    env.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    qnetwork_local = duelling_dqn_model(state_shape, action_shape,seed,type_num=type_num).to(device)
    qnetwork_target = duelling_dqn_model(state_shape, action_shape,seed,type_num=type_num).to(device)
    
    qnetwork_target.load_state_dict(qnetwork_local.state_dict())
    optimizer = torch.optim.Adam(qnetwork_local.parameters(), lr=params['lr'])


    GAMMA = 0.99
    EXPLORE = 20000
    eps_start = params['eps_start']
    eps_end = 0.0001
    REPLAY_MEMORY = params['replay_size']
    BATCH = params['batch_size']
    max_episodes = 500
    UPDATE_STEPS = 4

    memory_replay = Memory(REPLAY_MEMORY)

    epsilon = eps_start
    learn_steps = 0
    begin_learn = False
    scores_window = deque(maxlen=100)
    episode_rewards = []

    for epoch in range(max_episodes):
        state = env.reset()
        episode_reward = 0
        done = False
        while not done:
            prob = random.random()
            if prob < epsilon:
                action = random.choice(np.arange(action_shape))
            else:
                tensor_state = torch.FloatTensor(state).unsqueeze(0).to(device)
                action = qnetwork_local.select_action(tensor_state)
            next_state, reward, done, _ = env.step(action)
            episode_reward += reward

            memory_replay.add((state, next_state, action, reward, done))
            # Cheks if the replay buffer has enough samples to sample from
            if memory_replay.size() > 128:
                if not begin_learn:
                    begin_learn = True
                learn_steps += 1
                if learn_steps % UPDATE_STEPS == 0:
                    qnetwork_target.load_state_dict(qnetwork_local.state_dict())
                # Sampling batch size number of samples for target network
                batch = memory_replay.sample(BATCH, False)
                batch_state, batch_next_state, batch_action, batch_reward, batch_done = zip(*batch)

                batch_state = torch.FloatTensor(batch_state).to(device)
                batch_next_state = torch.FloatTensor(batch_next_state).to(device)
                batch_action = torch.FloatTensor(batch_action).unsqueeze(1).to(device)
                batch_reward = torch.FloatTensor(batch_reward).unsqueeze(1).to(device)
                batch_done = torch.FloatTensor(batch_done).unsqueeze(1).to(device)

                with torch.no_grad():
                    localQ_next = qnetwork_local(batch_next_state)
                    targetQ_next = qnetwork_target(batch_next_state)
                    local_max_action = torch.argmax(localQ_next, dim=1, keepdim=True)
                    y = batch_reward + (1 - batch_done) * GAMMA * targetQ_next.gather(1, local_max_action.long())

                loss = F.mse_loss(qnetwork_local(batch_state).gather(1, batch_action.long()), y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                if epsilon > eps_end:
                    epsilon -= (eps_start - eps_end) / EXPLORE
            if done:
                break
            state = next_state
        scores_window.append(episode_reward)
        episode_rewards.append(episode_reward)
        print('\rEpisode {}\tMoving Average Score: {:.2f}'.format(epoch, np.mean(scores_window)), end="")
        if epoch % 100 == 0:
            print('\rEpisode {}\tMoving Average Score: {:.2f}'.format(epoch, np.mean(scores_window)))
        if np.mean(scores_window)>=env.spec.reward_threshold and epoch >= 100:
            print('\nEnvironment solved in {:d} episodes!\tAverageScore: {:.2f}'.format(epoch, np.mean(scores_window)))
            break

    return episode_rewards

In [5]:
env = gym.make('Acrobot-v1')
state_shape = env.observation_space.shape[0]
action_shape = env.action_space.n

  deprecation(
  deprecation(


In [6]:
study = optuna.create_study(direction='maximize')

#Run the optimization
study.optimize(lambda trial: objective(trial, env, 1), n_trials=5)

# Print the best parameters found
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-04-07 04:52:12,015] A new study created in memory with name: no-name-3fe16fa7-2c22-4869-b8cd-48ac6ccfcc7c
  'lr'  : trial.suggest_loguniform('lr',1e-5,1e-4),
  'eps_start' : trial.suggest_loguniform('eps_start',0.1,0.2),
  deprecation(




For seed = 1


  if not isinstance(terminated, (bool, np.bool8)):
  batch_state = torch.FloatTensor(batch_state).to(device)


Episode 0	Moving Average Score: -500.00
Episode 100	Moving Average Score: -144.82
Episode 157	Moving Average Score: -100.03

[I 2024-04-07 04:55:34,570] Trial 0 finished with value: -130.03773584905662 and parameters: {'batch_size': 128, 'lr': 6.415318655621856e-05, 'eps_start': 0.11076012404423284, 'replay_size': 100000}. Best is trial 0 with value: -130.03773584905662.


Episode 158	Moving Average Score: -99.94
Environment solved in 158 episodes!	AverageScore: -99.94


For seed = 1


  'lr'  : trial.suggest_loguniform('lr',1e-5,1e-4),
  'eps_start' : trial.suggest_loguniform('eps_start',0.1,0.2),
  deprecation(


Episode 0	Moving Average Score: -500.00
Episode 100	Moving Average Score: -391.48
Episode 200	Moving Average Score: -335.92
Episode 300	Moving Average Score: -405.61
Episode 400	Moving Average Score: -224.31
Episode 498	Moving Average Score: -143.40

[I 2024-04-07 05:23:28,927] Trial 1 finished with value: -300.72 and parameters: {'batch_size': 128, 'lr': 1.1362130880884927e-05, 'eps_start': 0.12007214704516453, 'replay_size': 50000}. Best is trial 0 with value: -130.03773584905662.


Episode 499	Moving Average Score: -143.27

For seed = 1
Episode 0	Moving Average Score: -500.00
Episode 100	Moving Average Score: -180.01
Episode 200	Moving Average Score: -106.27
Episode 300	Moving Average Score: -103.50
Episode 325	Moving Average Score: -102.97

[I 2024-04-07 05:29:36,421] Trial 2 finished with value: -127.96636085626912 and parameters: {'batch_size': 64, 'lr': 4.578448201987797e-05, 'eps_start': 0.17365180632399785, 'replay_size': 50000}. Best is trial 2 with value: -127.96636085626912.


Episode 326	Moving Average Score: -99.13
Environment solved in 326 episodes!	AverageScore: -99.13


For seed = 1
Episode 0	Moving Average Score: -500.00
Episode 100	Moving Average Score: -154.27
Episode 200	Moving Average Score: -108.84
Episode 262	Moving Average Score: -100.04

[I 2024-04-07 05:34:02,378] Trial 3 finished with value: -124.48863636363636 and parameters: {'batch_size': 64, 'lr': 4.294928486327954e-05, 'eps_start': 0.12233771624701863, 'replay_size': 100000}. Best is trial 3 with value: -124.48863636363636.


Episode 263	Moving Average Score: -99.90
Environment solved in 263 episodes!	AverageScore: -99.90


For seed = 1
Episode 0	Moving Average Score: -500.00
Episode 100	Moving Average Score: -199.19
Episode 200	Moving Average Score: -161.57
Episode 300	Moving Average Score: -121.02
Episode 400	Moving Average Score: -118.66
Episode 491	Moving Average Score: -100.60

[I 2024-04-07 05:44:36,958] Trial 4 finished with value: -141.46653144016227 and parameters: {'batch_size': 64, 'lr': 2.0009680959452588e-05, 'eps_start': 0.10895730379224866, 'replay_size': 100000}. Best is trial 3 with value: -124.48863636363636.


Episode 492	Moving Average Score: -99.97
Environment solved in 492 episodes!	AverageScore: -99.97
Best trial:
  Value:  -124.48863636363636
  Params: 
    batch_size: 64
    lr: 4.294928486327954e-05
    eps_start: 0.12233771624701863
    replay_size: 100000


In [7]:
num_seeds = 5
all_episode_rewards = []

for seed in range(num_seeds):
    episode_rewards = duel_dqn(env,seed = seed,params = trial.params,type_num = 1)
    all_episode_rewards.append(episode_rewards)

# Calculate mean and variance across runs for each episode
max_length = max(len(v) for v in all_episode_rewards)
padded_rewards = [np.pad(v, (0, max_length - len(v)), mode='constant',constant_values = env.spec.reward_threshold) for v in all_episode_rewards]
mean_rewards_acro_1 = np.mean(padded_rewards, axis=0)
variance_rewards_acro_1 = np.var(padded_rewards, axis=0)   



For seed = 0
Episode 0	Moving Average Score: -500.00
Episode 100	Moving Average Score: -470.49
Episode 200	Moving Average Score: -132.06
Episode 300	Moving Average Score: -102.13
Episode 329	Moving Average Score: -99.529
Environment solved in 329 episodes!	AverageScore: -99.52


For seed = 1
Episode 0	Moving Average Score: -500.00
Episode 100	Moving Average Score: -154.27
Episode 200	Moving Average Score: -108.84
Episode 263	Moving Average Score: -99.904
Environment solved in 263 episodes!	AverageScore: -99.90


For seed = 2
Episode 0	Moving Average Score: -500.00
Episode 100	Moving Average Score: -415.58
Episode 200	Moving Average Score: -153.75
Episode 300	Moving Average Score: -128.76
Episode 400	Moving Average Score: -130.34
Episode 499	Moving Average Score: -104.10

For seed = 3
Episode 0	Moving Average Score: -142.00
Episode 100	Moving Average Score: -148.11
Episode 200	Moving Average Score: -109.14
Episode 230	Moving Average Score: -99.732
Environment solved in 230 episodes!	A

In [8]:
# Model for type 2
class duelling_dqn_model(nn.Module):

    def __init__(self, state_size, action_size, seed, fc1_units=64,  vl1_units = 256, al1_units = 256, type_num = 1):
        super(duelling_dqn_model, self).__init__()
        self.type_num = type_num

        self.fc1 = nn.Linear(state_size, fc1_units)
        self.value_layer_1 = nn.Linear(fc1_units, vl1_units)
        self.advantage_layer_1 = nn.Linear(fc1_units, al1_units)

        self.value_layer_2 = nn.Linear(vl1_units, 1)
        self.advantage_layer_2 = nn.Linear(al1_units, action_size)

    def forward(self, state):
        a=F.relu(self.fc1(state))
        adv=F.relu(self.advantage_layer_1(a))
        adv=self.advantage_layer_2(adv)
        value=F.relu(self.value_layer_1(a))
        value=self.value_layer_2(value) 
        advMax = torch.max(adv, dim=1, keepdim=True).values
        Q = value + adv - advMax
        return Q

    def select_action(self, state):
        with torch.no_grad():
            Q = self.forward(state)
            action_index = torch.argmax(Q, dim=1)
        return action_index.item()


In [9]:
study_2 = optuna.create_study(direction='maximize')

study_2.optimize(lambda trial: objective(trial,env,2), n_trials=5)

# Print the best parameters found
print("Best trial:")
trial_2 = study_2.best_trial
print("  Value: ", trial_2.value)
print("  Params: ")
for key, value in trial_2.params.items():
    print("    {}: {}".format(key, value))

[I 2024-04-07 06:36:04,078] A new study created in memory with name: no-name-3c8c12ec-b22a-4373-acde-af67384da5e3




For seed = 1


  'lr'  : trial.suggest_loguniform('lr',1e-5,1e-4),
  'eps_start' : trial.suggest_loguniform('eps_start',0.1,0.2),


Episode 0	Moving Average Score: -220.00
Episode 100	Moving Average Score: -183.68
Episode 200	Moving Average Score: -104.52
Episode 254	Moving Average Score: -100.47

[I 2024-04-07 06:39:22,329] Trial 0 finished with value: -134.8046875 and parameters: {'batch_size': 64, 'lr': 4.5330796683795955e-05, 'eps_start': 0.14905763894045493, 'replay_size': 50000}. Best is trial 0 with value: -134.8046875.


Episode 255	Moving Average Score: -98.99
Environment solved in 255 episodes!	AverageScore: -98.99


For seed = 1
Episode 0	Moving Average Score: -180.00
Episode 100	Moving Average Score: -349.82
Episode 200	Moving Average Score: -147.25
Episode 300	Moving Average Score: -181.57
Episode 400	Moving Average Score: -125.82
Episode 498	Moving Average Score: -114.97

[I 2024-04-07 06:49:19,038] Trial 1 finished with value: -184.102 and parameters: {'batch_size': 64, 'lr': 1.3325577486801331e-05, 'eps_start': 0.1040215299417645, 'replay_size': 50000}. Best is trial 0 with value: -134.8046875.


Episode 499	Moving Average Score: -115.18

For seed = 1
Episode 0	Moving Average Score: -182.00
Episode 100	Moving Average Score: -150.76
Episode 200	Moving Average Score: -111.48
Episode 275	Moving Average Score: -100.77

[I 2024-04-07 06:52:09,638] Trial 2 finished with value: -120.8158844765343 and parameters: {'batch_size': 32, 'lr': 9.748005267572618e-05, 'eps_start': 0.11779140743708955, 'replay_size': 75000}. Best is trial 2 with value: -120.8158844765343.


Episode 276	Moving Average Score: -99.84
Environment solved in 276 episodes!	AverageScore: -99.84


For seed = 1
Episode 0	Moving Average Score: -500.00
Episode 100	Moving Average Score: -201.54
Episode 200	Moving Average Score: -145.49
Episode 300	Moving Average Score: -103.43
Episode 400	Moving Average Score: -106.74
Episode 404	Moving Average Score: -101.41

[I 2024-04-07 06:57:19,174] Trial 3 finished with value: -139.45320197044336 and parameters: {'batch_size': 32, 'lr': 5.528522998056511e-05, 'eps_start': 0.10043594494743395, 'replay_size': 100000}. Best is trial 2 with value: -120.8158844765343.


Episode 405	Moving Average Score: -97.62
Environment solved in 405 episodes!	AverageScore: -97.62


For seed = 1
Episode 0	Moving Average Score: -192.00
Episode 100	Moving Average Score: -216.16
Episode 197	Moving Average Score: -100.02

[I 2024-04-07 07:00:16,759] Trial 4 finished with value: -158.8391959798995 and parameters: {'batch_size': 64, 'lr': 1.5369142177490045e-05, 'eps_start': 0.19310917391448384, 'replay_size': 50000}. Best is trial 2 with value: -120.8158844765343.


Episode 198	Moving Average Score: -99.71
Environment solved in 198 episodes!	AverageScore: -99.71
Best trial:
  Value:  -120.8158844765343
  Params: 
    batch_size: 32
    lr: 9.748005267572618e-05
    eps_start: 0.11779140743708955
    replay_size: 75000


In [10]:
num_seeds = 5
all_episode_rewards = []

for seed in range(num_seeds):
    episode_rewards = duel_dqn(env,seed = seed,params = trial_2.params,type_num = 2)
    all_episode_rewards.append(episode_rewards)

# Calculate mean and variance across runs for each episode
max_length = max(len(v) for v in all_episode_rewards)
padded_rewards = [np.pad(v, (0, max_length - len(v)), mode='constant',constant_values = env.spec.reward_threshold) for v in all_episode_rewards]
mean_rewards_acro_2 = np.mean(padded_rewards, axis=0)
variance_rewards_acro_2 = np.var(padded_rewards, axis=0)



For seed = 0
Episode 0	Moving Average Score: -500.00
Episode 100	Moving Average Score: -197.68
Episode 200	Moving Average Score: -112.98
Episode 300	Moving Average Score: -125.09
Episode 400	Moving Average Score: -103.78
Episode 428	Moving Average Score: -99.786
Environment solved in 428 episodes!	AverageScore: -99.78


For seed = 1
Episode 0	Moving Average Score: -182.00
Episode 100	Moving Average Score: -150.76
Episode 200	Moving Average Score: -111.48
Episode 276	Moving Average Score: -99.847
Environment solved in 276 episodes!	AverageScore: -99.84


For seed = 2
Episode 0	Moving Average Score: -500.00
Episode 100	Moving Average Score: -204.05
Episode 170	Moving Average Score: -99.798
Environment solved in 170 episodes!	AverageScore: -99.79


For seed = 3
Episode 0	Moving Average Score: -142.00
Episode 100	Moving Average Score: -159.19
Episode 200	Moving Average Score: -114.28
Episode 251	Moving Average Score: -99.917
Environment solved in 251 episodes!	AverageScore: -99.91


For 

In [None]:
plt.plot(range(1, len(mean_rewards_acro_1) + 1), mean_rewards_acro_1, label='Mean Return for type 1(using mean)', color='blue')
plt.fill_between(range(1, len(mean_rewards_acro_1) + 1), mean_rewards_acro_1 - np.sqrt(variance_rewards_acro_1),
                 mean_rewards_acro_1 + np.sqrt(variance_rewards_acro_1), color='blue', alpha=0.2, label='Variance for type 1(using mean)')
plt.plot(range(1, len(mean_rewards_acro_2) + 1), mean_rewards_acro_2, label='Mean Return for type 2(using max)', color='orange')
plt.fill_between(range(1, len(mean_rewards_acro_2) + 1), mean_rewards_acro_2 - np.sqrt(variance_rewards_acro_2),
                 mean_rewards_acro_2 + np.sqrt(variance_rewards_acro_2), color='orange', alpha=0.2, label='Variance for type 2(using max)')

plt.xlabel('Episode')
plt.ylabel('Episodic Return')
plt.title('Episodic Return vs. Episode Number (Mean and Variance across 5 seeds)')
plt.legend()
plt.grid(True)
plt.show()