In [None]:
# Importing needed librarys
import torch
import torch.nn as nn
import gym
import numpy as np
import random
import wandb
from copy import deepcopy

In [None]:
# Setting seeds
def make_deterministic(seed):
  torch.manual_seed(seed)
  random.seed(seed)
  np.random.seed(seed)

In [None]:
# Building a class for the neural network
class Neural_Network(nn.Module):
  def __init__(self, in_dim, out_dim, hidden_dim, pre_value, pre_action):
        super(Neural_Network, self).__init__()
        config.pre_value = pre_value
        config.pre_action = pre_action
        self.input_layer = nn.Linear(in_dim, hidden_dim[0])
        self.hidden_layer = nn.Linear(hidden_dim[0], hidden_dim[1])
        # Layers for state-value function
        self.value_function_pre = nn.Linear(hidden_dim[1], pre_value)
        self.value_function = nn.Linear(pre_value, 1)
        # Layers for advantage function
        self.advantage_function_pre = nn.Linear(hidden_dim[1], pre_action)
        self.advantage_function = nn.Linear(pre_action, out_dim)
       
  def forward (self, x, activation_function = nn.ReLU()):
    x = activation_function(self.input_layer(x))
    x = activation_function(self.hidden_layer(x))
    # Calculating advantage function
    a_pre = self.advantage_function_pre(x)
    a = self.advantage_function(a_pre)
    # Calculating state-value function
    v_pre = self.value_function_pre(x)
    v = self.value_function(v_pre)
    a_mean = a.mean(-1, keepdim = True)
    # Calculating q-values
    output = v + (a - a_mean)
    return output

In [None]:
# Defining function for epsilon greedy actions
def eps_greedy(state, online_model, eps):
  with torch.no_grad():
    if (np.random.rand() > eps):
      value, inds  = torch.max(online_model(state), dim=0)
      return inds.item()
    else:
      action = np.random.choice(env.action_space.n)
      return action

In [None]:
# Defining function for playing during and after training to obtain performance of the agent
def play(episodes, step, end, envs):
  x = 0
  returns = []

  for i in range(0, episodes):
    memory_states = [] 
    memory_actions = [] 
    memory_rewards = [] 
    memory_terminal = [] 
    memory_states_next = []

    state = envs.reset()
    while True:
      action = eps_greedy(state = torch.tensor(state).float() , online_model = online_model, eps = -1)
      state_next, reward, done, _ = envs.step(action)
      memory_actions.append(action)
      memory_states.append(state)
      memory_states_next.append(state_next)
      memory_rewards.append(reward)
      memory_terminal.append(done)
      state = state_next

      if done:
        x = sum(memory_rewards)
        state = envs.reset()
        returns.append(x)
        break

  if end:
    avg_return = sum(returns[-100:]) / 100
    print("Average return over 100 episodes of trained agent:", avg_return)
  else:
    avg_return = sum(returns[-25:]) / 25
    wandb.log({"Average return over 25 episodes of playing": avg_return,"step": step})
    print("Average return over 25 episodes of playing:", avg_return)

  return avg_return

In [None]:
##### Dueling Double DQN Algorithm ###########

# Seeds for repeatable training results
SEEDS = (12, 34, 56)

# Creating two environments: One for training and for playing
env = gym.make("CartPole-v1")
env2 = gym.make("CartPole-v1")

# Looping over different seeds for training different agents
for seed in SEEDS:
  make_deterministic(seed)
  print("SEED:", seed)
  wandb.init(project="DuelingDDQN-CartPole", entity="raisa")
  config = wandb.config

  # Defining hyperparameters
  gamma = 0.99 # gamma
  eps_max = 1 # maximum value for epsilon
  eps_min = 0.1 # minimum value for epsilon
  eps_decay = (1-0.1) / 25000 # decaying factor for epsilon
  learning_rate = 0.0025 # learning rate
  batch_size = 64 # batch size
  num_episodes = 1000 # maximum number of episodes
  min_memory = 100 # size of replay memory at start of training
  max_memory =  1000 # maximum size of replay memory
  hidden_dims = [128, 64] # size of hidden layers in the neural network
  hidden_dim_action = 10 # size of hidden layer for computing advantage function
  hidden_dim_state =  10 # size of hidden layer for computing state-value function
  act_period = 150 # update frequency of target neural network
  criterion = nn.MSELoss() # loss function
  tau = 0.9 # update factor for target net weights

  # Logging hyperparameters
  config.learning_rate = learning_rate
  config.gamma = gamma
  config.batch_size = batch_size
  config.num_episodes = num_episodes
  config.max_memory = max_memory
  config.min_memory = min_memory
  config.hidden_dim = hidden_dims
  config.act_period = act_period
  config.eps_max = eps_max
  config.eps_min = eps_min
  config.eps_decay = eps_decay
  config.hidden_dim_action = hidden_dim_action
  config.hidden_dim_state = hidden_dim_state
  config.tau = tau
  config.seed = seed

  # Setting the seed for the environments
  env.seed(seed)
  env.action_space.seed(seed)
  env2.seed(seed)
  env2.action_space.seed(seed)

  # Initializing replay memory
  memory_states = [] 
  memory_actions = [] 
  memory_rewards = [] 
  memory_terminal = [] 
  memory_states_next = []
  reward_durch = []
  reward_werte = []

  # Initializing list for average returns during training
  returns_train = []
  # Initializing list for average returns during playing
  returns_play = []

  # Creating neural networks and setting optimizer
  online_model = (Neural_Network(in_dim = int(np.prod(env.observation_space.shape)), out_dim = env.action_space.n, hidden_dim=hidden_dims, pre_value = hidden_dim_state, pre_action = hidden_dim_action)) 
  target_model = (Neural_Network(in_dim = int(np.prod(env.observation_space.shape)), out_dim = env.action_space.n, hidden_dim=hidden_dims, pre_value = hidden_dim_state, pre_action = hidden_dim_action)) 
  optimizer = torch.optim.Adam(online_model.parameters(), lr=learning_rate)
  target_model = deepcopy(online_model) 

  # Storing minimum experiences in replay memory
  state = env.reset()
  for step in range(0, min_memory):
    action = env.action_space.sample()
    state_next, reward, done, _ = env.step(action)
    memory_actions.append(action)
    memory_states.append(state)
    memory_states_next.append(state_next)
    memory_rewards.append(reward)
    memory_terminal.append(done)
    state = state_next

    if done:
      state = env.reset()

  epsi = eps_max
  step = 0

  # Training agent for number of episodes
  for episode in range(0, num_episodes):
    state = env.reset()
    reward_sum = 0
    while True:
      step += 1

    # Deleting first element of replay memory if max_memory length is reached
      if(len(memory_states) > max_memory):
        memory_actions.pop(0)
        memory_states.pop(0)
        memory_states_next.pop(0)
        memory_rewards.pop(0)
        memory_terminal.pop(0)

    # Taking epsilon greedy action and store experience in replay memory
      epsi = max(epsi - eps_decay, eps_min)
      action = eps_greedy(state = torch.tensor(state).float() , online_model = online_model, eps = epsi)
      state_next, reward, done, _ = env.step(action)
      memory_actions.append(action)
      memory_states.append(state)
      memory_states_next.append(state_next)
      memory_rewards.append(reward)
      memory_terminal.append(done)
      state = state_next

      reward_sum+=reward

    # Selecting random batch of experiences
      rand_index = np.random.choice(len(memory_states_next), size=batch_size, replace=False)
      batch_states = torch.tensor(np.array(memory_states)[rand_index], dtype=torch.float32) 
      batch_states_next = torch.tensor(np.array(memory_states_next)[rand_index], dtype=torch.float32) 
      batch_rewards = torch.tensor(np.array(memory_rewards)[rand_index], dtype=torch.float32) 
      batch_actions = torch.tensor(np.array(memory_actions)[rand_index], dtype=torch.int64) 
      batch_terminal = torch.tensor(np.array(memory_terminal)[rand_index], dtype=torch.int64) 
    
    # Calculating q-values for training
      with torch.no_grad():
      # Calculating q-values for next possible states
        q_values_next = online_model(batch_states_next)
      # Calculating indices of max q-values
        _, inds = torch.max(q_values_next,dim=1)
        inds = torch.reshape(input = inds, shape = (batch_size, 1))
      # Evaluating these actions with the help of target model
        max_q_values_next = torch.gather(target_model(batch_states_next), 1, inds)
        max_q_values_next = (torch.reshape(max_q_values_next, (-1,)))
      # Calculating the target q-values 
        q_target = (batch_rewards + gamma * (1 - batch_terminal) * max_q_values_next)
  
      # Calculating the expected q-values
      q_exp = online_model(batch_states)
      batch_actions = torch.reshape(input = batch_actions, shape = (batch_size, 1))
      q_exp = torch.gather(input = q_exp, dim = 1, index = batch_actions)
      q_exp = (torch.reshape(q_exp, (-1,))) 

      # Calculating loss
      loss = criterion(q_target, q_exp) 

      optimizer.zero_grad()

      # Backpropagating the loss
      loss.backward()

      # Taking an optimization step
      optimizer.step()

      # Updating target network softer every act_period steps
      if step % act_period == 0:
        for target, online in zip(target_model.parameters(), online_model.parameters()):
          target.data.mul_(1 - tau)
          target.data.add_(tau * online.data)

      # Playing every 500 steps to obtain the performance of the agent
      if step % 500 == 0:
        avg_return_play = play(episodes = 25, step = step, end = False, envs = env2)
        returns_play.append(avg_return_play)
      
      # If an episode has to come to an end, reset environment and print the reward for the last episode
      if done:
        print(episode + 1, epsi, reward_sum)
        returns_train.append(reward_sum)

        # If episode > 100, then print the average reward for the last hundred episodes 
        if episode + 1  > 100:
          x = sum(returns_train[-100:])/100
          print("Average return training:", x)
          # Logging the training average return
          wandb.log({"Average return over 100 episodes of training": x, "episode": episode + 1})  
        break

    # If training steps >= 2500 and the agent has obtained an average reward of 475 during last 5 times of playing, end training
    if step >= 2500:
      if sum(returns_play[-5:]) / 5 >= 475:
        break

  # Playing after training to obtain end results
  play(episodes = 100, step = step, end = True, envs = env2)

SEED: 12


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


1 0.9996039999999996 11.0
2 0.9991719999999992 12.0
3 0.9984159999999984 21.0
4 0.9971919999999972 34.0
5 0.9962559999999963 26.0
6 0.9957879999999958 13.0
7 0.993987999999994 50.0
8 0.9935919999999936 11.0
9 0.9931599999999932 12.0
10 0.9925119999999925 18.0
11 0.9918999999999919 17.0
12 0.9914679999999915 12.0
13 0.9907839999999908 19.0
14 0.9896319999999896 32.0
15 0.9888039999999888 23.0
16 0.9880839999999881 20.0
17 0.9871479999999871 26.0
18 0.9858879999999859 35.0
19 0.9852399999999852 18.0
20 0.9846279999999846 17.0
21 0.9842319999999842 11.0
22 0.9835479999999835 19.0
23 0.9820719999999821 41.0
Average return over 25 episodes of playing: 19.88
24 0.9815319999999815 15.0
25 0.980955999999981 16.0
26 0.9804159999999804 15.0
27 0.9797319999999797 19.0
28 0.9787599999999788 27.0
29 0.978003999999978 21.0
30 0.9775359999999775 13.0
31 0.976995999999977 15.0
32 0.9751239999999751 52.0
33 0.9733599999999734 49.0
34 0.971991999999972 38.0
35 0.971055999999971 26.0
36 0.970587999999970

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Average return over 100 episodes of training,▁▁▁▁▁▁▁▁▂▂▁▂▂▁▁▁▁▁▁▁▁▂▂▃▃▃▃▃▂▂▃▃▄▅▅▆▆▆▆█
Average return over 25 episodes of playing,▁▅▃▃▂▁▂▂▂▂▂▂▂▂▂▂▂▄▄▂▂▂▂▃▅▂▃▃██▂▂▂█▃▆████
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████

0,1
Average return over 100 episodes of training,70.0
Average return over 25 episodes of playing,500.0
episode,654.0
step,23000.0


1 0.9988839999999989 31.0
2 0.9980559999999981 23.0
3 0.9968679999999969 33.0
4 0.9963639999999964 14.0
5 0.9956799999999957 19.0
6 0.9948519999999949 23.0
7 0.9943119999999943 15.0
8 0.992007999999992 64.0
9 0.9912879999999913 20.0
10 0.9909279999999909 10.0
11 0.9895599999999896 38.0
12 0.9886239999999886 26.0
13 0.9877599999999878 24.0
14 0.9864999999999865 35.0
15 0.9855639999999856 26.0
16 0.984987999999985 16.0
17 0.9837639999999838 34.0
18 0.9826839999999827 30.0
Average return over 25 episodes of playing: 9.24
19 0.9816759999999817 28.0
20 0.9810999999999811 16.0
21 0.9805959999999806 14.0
22 0.9794439999999794 32.0
23 0.9782199999999782 34.0
24 0.9775359999999775 19.0
25 0.9751239999999751 67.0
26 0.9744399999999744 19.0
27 0.9728559999999729 44.0
28 0.9722799999999723 16.0
29 0.970983999999971 36.0
30 0.9701199999999701 24.0
31 0.9693279999999693 22.0
32 0.9686799999999687 18.0
33 0.9682479999999682 12.0
34 0.9668079999999668 40.0
35 0.9661239999999661 19.0
36 0.9654039999999

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Average return over 100 episodes of training,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▃▃▂▂▂▂▂▂▂▃▄▅▇██▇▇▇
Average return over 25 episodes of playing,▁▃▃▂▂▅▂▃▃▂▅▃▃▂▂▂▂▅▂▂▃▃▃▅▃▃▃▇▂▁▂▃▂▂█▆▂███
episode,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
Average return over 100 episodes of training,110.26
Average return over 25 episodes of playing,500.0
episode,817.0
step,38500.0


1 0.9988119999999988 33.0
2 0.9977319999999977 30.0
3 0.9972639999999973 13.0
4 0.9966159999999966 18.0
5 0.996003999999996 17.0
6 0.9953199999999953 19.0
7 0.9944559999999945 24.0
8 0.9938799999999939 16.0
9 0.9921879999999922 47.0
10 0.9912879999999913 25.0
11 0.9903879999999904 25.0
12 0.9894159999999894 27.0
13 0.9880839999999881 37.0
14 0.9871119999999871 27.0
15 0.9865359999999865 16.0
16 0.9857439999999857 22.0
17 0.9846999999999847 29.0
18 0.983979999999984 20.0
19 0.9824319999999824 43.0
Average return over 25 episodes of playing: 15.76
20 0.981963999999982 13.0
21 0.9804159999999804 43.0
22 0.97994799999998 13.0
23 0.9778599999999779 58.0
24 0.9774639999999775 11.0
25 0.9765639999999766 25.0
26 0.975987999999976 16.0
27 0.9755919999999756 11.0
28 0.9751959999999752 11.0
29 0.9742239999999742 27.0
30 0.9737199999999737 14.0
31 0.9732519999999732 13.0
32 0.9727839999999728 13.0
33 0.9718479999999718 26.0
34 0.9712359999999712 17.0
35 0.9706599999999707 16.0
36 0.969615999999969