In [None]:
# Importing needed packages
import torch
import torch.nn as nn
import gym
import numpy as np
import random
import wandb
from copy import deepcopy

In [None]:
# Setting seeds
def make_deterministic(seed):
  torch.manual_seed(seed)
  random.seed(seed)
  np.random.seed(seed)

In [None]:
# Building a class for the neural network
class Neural_Network(nn.Module):
  def __init__(self, in_dim, out_dim, hidden_dim):
        super(Neural_Network, self).__init__()
        self.input_layer = nn.Linear(in_dim, hidden_dim[0])
        self.hidden_layer = nn.Linear(hidden_dim[0], hidden_dim[1])
        self.output_layer = nn.Linear(hidden_dim[1], out_dim)
       
  def forward (self, x, activation_function = nn.ReLU()):
    x = activation_function(self.input_layer(x))
    x = activation_function(self.hidden_layer(x))
    output = self.output_layer(x)
    return output

In [None]:
# Defining function for epsilon greedy actions
def eps_greedy(state, online_model, eps):
  with torch.no_grad():
    if (random.random() > eps):
      value, inds  = torch.max(online_model(state), dim=0)
      return inds.item()
    else:
      action = env.action_space.sample()
      return action

In [None]:
# Defining function for playing during and after training to obtain performance of the agent
def play(episodes, step, end, envs):
  x = 0
  returns = []

  for i in range(0, episodes):
    memory_states = [] 
    memory_actions = [] 
    memory_rewards = [] 
    memory_terminal = [] 
    memory_states_next = []

    state = envs.reset()
    while True:
      action = eps_greedy(state = torch.tensor(state).float() , online_model = online_model, eps = -1)
      state_next, reward, done, _ = envs.step(action)
      memory_actions.append(action)
      memory_states.append(state)
      memory_states_next.append(state_next)
      memory_rewards.append(reward)
      memory_terminal.append(done)
      state = state_next

      if done:
        x = sum(memory_rewards)
        state = envs.reset()
        returns.append(x)
        break

  if end:
    avg_return = sum(returns[-100:]) / 100
    print("Average return over 100 episodes of trained agent:", avg_return)
  else:
    avg_return = sum(returns[-25:]) / 25
    wandb.log({"Average return over 25 episodes of playing": avg_return,"step": step})
    print("Average return over 25 episodes of playing:", avg_return)

  return avg_return

In [None]:
##### Double DQN Algorithm ###########

# Seeds for repeatable training results
SEEDS = (12, 34, 56)

# Creating two environments: One for training and for playing
env = gym.make("CartPole-v1")
env2 = gym.make("CartPole-v1")

# Looping over different seeds for training different agents
for seed in SEEDS:
  make_deterministic(seed)
  print("SEED:", seed)
  wandb.init(project="DoubleDQN-CartPole", entity="raisa")
  config = wandb.config

  # Defining hyperparameters
  gamma = 0.99 # gamma
  eps_max = 1 # maximum value for epsilon
  eps_min = 0.1 # minimum value for epsilon
  eps_decay = (1-0.1) / 25000 # decaying factor for epsilon
  learning_rate = 0.0025 # learning rate
  batch_size = 64 # batch size
  num_episodes = 1000 # maximum number of episodes
  min_memory = 100 # size of replay memory at start of training
  max_memory =  1000 # maximum size of replay memory
  hidden_dims = [128, 64] # size of hidden layers in the neural network
  act_period = 150 # update frequency of target neural network
  criterion = nn.MSELoss() # loss function

  # Logging hyperparameters
  config.learning_rate = learning_rate
  config.gamma = gamma
  config.batch_size = batch_size
  config.num_episodes = num_episodes
  config.max_memory = max_memory
  config.min_memory = min_memory
  config.hidden_dim = hidden_dims
  config.act_period = act_period
  config.eps_max = eps_max
  config.eps_min = eps_min
  config.eps_decay = eps_decay
  config.seed = seed

  # Setting the seed for the environments
  env.seed(seed)
  env.action_space.seed(seed)
  env2.seed(seed)
  env2.action_space.seed(seed)

  # Initializing replay memory
  memory_states = [] 
  memory_actions = [] 
  memory_rewards = [] 
  memory_terminal = [] 
  memory_states_next = []
  reward_durch = []
  reward_werte = []

  # Initializing list for average returns during training
  returns_train = []
  # Initializing list for average returns during playing
  returns_play = []

  # Creating neural networks and setting optimizer
  online_model = (Neural_Network(in_dim = int(np.prod(env.observation_space.shape)), out_dim = env.action_space.n, hidden_dim=hidden_dims)) 
  target_model = (Neural_Network(in_dim = int(np.prod(env.observation_space.shape)), out_dim = env.action_space.n, hidden_dim=hidden_dims)) 
  optimizer = torch.optim.Adam(online_model.parameters(), lr=learning_rate)
  target_model = deepcopy(online_model) 

  # Storing minimum experiences in replay memory
  state = env.reset()
  for step in range(0,min_memory):
    action = env.action_space.sample()
    state_next, reward, done, _ = env.step(action)
    memory_actions.append(action)
    memory_states.append(state)
    memory_states_next.append(state_next)
    memory_rewards.append(reward)
    memory_terminal.append(done)
    state = state_next

    if done:
      state = env.reset()

  epsi = eps_max
  step = 0

  # Training the agent for number of episodes
  for episode in range(0, num_episodes):
    state = env.reset()
    reward_sum = 0
    while True:
      step += 1

    # Deleting first element of replay memory, if max_memory length is reached
      if(len(memory_states) > max_memory):
        memory_actions.pop(0)
        memory_states.pop(0)
        memory_states_next.pop(0)
        memory_rewards.pop(0)
        memory_terminal.pop(0)

    # Taking epsilon greedy action and storing experience in replay memory
      epsi = max(epsi - eps_decay, eps_min)
      action = eps_greedy(state = torch.tensor(state).float() , online_model = online_model, eps = epsi)
      state_next, reward, done, _ = env.step(action)
      memory_actions.append(action)
      memory_states.append(state)
      memory_states_next.append(state_next)
      memory_rewards.append(reward)
      memory_terminal.append(done)
      state = state_next

      reward_sum+=reward

    # Selecting random batch of experiences
      rand_index = np.random.choice(len(memory_states_next), size=batch_size, replace=False)
      batch_states = torch.tensor(np.array(memory_states)[rand_index], dtype=torch.float32) 
      batch_states_next = torch.tensor(np.array(memory_states_next)[rand_index], dtype=torch.float32) 
      batch_rewards = torch.tensor(np.array(memory_rewards)[rand_index], dtype=torch.float32) 
      batch_actions = torch.tensor(np.array(memory_actions)[rand_index], dtype=torch.int64) 
      batch_terminal = torch.tensor(np.array(memory_terminal)[rand_index], dtype=torch.int64) 
    
    # Calculating q-values for training
      with torch.no_grad():
      # Calculating q-values for next possible states
        q_values_next = online_model(batch_states_next)
      # Calculating indices of max q-values
        _, inds = torch.max(q_values_next,dim=1)
        inds = torch.reshape(input = inds, shape = (batch_size, 1))
      # Evaluating these actions with the help of target model
        max_q_values_next = torch.gather(target_model(batch_states_next), 1, inds)
        max_q_values_next = (torch.reshape(max_q_values_next, (-1,)))
      # Calculating the target q-values
        q_target = (batch_rewards + gamma * (1 - batch_terminal) * max_q_values_next)
  
      # Calculating the expected q-values
      q_exp = online_model(batch_states)
      batch_actions = torch.reshape(input = batch_actions, shape = (batch_size, 1))
      q_exp = torch.gather(input = q_exp, dim = 1, index = batch_actions)
      q_exp = (torch.reshape(q_exp, (-1,))) 

      # Calculating loss
      loss = criterion(q_target, q_exp) 

      optimizer.zero_grad()

      # Backpropagating the loss
      loss.backward()

      # Taking an optimization step
      optimizer.step()

      # Copying the weights of online network in the target network every act_period steps
      if step % act_period == 0:
        target_model = deepcopy(online_model) 

      # Playing every 500 steps to obtain the performance of the DDQN agent
      if step % 500 == 0:
        avg_return_play = play(episodes = 25, step = step, end = False, envs = env2)
        returns_play.append(avg_return_play)
      
      # If an episode has to come to an end, reset environment and print the reward for the last episode
      if done:
        print(episode + 1, epsi, reward_sum)
        returns_train.append(reward_sum)

        # If episode > 100, then print the average reward for the last hundred episodes 
        if episode + 1  > 100:
          x = sum(returns_train[-100:])/100
          print("Average return training:", x)
          # Logging the training average return
          wandb.log({"Average return over 100 episodes of training": x, "episode": episode + 1})  
        break

    # If training steps >= 2500 and the agent has obtained an average reward of 475 during last 5 times of playing, end training
    if step >= 2500:
      if sum(returns_play[-5:]) / 5 >= 475:
        break

  # Playing after training to obtain end results
  play(episodes = 100, step = step, end = True, envs = env2)

SEED: 12


[34m[1mwandb[0m: Currently logged in as: [33mraisa[0m (use `wandb login --relogin` to force relogin)


1 0.9995679999999996 12.0
2 0.9986679999999987 25.0
3 0.9973359999999973 37.0
4 0.9969039999999969 12.0
5 0.9964359999999964 13.0
6 0.9958959999999959 15.0
7 0.9952839999999953 17.0
8 0.9946719999999947 17.0
9 0.9939159999999939 21.0
10 0.9933039999999933 17.0
11 0.9924759999999925 23.0
12 0.9919359999999919 15.0
13 0.9907839999999908 32.0
14 0.9898479999999898 26.0
15 0.9888759999999889 27.0
16 0.9884799999999885 11.0
17 0.9877239999999877 21.0
18 0.9871119999999871 17.0
19 0.9864639999999865 18.0
20 0.9858159999999858 18.0
21 0.9847359999999847 30.0
22 0.9841599999999842 16.0
23 0.9826839999999827 41.0
24 0.9820719999999821 17.0
Average return over 25 episodes of playing: 37.96
25 0.9811359999999811 26.0
26 0.9805959999999806 15.0
27 0.98005599999998 15.0
28 0.9794799999999795 16.0
29 0.9783999999999784 30.0
30 0.9776079999999776 22.0
31 0.9763839999999764 34.0
32 0.9753759999999754 28.0
33 0.9749439999999749 12.0
34 0.9732879999999733 46.0
35 0.9711639999999712 59.0
36 0.97062399999

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Average return over 100 episodes of training,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▃▅▅▅▆▇██
Average return over 25 episodes of playing,▁▃▂▆▆▁▂▂▂▂▂▂▂▆▂▂▂▂▂█▆▃██▂▂▂▂▂▂▂▂▃▃▄▃▅▇██
episode,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███

0,1
Average return over 100 episodes of training,125.57
Average return over 25 episodes of playing,500.0
episode,688.0
step,33000.0


1 0.9987039999999987 36.0
2 0.9977679999999978 26.0
3 0.9973359999999973 12.0
4 0.9968679999999969 13.0
5 0.9951759999999952 47.0
6 0.9946719999999947 14.0
7 0.9940959999999941 16.0
8 0.9934119999999934 19.0
9 0.9926559999999927 21.0
10 0.992043999999992 17.0
11 0.9914679999999915 16.0
12 0.990963999999991 14.0
13 0.9904959999999905 13.0
14 0.9898479999999898 18.0
15 0.9878679999999879 55.0
16 0.9873639999999874 14.0
17 0.9866799999999867 19.0
18 0.9850959999999851 44.0
19 0.9845919999999846 14.0
20 0.9842319999999842 10.0
21 0.9834759999999835 21.0
22 0.9827919999999828 19.0
23 0.9822159999999822 16.0
Average return over 25 episodes of playing: 23.44
24 0.9814239999999814 22.0
25 0.9795159999999795 53.0
26 0.9788679999999789 18.0
27 0.9782559999999783 17.0
28 0.9777519999999778 14.0
29 0.9771759999999772 16.0
30 0.9752679999999753 53.0
31 0.9744039999999744 24.0
32 0.9738999999999739 14.0
33 0.9734319999999734 13.0
34 0.9729279999999729 14.0
35 0.9723879999999724 15.0
36 0.97141599999

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Average return over 100 episodes of training,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▁▂▂▂▁▂▂▁▂▂▂▂▂▂▃▄▅▅▆█
Average return over 25 episodes of playing,▁▄▄▃██▂▂▂▂█▃▂▂▂▂▂▂▂▂▃▃▃▆▂▂▃▂▂▂▂▂██▂▄████
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███

0,1
Average return over 100 episodes of training,153.26
Average return over 25 episodes of playing,500.0
episode,768.0
step,35000.0


1 0.9994239999999994 16.0
2 0.9988839999999989 15.0
3 0.9977319999999977 32.0
4 0.9966159999999966 31.0
5 0.9944919999999945 59.0
6 0.9934479999999934 29.0
7 0.9927279999999927 20.0
8 0.9921519999999922 16.0
9 0.9914679999999915 19.0
10 0.9909279999999909 15.0
11 0.9886239999999886 64.0
12 0.988011999999988 17.0
13 0.9872559999999873 21.0
14 0.985959999999986 36.0
15 0.9852759999999853 19.0
16 0.984951999999985 9.0
17 0.9844839999999845 13.0
18 0.9828279999999828 46.0
19 0.9821079999999821 20.0
Average return over 25 episodes of playing: 34.12
20 0.9817479999999817 10.0
21 0.9801279999999801 45.0
22 0.9796599999999797 13.0
23 0.9786159999999786 29.0
24 0.9776799999999777 26.0
25 0.9772839999999773 11.0
26 0.9766719999999767 17.0
27 0.9761679999999762 14.0
28 0.9756999999999757 13.0
29 0.9751599999999752 15.0
30 0.9745119999999745 18.0
31 0.9739359999999739 16.0
32 0.9715959999999716 65.0
33 0.971019999999971 16.0
34 0.9702639999999703 21.0
35 0.9699399999999699 9.0
36 0.968031999999968