In [None]:
# Importing needed librarys
import torch
import torch.nn as nn
import gym
import numpy as np
import random
import wandb
from copy import deepcopy
import statistics

In [None]:
# Setting seeds
def make_deterministic(seed):
  torch.manual_seed(seed)
  random.seed(seed)
  np.random.seed(seed)

In [None]:
# Building a class for the neural network
class Neural_Network(nn.Module):
  def __init__(self, in_dim, out_dim, hidden_dim):
        super(Neural_Network, self).__init__()
        self.input_layer = nn.Linear(in_dim, hidden_dim[0])
        self.hidden_layer = nn.Linear(hidden_dim[0], hidden_dim[1])
        self.output_layer = nn.Linear(hidden_dim[1], out_dim)
       
  def forward (self, x, activation_function = nn.ReLU()):
    x = activation_function(self.input_layer(x))
    x = activation_function(self.hidden_layer(x))
    output = self.output_layer(x)
    return output

In [None]:
# Defining function for epsilon greedy actions
def eps_greedy(state, online_model, eps):
  with torch.no_grad():
    if (np.random.rand() > eps):
      value, inds  = torch.max(online_model(state), dim=0)
      return inds.item()
    else:
      action = np.random.choice(env.action_space.n)
      return action

In [None]:
 # Defining function for playing during and after training to obtain performance of the agent
def play(episodes, step, end, envs):
  y = 0
  x = 0
  returns = []

  for i in range(0, episodes):
    j = i+1
    memory_states = [] 
    memory_actions = [] 
    memory_rewards = [] 
    memory_terminal = [] 
    memory_states_next = []

    state = envs.reset()
    while True:
      action = eps_greedy(state = torch.tensor(state).float() , online_model = online_model, eps = -1)
      state_next, reward, done, _ = envs.step(action)
      memory_rewards.append(reward)
      memory_terminal.append(done)
      state = state_next

      if done:
        x = sum(memory_rewards)
        state = envs.reset()
        returns.append(x)
        break

  if end:
    avg_return = sum(returns[-100:]) / 100
    print("Average return over 100 episodes of trained agent:", avg_return)
  else:
    avg_return = sum(returns[-25:]) / 25
    wandb.log({"Average return over 25 episodes of playing": avg_return,"step": step})
    print("Average return over 25 episodes of playing:", avg_return)

  return avg_return

In [None]:
##### Prioritized Double DQN Algorithm ###########

# Seeds for repeatable training results
SEEDS = (12, 34, 56)

# Creating two environments: One for training and for playing
env = gym.make("CartPole-v1")
env2 = gym.make("CartPole-v1")

# Looping over different seeds for training different agents
for seed in SEEDS:
  make_deterministic(seed)
  print("SEED:", seed)
  wandb.init(project="Prior-Exp-Replay-CartPole", entity="raisa")
  config = wandb.config

  # Defining hyperparameters
  gamma = 0.99 # gamma
  eps_max = 1 # maximum value for epsilon
  eps_min = 0.1 # minimum value for epsilon
  eps_decay = (1-0.1) / 25000 # decaying factor for epsilon
  learning_rate = 0.0025 # learning rate
  batch_size = 64 # batch size
  num_episodes = 1000 # maximum number of episodes
  min_memory = 100 # size of replay memory at start of training
  max_memory = 1000 # maximum size of replay memory
  hidden_dims = [128, 64] # size of hidden layers in the neural network
  act_period =  150 # update frequency of target neural network
  beta = 0.5 # needed for calculating prioritization weights
  l = 0.02 # ensures that states with td error of 0 are visited
  alpha = 0.8 # specifies degree of prioritization

  # Logging hyperparameters
  config.learning_rate = learning_rate
  config.gamma = gamma
  config.batch_size = batch_size
  config.num_episodes = num_episodes
  config.max_memory = max_memory
  config.min_memory = min_memory
  config.hidden_dim = hidden_dims
  config.act_period = act_period
  config.eps_max = eps_max
  config.eps_min = eps_min
  config.eps_decay = eps_decay
  config.beta = beta
  config.l = l
  config.alpha = alpha
  config.seed = seed

  # Setting the seed for the environments
  env.seed(seed)
  env.action_space.seed(seed)
  env2.seed(seed)
  env2.action_space.seed(seed)

  # Initialisation replay memory
  memory_states = [] 
  memory_actions = [] 
  memory_rewards = [] 
  memory_terminal = []
  memory_td_error = []
  memory_states_next = []
  reward_durch = []
  reward_werte = []

  # Initializing list for average returns during training
  returns_train = []
  # Initializing list for average returns during playing
  returns_play = []

  # Creating neural networks and setting optimizer
  online_model = (Neural_Network(in_dim = int(np.prod(env.observation_space.shape)), out_dim = env.action_space.n, hidden_dim=hidden_dims)) 
  target_model = (Neural_Network(in_dim = int(np.prod(env.observation_space.shape)), out_dim = env.action_space.n, hidden_dim=hidden_dims)) 
  optimizer = torch.optim.Adam(online_model.parameters(), lr=learning_rate)
  target_model = deepcopy(online_model) 

  # Storing minimum experiences in replay memory
  state = env.reset()
  for step in range(0, min_memory):
    action = env.action_space.sample()
    state_next, reward, done, _ = env.step(action)
    memory_actions.append(action)
    memory_states.append(state)
    memory_states_next.append(state_next)
    memory_rewards.append(reward)
    memory_terminal.append(done)
    memory_td_error.append(0)
    state = state_next

    if done:
      state = env.reset()

  epsi = eps_max
  step = 0

  # Training the agent for number of episodes
  for episode in range(0, num_episodes):
    state = env.reset()
    reward_sum = 0
    while True:
      step += 1

      # Deleting first element of replay memory if max_memory length is reached
      if(len(memory_states) > max_memory):
        memory_actions.pop(0)
        memory_states.pop(0)
        memory_states_next.pop(0)
        memory_rewards.pop(0)
        memory_terminal.pop(0)
        memory_td_error.pop(0)

      # Taking epsilon greedy action and storing experience in replay memory
      epsi = max(epsi - eps_decay, eps_min)
      action = eps_greedy(state = torch.tensor(state).float() , online_model = online_model, eps = epsi)
      state_next, reward, done, _ = env.step(action)
      td_error = max(memory_td_error)
      memory_actions.append(action)
      memory_states.append(state)
      memory_states_next.append(state_next)
      memory_rewards.append(reward)
      memory_terminal.append(done)
      memory_td_error.append(td_error)
      state = state_next

      reward_sum+=reward

      # Calculating the likelihood of choosing experiences and the weights of experiences
      probs = [z + l for z in memory_td_error]
      new_probs = [pow(z,alpha) for z in probs]
      sum_probs = sum(new_probs)
      choose_probs = [(z/sum_probs) for z in new_probs]
      weights = [(len(memory_states) * z) ** -beta for z in choose_probs]
      sum_weights = sum(weights)
      norm_weights = [(z/sum_weights) for z in weights]
      
      # Selecting prioritized batch of experiences
      rand_index = np.random.choice(len(memory_states_next), size=batch_size, replace=False, p = choose_probs)
      batch_states = torch.tensor(np.array(memory_states)[rand_index], dtype=torch.float32) 
      batch_states_next = torch.tensor(np.array(memory_states_next)[rand_index], dtype=torch.float32) 
      batch_rewards = torch.tensor(np.array(memory_rewards)[rand_index], dtype=torch.float32) 
      batch_actions = torch.tensor(np.array(memory_actions)[rand_index], dtype=torch.int64) 
      batch_terminal = torch.tensor(np.array(memory_terminal)[rand_index], dtype=torch.int64) 
      batch_td_error = torch.tensor(np.array(memory_td_error)[rand_index],dtype=torch.float32) 

      # Calculating q-values for training
      with torch.no_grad():
        # Calculating q-values for next possible states
        q_values_next = online_model(batch_states_next)
        # Calculating indices of max q-values
        _, inds = torch.max(q_values_next,dim=1)
        inds = torch.reshape(input = inds, shape = (batch_size, 1))
        # Evaluating these actions with the help of the target model
        max_q_values_next = torch.gather(target_model(batch_states_next), 1, inds)
        max_q_values_next = (torch.reshape(max_q_values_next, (-1,)))
        # Calculating the target q-values 
        q_target = (batch_rewards + gamma * (1 - batch_terminal) * max_q_values_next)
    
      # Calculating the expected q-values
      q_exp = online_model(batch_states)
      batch_actions = torch.reshape(input = batch_actions, shape = (batch_size, 1))
      q_exp = torch.gather(input = q_exp, dim = 1, index = batch_actions)
      q_exp = (torch.reshape(q_exp, (-1,))) 

      # Calculating loss
      error = q_target - q_exp
      norm_weights = torch.tensor(norm_weights)
      loss = (norm_weights[rand_index] * error).pow(2).mean() 
      optimizer.zero_grad()

      # Backpropagating the loss
      loss.backward()

      # Taking an optimization step
      optimizer.step()

      # Updating td errors
      s = 0
      for x in rand_index:
        memory_td_error[x] = abs(error[s]).item()
        s += 1

      # Copying weights of the online network in the target network every act_period steps
      if step % act_period == 0:
        target_model = deepcopy(online_model) 

      # Playing every 500 steps to obtain the performance of the DQN agent
      if step % 500 == 0:
        avg_return_play = play(episodes = 25, step = step, end = False, envs = env2)
        returns_play.append(avg_return_play)
      
      # If an episode has to come to an end, reset environment and print the reward for the last episode
      if done:
        print(episode + 1, epsi, reward_sum)
        returns_train.append(reward_sum)

        # If episode > 100, then print the average reward for the last hundred episodes 
        if episode + 1  > 100:
          x = sum(returns_train[-100:])/100
          print("Average return training:", x)
          # Logging the training average return
          wandb.log({"Average return over 100 episodes of training": x, "episode": episode + 1})  
        break

      # If training steps >= 2500 and the agent has obtained an average reward of 475 during last 5 times of playing, end training
      if step >= 2500:
        if sum(returns_play[-5:]) / 5 >= 475:
          break

  # Playing after training to obtain end results
  play(episodes = 100, step = step, end = True, envs = env2)

SEED: 12


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


1 0.9994959999999995 14.0
2 0.9975519999999976 54.0
3 0.996975999999997 16.0
4 0.9963279999999963 18.0
5 0.9956079999999956 20.0
6 0.9952119999999952 11.0
7 0.9941679999999942 29.0
8 0.9935199999999935 18.0
9 0.9922959999999923 34.0
10 0.9916119999999916 19.0
11 0.9911439999999911 13.0
12 0.9903879999999904 21.0
13 0.9897759999999898 17.0
14 0.9888759999999889 25.0
15 0.988011999999988 24.0
16 0.9860679999999861 54.0
17 0.9857079999999857 10.0
18 0.9851679999999852 15.0
19 0.9844119999999844 21.0
20 0.9835839999999836 23.0
21 0.9829359999999829 18.0
22 0.9825399999999825 11.0
Average return over 25 episodes of playing: 9.4
23 0.9818559999999819 19.0
24 0.981027999999981 23.0
25 0.9804159999999804 17.0
26 0.9796599999999797 21.0
27 0.9791559999999792 14.0
28 0.9787239999999787 12.0
29 0.9782919999999783 12.0
30 0.9772839999999773 28.0
31 0.9768519999999768 12.0
32 0.9746559999999747 61.0
33 0.9742959999999743 10.0
34 0.9731439999999731 32.0
35 0.9726759999999727 13.0
36 0.97188399999997

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Average return over 100 episodes of training,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▁▂▂▂▂▂▂▂▂▃▄▅▅█
Average return over 25 episodes of playing,▁▂▅▃▄▄▃▃▃▅▃▃▂▃▃▃▂▂▂▂▂▂▂▃▃▃▄█▃▃▂▂▂▃▅▃▅███
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███

0,1
Average return over 100 episodes of training,135.44
Average return over 25 episodes of playing,500.0
episode,742.0
step,33000.0


1 0.9986679999999987 37.0
2 0.9983079999999983 10.0
3 0.9978399999999978 13.0
4 0.9967959999999968 29.0
5 0.995967999999996 23.0
6 0.9954999999999955 13.0
7 0.9947079999999947 22.0
8 0.9943839999999944 9.0
9 0.9937359999999937 18.0
10 0.9927999999999928 26.0
11 0.9924039999999924 11.0
12 0.9908199999999908 44.0
13 0.9899199999999899 25.0
14 0.988947999999989 27.0
15 0.9884439999999884 14.0
16 0.988011999999988 12.0
17 0.9861399999999861 52.0
18 0.9858159999999858 9.0
19 0.9844839999999845 37.0
20 0.9841959999999842 8.0
21 0.9833319999999833 24.0
Average return over 25 episodes of playing: 15.4
22 0.9817119999999817 45.0
23 0.9811359999999811 16.0
24 0.9805959999999806 15.0
25 0.9801999999999802 11.0
26 0.9798039999999798 11.0
27 0.9789039999999789 25.0
28 0.9773559999999774 43.0
29 0.9758079999999758 43.0
30 0.9753399999999753 13.0
31 0.9748359999999748 14.0
32 0.9742599999999743 16.0
33 0.9730719999999731 33.0
34 0.9724239999999724 18.0
35 0.9715959999999716 23.0
36 0.9707679999999708

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Average return over 100 episodes of training,▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▃▄▅▅▅▆▇▇█
Average return over 25 episodes of playing,▁▁▃▃▄▄▄▄▃▂▂▂▂▂▂▂▂▂▄▂▂▂▂▂▃▆▄▃▃▃▄▂█▆▅▄▇███
episode,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███

0,1
Average return over 100 episodes of training,84.2
Average return over 25 episodes of playing,500.0
episode,556.0
step,22500.0


1 0.9994959999999995 14.0
2 0.998991999999999 14.0
3 0.9984159999999984 16.0
4 0.9971919999999972 34.0
5 0.9966519999999967 15.0
6 0.9961479999999961 14.0
7 0.9955359999999955 17.0
8 0.9940959999999941 40.0
9 0.9930879999999931 28.0
10 0.9926199999999926 13.0
11 0.9918999999999919 20.0
12 0.9914679999999915 12.0
13 0.9906759999999907 22.0
14 0.99002799999999 18.0
15 0.9892719999999893 21.0
16 0.9889119999999889 10.0
17 0.9884079999999884 14.0
18 0.987003999999987 39.0
19 0.9864279999999864 16.0
20 0.9853119999999853 31.0
21 0.9844839999999845 23.0
22 0.9826479999999826 51.0
Average return over 25 episodes of playing: 9.08
23 0.9808839999999809 49.0
24 0.9794079999999794 41.0
25 0.979011999999979 11.0
26 0.9783279999999783 19.0
27 0.9778599999999779 13.0
28 0.9769239999999769 26.0
29 0.9763119999999763 17.0
30 0.9745479999999745 49.0
31 0.973971999999974 16.0
32 0.9732879999999733 19.0
33 0.9728559999999729 12.0
34 0.9714879999999715 38.0
35 0.970947999999971 15.0
36 0.9703359999999703 