In [None]:
# Importing needed librarys
import torch
import torch.nn as nn
import gym
import numpy as np
import random
import wandb
from copy import deepcopy

In [None]:
# Setting seeds
def make_deterministic(seed):
  torch.manual_seed(seed)
  random.seed(seed)
  np.random.seed(seed)

In [None]:
# Building a class for the neural network
class Neural_Network(nn.Module):
  def __init__(self, in_dim, out_dim, hidden_dim):
        super(Neural_Network, self).__init__()
        self.input_layer = nn.Linear(in_dim, hidden_dim[0])
        self.hidden_layer = nn.Linear(hidden_dim[0], hidden_dim[1])
        self.output_layer = nn.Linear(hidden_dim[1], out_dim)
       
  def forward (self, x, activation_function = nn.ReLU()):
    x = activation_function(self.input_layer(x))
    x = activation_function(self.hidden_layer(x))
    output = self.output_layer(x)
    return output

In [None]:
# Defining function for epsilon greedy actions
def eps_greedy(state, online_model, eps):
  with torch.no_grad():
    if (np.random.rand() > eps):
      value, inds  = torch.max(online_model(state), dim=0)
      return inds.item()
    else:
      action = np.random.choice(env.action_space.n)
      return action

In [None]:
 # Defining function for playing during and after training to obtain performance of the agent
def play(episodes, step, end, envs):
  x = 0
  returns = []

  for i in range(0, episodes):
    memory_states = [] 
    memory_actions = [] 
    memory_rewards = [] 
    memory_terminal = [] 
    memory_states_next = []

    state = envs.reset()
    while True:
      action = eps_greedy(state = torch.tensor(state).float() , online_model = online_model, eps = -1)
      state_next, reward, done, _ = envs.step(action)
      memory_rewards.append(reward)
      memory_terminal.append(done)
      state = state_next

      if done:
        x = sum(memory_rewards)
        state = envs.reset()
        returns.append(x)
        break

  if end:
    avg_return = sum(returns[-100:]) / 100
    print("Average return over 100 episodes of trained agent:", avg_return)
  else:
    avg_return = sum(returns[-25:]) / 25
    wandb.log({"Average return over 25 episodes of playing": avg_return,"step": step})
    print("Average return over 25 episodes of playing:", avg_return)

  return avg_return

In [None]:
##### DQN Algorithm ###########

# Seeds for repeatable training results
SEEDS = (12, 34, 56)

# Creating two environments: One for training and for playing
env = gym.make("CartPole-v1")
env2 = gym.make("CartPole-v1")

# Looping over different seeds for training different agents
for seed in SEEDS:
  make_deterministic(seed)
  print("SEED:", seed)
  wandb.init(project="DQN-CartPole", entity="raisa")
  config = wandb.config

  # Defining hyperparameters
  gamma = 0.99 # gamma
  eps_max = 1 # maximum value for epsilon
  eps_min = 0.1 # minimum value for epsilon
  eps_decay = (1-0.1) / 25000 # decaying factor for epsilon
  learning_rate = 0.0025 # learning rate
  batch_size = 64 # batch size
  num_episodes = 1000 # maximum number of episodes
  min_memory = 100 # size of replay memory at start of training
  max_memory = 1000 # maximum size of replay memory
  hidden_dims = [128, 64] # size of hidden layers in the neural network
  act_period = 150 # update frequency of target neural network
  criterion = nn.MSELoss() # loss function

  # Logging hyperparameters
  config.learning_rate = learning_rate
  config.gamma = gamma
  config.batch_size = batch_size
  config.num_episodes = num_episodes
  config.max_memory = max_memory
  config.min_memory = min_memory
  config.hidden_dim = hidden_dims
  config.act_period = act_period
  config.eps_max = eps_max
  config.eps_min = eps_min
  config.eps_decay = eps_decay
  config.seed = seed

  # Setting the seed for the environments
  env.seed(seed)
  env.action_space.seed(seed)
  env2.seed(seed)
  env2.action_space.seed(seed)

  # Initializing replay memory
  memory_states = [] 
  memory_actions = [] 
  memory_rewards = [] 
  memory_terminal = [] 
  memory_states_next = []

  # Initializing list for average returns during training
  returns_train = []
  # Initializing list for average returns during playing
  returns_play = []

  # Creating neural networks and setting optimizer
  online_model = (Neural_Network(in_dim = int(np.prod(env.observation_space.shape)), out_dim = env.action_space.n, hidden_dim=hidden_dims)) 
  target_model = (Neural_Network(in_dim = int(np.prod(env.observation_space.shape)), out_dim = env.action_space.n, hidden_dim=hidden_dims)) 
  optimizer = torch.optim.Adam(online_model.parameters(), lr=learning_rate)
  target_model = deepcopy(online_model) 

  # Storing minimum experiences in replay memory
  state = env.reset()
  for step in range(0,min_memory):
    action = np.random.choice(env.action_space.n)
    state_next, reward, done, _ = env.step(action)
    memory_actions.append(action)
    memory_states.append(state)
    memory_states_next.append(state_next)
    memory_rewards.append(reward)
    memory_terminal.append(done)
    state = state_next

    if done:
      state = env.reset()

  epsi = eps_max
  step = 0

  # Training the agent for number of episodes
  for episode in range(0, num_episodes):
    state = env.reset()
    reward_sum = 0
    while True:
      step += 1

      # Deleting first element of replay memory, if max_memory length is reached
      if(len(memory_states) > max_memory):
        memory_actions.pop(0)
        memory_states.pop(0)
        memory_states_next.pop(0)
        memory_rewards.pop(0)
        memory_terminal.pop(0)

      # Taking epsilon greedy action and storing experience in replay memory
      epsi = max(epsi - eps_decay, eps_min)
      action = eps_greedy(state = torch.tensor(state).float() , online_model = online_model, eps = epsi)
      state_next, reward, done, _ = env.step(action)
      memory_actions.append(action)
      memory_states.append(state)
      memory_states_next.append(state_next)
      memory_rewards.append(reward)
      memory_terminal.append(done)
      state = state_next

      reward_sum+=reward

      # Selecting random batch of experiences
      rand_index = np.random.choice(len(memory_states_next), size=batch_size, replace=False)
      batch_states = torch.tensor(np.array(memory_states)[rand_index], dtype=torch.float32) 
      batch_states_next = torch.tensor(np.array(memory_states_next)[rand_index], dtype=torch.float32) 
      batch_rewards = torch.tensor(np.array(memory_rewards)[rand_index], dtype=torch.float32) 
      batch_actions = torch.tensor(np.array(memory_actions)[rand_index], dtype=torch.int64) 
      batch_terminal = torch.tensor(np.array(memory_terminal)[rand_index], dtype=torch.int64) 
    
      # Calculating q-values for training
      with torch.no_grad():
        q_values_next = target_model(batch_states_next)
        max_q_values_next, _ = torch.max(q_values_next,dim=1)
        # Calculating the target q-values
        q_target = (batch_rewards + gamma * (1 - batch_terminal) * max_q_values_next)
  

      # Calculating the expected q-values
      q_exp = online_model(batch_states)
      batch_actions = torch.reshape(input = batch_actions, shape = (batch_size, 1))
      q_exp = torch.gather(input = q_exp, dim = 1, index = batch_actions)
      q_exp = (torch.reshape(q_exp, (-1,))) 

      # Calculating loss
      loss = criterion(q_target, q_exp) 

      optimizer.zero_grad()

      # Backpropagating the loss
      loss.backward()

      # Taking an optimization step
      optimizer.step()

      # Copying the weights of online network in the target network every act_period steps
      if step % act_period == 0:
        target_model = deepcopy(online_model) 

      # Playing every 500 steps to obtain the performance of the DQN agent
      if step % 500 == 0:
        avg_return_play = play(episodes = 25, step = step, end = False, envs = env2)
        returns_play.append(avg_return_play)
      
      # If an episode has to come to an end, reset environment and print the reward for the last episode
      if done:
        print(episode + 1, epsi, reward_sum)
        returns_train.append(reward_sum)

        # If episode > 100, then print the average reward for the last hundred episodes 
        if episode + 1  > 100:
          x = sum(returns_train[-100:])/100
          print("Average return training:", x)
          # Logging the training average return
          wandb.log({"Average return over 100 episodes of training": x, "episode": episode + 1})  
        break

    # If training steps >= 2500 and the agent has obtained an average reward of 475 during last 5 times of playing: End the training
    if step >= 2500:
      if sum(returns_play[-5:]) / 5 >= 475:
        break

  # Playing after training to obtain end results
  play(episodes = 100, step = step, end = True, envs = env2)

SEED: 12


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


1 0.9991359999999991 24.0
2 0.9983799999999984 21.0
3 0.998019999999998 10.0
4 0.9971559999999972 24.0
5 0.9964359999999964 20.0
6 0.9954999999999955 26.0
7 0.995031999999995 13.0
8 0.9944919999999945 15.0
9 0.9938799999999939 17.0
10 0.9931239999999931 21.0
11 0.9923319999999923 22.0
12 0.9898839999999899 68.0
13 0.9891639999999892 20.0
14 0.9886599999999887 14.0
15 0.9879399999999879 20.0
16 0.9854199999999854 70.0
17 0.985023999999985 11.0
18 0.9844119999999844 17.0
19 0.9830799999999831 37.0
20 0.9823599999999824 20.0
Average return over 25 episodes of playing: 75.04
21 0.9817119999999817 18.0
22 0.9808119999999808 25.0
23 0.9802719999999803 15.0
24 0.9797319999999797 15.0
25 0.9783279999999783 39.0
26 0.9775359999999775 22.0
27 0.9769239999999769 17.0
28 0.9761319999999761 22.0
29 0.9753039999999753 23.0
30 0.9747999999999748 14.0
31 0.9742599999999743 15.0
32 0.9737919999999738 13.0
33 0.9733599999999734 12.0
34 0.9723879999999724 27.0
35 0.9717399999999717 18.0
36 0.971235999999

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Average return over 100 episodes of training,▁▁▁▁▁▁▁▁▁▁▁▂▁▂▂▂▂▁▁▁▂▁▂▂▂▂▃▄▅▅▆▆▆▆▆▆▆▇▆█
Average return over 25 episodes of playing,▂▁▄▃▃▃▄▄▂▂▃▂▁▅▂▂▂▅▄▃▂▂▃▃▃▃▇▂▂▃▃▂▃▆▇▄▇███
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████

0,1
Average return over 100 episodes of training,73.75
Average return over 25 episodes of playing,500.0
episode,606.0
step,23000.0


1 0.9993159999999993 19.0
2 0.9987399999999987 16.0
3 0.9983439999999983 11.0
4 0.9976599999999977 19.0
5 0.9966519999999967 28.0
6 0.9958239999999958 23.0
7 0.9952479999999952 16.0
8 0.9935919999999936 46.0
9 0.9927999999999928 22.0
10 0.9918999999999919 25.0
11 0.9913599999999914 15.0
12 0.9909279999999909 12.0
13 0.9893799999999894 43.0
14 0.9879039999999879 41.0
15 0.9866439999999866 35.0
16 0.9857439999999857 25.0
17 0.9851319999999851 17.0
18 0.9843399999999843 22.0
19 0.9838719999999839 13.0
20 0.9831879999999832 19.0
21 0.9826479999999826 15.0
Average return over 25 episodes of playing: 23.68
22 0.9817479999999817 25.0
23 0.9812439999999812 14.0
24 0.9805959999999806 18.0
25 0.9801639999999802 12.0
26 0.9788319999999788 37.0
27 0.9777159999999777 31.0
28 0.9763119999999763 39.0
29 0.9758079999999758 14.0
30 0.9747639999999748 29.0
31 0.9739359999999739 23.0
32 0.9732879999999733 18.0
33 0.9728919999999729 11.0
34 0.9719199999999719 27.0
35 0.9708759999999709 29.0
36 0.970407999

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Average return over 100 episodes of training,▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▁▁▁▂▁▁▂▁▁▁▁▁▁▂▂▂▃▃▄▄▅▅▆▇█
Average return over 25 episodes of playing,▁▃▄▄▃▂▂▂▂▂▁▂▂▂▂▂▂▂▂▂▂▅▅▂▂▃▃▂▅▃▂▃▃▆██▇▇██
episode,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
Average return over 100 episodes of training,154.54
Average return over 25 episodes of playing,500.0
episode,731.0
step,36500.0


1 0.9981279999999981 52.0
2 0.9974799999999975 18.0
3 0.9958959999999959 44.0
4 0.9953919999999954 14.0
5 0.9946719999999947 20.0
6 0.9941319999999941 15.0
7 0.9935919999999936 15.0
8 0.9926559999999927 26.0
9 0.992007999999992 18.0
10 0.9915039999999915 14.0
11 0.9909279999999909 16.0
12 0.9897759999999898 32.0
13 0.9893439999999893 12.0
14 0.9876159999999876 48.0
15 0.9871119999999871 14.0
16 0.9863199999999863 22.0
17 0.9858159999999858 14.0
18 0.9852039999999852 17.0
19 0.9836919999999837 42.0
20 0.9825039999999825 33.0
Average return over 25 episodes of playing: 306.08
21 0.9819279999999819 16.0
22 0.98005599999998 52.0
23 0.9794079999999794 18.0
24 0.978975999999979 12.0
25 0.9783999999999784 16.0
26 0.9777519999999778 18.0
27 0.9768519999999768 25.0
28 0.9763479999999763 14.0
29 0.9745479999999745 50.0
30 0.9735039999999735 29.0
31 0.972999999999973 14.0
32 0.9725319999999725 13.0
33 0.9721359999999721 11.0
34 0.9711999999999712 26.0
35 0.9703359999999703 24.0
36 0.9696159999999