In [None]:
# Importing needed librarys
import torch
import torch.distributions as dist
import torch.nn as nn
import gym
import numpy as np
import random
import wandb

In [None]:
# Setting seeds
def make_deterministic(seed):
  torch.manual_seed(seed)
  random.seed(seed)
  np.random.seed(seed)

In [None]:
# Building a class for the neural network
class Neural_Network(nn.Module):
  def __init__(self, in_dim, out_dim, hidden_dim):
        super(Neural_Network, self).__init__()
        self.input_layer = nn.Linear(in_dim, hidden_dim[0])
        self.hidden_layer_1 = nn.Linear(hidden_dim[0], hidden_dim[1])
        self.hidden_layer_2 = nn.Linear(hidden_dim[1], hidden_dim[2])
        self.output_layer = nn.Linear(hidden_dim[2], out_dim)
       
  def forward (self, x, activation_function = nn.ReLU()):
    x = activation_function(self.input_layer(x))
    x = activation_function(self.hidden_layer_1(x))
    x = activation_function(self.hidden_layer_2(x))
    output = self.output_layer(x)
    return output

In [None]:
# Defining function for playing during and after training to obtain performance of the agent
def play(episodes, step, end, envs):
  x = 0
  returns = []

  for i in range(0, episodes):
    memory_rewards = []
    state = envs.reset()
    
    while True:
      state_tensor = torch.tensor(state, dtype=torch.float32) 
      values = policy_model(state_tensor)
      distribution = dist.Categorical(logits = values)
      action = distribution.sample()
      state_next, reward, done, _ = envs.step(action.item())
      memory_rewards.append(reward)
      state = state_next

      if done:
        x = sum(memory_rewards)
        state = envs.reset()
        returns.append(x)
        break

  if end:
    avg_return = sum(returns[-100:]) / 100
    print("Average return over 100 episodes of trained agent:", avg_return)
  else:
    avg_return = sum(returns[-25:]) / 25
    wandb.log({"Average return over 25 episodes of playing": avg_return,"step": step})
    print("Average return over 25 episodes of playing:", avg_return)

  return avg_return

In [None]:
##### Reinforce Algorithm with Baseline ###########

# Seeds for repeatable training results
SEEDS = (12, 34, 56)

# Creating two environments: One for training and for playing
env = gym.make("CartPole-v1")
env2 = gym.make("CartPole-v1")

# Looping over different seeds for training different agents
for seed in SEEDS:
  make_deterministic(seed)
  wandb.init(project="Reinforce-Baseline-CartPole", entity="raisa")
  print("SEED:", seed)
  config = wandb.config

  # Defining hyperparameters
  gamma = 0.99 # gamma
  learning_rate_1 = 0.001 # learning rate for policy network
  learning_rate_2 = 0.0005 # learning rate for state-value network
  num_episodes = 10000 # maximum number of episodes
  hidden_dims_1 = [128, 64, 32] # size of hidden layers in the policy network
  hidden_dims_2 = [128, 64, 32] # size of hidden layers in the state-value network

  # Logging hyperparameters
  config.learning_rate_policy = learning_rate_1
  config.learning_rate_value = learning_rate_2
  config.gamma = gamma
  config.num_episodes = num_episodes
  config.hidden_dim_policy = hidden_dims_1
  config.hidden_dim_value = hidden_dims_2
  config.seed = seed

  # Setting the seed for the environments
  env.seed(seed)
  env.action_space.seed(seed)
  env2.seed(seed)
  env2.action_space.seed(seed)

  # Initializing list for average returns during training
  returns_train = []
  # Initializing list for average returns during playing
  returns_play = []

  # Creating neural networks and setting optimizers
  policy_model = (Neural_Network(in_dim = int(np.prod(env.observation_space.shape)), out_dim = env.action_space.n, hidden_dim=hidden_dims_1))
  state_value_model = (Neural_Network(in_dim = int(np.prod(env.observation_space.shape)), out_dim = 1, hidden_dim=hidden_dims_1))
  optimizer = torch.optim.Adam(policy_model.parameters(), lr = learning_rate_1)
  optimizer_2 = torch.optim.Adam(state_value_model.parameters(), lr=learning_rate_2)

  steps = 0
  reward_list = []
  disc_rewards = []
  discounts = []
  returns = []
  state_values = []

  # Training the agent for number of episodes
  for episode in range(0, num_episodes):
    step = 0
    state = env.reset()
    reward_sum = 0
    returns.clear()
    state_values.clear()
    logs = []
    while True:
      step += 1
      steps += 1

      # Choosing action and obtaining log value action probability
      state_tensor = torch.tensor(state, dtype=torch.float32)
      values = policy_model(state_tensor)
      distribution = dist.Categorical(logits = values)
      action = distribution.sample()
      log_value = distribution.log_prob(action)
      state_next, reward, done, _ = env.step(action.item())
      reward_list.append(reward)
      logs.append(log_value)
      values_state_value = state_value_model(state_tensor)
      state_values.append(values_state_value)
      reward_sum += reward
      state = state_next

      # Playing every 500 steps to obtain the performance of the agent
      if steps % 500 == 0:
        avg_return_play = play(episodes = 25, step = steps, end = False, envs = env2)
        returns_play.append(avg_return_play)
      
      # Following these steps, if episode has to come to an end 
      if done:
        loss = 0
        loss_2 = 0

        # Calculating discounts
        for i in range(0,step):
          discounts.append(gamma**i)

        # Calculating discounted returns
        for i in range(0, step):
          disc_rewards = [x*y for x,y in zip(reward_list,discounts)]
          returns.append(torch.tensor(sum(disc_rewards)))
          reward_list.pop(0)

        returns_tensor = torch.stack(returns)
        logs_tensor = torch.stack(logs)
        state_value_tensor = torch.stack(state_values)

        error = returns_tensor - state_value_tensor
        
        # Calculating loss for policy network
        log_baseline = error.detach() * logs_tensor
        loss = (-log_baseline).mean()
      
        # Optimizing policy network
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Calculating loss for state-value network
        losses = error.pow(2).mean()

        # Optimizing state-value network
        optimizer_2.zero_grad()
        losses.backward()
        optimizer_2.step()

        print(episode + 1, reward_sum)
        returns_train.append(reward_sum)
        break

    # If episode > 100, then print the average reward for the last hundred episodes 
    if episode + 1  > 100:
      x = sum(returns_train[-100:])/100
      print("Average return training:", x)
      #Log the training average return
      wandb.log({"Average return over 100 episodes of training": x, "episode": episode + 1})

    # If training steps >= 2500 and the agent has obtained an average reward of 475 during last 5 times of playing, end training
    if steps >= 2500:
      if sum(returns_play[-5:]) / 5 >= 475:
        break

  # Playing after training to obtain end results
  play(episodes = 100, step = steps, end = True, envs = env2)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


SEED: 12
1 10.0
2 37.0
3 16.0
4 9.0
5 12.0
6 20.0
7 12.0
8 15.0
9 13.0
10 23.0
11 17.0
12 10.0
13 15.0
14 9.0
15 29.0
16 20.0
17 28.0
18 21.0
19 16.0
20 63.0
21 10.0
22 13.0
23 31.0
24 19.0
Average return over 25 episodes of playing: 19.52
25 41.0
26 42.0
27 33.0
28 14.0
29 36.0
30 31.0
31 22.0
32 14.0
33 21.0
34 31.0
35 34.0
36 27.0
37 24.0
38 24.0
39 16.0
40 22.0
41 12.0
42 59.0
43 24.0
Average return over 25 episodes of playing: 28.36
44 20.0
45 12.0
46 21.0
47 41.0
48 42.0
49 18.0
50 19.0
51 21.0
52 27.0
53 49.0
54 10.0
55 20.0
56 29.0
57 39.0
58 13.0
59 16.0
60 29.0
61 36.0
62 18.0
63 11.0
Average return over 25 episodes of playing: 25.64
64 29.0
65 24.0
66 16.0
67 10.0
68 14.0
69 19.0
70 14.0
71 10.0
72 11.0
73 26.0
74 30.0
75 17.0
76 26.0
77 14.0
78 10.0
79 13.0
80 10.0
81 16.0
82 17.0
83 18.0
84 21.0
85 43.0
86 21.0
87 19.0
88 16.0
89 9.0
90 30.0
Average return over 25 episodes of playing: 24.04
91 14.0
92 75.0
93 15.0
94 25.0
95 67.0
96 49.0
97 16.0
98 43.0
99 13.0
100 22.0
10

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Average return over 100 episodes of training,▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▆▆▆▇▇▇█
Average return over 25 episodes of playing,▁▁▁▁▂▂▂▂▃▃▄▃▅▄▅▅▅▅▅▇▆▄▄▆█▆▄▅▅▇█▇▇▇█▇▇▇██
episode,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
Average return over 100 episodes of training,319.52
Average return over 25 episodes of playing,479.64
episode,333.0
step,45000.0


SEED: 34
1 21.0
2 20.0
3 13.0
4 16.0
5 42.0
6 27.0
7 18.0
8 16.0
9 21.0
10 38.0
11 15.0
12 15.0
13 33.0
14 31.0
15 44.0
16 29.0
17 9.0
18 20.0
19 30.0
Average return over 25 episodes of playing: 24.8
20 90.0
21 27.0
22 54.0
23 12.0
24 21.0
25 14.0
26 12.0
27 27.0
28 19.0
29 20.0
30 13.0
31 41.0
32 25.0
33 10.0
34 10.0
35 15.0
36 17.0
37 25.0
38 19.0
39 26.0
40 19.0
41 15.0
Average return over 25 episodes of playing: 21.16
42 21.0
43 24.0
44 10.0
45 17.0
46 22.0
47 14.0
48 45.0
49 10.0
50 17.0
51 26.0
52 12.0
53 30.0
54 11.0
55 12.0
56 25.0
57 29.0
58 13.0
59 17.0
60 12.0
61 18.0
62 40.0
63 17.0
64 12.0
65 24.0
66 18.0
Average return over 25 episodes of playing: 21.68
67 35.0
68 67.0
69 17.0
70 30.0
71 34.0
72 33.0
73 68.0
74 35.0
75 34.0
76 18.0
77 28.0
78 22.0
79 19.0
80 22.0
81 28.0
82 16.0
Average return over 25 episodes of playing: 26.4
83 35.0
84 41.0
85 22.0
86 41.0
87 33.0
88 21.0
89 11.0
90 29.0
91 23.0
92 21.0
93 11.0
94 25.0
95 32.0
96 22.0
97 43.0
98 50.0
99 31.0
Average ret

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Average return over 100 episodes of training,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▅▅▆▆▇▇▇▇█
Average return over 25 episodes of playing,▁▁▁▁▁▁▁▂▄▃▃▅▆▅▄▃▅▇▇▆▆▆▆▅▅▅▄▆▆▇▇▆▃▄▅▅▆███
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███

0,1
Average return over 100 episodes of training,282.82
Average return over 25 episodes of playing,469.4
episode,297.0
step,35000.0


SEED: 56
1 15.0
2 21.0
3 38.0
4 14.0
5 14.0
6 66.0
7 10.0
8 9.0
9 20.0
10 10.0
11 11.0
12 13.0
13 16.0
14 23.0
15 17.0
16 30.0
17 13.0
18 45.0
19 14.0
20 29.0
21 25.0
22 11.0
23 15.0
Average return over 25 episodes of playing: 22.12
24 29.0
25 25.0
26 24.0
27 21.0
28 17.0
29 26.0
30 11.0
31 33.0
32 16.0
33 68.0
34 33.0
35 33.0
36 40.0
37 21.0
38 21.0
39 17.0
40 15.0
41 9.0
42 15.0
43 12.0
44 10.0
Average return over 25 episodes of playing: 23.24
45 36.0
46 22.0
47 15.0
48 16.0
49 25.0
50 24.0
51 12.0
52 13.0
53 15.0
54 19.0
55 25.0
56 11.0
57 18.0
58 16.0
59 28.0
60 26.0
61 21.0
62 13.0
63 12.0
64 64.0
65 21.0
66 20.0
67 14.0
Average return over 25 episodes of playing: 21.64
68 45.0
69 11.0
70 31.0
71 21.0
72 17.0
73 11.0
74 23.0
75 19.0
76 37.0
77 17.0
78 26.0
79 39.0
80 55.0
81 16.0
82 18.0
83 21.0
84 34.0
85 12.0
86 16.0
87 24.0
88 18.0
89 22.0
Average return over 25 episodes of playing: 24.6
90 19.0
91 32.0
92 22.0
93 13.0
94 28.0
95 25.0
96 19.0
97 17.0
98 23.0
99 32.0
100 12.0
10