In [None]:
# Importing needed librarys
import torch
import torch.distributions as dist
import torch.nn as nn
import gym
import numpy as np
import random
import wandb

In [None]:
# Setting seeds
def make_deterministic(seed):
  torch.manual_seed(seed)
  random.seed(seed)
  np.random.seed(seed)

In [None]:
# Building a class for the neural network
class Neural_Network(nn.Module):
  def __init__(self, in_dim, out_dim, hidden_dim):
        super(Neural_Network, self).__init__()
        self.input_layer = nn.Linear(in_dim, hidden_dim[0])
        self.hidden_layer_1 = nn.Linear(hidden_dim[0], hidden_dim[1])
        self.hidden_layer_2 = nn.Linear(hidden_dim[1], hidden_dim[2])
        self.output_layer = nn.Linear(hidden_dim[2], out_dim)
       
  def forward (self, x, activation_function = nn.ReLU()):
    x = activation_function(self.input_layer(x))
    x = activation_function(self.hidden_layer_1(x))
    x = activation_function(self.hidden_layer_2(x))
    output = self.output_layer(x)
    return output

In [None]:
# Defining function for playing during and after training to obtain performance of the agent
def play(episodes, step, end, envs):
  x = 0
  returns = []

  for i in range(0, episodes):
    memory_rewards = []

    state = envs.reset()
    while True:
      state_tensor = torch.tensor(state, dtype=torch.float32)
      values = policy_model(state_tensor)
      distribution = dist.Categorical(logits = values)
      action = distribution.sample()
      state_next, reward, done, _ = envs.step(action.item())
      memory_rewards.append(reward)
      state = state_next

      if done:
        x = sum(memory_rewards)
        state = envs.reset()
        returns.append(x)
        break

  if end:
    avg_return = sum(returns[-100:]) / 100
    print("Average return over 100 episodes of trained agent:", avg_return)
  else:
    avg_return = sum(returns[-25:]) / 25
    wandb.log({"Average return over 25 episodes of playing:": avg_return,"step": step})
    print("Average return over 25 episodes of playing:", avg_return)

  return avg_return

In [None]:
##### REINFORCE Algorithm ###########

# Seeds for repeatable training results
SEEDS = (12, 34, 56)

# Creating two environments: One for training and for playing
env = gym.make("CartPole-v1")
env2 = gym.make("CartPole-v1")

# Looping over different seeds for training different agents
for seed in SEEDS:
  make_deterministic(seed)
  wandb.init(project="Reinforce-CartPole", entity="raisa")
  print("SEED:", seed)
  config = wandb.config

  # Defining hyperparameters
  gamma = 0.99 # gamma
  learning_rate = 0.001 # learning rate
  num_episodes = 10000 # maximum number of episodes
  hidden_dims = [128, 64, 32] # size of hidden layers in the neural network

  # Logging hyperparameters
  config.learning_rate = learning_rate
  config.gamma = gamma
  config.num_episodes = num_episodes
  config.hidden_dim = hidden_dims
  config.seed = seed

  # Setting the seed for the environments
  env.seed(seed)
  env.action_space.seed(seed)
  env2.seed(seed)
  env2.action_space.seed(seed)

  # Initializing list for average returns during training
  returns_train = []
  # Initializing list for average returns during playing
  returns_play = []

  # Creating neural network and setting optimizer
  policy_model = (Neural_Network(in_dim = int(np.prod(env.observation_space.shape)), out_dim = env.action_space.n, hidden_dim=hidden_dims))
  optimizer = torch.optim.Adam(policy_model.parameters(), lr=learning_rate)

  steps = 0
  reward_list = []
  disc_rewards = []
  discounts = []
  returns = []
  logs = []

  # Training the agent for number of episodes
  for episode in range(0, num_episodes):
    step = 0
    state = env.reset()
    reward_sum = 0
    returns.clear()
    logs.clear()
    while True:
      step += 1
      steps += 1

      # Choosing action and obtaining log value of action probability
      state_tensor = torch.tensor(state, dtype=torch.float32)
      values = policy_model(state_tensor)
      distribution = dist.Categorical(logits = values)
      action = distribution.sample()
      log_value = distribution.log_prob(action)
      state_next, reward, done, _ = env.step(action.item())
      reward_list.append(reward)
      logs.append(log_value)
      reward_sum += reward
      state = state_next

      # Playing every 500 steps to obtain the performance of the agent
      if steps % 500 == 0:
        avg_return_play = play(episodes = 25, step = steps, end = False, envs = env2)
        returns_play.append(avg_return_play)
      
      # Following these steps, if episode has come to an end
      if done:
        loss = 0

        # Calculating discounts
        for i in range(0,step):
          discounts.append(gamma**i)

        # Calculating discounted returns
        for i in range(0, step):
          disc_rewards = [x*y for x,y in zip(reward_list,discounts)]
          returns.append(torch.tensor(sum(disc_rewards)))
          reward_list.pop(0)

        returns_tensor = torch.stack(returns)
        logs_tensor = torch.stack(logs)
        
        # Calculating loss
        log_returns = - returns_tensor * logs_tensor
        loss = log_returns.mean()

        optimizer.zero_grad()

        # Backpropagating the loss
        loss.backward()

        # Taking an optimization step
        optimizer.step()

        print(episode + 1, reward_sum)

        returns_train.append(reward_sum)
        break

    # If episode > 100, then print the average reward for the last hundred episodes
    if episode + 1  > 100:
      x = sum(returns_train[-100:])/100
      print("Average return training:", x)
      # Logging the training average return
      wandb.log({"Average return over 100 episodes of training:": x, "episode": episode + 1})

    # If training steps >= 2500 and the agent has obtained an average reward of 475 during last 5 times of playing, end the training
    if steps >= 2500:
      if sum(returns_play[-5:]) / 5 >= 475:
        break

  # Playing after training to obtain end results
  play(episodes = 100, step = steps, end = True, envs = env2)

[34m[1mwandb[0m: Currently logged in as: [33mraisa[0m (use `wandb login --relogin` to force relogin)


SEED: 12
1 13.0
2 12.0
3 22.0
4 17.0
5 23.0
6 12.0
7 35.0
8 18.0
9 21.0
10 12.0
11 14.0
12 14.0
13 11.0
14 31.0
15 21.0
16 17.0
17 17.0
18 34.0
19 26.0
20 40.0
21 29.0
22 10.0
23 19.0
24 20.0
Average return over 25 episodes of playing: 19.08
25 12.0
26 20.0
27 37.0
28 16.0
29 20.0
30 54.0
31 15.0
32 29.0
33 41.0
34 56.0
35 14.0
36 17.0
37 44.0
38 27.0
39 28.0
40 12.0
Average return over 25 episodes of playing: 26.04
41 83.0
42 40.0
43 19.0
44 13.0
45 21.0
46 15.0
47 19.0
48 17.0
49 36.0
50 25.0
51 18.0
52 25.0
53 66.0
54 20.0
55 25.0
56 23.0
57 19.0
58 40.0
59 11.0
Average return over 25 episodes of playing: 25.28
60 46.0
61 11.0
62 26.0
63 18.0
64 13.0
65 22.0
66 23.0
67 29.0
68 38.0
69 16.0
70 29.0
71 30.0
72 43.0
73 18.0
74 18.0
75 16.0
76 9.0
77 30.0
78 12.0
79 14.0
80 10.0
81 34.0
82 26.0
Average return over 25 episodes of playing: 26.52
83 15.0
84 19.0
85 23.0
86 56.0
87 34.0
88 14.0
89 74.0
90 22.0
91 48.0
92 37.0
93 15.0
94 26.0
95 28.0
96 11.0
97 15.0
98 29.0
99 32.0
Average r

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Average return over 100 episodes of training:,▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▃▃▃▄▆▇████▇▅▄▃▃▃▃▃▃▄▄▅▆
Average return over 25 episodes of playing:,▁▁▁▁▁▁▂▂▂▄▃▃▆▅▆▇▅▄▄▅▆▃▁▃▂▂▂▂▂▂▂▂▂▃▅▅▅▆██
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███

0,1
Average return over 100 episodes of training:,187.42
Average return over 25 episodes of playing:,494.52
episode,738.0
step,70500.0


SEED: 34
1 22.0
2 17.0
3 10.0
4 26.0
5 13.0
6 22.0
7 42.0
8 37.0
9 16.0
10 14.0
11 47.0
12 15.0
13 15.0
14 21.0
15 89.0
16 9.0
17 15.0
18 19.0
19 41.0
Average return over 25 episodes of playing: 24.36
20 21.0
21 15.0
22 13.0
23 17.0
24 23.0
25 57.0
26 18.0
27 12.0
28 14.0
29 13.0
30 10.0
31 13.0
32 17.0
33 62.0
34 10.0
35 12.0
36 23.0
37 22.0
38 11.0
39 12.0
40 13.0
Average return over 25 episodes of playing: 22.24
41 112.0
42 10.0
43 10.0
44 26.0
45 19.0
46 13.0
47 23.0
48 13.0
49 19.0
50 19.0
51 18.0
52 64.0
53 18.0
54 13.0
55 16.0
56 58.0
57 26.0
58 20.0
59 68.0
60 10.0
61 23.0
Average return over 25 episodes of playing: 22.08
62 24.0
63 30.0
64 16.0
65 11.0
66 15.0
67 8.0
68 17.0
69 29.0
70 17.0
71 81.0
72 17.0
73 19.0
74 39.0
75 47.0
76 14.0
77 21.0
78 21.0
79 16.0
80 19.0
81 16.0
Average return over 25 episodes of playing: 23.48
82 40.0
83 20.0
84 33.0
85 31.0
86 38.0
87 23.0
88 26.0
89 22.0
90 18.0
91 21.0
92 22.0
93 15.0
94 60.0
95 14.0
96 36.0
97 34.0
98 69.0
Average return ov

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Average return over 100 episodes of training:,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▃▃▄▄▅▆▆▆▆▇▇▇▇▇▆▆▆██▇█
Average return over 25 episodes of playing:,▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▄▄▅▂▂▃▃▃▄▄▂▂▂▂▄▅█▆▃▂▂▅▇██
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████

0,1
Average return over 100 episodes of training:,164.6
Average return over 25 episodes of playing:,500.0
episode,534.0
step,45500.0


SEED: 56
1 11.0
2 27.0
3 21.0
4 30.0
5 16.0
6 25.0
7 74.0
8 25.0
9 20.0
10 36.0
11 13.0
12 17.0
13 19.0
14 18.0
15 15.0
16 22.0
17 39.0
18 10.0
19 31.0
20 14.0
Average return over 25 episodes of playing: 22.24
21 27.0
22 19.0
23 30.0
24 10.0
25 41.0
26 11.0
27 11.0
28 87.0
29 14.0
30 19.0
31 10.0
32 35.0
33 18.0
34 42.0
35 19.0
36 23.0
37 38.0
38 24.0
39 9.0
Average return over 25 episodes of playing: 21.96
40 45.0
41 15.0
42 40.0
43 18.0
44 37.0
45 35.0
46 45.0
47 17.0
48 35.0
49 14.0
50 20.0
51 15.0
52 24.0
53 27.0
54 15.0
55 46.0
56 50.0
57 16.0
Average return over 25 episodes of playing: 26.68
58 20.0
59 75.0
60 13.0
61 27.0
62 23.0
63 35.0
64 20.0
65 20.0
66 25.0
67 47.0
68 28.0
69 31.0
70 49.0
71 41.0
72 31.0
73 16.0
Average return over 25 episodes of playing: 25.92
74 21.0
75 33.0
76 13.0
77 31.0
78 57.0
79 24.0
80 21.0
81 17.0
82 14.0
83 20.0
84 23.0
85 42.0
86 35.0
87 46.0
88 55.0
89 31.0
90 29.0
Average return over 25 episodes of playing: 35.8
91 51.0
92 70.0
93 30.0
94 39.0
