In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import gym
from tqdm import tqdm_notebook
import numpy as np
from collections import deque

In [None]:
DISCOUNT_FACTOR = 0.99
NUM_EPISODES = 1000

MAX_STEPS = 1000
SOLVED_SCORE =-90

#device to run model on
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

  and should_run_async(code)


In [None]:

class Policy(nn.Module):
    def __init__(self, s_size=6, h_size=16, a_size=3):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(s_size, h_size)
        self.fc2 = nn.Linear(h_size, a_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.softmax(x, dim=1)

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        m = Categorical(probs)
        action = m.sample()
        return action.item() - 1, m.log_prob(action)

In [None]:
class StateValueNetwork(nn.Module):

    #Takes in state
    def __init__(self, observation_space):
        super(StateValueNetwork, self).__init__()

        self.input_layer = nn.Linear(observation_space, 128)
        self.output_layer = nn.Linear(128, 1)

    def forward(self, x):
        #input layer
        x = self.input_layer(x)

        #activiation relu
        x = F.relu(x)

        #get state value
        state_value = self.output_layer(x)

        return state_value

In [None]:
def select_action(network, state):

    state = torch.from_numpy(state).float().unsqueeze(0).to(DEVICE)
    action_probs = network(state)
    state = state.detach()

    m = Categorical(action_probs)
    action = m.sample()

    #return action
    return action.item(), m.log_prob(action)

In [None]:

env = gym.make('Acrobot-v1')

#Init network
policy_network = PolicyNetwork(env.observation_space.shape[0], env.action_space.n).to(DEVICE)
stateval_network = StateValueNetwork(env.observation_space.shape[0]).to(DEVICE)

#Init optimizer
policy_optimizer = optim.SGD(policy_network.parameters(), lr=0.01)
stateval_optimizer = optim.SGD(stateval_network.parameters(), lr=0.01)

  deprecation(
  deprecation(


In [None]:
def reinforce():

  scores = []

  #track recent scores
  recent_scores = deque(maxlen = 100)

  #run episodes
  for episode in tqdm_notebook(range(NUM_EPISODES)):

      #init variables
      state = env.reset()
      done = False
      score = 0
      I = 1

      #run episode, update online
      for step in range(MAX_STEPS):

          #get action and log probability
          action, lp = select_action(policy_network, state)

          #step with action
          new_state, reward, done, _ = env.step(action)

          #update episode score
          score += reward

          #get state value of current state
          state_tensor = torch.from_numpy(state).float().unsqueeze(0).to(DEVICE)
          state_val = stateval_network(state_tensor)

          #get state value of next state
          new_state_tensor = torch.from_numpy(new_state).float().unsqueeze(0).to(DEVICE)
          new_state_val = stateval_network(new_state_tensor)

          #if terminal state, next state val is 0
          if done:
              new_state_val = torch.tensor([0]).float().unsqueeze(0).to(DEVICE)

          #calculate value function loss with MSE
          val_loss = F.mse_loss(reward + DISCOUNT_FACTOR * new_state_val, state_val)
          val_loss *= I

          #calculate policy loss
          advantage = reward + DISCOUNT_FACTOR * new_state_val.item() - state_val.item()
          policy_loss = -lp * advantage
          policy_loss *= I

          #Backpropagate policy
          policy_optimizer.zero_grad()
          policy_loss.backward(retain_graph=True)
          policy_optimizer.step()

          #Backpropagate value
          stateval_optimizer.zero_grad()
          val_loss.backward()
          stateval_optimizer.step()

          if done:
              break

          #move into new state, discount I
          state = new_state
          I *= DISCOUNT_FACTOR

      #append episode score
      scores.append(score)
      recent_scores.append(score)

      return scores

      early stopping if we meet solved score goal
      if np.array(recent_scores).mean() >= SOLVED_SCORE:
          break

In [None]:
all_scores = []
for seed in seeds:
    print("started training with seed: ", seed)
    _, scores = reinforce()
    print("completed training with seed: ", seed)
    all_scores.append(scores)

In [None]:
avg=np.mean(all_scores,axis=0)
var=np.var(all_scores,axis=0)
c=np.arange(1,len(all_score[0]),1)

plt.figure(figsize=(10,6))
plt.plot(c,avg,label="Average_rewards",color="blue")
plt.fill_between(c,avg-np.sqrt(var),avg+np.sqrt(var),alpha=0.2,color="blue")
plt.title("REINFORCE (Acrobot)")
plt.xlabel("Episode Count")
plt.ylabel("Average Reward")
plt.grid(True)
plt.show()