# Importing dependancies

In [39]:
import gym

import numpy as np
import random
from collections import namedtuple, deque
from more_itertools import chunked

import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

import matplotlib.pyplot as plt

In [40]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# Model definition

We can use the state variables (position of cart, angle of pole)

In [41]:
class ActorCritic(nn.Module):

    def __init__(self, output_size):
        super(ActorCritic, self).__init__()
        self.fc1 = nn.LazyLinear(64) 
        self.fc2 = nn.LazyLinear(64)
        self.a1 = nn.LazyLinear(32) 
        self.a2 = nn.LazyLinear(output_size) 
        self.c1 = nn.LazyLinear(32) 
        self.c2 = nn.LazyLinear(1) 

        self.act = nn.PReLU()
        self.softmax = nn.Softmax()

    def forward(self, input):
        x = self.act(self.fc1(input))
        x = self.act(self.fc2(x))
        a_output = self.act(self.a1(x))
        a_output = self.softmax(self.a2(x))
        c_output = self.act(self.c1(x))
        c_output = self.c2(x)
        return a_output, c_output

# Creating the agent

In [42]:
class ActorCriticAgent:
    def __init__(self, env):

        self.action_size = env.action_space.n # output size
        
        self.model = ActorCritic(self.action_size).to(device)

        self.critic_loss = nn.SmoothL1Loss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.01)

        self.num_epochs = 250
        self.gamma = 0.99

        self.batch_size = 64

    def act(self, state):
        action_prob, critique = self.model(state)
        m = Categorical(action_prob)
        action = m.sample()
        return action.item(), critique, m.log_prob(action)
    
    def act_ideal(self, state):
        actions, critique = self.model(state)
        return actions.max(1)[1].item()
    
    def train_step(self, log_probs, critiques, rewards):
        
        discount_reward_sum = 0
        returns = []

        for reward in rewards[::-1]:
            discount_reward_sum = reward + self.gamma * discount_reward_sum
            returns.insert(0, discount_reward_sum)

        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + np.finfo(np.float32).eps.item())
        returns = returns.tolist()

        policy_losses = []
        critic_losses = []

        for log_prob, critique, r in zip(log_probs, critiques, returns):
            advantage = r - critique

            policy_losses.append(-log_prob * advantage)
            critic_losses.append(self.critic_loss(critique, torch.Tensor([r]).to(device)))


        loss = torch.stack(policy_losses).sum() + torch.stack(critic_losses).sum()
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    
    def train(self, env, print_epochs=False):

        scores = []

        for epoch in range(self.num_epochs):
            done = False
            curr_state = env.reset()
            curr_state = torch.Tensor(curr_state).to(device)
            score = -10

            log_probs = []
            critiques = []
            rewards = []
            
            while not done:

                if score < curr_state[0].item():
                    score = curr_state[0].item()

                action, critique, log_prob = self.act(curr_state.unsqueeze(0))
                curr_state, reward, done, _ = env.step(action)
                curr_state = torch.Tensor(curr_state).to(device)

                log_probs.append(log_prob)
                critiques.append(critique)
                rewards.append(reward)

            
            self.train_step(log_probs, critiques, rewards)

            scores.append(score)

            if print_epochs:
                print("Epoch: " + str(epoch + 1) + ". Score is: " + str(score))
            
        return scores


# Training the agent

In [43]:
env = gym.make("MountainCar-v0")
torch.autograd.set_detect_anomaly(False)
agent = ActorCriticAgent(env)
env.reset()
scores = agent.train(env, print_epochs=True)
plt.plot(scores)
plt.show()

  a_output = self.softmax(self.a2(x))


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [64, 1]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!