In [198]:
import numpy as np
import sys
import torch
import gym
from torch import nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
from IPython.display import clear_output
import matplotlib.pyplot as plt

In [199]:
class ActorCritic(nn.Module):
    def __init__(self, state_dim, n_actions, gamma):
        super(ActorCritic, self).__init__()
        self.common = nn.Linear(state_dim,128)
        self.action_layer = nn.Linear(128, n_actions)
        self.critic_layer = nn.Linear(128, 1)
        
        self.logprobs = []
        self.values = []
        self.rewards = []

        self.gamma = gamma
    
    def clear(self):
        self.logprobs.clear()
        self.values.clear()
        self.rewards.clear()

    def forward(self, x):
        # x = torch.from_numpy(x).float().to(device)
        x = torch.from_numpy(x).float()
        x = F.relu(self.common(x))

        critic_value = self.critic_layer(x)

        probs = F.softmax(self.action_layer(x),dim=0)
        dist = Categorical(probs)
        action = dist.sample()

        self.logprobs.append(dist.log_prob(action))
        self.values.append(critic_value)

        return action.item()

    def get_dat_loss(self):
        rewards = []
        discount_reward = 0
        for reward in self.rewards[::-1]:
            discount_reward = reward + self.gamma * discount_reward
            rewards.append(discount_reward)
        rewards.reverse()
        
        # rewards = torch.tensor(rewards).to(device)
        rewards = torch.tensor(rewards)
        reward_len = len(rewards)
        # normalize
        rewards = (rewards - rewards.mean()) / (rewards.std())
        rewards.resize_(reward_len, 1)

        cum_loss = 0

        for logprob, value, reward in zip(self.logprobs, self.values, rewards):
            advantage = (reward - value.item())
            actor_loss = -logprob * advantage
            critic_loss = F.smooth_l1_loss(value, reward)
            cum_loss += (actor_loss + critic_loss)
        
        return cum_loss

In [200]:
lr = 0.02
betas = (0.9, 0.999)
gamma = 0.99
episodes = 10000
max_iterations = 10000

In [201]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
env = gym.make('LunarLander-v2')
state_dim = env.observation_space.shape[0]
n_actions = env.action_space.n
# A2C = ActorCritic(state_dim, n_actions, gamma).to(device)
A2C = ActorCritic(state_dim, n_actions, gamma)
optimizer = optim.Adam(A2C.parameters(), lr=lr, betas=betas)

In [202]:
cum_reward = 0
episode_rewards = []
for episode in range(episodes):
    obs = env.reset()
    total_reward = 0
    for iter in range(max_iterations):
        action  = A2C(obs)
        obs, reward, done, _ = env.step(action)
        A2C.rewards.append(reward)
        cum_reward += reward
        total_reward += reward
        if done:
            break
    
    optimizer.zero_grad()
    loss = A2C.get_dat_loss()
    loss.backward()
    optimizer.step()
    A2C.clear()


    print(f"[{episode+1}/{episodes}] episode reward: {round(total_reward)} cummulative reward: {round(cum_reward)}"," "*10,end='\r')

[127/10000] episode reward: 3 cummulative reward: -14554                               

KeyboardInterrupt: 