In [1]:
import gym
import torch as T
import numpy as np

In [2]:
class ActorCritic(T.nn.Module):
    def __init__(self):
        super(ActorCritic, self).__init__()
        self.transform = T.nn.Linear(8, 128)
        self.act_layer = T.nn.Linear(128, 4) # Action layer
        self.val_layer = T.nn.Linear(128, 1) # Value layer
        self.log_probs = []
        self.state_vals = []
        self.rewards = []

    def forward(self, state):
        state = T.from_numpy(state).float()
        state = T.nn.functional.relu(self.transform(state))
        state_value = self.val_layer(state)

        act_probs = T.nn.functional.softmax(self.act_layer(state))
        act_dist = T.distributions.Categorical(act_probs)
        action = act_dist.sample()

        self.log_probs.append(act_dist.log_prob(action))
        self.state_vals.append(state_value)

        return action.item()

    def computeLoss(self, gamma=0.99):
        rewards = []
        discounted_reward = 0
        for reward in self.rewards[::-1]:
            discounted_reward = reward + gamma * discounted_reward
            rewards.insert(0, discounted_reward)

        rewards = T.tensor(rewards)
        rewards = (rewards - rewards.mean()) / (rewards.std())

        loss = 0
        for log_probability, value, reward in zip(
            self.log_probs, self.state_vals, rewards):
            advantage = reward - value.item()
            act_loss = -log_probability * advantage
            val_loss = T.nn.functional.smooth_l1_loss(value, reward)
            loss += (act_loss + val_loss)

        return loss

    def clear(self):
        del self.log_probs[:]
        del self.state_vals[:]
        del self.rewards[:]

In [3]:
np.random.seed(0)

In [4]:
env = gym.make("LunarLander-v2")

policy = ActorCritic()
optimizer = T.optim.Adam(policy.parameters(), lr=0.02, betas=(0.9, 0.999))

In [5]:
render = True
np.random.seed(0)
running_reward = 0
for i in np.arange(0, 10): # Run it for at least 10000 episodes
    state = env.reset()
    for t in range(10000):
        action = policy(state)
        state, reward, done, _ = env.step(action)
        policy.rewards.append(reward)
        running_reward += reward
        if render and i > 1000:
            env.render()
        if done:
            break
    print("Episode {}\tReward: {}".format(i, running_reward))

    # Updating the policy
    optimizer.zero_grad()
    loss = policy.computeLoss(0.99)
    loss.backward()
    optimizer.step()
    policy.clear()

    if i % 20 == 0:
        running_reward = running_reward / 20
        running_reward = 0

  app.launch_new_instance()


Episode 0	Reward: -41.40409320391048
Episode 1	Reward: -161.92281385130275
Episode 2	Reward: -806.525529395525
Episode 3	Reward: -1271.564822535037
Episode 4	Reward: -1694.6124798537085
Episode 5	Reward: -1883.9349254123053
Episode 6	Reward: -2654.1571354590674
Episode 7	Reward: -3486.359976552412
Episode 8	Reward: -4032.661135668883
Episode 9	Reward: -4437.026080173242
