In [2]:
import numpy as np
import matplotlib.pyplot as plt
import gym

import torch as T
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions.categorical import Categorical

In [17]:
class ActorCriticNetwork(nn.Module):
    def __init__(self, lr, input_dims, 
                 n_actions, fc1_dims=256, fc2_dims=256):
        super(ActorCriticNetwork, self).__init__()
        self.fc1 = nn.Linear(*input_dims, fc1_dims)
        self.fc2 = nn.Linear(fc1_dims, fc2_dims)
        self.pi = nn.Linear(fc2_dims, n_actions)
        self.v = nn.Linear(fc2_dims, 1)
                
        self.optimizer = optim.Adam(params=self.parameters(), 
                       lr=lr)
        
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
        
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        pi = self.pi(x)
        v = self.v(x)
        return (pi, v)

In [21]:
class Agent():
    def __init__(self, lr, input_dims, fc1_dims, fc2_dims,
                 gamma=0.99, n_actions=4):
        self.lr = lr 
        self.gamma = gamma
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        
        self.actor_critic = ActorCriticNetwork(lr, input_dims, 
                                               n_actions,
                                               fc1_dims, fc2_dims)
        self.log_prob = None
        
    def choose_action(self, observation):
        state = T.Tensor([observation]).to(self.actor_critic.device)
        
        # ignore the values by ,_
        probabilities, _ = self.actor_critic.forward(state)
        probabilities = F.softmax(probabilities, dim=1)
        # calculate the categorical distribution based on the
        # probabilities
        action_probs = T.distributions.Categorical(probabilities)
        # sample to obtain the action
        action = action_probs.sample()
        
        log_prob = action_probs.log_prob(action)
        self.log_prob = log_prob
        return action.item() # de-reference with item()
        
    def learn(self, state, reward, state_, done):
        # reset gradient, zero the gradient
        self.actor_critic.optimizer.zero_grad()
        
        state = T.tensor([state], dtype=T.float).to(self.actor_critic.device)
        state_ = T.tensor([state_], dtype=T.float).to(self.actor_critic.device)
        reward = T.tensor(reward, dtype=T.float).to(self.actor_critic.device)
        
        _, critic_value = self.actor_critic.forward(state)
        _, critic_value_ = self.actor_critic.forward(state_)
        
        delta = reward + self.gamma * critic_value_ * (1-int(done)) \
            - critic_value
        actor_loss = -self.log_prob*delta
        critic_loss = delta**2
        
        # to prevent the back prop graph from running twice
        # we sum up here
        (actor_loss + critic_loss).backward()
        self.actor_critic.optimizer.step()
        

In [22]:
def plot_learning_curve(score, x, figure_file=None):
    running_avg = np.zeros(len(scores))
    for i in range(len(running_avg)):
        running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
    plt.plot(x, running_avg)
    plt.title('Running average of previous 100 scores')
    if figure_file:
        plt.savefig(figure_file)
    else:
        plt.show()

In [25]:
env = gym.make('LunarLander-v2')
n_games = 100

agent = Agent(gamma=0.99, lr=5e-6, input_dims=[8],
                          n_actions=4, 
                          fc1_dims=2048, fc2_dims=1536)

fname = 'ACTOR_CRITIC_' + 'lunar_lander_lr' + str(agent.lr) + \
        '_fc1_dims_' + str(agent.fc1_dims) + '_fc2_dims_' + str(agent.fc2_dims) \
        + '_' + str(n_games) + 'games'
figure_file = 'plots/' + fname + '.png'

scores = []
for i in range(n_games):
    done = False
    observation = env.reset()
    score = 0
    while not done:
        action = agent.choose_action(observation)
        observation_, reward, done, info = env.step(action)
        score += reward
        
        agent.learn(observation, reward, observation, done)
        observation = observation_
        
    scores.append(score)
    avg_score = np.mean(scores[-100:])
    print('episode ', i, 'score %.2f' % score,
            'average score %.2f' % avg_score)

x = [i+1 for i in range(len(scores))]
plot_learning_curve(scores, x, figure_file)

env.close()
env.env.close()


episode  0 score -99.98 average score -99.98
episode  1 score -252.08 average score -176.03
episode  2 score -372.87 average score -241.64
episode  3 score -102.91 average score -206.96
episode  4 score -318.28 average score -229.22
episode  5 score -41.46 average score -197.93
episode  6 score -164.88 average score -193.21
episode  7 score -409.39 average score -220.23


KeyboardInterrupt: 