In [1]:
# GridBoard.py and Gridworld in the main repository have been directly copied from 
# Alexander Zai. “Deep Reinforcement Learning in Action MEAP V06”
# whereas the text in this notebook is my own implementation of the code in the book
# licenses are inherited from the ones used by A.Zai et al in the mentioned book
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
from collections import OrderedDict
from torch.autograd import Variable
from Gridworld import *
from IPython.display import clear_output
import random
from matplotlib import pylab as plt

In [18]:
def softmax_fn(av, tau = 1.12):
# This function receives average rewards and outputs the softmax probabilities 
# Arguments:
#   - av: expected averages
#   - tau: temperature. High val exaggerates differences; low value promotes homogenity
# Output:
#   - Softmaxed values

    softm = np.exp(av / tau) / np.sum( np.exp(av[:] / tau) )
    return softm

In [52]:

class QNet():
    def __init__(self, n_in, n_out, n_hidden1, n_hidden2, action_set, gridSize = 4, gamma = 0.9):
        self.n_in = n_in
        self.n_out = n_out
        self.action_set = action_set
        self.gridSize = gridSize
        self.newGridGame()
        self.gamma = gamma
       # self.one_hot_reward = np.ones(arms)

        # Neural network model definition 
        self.model = nn.Sequential(OrderedDict([
            ('fc1', nn.Linear(n_in, n_hidden1)),
            ('ReLu1', nn.ReLU(inplace = True)),
            ('fc2', nn.Linear(n_hidden1, n_hidden2)),
            ('ReLu2', nn.ReLU(inplace = True)),
            ('fc3', nn.Linear(n_hidden2, n_out)),
            ('ReLu3', nn.ReLU(inplace = True))
        ])
        )
                                  
    def newGridGame(self):
        self.env = Gridworld(size=self.gridSize, mode='static') 
        self.refreshState()
                                  
    def refreshState(self):
        # Get state with some random noise
        state_ = self.env.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0 
        self.state = torch.from_numpy(state_).float()
                                  
    def forward(self):
        # Get reward prediction from the model
        y_pred = self.model(self.state).squeeze() 
        # Obtain the probability distribution of the reward prediction
        print("ypred: ", y_pred.data.numpy())
        av_softmax = softmax_fn(y_pred.data.numpy(), tau=2.0)  
        print("av_softmax: ", av_softmax)

        av_softmax /= av_softmax.sum() 

        return av_softmax, y_pred
    
    def getQReward(self):
        cur_reward = self.env.reward()
        # The reward of a Q_learning algorithm takes into account propect
        # So ,we need to get new state and the expected reward (maxQval)
        new_state_ = self.env.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0 
        new_state_ = torch.from_numpy(new_state_).float()
        with torch.no_grad(): # we do not want this pred to be considered in grads
            fut_qval = self.model(new_state_)
        
        # Are we in the endgame?
        if cur_reward == -1:
            QReward = cur_reward + self.gamma*torch.max(fut_qval)
        else:
            QReward = reward
        
        return QReward, cur_reward
        
    def actuate(self, av_softmax, y_pred, action_map = None):
        # This function returns the reward to be used for backpropagation and the cur_reward obtained 
        # The backprop reward is such that only modifies the nodes related with obtaining cur_reward
        # To do this, we copy the forward output and modify the value of the array corresponding with the 
        # choice made. The new value of this position is the cur_reward.
        
        # Probabilistically choose an action 
        print("choosing among: ", self.n_out)
        print("av_softmax: ", av_softmax)
        choice = np.random.choice(self.n_out, p=av_softmax) 
        # convert the choice to the action using the action set of the game
        action = self.action_set[choice]
        # Make the action
        self.env.makeMove(action)
        # Execute action and get the reward for it
        Qreward, status_ = self.getQReward()
        # Copy qvals
        backprop_reward = y_pred.data.numpy().copy()
        # Update val corresponding to choice so it matches the QReward
        backprop_reward[choice] = Qreward
                                          
        # Return reward
        return backprop_reward,status_
        

In [65]:
def train(Qmodel, loss_fn, optimizer, epochs = 5000):
    
    for epoch in range(epochs): # Episodic training, each epoch is a new game
        
        optimizer.zero_grad() # reset grads between epochs
        # Create a new game for each epoch
        Qmodel.newGridGame() # Also refreshes the model state
       
        # Track play status (has game ended?) 
        game_active = True
        while(game_active): # while current play is going on
            actions_prob, y_pred = Qmodel.forward() # get prob distribution and curr_reward
            # actuate based on prob - determines explore-exploit
            act_rewards, status = Qmodel.actuate(actions_prob, y_pred) 
            # Compute loss
            print('in loss. y_pred {}, rewards: {}'.format(len(y_pred.data.numpy()),len(act_rewards)))
            loss = loss_fn(y_pred.data.numpy(), act_rewards) # compute loss
            print('epoch {}. Loss = {}'.format(epoch, loss))
            losses.append(loss.item())
            loss.backward() # obtain backward propagation gradients
            optimizer.step() # apply backprop to network model
            
            if status == -1:
                game_active = True
            else:
                game_active = False
                
            
            


In [66]:
# Set params 
action_map = {
    0:'u',
    1:'d',
    2:'l',
    3:'r'
}
n_out = len(action_map) # 4
gridSize = 4
n_in = 4*gridSize*gridSize
learning_rate = 1e-4
gamma = 0.9 # decay ratio

# Create RL object
Qnet = QNet(n_in, n_out, 164, 150, action_map, gridSize, gamma)
print(Qnet.model)

# Loss metric and optimization criterion for training
loss_fn = nn.MSELoss(size_average=True)
criterion = torch.optim.Adam(Qnet.model.parameters(), lr=learning_rate)


Sequential(
  (fc1): Linear(in_features=64, out_features=164, bias=True)
  (ReLu1): ReLU(inplace)
  (fc2): Linear(in_features=164, out_features=150, bias=True)
  (ReLu2): ReLU(inplace)
  (fc3): Linear(in_features=150, out_features=4, bias=True)
  (ReLu3): ReLU(inplace)
)


In [67]:
losses =[]
train(Qnet, loss_fn, criterion, epochs = 100)

ypred:  [0.         0.02122179 0.08242083 0.08360937]
av_softmax:  [0.24417464 0.24677935 0.2544474  0.25459865]
choosing among:  4
av_softmax:  [0.24417464 0.24677935 0.2544474  0.25459865]
in loss. y_pred 4, rewards: 4


TypeError: 'int' object is not callable

In [None]:
plt.figure(figsize=(10,7))
plt.plot(losses)
plt.xlabel("Plays")
plt.ylabel("Loss")

In [10]:
len(action_map)

4

In [1]:


from Gridworld import Gridworld
game = Gridworld(size=4, mode='static')



In [2]:
game.display()


array([['+', '-', ' ', 'P'],
       [' ', 'W', ' ', ' '],
       [' ', ' ', ' ', ' '],
       [' ', ' ', ' ', ' ']], dtype='<U2')

In [3]:
game.makeMove('d')
game.makeMove('d')
game.makeMove('l')
game.display()

array([['+', '-', ' ', ' '],
       [' ', 'W', ' ', ' '],
       [' ', ' ', 'P', ' '],
       [' ', ' ', ' ', ' ']], dtype='<U2')

In [4]:
game.reward()

-1

In [5]:
game.board.render_np()

array([[[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 0, 0]],

       [[1, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]],

       [[0, 1, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]],

       [[0, 0, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]]], dtype=uint8)

In [None]:
4