# Policy Gradient RL model 001

In [5]:
import copy

import random
import math
import numpy as np
from inspect import isfunction

import torch
print('Torch version:', torch.__version__)

import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F

import matplotlib.pyplot as plt

from tqdm import tqdm

import sys

Torch version: 1.11.0


## Policy and agent classes.

In [140]:
state_space = [i for i in range(0, 128)]
state_space_tensor = torch.Tensor([float(i) for i in range(0, 128)])

In [141]:
print(state_space_tensor)

tensor([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,
         12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,  22.,  23.,
         24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,  33.,  34.,  35.,
         36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,  44.,  45.,  46.,  47.,
         48.,  49.,  50.,  51.,  52.,  53.,  54.,  55.,  56.,  57.,  58.,  59.,
         60.,  61.,  62.,  63.,  64.,  65.,  66.,  67.,  68.,  69.,  70.,  71.,
         72.,  73.,  74.,  75.,  76.,  77.,  78.,  79.,  80.,  81.,  82.,  83.,
         84.,  85.,  86.,  87.,  88.,  89.,  90.,  91.,  92.,  93.,  94.,  95.,
         96.,  97.,  98.,  99., 100., 101., 102., 103., 104., 105., 106., 107.,
        108., 109., 110., 111., 112., 113., 114., 115., 116., 117., 118., 119.,
        120., 121., 122., 123., 124., 125., 126., 127.])


In [178]:
class Policy(nn.Module):
    def __init__(self, input_features=128, out_features=128, layer_count=4, hidden_features=600, dropout=.1):
        super().__init__()
        
        self.input_features = input_features
        self.out_features = out_features
        self.layer_count = layer_count
        self.hidden_features = hidden_features
        self.dropout = dropout
        
        self.layers = nn.ModuleList()
        self.layers.append(nn.Linear(2*self.input_features, self.hidden_features))
        self.layers.append(torch.nn.Dropout(p=self.dropout))
        #self.layers.append(nn.BatchNorm1d(self.hidden_features))
        self.layers.append(nn.LeakyReLU())
        for n in range(self.layer_count-2):
            self.layers.append(nn.Linear(self.hidden_features, self.hidden_features))
            self.layers.append(torch.nn.Dropout(p=self.dropout))
            #self.layers.append(nn.BatchNorm1d(self.hidden_features))
            self.layers.append(nn.LeakyReLU())
        self.layers.append(nn.Linear(self.hidden_features, self.out_features))
        self.layers.append(nn.Softmax(dim=-1))
        
    def forward(self, goal_dist, state_dist):
        x = torch.cat((state_dist, goal_dist))
        for layer in self.layers:
            x = layer(x)
        output_dist = x
        action_index = torch.argmax(x).unsqueeze(-1)
        return output_dist, action_index

**Testing:**

In [179]:
policy = Policy()
print('\nLayer list:\n', policy.layers)

goal_dist = torch.rand(128)
state_dist = torch.rand(128)
print('\nLength of output distribution:', len(policy(state_dist, goal_dist)[0]))
print('\nSum of output distribution:', policy(state_dist, goal_dist)[0].sum())
print('\nPolicy\'s action:', policy(state_dist, goal_dist)[1])


Layer list:
 ModuleList(
  (0): Linear(in_features=256, out_features=600, bias=True)
  (1): Dropout(p=0.1, inplace=False)
  (2): LeakyReLU(negative_slope=0.01)
  (3): Linear(in_features=600, out_features=600, bias=True)
  (4): Dropout(p=0.1, inplace=False)
  (5): LeakyReLU(negative_slope=0.01)
  (6): Linear(in_features=600, out_features=600, bias=True)
  (7): Dropout(p=0.1, inplace=False)
  (8): LeakyReLU(negative_slope=0.01)
  (9): Linear(in_features=600, out_features=128, bias=True)
  (10): Softmax(dim=-1)
)

Length of output distribution: 128

Sum of output distribution: tensor(1.0000, grad_fn=<SumBackward0>)

Policy's action: tensor([91])


In [265]:
class Agent():
    def __init__(self, policy=Policy(), goal_reward = 20, epsilon=.2):
        
        self.policy = policy
        self.goal_reward = torch.Tensor([goal_reward])
        
        self.epsilon = epsilon
        
        self.episode_states = torch.Tensor()
        self.episode_rewards = torch.Tensor()
        
        self.episode_goal = torch.Tensor()
        
        self.state_space_tensor = torch.Tensor([float(i) for i in range(0, 128)])
        
        self.explore_possibilities = ['explore', 'on_policy']
        self.explore_weights = [self.epsilon, 1.-self.epsilon]
        
    def reward(self, goal_index, state_index, action_index):
        action_direction = torch.sign(action_index - state_index)
        goal_direction = torch.sign(goal_index - state_index)
        
        direction_reward = action_direction * goal_direction
        
        goal_reward = self.goal_reward * torch.eq(action_index, goal_index)
        
        return direction_reward + goal_reward
        
    def initiate_episode(self, goal_index, start_index):
        self.episode_goal = torch.Tensor([goal_index])
        
        self.episode_states = torch.Tensor([start_index])
        self.episode_rewards = torch.Tensor([0.])
        
    def act(self):
        last_state_index = self.episode_states[-1]
        last_state_dist = torch.eq(self.state_space_tensor, last_state_index).float()
    
        goal_dist = torch.eq(self.state_space_tensor, self.episode_goal).float()
        
        explore_Q = random.choices(self.explore_possibilities, self.explore_weights)
        if explore_Q == self.explore_possibilities[0]:
            new_action_index = torch.Tensor([random.randrange(0, 128)])
        else:
            new_action_index = policy(goal_dist, last_state_dist)[1]
            
        self.episode_states = torch.cat((self.episode_states, new_action_index))
            
        new_reward = self.reward(self.episode_goal, last_state_index, new_action_index)
            
        self.episode_rewards = torch.cat((self.episode_rewards, new_reward))
        
    def generate_episode(self, goal_index, start_index, time_step_count):
        self.initiate_episode(goal_index, start_index)
        step_index = 0
        while (step_index < time_step_count) & (self.episode_states[-1] != self.episode_goal):
            self.act()
            step_index += 1

**Testing:**

In [264]:
agent = Agent()
#print('\nAgent\'s policy model:\n', agent.policy)

print('\nUn-nitialized episode state list:', agent.episode_states)
print('Un-nitialized episode reward list:', agent.episode_rewards)

goal_index = torch.Tensor([random.randrange(0,128)])
state_index = torch.Tensor([random.randrange(0,128)])
action_index = torch.Tensor([random.randrange(0,128)])
print('\nGeneric reward:', agent.reward(goal_index, state_index, action_index))
print('\nReward in case where action takes us to goal:', agent.reward(goal_index, state_index, goal_index))
print('\nReward in case where state = action = goal:', agent.reward(goal_index, goal_index, goal_index))

print('\n___________________________________________________\n')

agent.initiate_episode(goal_index.item(), state_index.item())
print('\nInitialized episode goal:', agent.episode_goal)
print('Initialized episode state list:', agent.episode_states)
print('Initialized episode reward list:', agent.episode_rewards)

agent.act()
print('\nEpisode state list after one action:', agent.episode_states)
print('Episode reward list after one action:', agent.episode_rewards)

agent.act()
print('\nEpisode state list after two actions:', agent.episode_states)
print('Episode reward list after two actions:', agent.episode_rewards)

print('\n___________________________________________________\n')

agent.initiate_episode(goal_index.item(), state_index.item())
print('\nInitialized episode goal:', agent.episode_goal)
print('Initialized episode state list:', agent.episode_states)
print('Initialized episode reward list:', agent.episode_rewards)

step_count = 30
agent.generate_episode(goal_index.item(), state_index.item(), step_count)
print('\nEpisode state list after {} actions or goal:\n'.format(step_count), agent.episode_states)
print('Episode reward list after {} actions or goal:\n'.format(step_count), agent.episode_rewards)


Un-nitialized episode state list: tensor([])
Un-nitialized episode reward list: tensor([])

Generic reward: tensor([1.])

Reward in case where action takes us to goal: tensor([21.])

Reward in case where state = action = goal: tensor([20.])

___________________________________________________


Initialized episode goal: tensor([56.])
Initialized episode state list: tensor([90.])
Initialized episode reward list: tensor([0.])

Episode state list after one action: tensor([ 90., 110.])
Episode reward list after one action: tensor([ 0., -1.])

Episode state list after two actions: tensor([ 90., 110.,  91.])
Episode reward list after two actions: tensor([ 0., -1.,  1.])

___________________________________________________


Initialized episode goal: tensor([56.])
Initialized episode state list: tensor([90.])
Initialized episode reward list: tensor([0.])

Episode state list after 30 actions or goal:
 tensor([ 90.,  91.,  91.,  94.,  91.,  91.,  91.,  91., 110.,  91.,  53.,  91.,
         91.

## Training loop.