<h5>
A simple game to test the implementation of policy gradient.
<br>
In this game, a N by N game board is generated every turn. One cell will be labelled 1, all other cells are labelled 0.
<br>
The correct answer is simply the column number of the cell labelled as 1.
</h5>

In [1]:
from n_by_n_squares import NByNSquares
from policy_gradient_nn import NeuralNetwork
import numpy as np
import random
import copy

In [2]:
def play_n_games(nn, num_games, grid_size=2, max_rnd=10, gamma=0, fp_prob_random_select=0.1, choose_best_output=False, verbose=False):
    advantage_all = []
    for _ in range(num_games):
        advantage = play_single_game(nn, grid_size, max_rnd, gamma, fp_prob_random_select, choose_best_output, verbose)
        advantage_all.extend(advantage)
    return np.array(advantage_all)

def play_single_game(nn, grid_size=2, max_rnd=10, gamma=0, fp_prob_random_select=0.1, choose_best_output=False, verbose=False):
    game = NByNSquares(grid_size, max_rnd, verbose=verbose)
    reward = []
    is_game_over = False
    while not is_game_over:
        is_game_over, reward_curr = play_single_round(nn, game, fp_prob_random_select, choose_best_output)
        reward.append(reward_curr)
        
    advantage = calc_advantage(reward, gamma, max_rnd)
    return advantage
    
def play_single_round(nn, game, fp_prob_random_select, choose_best_output=False):
    game_grid_vector = game.get_vector_repr().reshape(-1, 1)
    a_curr, action_curr = nn.fp(game_grid_vector, fp_prob_random_select, choose_best_output)
    reward, is_game_over = game.take_action(action_curr[0])
    return is_game_over, reward
        
def calc_advantage(reward, gamma, max_rnd):
    advantage = [0 for i in range(max_rnd)]
    prev_advantage = 0
    for i in range(len(reward)-1, -1, -1):
        curr_advantage = reward[i] + (gamma*prev_advantage)
        advantage[i] = float(curr_advantage)
        prev_advantage = float(curr_advantage)
    return advantage

def normalize_advantage(advantage):
    normalized_advantage = np.array(advantage, dtype='float')
    normalized_advantage -= np.mean(normalized_advantage)
    normalized_advantage /= np.std(normalized_advantage)
    return normalized_advantage

In [81]:
nn = NeuralNetwork([9,28,28,3])

for i in range(101):
    advantage_all = play_n_games(nn, 5, grid_size=3, max_rnd=5, gamma=0.9, fp_prob_random_select=0.15)
    mean_adv = np.mean(advantage_all)
    normalized_advantage_all = normalize_advantage(advantage_all)
    nn.bp(normalized_advantage_all)
    nn.update_weights(0.2)
    nn.clear_caches()
    if i%20==0: 
        print('Mean advantage over 5 rounds at epoch {}: {}'.format(i, mean_adv))

Mean advantage over 5 rounds at epoch 0: -1.3300919999999998
Mean advantage over 5 rounds at epoch 20: -0.41495600000000005
Mean advantage over 5 rounds at epoch 40: 0.46096400000000004
Mean advantage over 5 rounds at epoch 60: 1.52834
Mean advantage over 5 rounds at epoch 80: 2.3537
Mean advantage over 5 rounds at epoch 100: 2.301212


In [82]:
# % rounds w/ correct answer, out of 1000 rounds
rewards = play_n_games(nn, 100, grid_size=3, max_rnd=10, fp_prob_random_select=0, gamma=0, choose_best_output=True, verbose=False)
nn.clear_caches()
print('% rounds with correct answer: {}%'.format(np.mean(rewards)*100))

% rounds with correct answer: 100.0%
