In [3]:
import gym
from policy_gradient_nn import NeuralNetwork
import numpy as np
import random
import copy
import time
import pickle

In [4]:
def play_n_games(nn, num_games, gamma=0.0, fp_prob_random_select=0.1, choose_best_output=False):
    advantage_all = []
    time_survived = []
    for _ in range(num_games):
        advantage = play_single_game(nn, gamma, fp_prob_random_select, choose_best_output)
        advantage_all.extend(advantage)
        time_survived.append(len(advantage))
    return np.array(advantage_all), time_survived

def play_single_game(nn, gamma=0.0, fp_prob_random_select=0.1, render=False, choose_best_output=False):
    reward = []
    is_game_over = False

    env = gym.make('CartPole-v1')
    state_recent_three = env.reset()
    state_curr = copy.deepcopy(state_recent_three)
    state_recent_three = np.hstack([state_recent_three, state_recent_three, state_recent_three])
    while not is_game_over:
        if render:
            env.render()
        state_recent_three = np.hstack([state_recent_three[4:], state_curr])
        _, action_curr = nn.fp(state_recent_three.reshape(-1, 1), fp_prob_random_select, choose_best_output)
        action_curr = int(action_curr[0])
        state_curr, reward_curr, is_game_over, _ = env.step(action_curr)
        
        if is_game_over and len(reward)<500: reward_curr = -100
        reward.append(reward_curr)
    advantage = calc_advantage(reward, gamma, len(reward))
    return advantage

def calc_advantage(reward, gamma, max_rnd):
    advantage = [0 for i in range(max_rnd)]
    prev_advantage = 0
    for i in range(len(reward) - 1, -1, -1):
        curr_advantage = reward[i] + (gamma * prev_advantage)
        advantage[i] = float(curr_advantage)
        prev_advantage = float(curr_advantage)
    return advantage

def normalize_advantage(advantage):
    normalized_advantage = np.array(advantage, dtype='float')
    normalized_advantage -= np.mean(normalized_advantage)
    normalized_advantage /= np.std(normalized_advantage)
    return normalized_advantage

In [None]:
nn = NeuralNetwork([12,128,128,2])

learning_rate_init = 0.10
learning_rate_decay = 0
gamma = 0.99
fp_prob_random_select = 0.05
for epoch in range(1000):
    #update gradients every 5 games. Could choose number of games depending on length survived in each game, but this works for now
    advantage_all, time_survived = play_n_games(nn, 5, gamma=gamma,fp_prob_random_select=fp_prob_random_select)
    normalized_advantage_all = normalize_advantage(advantage_all)
    nn.bp(normalized_advantage_all)
    learning_rate = learning_rate_init / (1 + learning_rate_decay * epoch)
    nn.update_weights(learning_rate=learning_rate)
    nn.clear_caches()
    if epoch%50==0: 
        print('Epoch={} lr={}'.format(epoch, learning_rate))
        print('Avg time survived over 5 rounds={}'.format(sum(time_survived)/len(time_survived)))

In [5]:
#save NN to file
# with open('CartPole-v1_trained_NN.pickle', 'wb') as f:
#     pickle.dump(nn, f)

#load NN from file
with open('CartPole-v1_trained_NN.pickle', 'rb') as f:
    nn = pickle.load(f)

In [6]:
#avg score over 100 games
adv_len = []
for _ in range(100):
    adv = play_single_game(nn, gamma=0.0, fp_prob_random_select=0.0, choose_best_output=True, render=False)
    adv_len.append(len(adv))
    nn.clear_caches()
print('Avg score over 100 games: {}'.format(sum(adv_len)/len(adv_len)))

Avg score over 100 games: 500.0


In [8]:
#rendering of the trained network playing a game
adv = play_single_game(nn, gamma=0.0, fp_prob_random_select=0.0, choose_best_output=True, render=True)
nn.clear_caches()
print('Score this round: {}'.format(len(adv)))

Score this round: 500
