In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)
import gym
import pandas as pd
import numpy as np
import tensorflow as tf
import os
import os.path
import time
import pickle
from PIL import Image
import shutil
import matplotlib.pyplot as plt
import random
import cv2
from datetime import datetime

ENV_NAME = 'CartPole-v0'
MODEL_LOG_DIR = '../log/'
TRAIN_LOG_DIR = MODEL_LOG_DIR + 'train/' + 'pg/'

In [None]:
def get_one_hot(y, n_class):
    y = np.array(y)
    res = np.eye(n_class)[np.array(y).reshape(-1)]
    return res.reshape(list(y.shape)+[n_class])

def get_discounted_values(values, gamma):
    n = len(values)
    discounted_values = np.zeros_like(values)
    accumulative = 0.0
    index = list(range(n))
    index.reverse()
    for i in index:
        accumulative = values[i] + gamma * accumulative
        discounted_values[i] = accumulative
        
    discounted_values = (discounted_values - np.mean(discounted_values))/np.std(discounted_values)
    return discounted_values

In [None]:
class Network:
    def __init__(self, session, n_in, n_out):
        
        self.session = session
        self.n_in = n_in
        self.n_out = n_out
        self.global_step = 0
        
        self.n1 = 10
        self.n2 = 2
        
        with tf.name_scope('PlaceHolders'):
            self.states = tf.placeholder(tf.float32, [None, self.n_in], name='states')
            self.actions = tf.placeholder(tf.float32, [None, self.n_out], name='actions')
            self.discounted_rewards = tf.placeholder(tf.float32, [None, ], name='dist_rewards')
            
            self.reward = tf.placeholder(tf.float32, [], name='reward')
            self.rewards_mean = tf.placeholder(tf.float32, [], name='rewards')
            
        with tf.name_scope('FullyConnectedLayer_1'):
            self.W_fc1 = tf.get_variable('W_fc1', shape=[self.n_in, self.n1])
            self.b_fc1 = tf.get_variable('b_fc1', shape=[self.n1])
            self.h_fc1 = tf.nn.relu(tf.add(tf.matmul(self.states, self.W_fc1), self.b_fc1))
            
        with tf.name_scope('FullyConnectedLayer_2'):
            self.W_fc2 = tf.get_variable('W_fc2', shape=[self.n1,self.n2])
            self.b_fc2 = tf.get_variable('b_fc2', shape=[self.n2])
            self.h_fc2 = tf.nn.relu(tf.add(tf.matmul(self.h_fc1, self.W_fc2), self.b_fc2))
            
        with tf.name_scope('PolicyLayer'):
            self.W_fc3 = tf.get_variable('W_fc5', shape=[self.n2, self.n_out])
            self.b_fc3 = tf.get_variable('b_fc5', shape=[self.n_out])
            self.action_proba = tf.nn.softmax(tf.add(tf.matmul(self.h_fc2, self.W_fc3), self.b_fc3), name='Policy')
        
        with tf.name_scope('LearningRate'):
            self.lr = 0.01
            
        with tf.name_scope('Loss'):
            self.neg_log_proba = tf.nn.softmax_cross_entropy_with_logits_v2(logits = self.action_proba, labels = self.actions)
            self.loss = tf.reduce_mean(self.discounted_rewards * self.neg_log_proba)
            
        with tf.name_scope('TrainStep'):
            self.train_step = tf.train.AdamOptimizer(self.lr).minimize(self.loss)
        
        with tf.name_scope('SummaryWriter'):
            self.train_summary = tf.summary.merge([tf.summary.scalar("train_loss", self.loss)])
            self.reward_summary = tf.summary.merge([tf.summary.scalar("episode_reward", self.reward)])
            self.mean_reward_summary = tf.summary.merge([tf.summary.scalar("mean_episode_reward_50", self.rewards_mean)])
            self.writer = tf.summary.FileWriter(TRAIN_LOG_DIR, session.graph)
            
        self.init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
        self.session.run(self.init)

    def compute_action_proba(self, x):
        return self.session.run(self.action_proba, feed_dict={self.states:np.reshape(x,[-1, self.n_in])})
    
    def train(self, ep_states, ep_actions, ep_rewards):
        self.global_step += 1
        self.summary, _ = self.session.run([self.train_summary, self.train_step], feed_dict={self.states: ep_states, self.actions: ep_actions, self.discounted_rewards: ep_rewards})
        self.writer.add_summary(self.summary, self.global_step)
        
    def write_reward(self, ep, r, r_mean):
        self.summary_0, self.summary_1 = self.session.run([self.reward_summary, self.mean_reward_summary], feed_dict={self.reward: r, self.rewards_mean: r_mean})
        self.writer.add_summary(self.summary_0, ep)
        self.writer.add_summary(self.summary_1, ep)

In [None]:
class Player:
    def __init__(self, tf_session, env):
        self.env = env
        self.obs_space = env.observation_space.shape[0]
        try:
            self.act_space = env.action_space.n
        except:
            self.act_space = env.action_space.shape[0]
        self.gamma = 0.95
        self.policy = Network(tf_session, self.obs_space, self.act_space)
        
        self.ep_states = []
        self.ep_actions = []
        self.ep_rewards = []

    def gather_exp(self, observation, action, reward):
        self.ep_states.append(observation)
        self.ep_actions.append(action)
        self.ep_rewards.append(reward)

    def choose_action(self, observation):
        action_prob = self.policy.compute_action_proba(observation).ravel()
        return np.random.choice(list(range(self.act_space)),p = action_prob)
    
    def choose_random_action(self):
        return np.random.choice(list(range(self.act_space)))

    def update_policy(self):
        self.one_hot_actions = get_one_hot(self.ep_actions, self.act_space)
        self.discounted_rewards = get_discounted_values(self.ep_rewards, self.gamma)
        self.policy.train(self.ep_states, self.one_hot_actions,self.discounted_rewards)
        
    def render(self):
        self.env.render()
        
    def take_action(self, action):
        return self.env.step(action)

    def gather_reward(self, reward):
        self.ep_rewards.append(reward)

    def get_total_reward(self):
        return sum(self.ep_rewards)
        
    def reset(self):
        self.ep_states = []
        self.ep_actions = []
        self.ep_rewards = []
        return self.env.reset()

In [None]:
if not os.path.exists(MODEL_LOG_DIR):
    os.makedirs(MODEL_LOG_DIR)
if not os.path.exists(TRAIN_LOG_DIR):
    os.makedirs(TRAIN_LOG_DIR)

env = gym.make(ENV_NAME)
env = env.unwrapped
env.seed(1)
sess = tf.Session()
player = Player(sess, env)  
saver = tf.train.Saver()
ep_rewards = []

date_object = datetime.now()
current_time = date_object.strftime('%H:%M:%S')
print("Training starts -- {}".format(current_time))

for ep in range(300):
    
    observation = player.reset()
    
    while(True):
        player.render()
        action = player.choose_action(observation)
        new_observation, reward, done, _ = player.take_action(action)
        player.gather_exp(observation, action, reward)
        observation = new_observation

        if done:
            player.update_policy()
            ep_rewards.append(player.get_total_reward())
            player.policy.write_reward(ep+1, ep_rewards[-1], np.mean(ep_rewards[-50:]))
            print('==========================\n')
            print('Episode {} with reward {}\n'.format(ep+1, ep_rewards[-1]))
            print('Mean reward {}\n'.format(np.mean(ep_rewards)))
            print('Max reward {}\n'.format(np.max(ep_rewards)))
            break

saver.save(sess, TRAIN_LOG_DIR, global_step=ep+1)