In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)
import gym
import pandas as pd
import numpy as np
import tensorflow as tf
import os
import os.path
import time
import pickle
from PIL import Image
import shutil
import matplotlib.pyplot as plt
import random
import cv2
from datetime import datetime

ENV_NAME = 'CartPole-v0'
MODEL_LOG_DIR = '../log/'
TRAIN_LOG_DIR = MODEL_LOG_DIR + 'train/' + 'pg/'

In [2]:
def get_one_hot(y, n_class):
    y = np.array(y)
    res = np.eye(n_class)[np.array(y).reshape(-1)]
    return res.reshape(list(y.shape)+[n_class])

def get_discounted_values(values, gamma):
    n = len(values)
    discounted_values = np.zeros_like(values)
    accumulative = 0.0
    index = list(range(n))
    index.reverse()
    for i in index:
        accumulative = values[i] + gamma * accumulative
        discounted_values[i] = accumulative
        
    discounted_values = (discounted_values - np.mean(discounted_values))/np.std(discounted_values)
    return discounted_values

In [3]:
class Network:
    def __init__(self, session, n_in, n_out):
        
        self.session = session
        self.n_in = n_in
        self.n_out = n_out
        self.global_step = 0
        
        self.n1 = 10
        self.n2 = 2
        
        with tf.name_scope('PlaceHolders'):
            self.states = tf.placeholder(tf.float32, [None, self.n_in], name='states')
            self.actions = tf.placeholder(tf.float32, [None, self.n_out], name='actions')
            self.discounted_rewards = tf.placeholder(tf.float32, [None, ], name='dist_rewards')
            
            self.reward = tf.placeholder(tf.float32, [], name='reward')
            self.rewards_mean = tf.placeholder(tf.float32, [], name='rewards')
            
        with tf.name_scope('FullyConnectedLayer_1'):
            self.W_fc1 = tf.get_variable('W_fc1', shape=[self.n_in, self.n1])
            self.b_fc1 = tf.get_variable('b_fc1', shape=[self.n1])
            self.h_fc1 = tf.nn.relu(tf.add(tf.matmul(self.states, self.W_fc1), self.b_fc1))
            
        with tf.name_scope('FullyConnectedLayer_2'):
            self.W_fc2 = tf.get_variable('W_fc2', shape=[self.n1,self.n2])
            self.b_fc2 = tf.get_variable('b_fc2', shape=[self.n2])
            self.h_fc2 = tf.nn.relu(tf.add(tf.matmul(self.h_fc1, self.W_fc2), self.b_fc2))
            
        with tf.name_scope('PolicyLayer'):
            self.W_fc3 = tf.get_variable('W_fc5', shape=[self.n2, self.n_out])
            self.b_fc3 = tf.get_variable('b_fc5', shape=[self.n_out])
            self.action_proba = tf.nn.softmax(tf.add(tf.matmul(self.h_fc2, self.W_fc3), self.b_fc3), name='Policy')
        
        with tf.name_scope('LearningRate'):
            self.lr = 0.01
            
        with tf.name_scope('Loss'):
            self.neg_log_proba = tf.nn.softmax_cross_entropy_with_logits_v2(logits = self.action_proba, labels = self.actions)
            self.loss = tf.reduce_mean(self.discounted_rewards * self.neg_log_proba)
            
        with tf.name_scope('TrainStep'):
            self.train_step = tf.train.AdamOptimizer(self.lr).minimize(self.loss)
        
        with tf.name_scope('SummaryWriter'):
            self.train_summary = tf.summary.merge([tf.summary.scalar("train_loss", self.loss)])
            self.reward_summary = tf.summary.merge([tf.summary.scalar("episode_reward", self.reward)])
            self.mean_reward_summary = tf.summary.merge([tf.summary.scalar("mean_episode_reward_50", self.rewards_mean)])
            self.writer = tf.summary.FileWriter(TRAIN_LOG_DIR, session.graph)
            
        self.init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
        self.session.run(self.init)

    def compute_action_proba(self, x):
        return self.session.run(self.action_proba, feed_dict={self.states:np.reshape(x,[-1, self.n_in])})
    
    def train(self, ep_states, ep_actions, ep_rewards):
        self.global_step += 1
        self.summary, _ = self.session.run([self.train_summary, self.train_step], feed_dict={self.states: ep_states, self.actions: ep_actions, self.discounted_rewards: ep_rewards})
        self.writer.add_summary(self.summary, self.global_step)
        
    def write_reward(self, ep, r, r_mean):
        self.summary_0, self.summary_1 = self.session.run([self.reward_summary, self.mean_reward_summary], feed_dict={self.reward: r, self.rewards_mean: r_mean})
        self.writer.add_summary(self.summary_0, ep)
        self.writer.add_summary(self.summary_1, ep)

In [4]:
class Player:
    def __init__(self, tf_session, env):
        self.env = env
        self.obs_space = env.observation_space.shape[0]
        try:
            self.act_space = env.action_space.n
        except:
            self.act_space = env.action_space.shape[0]
        self.gamma = 0.95
        self.policy = Network(tf_session, self.obs_space, self.act_space)
        
        self.ep_states = []
        self.ep_actions = []
        self.ep_rewards = []

    def gather_exp(self, observation, action, reward):
        self.ep_states.append(observation)
        self.ep_actions.append(action)
        self.ep_rewards.append(reward)

    def choose_action(self, observation):
        action_prob = self.policy.compute_action_proba(observation).ravel()
        return np.random.choice(list(range(self.act_space)),p = action_prob)
    
    def choose_random_action(self):
        return np.random.choice(list(range(self.act_space)))

    def update_policy(self):
        self.one_hot_actions = get_one_hot(self.ep_actions, self.act_space)
        self.discounted_rewards = get_discounted_values(self.ep_rewards, self.gamma)
        self.policy.train(self.ep_states, self.one_hot_actions,self.discounted_rewards)
        
    def render(self):
        self.env.render()
        
    def take_action(self, action):
        return self.env.step(action)

    def gather_reward(self, reward):
        self.ep_rewards.append(reward)

    def get_total_reward(self):
        return sum(self.ep_rewards)
        
    def reset(self):
        self.ep_states = []
        self.ep_actions = []
        self.ep_rewards = []
        return self.env.reset()

In [5]:
if not os.path.exists(MODEL_LOG_DIR):
    os.makedirs(MODEL_LOG_DIR)
if not os.path.exists(TRAIN_LOG_DIR):
    os.makedirs(TRAIN_LOG_DIR)

env = gym.make(ENV_NAME)
env = env.unwrapped
env.seed(1)
sess = tf.Session()
player = Player(sess, env)  
saver = tf.train.Saver()
ep_rewards = []

date_object = datetime.now()
current_time = date_object.strftime('%H:%M:%S')
print("Training starts -- {}".format(current_time))

for ep in range(300):
    
    observation = player.reset()
    
    while(True):
        player.render()
        action = player.choose_action(observation)
        new_observation, reward, done, _ = player.take_action(action)
        player.gather_exp(observation, action, reward)
        observation = new_observation

        if done:
            player.update_policy()
            ep_rewards.append(player.get_total_reward())
            player.policy.write_reward(ep+1, ep_rewards[-1], np.mean(ep_rewards[-50:]))
            print('==========================\n')
            print('Episode {} with reward {}\n'.format(ep+1, ep_rewards[-1]))
            print('Mean reward {}\n'.format(np.mean(ep_rewards)))
            print('Max reward {}\n'.format(np.max(ep_rewards)))
            break

saver.save(sess, TRAIN_LOG_DIR, global_step=ep+1)

Instructions for updating:
Colocations handled automatically by placer.
Training starts -- 20:50:24

Episode 1 with reward 44.0

Mean reward 44.0

Max reward 44.0


Episode 2 with reward 10.0

Mean reward 27.0

Max reward 44.0


Episode 3 with reward 14.0

Mean reward 22.666666666666668

Max reward 44.0


Episode 4 with reward 17.0

Mean reward 21.25

Max reward 44.0


Episode 5 with reward 17.0

Mean reward 20.4

Max reward 44.0


Episode 6 with reward 25.0

Mean reward 21.166666666666668

Max reward 44.0


Episode 7 with reward 40.0

Mean reward 23.857142857142858

Max reward 44.0


Episode 8 with reward 23.0

Mean reward 23.75

Max reward 44.0


Episode 9 with reward 14.0

Mean reward 22.666666666666668

Max reward 44.0


Episode 10 with reward 32.0

Mean reward 23.6

Max reward 44.0


Episode 11 with reward 19.0

Mean reward 23.181818181818183

Max reward 44.0


Episode 12 with reward 13.0

Mean reward 22.333333333333332

Max reward 44.0


Episode 13 with reward 11.0

Mean reward 2


Episode 78 with reward 43.0

Mean reward 19.102564102564102

Max reward 54.0


Episode 79 with reward 13.0

Mean reward 19.025316455696203

Max reward 54.0


Episode 80 with reward 12.0

Mean reward 18.9375

Max reward 54.0


Episode 81 with reward 19.0

Mean reward 18.938271604938272

Max reward 54.0


Episode 82 with reward 34.0

Mean reward 19.121951219512194

Max reward 54.0


Episode 83 with reward 15.0

Mean reward 19.072289156626507

Max reward 54.0


Episode 84 with reward 33.0

Mean reward 19.238095238095237

Max reward 54.0


Episode 85 with reward 14.0

Mean reward 19.176470588235293

Max reward 54.0


Episode 86 with reward 17.0

Mean reward 19.151162790697676

Max reward 54.0


Episode 87 with reward 16.0

Mean reward 19.114942528735632

Max reward 54.0


Episode 88 with reward 14.0

Mean reward 19.056818181818183

Max reward 54.0


Episode 89 with reward 13.0

Mean reward 18.98876404494382

Max reward 54.0


Episode 90 with reward 17.0

Mean reward 18.966666666666665

Ma


Episode 156 with reward 11.0

Mean reward 18.28846153846154

Max reward 54.0


Episode 157 with reward 16.0

Mean reward 18.273885350318473

Max reward 54.0


Episode 158 with reward 12.0

Mean reward 18.234177215189874

Max reward 54.0


Episode 159 with reward 41.0

Mean reward 18.37735849056604

Max reward 54.0


Episode 160 with reward 19.0

Mean reward 18.38125

Max reward 54.0


Episode 161 with reward 10.0

Mean reward 18.32919254658385

Max reward 54.0


Episode 162 with reward 28.0

Mean reward 18.38888888888889

Max reward 54.0


Episode 163 with reward 9.0

Mean reward 18.33128834355828

Max reward 54.0


Episode 164 with reward 10.0

Mean reward 18.28048780487805

Max reward 54.0


Episode 165 with reward 16.0

Mean reward 18.266666666666666

Max reward 54.0


Episode 166 with reward 14.0

Mean reward 18.240963855421686

Max reward 54.0


Episode 167 with reward 26.0

Mean reward 18.2874251497006

Max reward 54.0


Episode 168 with reward 10.0

Mean reward 18.2380952380952


Episode 234 with reward 15.0

Mean reward 18.495726495726494

Max reward 57.0


Episode 235 with reward 23.0

Mean reward 18.514893617021276

Max reward 57.0


Episode 236 with reward 19.0

Mean reward 18.516949152542374

Max reward 57.0


Episode 237 with reward 11.0

Mean reward 18.485232067510548

Max reward 57.0


Episode 238 with reward 19.0

Mean reward 18.48739495798319

Max reward 57.0


Episode 239 with reward 15.0

Mean reward 18.472803347280333

Max reward 57.0


Episode 240 with reward 9.0

Mean reward 18.433333333333334

Max reward 57.0


Episode 241 with reward 17.0

Mean reward 18.42738589211618

Max reward 57.0


Episode 242 with reward 11.0

Mean reward 18.39669421487603

Max reward 57.0


Episode 243 with reward 14.0

Mean reward 18.378600823045268

Max reward 57.0


Episode 244 with reward 20.0

Mean reward 18.385245901639344

Max reward 57.0


Episode 245 with reward 12.0

Mean reward 18.35918367346939

Max reward 57.0


Episode 246 with reward 24.0

Mean reward 18

'../log/train/pg/-300'

In [6]:
player.ep_actions

[0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]

In [7]:
np.vstack(np.array(player.one_hot_actions))

array([[1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.]])

In [8]:
player.ep_rewards

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]

In [9]:
np.array(player.ep_states).shape

(15, 4)

In [10]:
action_prob = player.policy.compute_action_proba(observation).ravel()
action_prob

array([0.6459801 , 0.35401988], dtype=float32)

In [11]:
get_one_hot([1,2,3,1], 4)

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.]])

In [12]:
player.obs_space

4