In [13]:
import gym
import tensorflow as tf
import numpy as np
import math

In [14]:
class Agent:
    '''
    Agent with environment
    '''
    def __init__(self, env = None, learning_rate = 1e-2, seed = 0, sizes = [32], activation = tf.tanh):
        '''Initializing all specific variables for bot'''
        
        assert env, "Value env is required"
        
        self.env = gym.make(env)
        self.seed = 0
        self.obs = self.env.reset()
        self.activation = activation
        self.sizes = sizes
        self.obs_dim = self.env.observation_space.shape[0]
        self.n_acts = self.env.action_space.n
        
        self.learning_rate = learning_rate
        
        self.obs_ph, self.logits, self.action = self.create_model()
        self.weights_ph, self.act_ph, self.loss = self.loss_func()
        #optimizer
        self.train_op = tf.train.AdamOptimizer(learning_rate = self.learning_rate).minimize(self.loss)
        
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
        self.writer = tf.summary.FileWriter('./Agent', self.sess.graph)
        
        
    def create_model(self):
        '''
        Creating model
        obs_dim = env.observation_space.shape -> (4,), use .shape[0]
        sizes [hidden layers] + [env.action_space.n]
        ''' 
        assert self.obs_dim #maybe obs_dim == 0, check this!
        obs_ph = tf.placeholder(dtype = tf.float32, shape=(None, self.obs_dim))
        hidden = tf.layers.dense(obs_ph, units = self.sizes[0], activation = self.activation)
        for x in self.sizes[1:-1]:
            hidden = tf.layers.dense(hidden, units = x, \
                                     activation = self.activation)
        logits = tf.layers.dense(hidden, units = self.n_acts)
        action = tf.squeeze(tf.multinomial(logits=logits, \
                                           num_samples=1), axis=1)

        #action = tf.argmax(input = logits, axis = 1)

        return obs_ph, logits, action
        
    
    def act(self, obs):
        '''Choosing action for self.obs '''
        action = self.sess.run(self.action, feed_dict = {
            self.obs_ph : [obs]
        })#[0] or [1]
        return action[0]
    
    
    def play_episode(self, rendering = False):
        '''
        Playing only one episode, collecting trajectory and rewards
        '''
        episode_obs_batch = list()
        episode_action_batch = list()
        episode_trajectory = list()
        rewards = list()
        
        self.obs = self.env.reset()
        while True:
            episode_obs_batch.append(self.obs.copy())
            if rendering == True:
                self.env.render()
            action = self.act(self.obs)
            episode_trajectory.append((self.obs, action))
            self.obs, reward, is_done, info = self.env.step(action)
            
            
            episode_action_batch.append(action)
            
            rewards.append(reward)
            if is_done:
                break
        if rendering == True:
            return sum(rewards)
        else:
            return rewards, episode_obs_batch, episode_action_batch, episode_trajectory
    
    
    def play_epoch(self, n):
        '''
        Collecting samples with current policy
        n = number of samples from 1 epoch
        '''
        epoch_obs_batch = list()
        epoch_action_batch = list()
        epoch_rewards = list()
        epoch_trajectory = list()
        epoch_reward_to_go = list()
        
        while len(epoch_obs_batch)<n:
            reward, obs_batch, action_batch, trajectory_batch = self.play_episode()
            
            epoch_obs_batch+=(obs_batch)
            
            epoch_action_batch+=(action_batch)
            epoch_rewards.append(sum(reward))
            epoch_reward_to_go += list(self.reward_to_go(reward))
            
            epoch_trajectory.append(trajectory_batch)
        return epoch_obs_batch, epoch_action_batch, epoch_trajectory, epoch_reward_to_go, sum(epoch_rewards)/len(epoch_rewards)
    
    
    def reward_to_go(self, rews):
        '''
        Calculating parameterized policy(weights)
        '''
        n = len(rews)
        rtgs = np.zeros_like(rews)
        for i in reversed(range(n)):
            rtgs[i] = rews[i] + (rtgs[i+1] if i+1 < n else 0)
        #returns [200. 199. ... 1.]
        return rtgs
    
    
    def train_epoch(self):
        '''
        Training agent for n epochs
        '''
        obs_batches, action_batches, trajectory_batches, reward_to_go, mean_reward = self.play_epoch(5000)
        self.play_episode(True)
        #print("obs: {}, \nactions: {},\n rewards: {}".format(obs_batches[0:8], action_batches[0:8], reward_to_go[0:8]))
        loss, _ = self.optimize(obs_batches, action_batches, reward_to_go)
        #print(np.array(obs_batches).shape,np.array(action_batches).shape,np.array(reward_to_go).shape)
        return loss, mean_reward
        
    def can_solve(self):
        for i in range(100):
            reward, _, _, _= self.play_episode()
            if sum(reward) != 200: 
                print("{} Cannot solve yet {}".format(i, sum(reward)))
                return False
        return True
                
        
    def train_n_epochs(self, n):
        for i in range(n):
            loss, reward = self.train_epoch()
            if reward == 200:
                if self.can_solve():
                    return "SOLVED"
                    
            print("#i:{} loss: {}, mean reward: {}".format(i, loss, reward))    
            
            
    def loss_func(self):
        '''Calculating loss, returning specific placeholders'''
        weights_ph = tf.placeholder(shape=(None,), dtype = tf.float32)
        act_ph = tf.placeholder(shape=(None,), dtype = tf.int32)
        
        action_masks = tf.one_hot(act_ph, self.n_acts)#array([[0., 1.]...], dtype=float32)

        log_probs = tf.reduce_sum(action_masks * tf.nn.log_softmax(self.logits), axis=1)
        #array([-0.6952652,...], dtype=float32)
        loss = -tf.reduce_mean(weights_ph* log_probs)#9.812662
        return weights_ph, act_ph, loss
    
    
    def optimize(self, obs, act, weights):
        '''Optimizing logits with loss, using self.train_op(optimizer)'''
        return self.sess.run([self.loss, self.train_op], feed_dict = {
            self.obs_ph : obs,
            self.act_ph: act,
            self.weights_ph: weights
        })

In [15]:
agent = Agent("CartPole-v0")

UnregisteredEnv: No registered env with id: InvertedPendulumPyBulletEnv-v0

In [4]:
agent.train_n_epochs(500)

#i:0 loss: 17.38859748840332, mean reward: 37.65413533834587
#i:1 loss: 18.03504180908203, mean reward: 39.39370078740158
#i:2 loss: 16.7275390625, mean reward: 42.10084033613445
#i:3 loss: 19.99949836730957, mean reward: 49.65346534653465
#i:4 loss: 19.825490951538086, mean reward: 53.340425531914896
#i:5 loss: 21.259174346923828, mean reward: 59.11764705882353
#i:6 loss: 22.01289176940918, mean reward: 59.28235294117647
#i:7 loss: 23.762725830078125, mean reward: 63.2375
#i:8 loss: 28.84112548828125, mean reward: 77.3076923076923
#i:9 loss: 28.648921966552734, mean reward: 79.5
#i:10 loss: 28.27061653137207, mean reward: 81.29032258064517
#i:11 loss: 34.772422790527344, mean reward: 101.06
#i:12 loss: 34.323673248291016, mean reward: 104.6875
#i:13 loss: 41.642452239990234, mean reward: 123.14634146341463
#i:14 loss: 45.572906494140625, mean reward: 150.14705882352942
#i:15 loss: 48.93723678588867, mean reward: 158.71875
#i:16 loss: 50.05317687988281, mean reward: 171.3
#i:17 loss: 5

'SOLVED'

In [5]:
agent.play_episode(True)

200.0

In [6]:
agent.env.close()

In [7]:
class RemoveRewardLimitation(gym.ActionWrapper):
    '''this class serves to escape 200 rewards = done'''
    def __init__(self, env):
        super().__init__(env)
        
    def step(self, action):
        assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action))
        state = self.state
        x, x_dot, theta, theta_dot = state
        force = self.force_mag if action==1 else -self.force_mag
        costheta = math.cos(theta)
        sintheta = math.sin(theta)
        temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta) / self.total_mass
        thetaacc = (self.gravity * sintheta - costheta* temp) / (self.length * (4.0/3.0 - self.masspole * costheta * costheta / self.total_mass))
        xacc  = temp - self.polemass_length * thetaacc * costheta / self.total_mass
        if self.kinematics_integrator == 'euler':
            x  = x + self.tau * x_dot
            x_dot = x_dot + self.tau * xacc
            theta = theta + self.tau * theta_dot
            theta_dot = theta_dot + self.tau * thetaacc
        else: # semi-implicit euler
            x_dot = x_dot + self.tau * xacc
            x  = x + self.tau * x_dot
            theta_dot = theta_dot + self.tau * thetaacc
            theta = theta + self.tau * theta_dot
        self.state = (x,x_dot,theta,theta_dot)
        done =  x < -self.x_threshold \
                or x > self.x_threshold \
                or theta < -self.theta_threshold_radians \
                or theta > self.theta_threshold_radians
        done = bool(done)
        if not done:
            reward = 1.0
        elif self.steps_beyond_done is None:
            # Pole just fell!
            self.steps_beyond_done = 0
            reward = 1.0
        else:
            if self.steps_beyond_done == 0:
                print("You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.")
            self.steps_beyond_done += 1
            reward = 0.0
        return np.array(self.state), reward, done, {}
    
    def reset(self):
        '''@https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py'''
        self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,))
        self.steps_beyond_done = None
        return np.array(self.state)

    def render(self, mode='human'):
        screen_width = 600
        screen_height = 400

        world_width = self.x_threshold*2
        scale = screen_width/world_width
        carty = 100 # TOP OF CART
        polewidth = 10.0
        polelen = scale * (2 * self.length)
        cartwidth = 50.0
        cartheight = 30.0

        if self.viewer is None:
            from gym.envs.classic_control import rendering
            self.viewer = rendering.Viewer(screen_width, screen_height)
            l,r,t,b = -cartwidth/2, cartwidth/2, cartheight/2, -cartheight/2
            axleoffset =cartheight/4.0
            cart = rendering.FilledPolygon([(l,b), (l,t), (r,t), (r,b)])
            self.carttrans = rendering.Transform()
            cart.add_attr(self.carttrans)
            self.viewer.add_geom(cart)
            l,r,t,b = -polewidth/2,polewidth/2,polelen-polewidth/2,-polewidth/2
            pole = rendering.FilledPolygon([(l,b), (l,t), (r,t), (r,b)])
            pole.set_color(.8,.6,.4)
            self.poletrans = rendering.Transform(translation=(0, axleoffset))
            pole.add_attr(self.poletrans)
            pole.add_attr(self.carttrans)
            self.viewer.add_geom(pole)
            self.axle = rendering.make_circle(polewidth/2)
            self.axle.add_attr(self.poletrans)
            self.axle.add_attr(self.carttrans)
            self.axle.set_color(.5,.5,.8)
            self.viewer.add_geom(self.axle)
            self.track = rendering.Line((0,carty), (screen_width,carty))
            self.track.set_color(0,0,0)
            self.viewer.add_geom(self.track)

            self._pole_geom = pole

        if self.state is None: return None

        # Edit the pole polygon vertex
        pole = self._pole_geom
        l,r,t,b = -polewidth/2,polewidth/2,polelen-polewidth/2,-polewidth/2
        pole.v = [(l,b), (l,t), (r,t), (r,b)]

        x = self.state
        cartx = x[0]*scale+screen_width/2.0 # MIDDLE OF CART
        self.carttrans.set_translation(cartx, carty)
        self.poletrans.set_rotation(-x[2])

        return self.viewer.render(return_rgb_array = mode=='rgb_array')

    def close(self):
        if self.viewer:
            self.viewer.close()
            self.viewer = None

In [8]:
env = gym.make("CartPole-v0")
env = RemoveRewardLimitation(env)

In [9]:
rewards = 0
obs = env.reset()
is_done = False
while is_done == False:
    env.render()
    action = agent.sess.run(agent.action, {agent.obs_ph: [obs]})[0]
    obs, reward, is_done, _ = env.step(action)
    rewards+=reward
    if is_done:
        print(obs, rewards)
env.close()

[-2.4061323  -0.89856719 -0.07917451 -0.10530031] 329.0
