In [9]:
%pip install ribs[all] gym~=0.17.0 Box2D~=2.3.10 tqdm



In [10]:
import gym
import time
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import pylab

In [11]:
env = gym.make("LunarLander-v2")
seed = 1339
action_dim = env.action_space.n
obs_dim = env.observation_space.shape[0]

In [12]:
import keras 
import numpy as np
from keras.layers import Input,Dense
from keras import Model , Sequential
from tensorflow.keras.optimizers import Adam,RMSprop
import tensorflow as tf
from tensorboardX import SummaryWriter
import tensorflow_probability as tfp
from keras import backend as k
import os
import copy
import gym

In [13]:
class Actor_Model:
  def __init__(self,state_size,action_size ,lr):
    self.action_size = action_size
    x_input = Input(state_size)
    x1 = Dense(756,activation= "relu" ,  kernel_initializer=tf.random_normal_initializer(stddev=0.01))(x_input)
    x2 = Dense(378,activation = "relu" ,  kernel_initializer=tf.random_normal_initializer(stddev=0.01))(x1)
    x3 = Dense(63,activation="relu" , kernel_initializer=tf.random_normal_initializer(stddev=0.01))(x2)
    values = Dense(action_size , activation = "softmax" , kernel_initializer=tf.random_normal_initializer(stddev=0.01))(x3)
    self.Actor = Model(inputs = x_input , outputs = values)
    self.Actor.compile(loss = self.Actor_loss , optimizer = Adam(lr = lr))

  def Actor_loss(self,y_true,y_pred):
    advantages, predictions, actions = y_true[:, :1], y_true[:, 1:1+self.action_size], y_true[:, 1+self.action_size:]
    prob = actions * y_pred
    old_prob = actions*predictions
    cliping = 0.2
    entropy_loss  = 0.001
    beta = 3
    dtar = 0.003
    prob = k.clip(prob ,1e-8,1 )
    old_prob = k.clip(old_prob ,1e-8 , 1)
    ratio = k.exp(k.log(prob) - k.log(old_prob))
    clip = k.clip(ratio , min_value = 1-cliping , max_value = 1+cliping)
    s1 = ratio * advantages
    s2 = clip *  advantages
    
    #KL = k.mean(s1 - beta.kl(prob,old_prob))

    actor_loss = -k.mean(k.minimum(s1,s2))
    entropy = -(y_pred * k.log(y_pred))
    entropy = entropy_loss*k.mean(entropy)
    total_loss = actor_loss - entropy
    return total_loss
  def predict(self, state):
        return self.Actor.predict(state)

In [14]:
class Critic:
  def __init__(self,state_size,action_size , lr):
    x_input = Input(state_size)
    old_values = Input(shape = (1,))
    x1 = Dense(512,activation= 'relu' ,kernel_initializer=tf.random_normal_initializer(stddev=0.01))(x_input)
    x2 = Dense(256 ,activation= "relu" , kernel_initializer=tf.random_normal_initializer(stddev=0.01))(x1)
    x3 = Dense(32 ,activation="relu" , kernel_initializer=tf.random_normal_initializer(stddev=0.01))(x2)
    value = Dense(1 ,activation=None ,  kernel_initializer=tf.random_normal_initializer(stddev=0.01))(x3)
    self.Critic = Model(inputs = [x_input,old_values] , outputs = value)
    self.Critic.compile(loss = self.critic_loss(old_values) , optimizer = Adam(lr  = lr))
  def critic_loss(self, values):
        def loss(y_true, y_pred):
            LOSS_CLIPPING = 0.2
            clipped_value_loss = values + k.clip(y_pred - values, -LOSS_CLIPPING, LOSS_CLIPPING)
            v_loss1 = (y_true - clipped_value_loss) ** 2
            v_loss2 = (y_true - y_pred) ** 2
            
            value_loss = 0.5 * k.mean(k.maximum(v_loss1, v_loss2))
            #value_loss = K.mean((y_true - y_pred) ** 2) # standard PPO loss
            return value_loss
        return loss
  def predict(self, state):
        return self.Critic.predict([state, np.zeros((state.shape[0], 1))])

In [17]:
class Agent:
  def __init__(self,env_name):
    self.env = gym.make(env_name)
    self.state_size = self.env.observation_space.shape
    self.action_size = self.env.action_space.n
    self.lr1 = 0.001
    self.lr2 = 0.005
    self.gamma = 0.99
    self.lamda= 0.95
    self.Actor = Actor_Model(self.state_size , self.action_size ,self.lr1)
    self.Critic = Critic(self.state_size ,self.action_size , self.lr2)
    self.optimizer = Adam
    self.writer = SummaryWriter(comment = "_"+env_name+"_"+self.optimizer.__name__+"_"+str(self.lr1)+"_"+str(self.lr2))
    self.actor_name = "Actor.h5"
    self.critic_name = "critic.h5"
    self.episode = 0
    self.EPISODE = 100
  def act(self,state):
    predict = self.Actor.predict(state)[0]
    action = np.random.choice(self.action_size , p = predict)
    action_onehot = np.zeros([self.action_size])
    action_onehot[action] = 1
    return predict , action ,action_onehot
  def Padvantages(self,rewards,dones,values ,next_values ,normalize = True):
    deltas = [r+self.gamma*(1-d)*nv - v for r , d,nv,v in zip(rewards,dones , next_values,values)]
    deltas = np.stack(deltas)
    advantages = copy.deepcopy(deltas)
    for t in reversed(range(len(deltas)-1)):
      advantages[t]  = advantages[t] + self.gamma*self.lamda*(1-dones[t]) *advantages[t+1]
    target = advantages+values
    if normalize:
      advantages = (advantages - advantages.mean())/advantages.std()
    return np.vstack(advantages) ,np.vstack(target)
  def replay(self,rewards,dones,next_states ,states,actions,predictions):
    states = np.vstack(states)
    next_states = np.vstack(next_states)
    actions = np.vstack(actions)
    predictions = np.vstack(predictions)

    values = self.Critic.predict(states)
    next_values =self.Critic.predict(next_states)

    advantages , target = self.Padvantages(rewards,dones , np.squeeze(values) ,np.squeeze(next_values))
 
    y_true = np.hstack([advantages ,predictions,actions])

    self.Actor.Actor.fit(states , y_true , epochs = 10,verbose = 0)
    self.Critic.Critic.fit([states,values] ,target ,epochs = 10 ,verbose = 0)
  def load(self):
    self.Actor.Actor.load_weights(self.actor_name)
    self.Critic.Critic.load_weights(self.actor_name)
  def load(self):
    self.Actor.Actor.save_weights(self.actor_name)
    self.Critic.Critic.save_weights(self.actor_name)
  pylab.figure(figsize=(18, 9))
  pylab.subplots_adjust(left=0.05, right=0.98, top=0.96, bottom=0.06)
  def PlotModel(self, score, episode):
        self.scores_.append(score)
        self.episodes_.append(episode)
        self.average_.append(sum(self.scores_[-50:]) / len(self.scores_[-50:]))
        if str(episode)[-2:] == "00":# much faster than episode % 100
            pylab.plot(self.episodes_, self.scores_, 'b')
            pylab.plot(self.episodes_, self.average_, 'r')
            pylab.title(self.env_name+" PPO training cycle", fontsize=18)
            pylab.ylabel('Score', fontsize=18)
            pylab.xlabel('Steps', fontsize=18)
            try:
                pylab.grid(True)
                pylab.savefig(self.env_name+".png")
            except OSError:
                pass
        # saving best models
        if self.average_[-1] >= self.max_average:
            self.max_average = self.average_[-1]
            self.save()
            SAVING = "SAVING"
            # decreaate learning rate every saved model
            self.lr *= 0.95
            k.set_value(self.Actor.Actor.optimizer.learning_rate, self.lr1)
            k.set_value(self.Critic.Critic.optimizer.learning_rate, self.lr2)
      
        return self.average_[-1], SAVING
  def train(self):
    
      state = self.env.reset()
      state = np.reshape(state , [1,self.state_size[0]])
      done = False
      score = 0
      while True:
        rewards,dones,next_states ,states,actions,predictions = [],[],[],[],[],[]
        episodes = []
        scores =[]
        while not done:
          predict , action,action_onehot = self.act(state)
          next_state,reward,done,info = self.env.step(action)
          next_state = np.reshape(next_state , [1,self.state_size[0]])
          states.append(state)
          next_states.append(next_state)
          actions.append(action_onehot)
          rewards.append(reward)
          dones.append(done)
          predictions.append(predict)       
          state = next_state
          score += reward

          if done:
            self.episode += 1
            episodes.append(self.episode)
            scores.append(score)
            plt.plot(episodes , scores)
            print("==================================================================================================================================================")
            print("episodes:" , self.episode, "rewards:" , score )
            self.writer.add_scalar(f'Workers:{1}/score_per_episode', score, self.episode)
            self.writer.add_scalar(f'Workers:{1}/learning_rate', self.lr1,self.lr2, self.episode)
                   
            #self.replay(rewards,dones,next_states ,states,actions,predictions)
            state, done, score = self.env.reset(), False, 0
            state = np.reshape(state, [1, self.state_size[0]])
        if self.episode >= self.EPISODE:
          break
      self.env.close()

<Figure size 1296x648 with 0 Axes>

In [None]:
if __name__ =="__main__":
  name = "LunarLander-v2"
  agent = Agent(name)
  agent.train()

  super(Adam, self).__init__(name, **kwargs)


episodes: 1 rewards: -113.88709883972695
episodes: 2 rewards: -91.50327922177419
episodes: 3 rewards: -88.80597555672675
episodes: 4 rewards: -511.5873430179027
episodes: 5 rewards: -147.1628886230149
episodes: 6 rewards: -212.96445131817006
episodes: 7 rewards: -79.59444099225732
episodes: 8 rewards: -166.16284063863452
episodes: 9 rewards: -115.50747475531782
episodes: 10 rewards: -166.45719665382524
episodes: 11 rewards: -337.4851026634143
episodes: 12 rewards: -240.97288626635267
episodes: 13 rewards: -138.8128823706997
episodes: 14 rewards: -247.8924921954924
episodes: 15 rewards: -143.47465369795322
episodes: 16 rewards: -131.75538026133762
episodes: 17 rewards: -277.94417533031435
episodes: 18 rewards: -379.58242202888135
episodes: 19 rewards: -101.3142293012564
episodes: 20 rewards: -360.5792638432061
episodes: 21 rewards: -211.6291080107072
episodes: 22 rewards: -204.15299120415426
episodes: 23 rewards: -113.11590050769601
episodes: 24 rewards: -127.90516304802574
episodes: 25