In [15]:
#Imports
from dm_control import mujoco

# Access to enums and MuJoCo library functions.
from dm_control.mujoco.wrapper.mjbindings import enums
from dm_control.mujoco.wrapper.mjbindings import mjlib

# PyMJCF
from dm_control import mjcf

# Composer high level imports
from dm_control import composer
from dm_control.composer.observation import observable
from dm_control.composer import variation

# Imports for Composer tutorial example
from dm_control.composer.variation import distributions
from dm_control.composer.variation import noises
from dm_control.locomotion.arenas import floors

# Control Suite
from dm_control import suite

# Run through corridor example
from dm_control.locomotion.walkers import cmu_humanoid
from dm_control.locomotion.arenas import corridors as corridor_arenas
from dm_control.locomotion.tasks import corridors as corridor_tasks

# Soccer
from dm_control.locomotion import soccer

# Manipulation
from dm_control import manipulation
import tensorflow as tf
import gym
from keras.models import Sequential
from keras.layers import Dense, Activation, Input, Dropout, Normalization
from tensorflow.keras.optimizers import Adam
import numpy as np
import cv2
import os
from collections import deque
from keras.callbacks import TensorBoard
import time
import random
#HELPER FUNCTION

# General
import copy
import os
import itertools
from IPython.display import clear_output
import numpy as np

# Graphics-related
import matplotlib
import matplotlib.animation as animation
import matplotlib.pyplot as plt
from IPython.display import HTML
import PIL.Image

In [16]:
#Helper functions

SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 12
plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
%config InlineBackend.figure_format = 'svg'
# Inline video helper function
if os.environ.get('COLAB_NOTEBOOK_TEST', False):
  # We skip video generation during tests, as it is quite expensive.
  display_video = lambda *args, **kwargs: None
else:
  def display_video(frames, framerate=30):
    height, width, _ = frames[0].shape
    dpi = 70
    orig_backend = matplotlib.get_backend()
    matplotlib.use('Agg')  # Switch to headless 'Agg' to inhibit figure rendering.
    fig, ax = plt.subplots(1, 1, figsize=(width / dpi, height / dpi), dpi=dpi)
    matplotlib.use(orig_backend)  # Switch back to the original backend.
    ax.set_axis_off()
    ax.set_aspect('equal')
    ax.set_position([0, 0, 1, 1])
    im = ax.imshow(frames[0])
    def update(frame):
      im.set_data(frame)
      return [im]
    interval = 1000/framerate
    anim = animation.FuncAnimation(fig=fig, func=update, frames=frames,
                                   interval=interval, blit=True, repeat=False)
    anim.save('cartpole.mp4', fps=30, extra_args=['-vcodec', 'libx264'])
    
class ModifiedTensorBoard(TensorBoard):

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.step = 1
        self.writer = tf.summary.create_file_writer(self.log_dir)
        self._log_write_dir = self.log_dir
    
    def set_model(self, model):
        self.model = model
    
        self._train_dir = os.path.join(self._log_write_dir, 'train')
        self._train_step = self.model._train_counter
    
        self._val_dir = os.path.join(self._log_write_dir, 'validation')
        self._val_step = self.model._test_counter
    
        self._should_write_train_graph = False
    
    def on_epoch_end(self, epoch, logs=None):
        self.update_stats(**logs)
    
    def on_batch_end(self, batch, logs=None):
        pass
    
    def on_train_end(self, _):
        pass
    
    def update_stats(self, **stats):
        with self.writer.as_default():
            for key, value in stats.items():
                tf.summary.scalar(key, value, step = self.step)
                self.writer.flush()


In [17]:
#DQN agent

class DQN_agent():
    def __init__(self):
        self.model = self.create_model()
        
        self.target_model = self.create_model()
        self.target_model.set_weights(self.model.get_weights())
        
        self.replay_memory = deque(maxlen=10000)
        self.tensorboard = ModifiedTensorBoard(log_dir="logs/{}-{}".format('cartpole', int(time.time())))
        self.target_update_counter = 0
        
    def create_model(self):
        model = Sequential()
        model.add(Input(shape=(4,)))
        model.add(Dense(100))
        model.add(Activation('relu'))
        model.add(Normalization())
        model.add(Dropout(0.2))
        model.add(Dense(100))
        model.add(Activation('relu'))
        model.add(Normalization())
        model.add(Dense(2))
        model.add(Activation('linear'))
        
        model.compile(loss = 'mse' ,optimizer=Adam(learning_rate=  0.0005))

        return model

    def update_replay_memory(self,transition):
        self.replay_memory.append(transition)
        
    def get_qs(self,state):
        
        return self.model.predict(state)[0]
    
    def train(self, terminal_state, step):

        # Start training only if certain number of samples is already saved
        if len(self.replay_memory) < 10000:
            return
        
        # Get a minibatch of random samples from memory replay table
        minibatch = random.sample(self.replay_memory, 400)

        # Get current states from minibatch, then query NN model for Q values
        current_states = np.array([transition[0] for transition in minibatch])
        current_qs_list = self.model.predict(current_states)

        # Get future states from minibatch, then query NN model for Q values
        # When using target network, query it, otherwise main network should be queried
        new_current_states = np.array([transition[3] for transition in minibatch])

        future_qs_list = self.target_model.predict(new_current_states)
        
        X = []
        y = []

        # Now we need to enumerate our batches
        for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):

            # If not a terminal state, get new q from future states, otherwise set it to 0
            # almost like with Q Learning, but we use just part of equation here
            if not done:
                max_future_q = np.max(future_qs_list[index])
                new_q = reward + 0.99 * max_future_q
            else:
                new_q = reward

            # Update Q value for given state
            current_qs = current_qs_list[index]
            current_qs[action] = new_q

            # And append to our training data
            X.append(current_state)
            y.append(current_qs)

        # Fit on all samples as one batch, log only on terminal state
        self.model.fit(np.array(X), np.array(y), batch_size=400, verbose=0, shuffle=False, callbacks=[self.tensorboard] if terminal_state else None)
        
        if terminal_state:
            self.target_update_counter += 1

        # If counter reaches set value, update target network with weights of main network
        if self.target_update_counter > 5:
            self.target_model.set_weights(self.model.get_weights())
            self.target_update_counter = 0


In [18]:
#EPISODE (MuJoCo)
'''
# Load the environment
def run_episode(epsilon,film,step,rewards):
  env = suite.load('cartpole', 'balance')

  duration = 4 # Seconds
  frames = []
  ticks = []
  
  observations = []
  actions = []
  time_step = env.reset()
  
  while env.physics.data.time < duration:
    transition = []
    observation = time_step.observation
    current_pos = np.array(list((observation.items()))[0][1])
    current_vel = np.array(list((observation.items()))[1][1])
    current_state = np.concatenate(( current_pos/np.linalg.norm(current_pos),current_vel/np.linalg.norm(current_vel)))
    transition.append(current_state)
    attempt = agent.get_qs((np.array(current_state)).reshape(-1,5))
    attempt_max = np.argmax(attempt)

    transition.append(attempt_max)
    if np.random.random()>epsilon:
      if attempt_max == 0:
        action =1
      elif attempt_max ==1:
        action =-1
      elif attempt_max==2:
        action = 0 
    else:
      rand = np.random.random()
      if rand >0.8:
        action = 1
      elif 0.8>rand>0.6:
        action =-1
      elif 0.6>rand>0.4:
        action =0
      elif 0.4>rand>0.2:
        action =1
      else:
        action = -1
        
    actions.append(action)
    time_step = env.step(action)
    observation = time_step.observation
 
    new_current_pos = np.array(list((observation.items()))[0][1])
    new_current_vel = np.array(list((observation.items()))[1][1])
    new_current_state = np.concatenate(( new_current_pos/np.linalg.norm(new_current_pos),new_current_vel/np.linalg.norm(new_current_vel)))
    
    reward =time_step.reward
    rewards.append(reward)
    transition.append(reward)
    transition.append(new_current_state)
    done = False
    transition.append(done)
    agent.update_replay_memory(transition)
    
    agent.train(done,step)
    current_state = new_current_state
    if film == True:
      camera0 = env.physics.render(camera_id=0, height=200, width=200)
      camera1 = env.physics.render(camera_id=1, height=200, width=200)
      frames.append(np.hstack((camera0, camera1)))
      rewards.append(time_step.reward)
      observations.append(copy.deepcopy(time_step.observation))
      ticks.append(env.physics.data.time)
      
    
  
  
  done = True
  transition=[current_state,attempt_max,reward,new_current_state,done]
  agent.update_replay_memory(transition)
  agent.train(done,step)
  
  
  if film == True:
    print("FILMING")
    html_video = display_video(frames, framerate=1./env.control_timestep())
    html_video
    return actions
'''

'\n# Load the environment\ndef run_episode(epsilon,film,step,rewards):\n  env = suite.load(\'cartpole\', \'balance\')\n\n  duration = 4 # Seconds\n  frames = []\n  ticks = []\n  \n  observations = []\n  actions = []\n  time_step = env.reset()\n  \n  while env.physics.data.time < duration:\n    transition = []\n    observation = time_step.observation\n    current_pos = np.array(list((observation.items()))[0][1])\n    current_vel = np.array(list((observation.items()))[1][1])\n    current_state = np.concatenate(( current_pos/np.linalg.norm(current_pos),current_vel/np.linalg.norm(current_vel)))\n    transition.append(current_state)\n    attempt = agent.get_qs((np.array(current_state)).reshape(-1,5))\n    attempt_max = np.argmax(attempt)\n\n    transition.append(attempt_max)\n    if np.random.random()>epsilon:\n      if attempt_max == 0:\n        action =1\n      elif attempt_max ==1:\n        action =-1\n      elif attempt_max==2:\n        action = 0 \n    else:\n      rand = np.random.ran

In [31]:
#Episode openai gym
def run_episode_2(epsilon,step,render,rewards):
    env = gym.make('CartPole-v1')
    observation = env.reset()
    done = False
    for i in range(100):
        transition = []
        if render == True:
            env.render()
        transition.append(observation)
        attempt = agent.get_qs(((observation).reshape(-1,4)))
        attempt_max = np.argmax(attempt)
        
        if np.random.random()>epsilon:
            if attempt_max == 0:
                action =0
            elif attempt_max ==1:
                action =1
        else:
            rand = np.random.random()
            if rand >0.5:
                action = 0
            elif 0.5>rand:
                action =1
        transition.append(action)
        observation, reward, done, info = env.step(action)
        transition.append(reward)
        rewards.append(reward)
        transition.append(observation)
        transition.append(done)
        agent.update_replay_memory(transition)
        agent.train(done,step)
    env.close()

In [33]:
agent = DQN_agent()

In [35]:
#training loop

num_episodes = 500
epsilon = 1
decay = 0.99
min_epsilon = 0.001
step = 1
ep_num = 1

for episode in range(1,num_episodes+1):
    rewards = []
    agent.tensorboard.step = episode
    done = False
    print(f"Episode {ep_num}")
    run_episode_2(epsilon,step,False,rewards)
    step +=1
    #plt.plot(np.linspace(0,len(rewards),len(rewards)),rewards)
    total = 0
    for x in rewards:
        if x == 1:
            total +=1
    print(total)
    plt.show()
    if epsilon*decay > min_epsilon:
        epsilon = epsilon*decay
        print(epsilon)
    ep_num+=1
run_episode_2(epsilon,step,True,rewards)



Episode 1


  logger.warn(


13
0.99
Episode 2
17
0.9801
Episode 3
81
0.9702989999999999
Episode 4
13
0.96059601
Episode 5
31
0.9509900498999999
Episode 6
12
0.9414801494009999
Episode 7
26
0.9320653479069899
Episode 8
18
0.92274469442792
Episode 9
18
0.9135172474836407
Episode 10
18
0.9043820750088043
Episode 11
21
0.8953382542587163
Episode 12
12
0.8863848717161291
Episode 13
11
0.8775210229989678
Episode 14
25
0.8687458127689781
Episode 15
26
0.8600583546412883
Episode 16
14
0.8514577710948754
Episode 17
28
0.8429431933839266
Episode 18
21
0.8345137614500874
Episode 19
13
0.8261686238355865
Episode 20
18
0.8179069375972307
Episode 21
15
0.8097278682212583
Episode 22
10
0.8016305895390458
Episode 23
11
0.7936142836436553
Episode 24
12
0.7856781408072188
Episode 25
16
0.7778213593991465
Episode 26
13
0.7700431458051551
Episode 27
22
0.7623427143471035
Episode 28
23
0.7547192872036325
Episode 29
17
0.7471720943315961
Episode 30
19
0.7397003733882802
Episode 31
11
0.7323033696543974
Episode 32
19
0.7249803359578534

KeyboardInterrupt: 

In [None]:
run_episode_2(epsilon,step,True,rewards)