In [122]:
from dm_control import suite
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import clone_model
import random
import time

In [123]:
# Swimmer6 snake , task: get to the goal 
env = suite.load(domain_name="swimmer", task_name="swimmer6")

In [124]:
def save_models(agent):
    agent.critic_model.save('Swimmer6/critic_model.h5')
    agent.actor_model.save('Swimmer6/actor_model.h5')
    agent.target_critic_model.save('Swimmer6/target_critic_model.h5')
    agent.target_actor_model.save('Swimmer6/target_actor_model.h5')
    
def save_scores(scores,last_rewards,first_rewards):
    np.savetxt('Swimmer6/scores.csv', scores)
    np.savetxt('Swimmer6/last_rewards.csv', last_rewards)
    np.savetxt('Swimmer6/first_rewards.csv', first_rewards) 

In [125]:
print(env.observation_spec())

OrderedDict([('joints', Array(shape=(5,), dtype=dtype('float64'), name='joints')), ('to_target', Array(shape=(2,), dtype=dtype('float64'), name='to_target')), ('body_velocities', Array(shape=(18,), dtype=dtype('float64'), name='body_velocities'))])


In [126]:
# to change all layers to have dtype float64 by default
tf.keras.backend.set_floatx('float64')

In [127]:
def convert_state(observation):
    return np.hstack(
        [observation['joints'], observation['to_target'], observation['body_velocities']]
    ).reshape((1,25))  # all of the components: joints, to_target, body_velocities sum up to 25

In [128]:
class DDPG(object):
    
    def __init__(self, state_dim, action_spec, discount_rate,tau, batch_size):
        self.action_spec = action_spec
        self.state_dim = state_dim
        self.action_dim = self.action_spec.shape[0]
        self.action_min = self.action_spec.minimum
        self.action_max = self.action_spec.maximum
        self.actor_opt = tf.optimizers.Adam(1e-3)   
        self.discount_rate = discount_rate
        self.batch_size = batch_size
        self.noise_scale = 1.0
        self.tau = tau
        
        self.memory = {
            "state":np.array([]), 
            "action":np.array([], dtype = int), 
            "reward":np.array([]),
            "new_state":np.array([]), 
            "done":np.array([])
            }

        # Actor Network
        self.actor_model = self.create_actor_model()

        #Create target actor as clone of actor
        self.target_actor_model = clone_model(self.actor_model)
        self.target_actor_model.compile(optimizer='sgd', loss='mse')
               
        # Critic Network
        self.critic_model = self.create_critic_model()
        
        #Create target critic as clone of critic
        self.target_critic_model = clone_model(self.critic_model)
        self.target_critic_model.compile(optimizer='sgd', loss='mse')
        

    def create_actor_model(self):
        #Create actor
        actor_model = tf.keras.Sequential([
        # Add 400 layer with relu on observations input
        layers.Dense(400, activation='relu', input_shape=(self.state_dim,)),
        # Add 300 hidden layer
        layers.Dense(300, activation='relu'),
        # Add output action layer with tanh as need 5 size array with -1 to 1 values
        layers.Dense(self.action_dim, activation='tanh')])

        # config the model with losses and metrics
        actor_model.compile(optimizer=tf.keras.optimizers.Adam(1e-3), loss='mse')
        return actor_model
    
    def create_critic_model(self):
        #Create critic 
        #Add inputs observations and actions
        action_input = tf.keras.Input(shape=(self.action_dim,), name='action')
        observation_input = tf.keras.Input(shape=self.state_dim, name='state')
        flattened_observation = layers.Flatten()(observation_input)
        #Add layer on observation input only
        l=layers.Dense(400, activation='relu', input_shape=(self.state_dim,))(flattened_observation)
        #Later add actions
        l=layers.Concatenate()([l, action_input])
        l=layers.Dense(300, activation='relu')(l)
        #Output q value
        l=layers.Dense(1, activation='linear')(l)
        critic_model = tf.keras.Model(inputs=[action_input, observation_input], outputs=l)

        # config the model with losses and metrics
        critic_model.compile(optimizer=tf.keras.optimizers.Adam(1e-3), loss='mse')
        return critic_model
    
    def get_action_with_noise(self, obs):
        #Make action based on determenistic trained policy, but add Gaussian noise to explore env
        action = self.actor_model.predict(obs)[0]
        action = action + self.noise_scale*np.random.normal(0,1,size = self.action_dim)
        action = np.clip(action, -1.0, 1.0)  # between action values
        return action
    
    def update_buffer(self, obs, new_obs, action, reward, done):
        if len(self.memory["state"])>0:
            if self.memory["done"].shape[0]>1000000:
                for key in self.memory.keys():
                    self.memory[key] = self.memory[key][-900000:] #leave only part of buffer
            self.memory["state"] = np.vstack((self.memory["state"], obs))
            self.memory["new_state"] = np.vstack((self.memory["new_state"], new_obs))
            self.memory["action"] = np.vstack((self.memory["action"], action))
        else: 
            # if not initialized new
            self.memory["state"] = np.array(obs)
            self.memory["new_state"] = np.array(new_obs)
            self.memory["action"] = np.array(action)
        self.memory["reward"] = np.append(self.memory["reward"], reward)  # not cleaning, always not empty
        self.memory["done"] = np.append(self.memory["done"], done)   
        
    def ddpg_batch_update(self):
        if self.memory["done"].shape[0]>=self.batch_size: #there is enough for sampling
            indexes = random.sample(range(0, self.memory["done"].shape[0]), self.batch_size)
            batch = {
                    "state": np.squeeze(self.memory["state"][indexes]), 
                    "action": np.squeeze(self.memory["action"][indexes]), 
                    "reward": self.memory["reward"][indexes], 
                    "new_state": np.squeeze(self.memory["new_state"][indexes]), 
                    "done": self.memory["done"][indexes]
                    }
            
            # Q' in DDPG pseudocode to count target critic value using only target networks,
            #based on new state choose action by target actor deterministic 
            #and count q value based on new state and chosen action by target critic 
            target_q = self.target_critic_model({"state":batch["new_state"], 
                                            "action": self.target_actor_model(batch["new_state"])})
            # y_i in DDPG  , target value to count MSE: ri+ discount_rate*Q_target 
            # if done do not add future reward, so added  *(1-batch["done"])
            y = batch["reward"].reshape(self.batch_size,1) + \
                (self.discount_rate*(1-batch["done"])).reshape(self.batch_size,1)*target_q
           
            # update critic by minimizing MSE loss between y and usual critic, actor output
            self.critic_model.train_on_batch({"state": batch["state"], "action": batch["action"]}, y)

            # update actor policy using sampled gradient
            with tf.GradientTape() as tape:
                q = self.critic_model({"state": batch["state"], #q vector from critic
                                       "action": self.actor_model(batch["state"])})  # get action from actor 
                actor_loss = - tf.reduce_mean(q) # 1 value =mean q value across all q vector values, ignore structure
            # compute gradients (loss, variables)
            actor_grads = tape.gradient(actor_loss, self.actor_model.trainable_weights)
            # update optimizer applying counted gradients
            self.actor_opt.apply_gradients(zip(actor_grads, self.actor_model.trainable_weights))
#             self.actor_opt.minimize(actor_loss, self.actor_model.trainable_weights)

            # update target networks target=tau*net + (1-tau)*target
            self.target_critic_model.set_weights((1.0-self.tau) * np.array(self.target_critic_model.get_weights()) + \
                                          self.tau*np.array(self.critic_model.get_weights()))
            self.target_actor_model.set_weights((1.0-self.tau) * np.array(self.target_actor_model.get_weights()) + \
                                         self.tau*np.array(self.actor_model.get_weights()))
            
            self.noise_scale*=0.99  # lower noise as exploration/exploitation
    

In [129]:
action_spec = env.action_spec()
action_dim = action_spec.shape[0]
action_low=action_spec.minimum  # [-1. -1. -1. -1. -1.]
action_high=action_spec.maximum  # [1. 1. 1. 1. 1.]
state_dim = 25 # np.hstack all of the components: joints, to_target, body_velocities
# agent = DDPG(state_dim, action_spec,0.99,0.01, 64)

In [130]:
def train_model(env, num_games, state_dim, action_spec):
    #initalize empty
    scores = np.array([])
    last_rewards = np.array([])
    first_rewards = np.array([])
    action_spec = env.action_spec()
    
    # initialize with network models + target and empty memory
    agent = DDPG(state_dim, action_spec, 0.99,0.01, 64)
    start = time.time()

    for game in range(1, num_games+1):       
        if (game % 25) == 0:  # after each ... iterarion number store data
            #temporary save all 4 models if disconnected helpful
            save_models(agent)
            save_scores(scores,last_rewards,first_rewards)       
            print(f"{iteration} number iterations left")
            print(f"{(time.time()-start)/60} minutes remaining")
            
        #just plot for demo     
        if (game % 100) == 0:
            x=list(range(len(scores)))
            plt.plot(x, scores)
            plt.xlabel("Iteration number")
            plt.ylabel("Reward")
            plt.show()

        time_step = env.reset()  # reset env once in game
        obs = convert_state(time_step.observation) # to np array based on env obs 

        game_score = 0
        #for iterations in game, fixed 1000 for env, automatically returns last
        while not time_step.last():       
            # get action with added noise
            action = agent.get_action_with_noise(obs)
            # make action 
            time_step = env.step(action)
            #convert got from env obs
            new_obs = convert_state(time_step.observation)            
            # update info
            game_score += time_step.reward 
            #last is done flag
            agent.update_buffer(obs, new_obs, action, time_step.reward, 0)
            obs = new_obs
            # update networks weights
            agent.ddpg_batch_update()
            #store info if memory full then will clean first rewards, need for stats
            if len(first_rewards)==len(last_rewards):
                first_rewards = np.append(first_rewards, time_step.reward)
        #gane ended, store last step
        new_obs = convert_state(time_step.observation)   
        # done flag =1
        agent.update_buffer(obs, new_obs, action, time_step.reward, 1)
        # update networks weights
        agent.ddpg_batch_update()
        #last reward added
        game_score += time_step.reward 
        
        #across all games scores array
        scores = np.append(scores, game_score)
        last_rewards = np.append(last_rewards, time_step.reward)
        #after each game
        print(f"Iteration: {game};   score: {game_score}; game reward: {time_step.reward}; previous game reward: {first_rewards[-1]}")
    
    #after all games complete save trained networks
    save_models(agent)
    save_scores(scores,last_rewards,first_rewards)
    return agent, scores, last_rewards, first_rewards

In [None]:
num_games = 1000

agent,scores, last_rewards, first_rewards = train_model(env, num_games, state_dim, action_spec)

Iteration: 1;   score: 122.17612617358806; game reward: 0.12426098087077711; previous game reward: 0.08716617710255155


In [84]:
from dm_control import suite
from dm_control import viewer
import numpy as np

env = suite.load(domain_name="swimmer", task_name="swimmer6",visualize_reward = True)
action_spec = env.action_spec()
obs_spec = env.observation_spec()

test_games_num = 100
test_scores = []

for game in range(test_games_num):
    score = 0
    step_data = env.reset()  # get initial state
    
    while not step_data.last():
        action = np.random.uniform(low=action_spec.minimum,high=action_spec.maximum,size=action_spec.shape)
        step_data = env.step(action)
        score += step_data.reward
    print(score)   
    test_scores.append(score)

print("Average reward on test 100 games: ", np.mean(test_scores))
    
#     for step in range(max_steps_in_game):
# #         env.render()  # show gif
# #         print(observation)  # print state vector 8, fixed for environment
#         obs = np.reshape(obs, (1, state_size))
#         rewards = trained_model.predict(obs)
#         action = int(np.argmax(rewards[0]))
# #         action = env.action_space.sample()
#         obs, reward, done, info = env.step(action)  # step returns 4 parameters
#         score +=reward
#         if done:  # game over need reset
# #             print("Episode finished after {} timesteps".format(step+1))
#             print(score)
#             break
#     test_scores.append(score)

11.84270279515664
174.37853653624165
990.3867739230217
76.40498741416086
11.85914494957288
7.8378229237979244
379.93543975807563
998.9540249895206
9.279951886452674
186.781442120145
29.917483871047192
18.841813762040342
719.792014475365
805.2113464955687
6.066806933490635
471.60333387103304
7.894880668466012
10.260713640206554
441.04042191833634
17.162429921387908
9.984490586754552
497.2597006810599
10.402056148713594
9.167138935953089
18.75408767159369
14.042000761098931
18.27572731557042
11.661311304978614
29.211435875778346
8.682749542307647
11.11649064771654
14.991746371904334
11.35004667664715
7.167194815978092
442.93244975917196
13.443347133179362
20.974426910764954
8.369623629784828
10.276102603329674
4.007666418266852
38.43273675369839
10.065484949903052
665.3867561239139
46.10872632572045
985.6128515787769
46.709100646778026
17.2076953475195
10.766706531839262
14.512184854906188
17.205131684222888
13.909490875054967
42.98485235124309
493.99112416622785
4.727411009707003
8.0525

In [None]:
#Viewer
from dm_control import suite
from dm_control import viewer
import numpy as np

# env = suite.load(domain_name="humanoid", task_name="stand")
env = suite.load(domain_name="swimmer", task_name="swimmer6")
action_spec = env.action_spec()
print(action_spec)

# Define a uniform random policy.
def random_policy(time_step):
#     del time_step  # Unused.
    print(time_step)
    
    action = np.random.uniform(low=action_spec.minimum,
                           high=action_spec.maximum,
                           size=action_spec.shape)
    print(action)
    return action
# time_step = env.reset()

# Launch the viewer application.
viewer.launch(env, policy=random_policy)