In [1]:
import tensorflow as tf     
import numpy as np          
import os

  from ._conv import register_converters as _register_converters


DDPG

In [2]:
#train while playing game, we do not need any data
def train_jump_zero_envnoise(env, start, end, noise_sigma, init_memory, model_dir, experiment_dir,
                         actor, critic, memory,
                         actor_lr, critic_lr, batch_size,
                         gamma, tau=0.01):
    
    #build agent: action_range=(-1., 1.),reward_scale=1.
    agent = DDPG(actor, critic, memory, env.observation_shape, env.action_shape,
                 actor_lr=actor_lr, critic_lr=critic_lr, batch_size=batch_size,
                 gamma=gamma, tau=tau)

    #saver
    saver = tf.train.Saver(max_to_keep=20)
    #------add save dir--------
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    #summary dir------------------
    summary_dir = os.path.join(experiment_dir, "summaries")
    if not os.path.exists(summary_dir):#如果路径不存在创建路径
        os.makedirs(summary_dir)
    summary_writer = tf.summary.FileWriter(summary_dir)
    summary = tf.Summary()
    episode_summary = tf.Summary()
    #----------------------------
    with tf.Session() as sess:
        
        #load model if we have
        latest_checkpoint = model_dir
        if latest_checkpoint:
            print("Loading model checkpoint {}...\n".format(latest_checkpoint))
            saver.restore(sess, latest_checkpoint)
            agent.sess = sess
        else:
        # Prepare everything.
            print('Building new model...')
            agent.initialize(sess)
        #         sess.graph.finalize()
        
        #------------------------
        #generate initial memory
        print('Generating ',init_memory,' memory... Please reset game!')
        for i in range(init_memory):
            obs0 = env.reset()
            while 1:
                #get action
                feed_dict = {agent.obs0: [obs0]}
                action = sess.run(agent.actor_tf, feed_dict=feed_dict)
                action = action.flatten()
                
                #add action niose for exploration 
                record_action = action + np.random.normal(0, noise_sigma)
                record_action = np.clip(record_action, -1, 1)
                
                #add noise in env for robust
                do_action = record_action + np.random.uniform(-0.15,0.15)
                do_action = np.clip(do_action, -1, 1)
                
                #do action
                obs1, reward, done, score = env.step(do_action)
                    
                #store transition
                agent.store_transition(obs0, record_action, reward, obs1, done)

                #judge death
                if done:
                    break
                else:
                    obs0 =obs1
                        
        #--------------------------------------      
        print('Training...')
        for episode in range(start, end+1):
            #set game
#             print('new game')
            obs0 = env.reset()
            episode_reward = 0
            episode_step = 0
            episode_score = 0
            
            while 1:
                #get action
                feed_dict = {agent.obs0: [obs0]}
                action = sess.run(agent.actor_tf, feed_dict=feed_dict)
                action = action.flatten()
                
                #add action niose for exploration 
                record_action = action + np.random.normal(0, noise_sigma)
                record_action = np.clip(record_action, -1, 1)
                
                #add noise in env for robust
                do_action = record_action + np.random.uniform(-0.15,0.15)
                do_action = np.clip(do_action, -1, 1)
                
                #do action
                obs1, reward, done, score = env.step(do_action)
                episode_reward += reward 
                episode_step += 1
                    
                #store transition
                agent.store_transition(obs0, record_action, reward, obs1, done)
                obs0 =obs1
                
                # Train.
                cl, al = agent.train()
                global_step = sess.run(agent.global_step)
                
                #record loss
                summary.value.add(simple_value=cl, tag="critic_loss")
                summary.value.add(simple_value=al, tag="actor_loss")
                summary_writer.add_summary(summary, global_step)
                #record graph
                #summary_writer.add_graph(sess.graph)
                #flush
                summary_writer.flush()

                #update target model
                agent.update_target_net()

                
                if done:
                    episode_summary.value.add(simple_value=episode_reward, tag="episode_reward")
                    episode_summary.value.add(simple_value=episode_step, tag="episode_step")
                    episode_summary.value.add(simple_value=episode_score, tag="episode_score")
                    summary_writer.add_summary(episode_summary, episode)
                    summary_writer.flush()
                    #print('dead at',episode_step)
                    break
                else:
                    episode_score = score
                
                #----------------------------------------------------------
        
                

            #save model every 100 episodes
            if episode>0 and episode%100 == 0:
                saver.save(tf.get_default_session(), checkpoint_path+str(episode))
                print('model saved at', episode,'episode, path:', checkpoint_path+str(episode))

    print('Training completed at episode', episode)



In [3]:
from models_update import Actor, Critic

In [4]:
from memory import Memory

In [5]:
from ddpg import DDPG

hyper-parameters

In [6]:
actor_lr = 1e-4
critic_lr = 1e-3
tau = 0.01
nb_actions = 1

In [7]:
batch_size = 128
limit=int(5000)
noise_sigma = 0.1
gamma = 0

experiment setup

In [8]:
init_memory = 5
# episodes = 10000
model_dir = None
start = 0
end = 2000

In [9]:
experiment_dir = os.path.abspath("./ddpg-zero-model-envnoise-inputdim4/experiments/")

In [10]:
#create env
import cv2
from jump_env_update import Jump_Env
number_templet = [cv2.imread('templet/{}.jpg'.format(i)) for i in range(10)]
restart_templet = cv2.imread('templet/again.jpg')
env = Jump_Env(number_templet=number_templet, restart_templet=restart_templet)

In [11]:
actor = Actor(nb_actions, layer_norm=True)
critic = Critic(layer_norm=True)
memory = Memory(limit, action_shape=env.action_shape, observation_shape=env.observation_shape)

In [12]:
train_jump_zero_envnoise(env=env, start=start, end=end, noise_sigma=noise_sigma, init_memory=init_memory, model_dir=model_dir, experiment_dir=experiment_dir, actor=actor, critic=critic, memory=memory, 
              actor_lr=actor_lr, critic_lr=critic_lr, batch_size=batch_size, gamma=gamma, tau=tau)

setting up target updates ...
len 16 = 16
{ target_actor/Conv/weights:0 } <- { actor/Conv/weights:0 }
{ target_actor/Conv/biases:0 } <- { actor/Conv/biases:0 }
{ target_actor/Conv_1/weights:0 } <- { actor/Conv_1/weights:0 }
{ target_actor/Conv_1/biases:0 } <- { actor/Conv_1/biases:0 }
{ target_actor/Conv_2/weights:0 } <- { actor/Conv_2/weights:0 }
{ target_actor/Conv_2/biases:0 } <- { actor/Conv_2/biases:0 }
{ target_actor/dense/kernel:0 } <- { actor/dense/kernel:0 }
{ target_actor/dense/bias:0 } <- { actor/dense/bias:0 }
{ target_actor/LayerNorm/beta:0 } <- { actor/LayerNorm/beta:0 }
{ target_actor/LayerNorm/gamma:0 } <- { actor/LayerNorm/gamma:0 }
{ target_actor/dense_1/kernel:0 } <- { actor/dense_1/kernel:0 }
{ target_actor/dense_1/bias:0 } <- { actor/dense_1/bias:0 }
{ target_actor/LayerNorm_1/beta:0 } <- { actor/LayerNorm_1/beta:0 }
{ target_actor/LayerNorm_1/gamma:0 } <- { actor/LayerNorm_1/gamma:0 }
{ target_actor/dense_2/kernel:0 } <- { actor/dense_2/kernel:0 }
{ target_actor/d

  warn("The default mode, 'constant', will be changed to 'reflect' in "


Training...
model saved at 100 episode, path: /Users/qrdai/project-ddpg/ddpg-zero-model-envnoise-inputdim4/experiments/checkpoints/model100
model saved at 200 episode, path: /Users/qrdai/project-ddpg/ddpg-zero-model-envnoise-inputdim4/experiments/checkpoints/model200
model saved at 300 episode, path: /Users/qrdai/project-ddpg/ddpg-zero-model-envnoise-inputdim4/experiments/checkpoints/model300
model saved at 400 episode, path: /Users/qrdai/project-ddpg/ddpg-zero-model-envnoise-inputdim4/experiments/checkpoints/model400
model saved at 500 episode, path: /Users/qrdai/project-ddpg/ddpg-zero-model-envnoise-inputdim4/experiments/checkpoints/model500
model saved at 600 episode, path: /Users/qrdai/project-ddpg/ddpg-zero-model-envnoise-inputdim4/experiments/checkpoints/model600
model saved at 700 episode, path: /Users/qrdai/project-ddpg/ddpg-zero-model-envnoise-inputdim4/experiments/checkpoints/model700
model saved at 800 episode, path: /Users/qrdai/project-ddpg/ddpg-zero-model-envnoise-inputdi