In [1]:
import numpy as np
import matplotlib.pyplot as plt
import moviepy.editor as mpy
import skimage.transform
from IPython.display import Image, display

import tensorflow as tf
import tensorflow_probability as tfp
import tensorflow.keras.losses as kls

In [2]:
cd PyGame-Learning-Environment

C:\Users\Yan-Ru\Desktop\PyGame-Learning-Environment


In [4]:
pip install tensorflow_probability==0.12.2

Collecting tensorflow_probability==0.12.2
  Downloading tensorflow_probability-0.12.2-py2.py3-none-any.whl (4.8 MB)
     ---------------------------------------- 4.8/4.8 MB 7.2 MB/s eta 0:00:00
Installing collected packages: tensorflow_probability
  Attempting uninstall: tensorflow_probability
    Found existing installation: tensorflow-probability 0.19.0
    Uninstalling tensorflow-probability-0.19.0:
      Successfully uninstalled tensorflow-probability-0.19.0
Successfully installed tensorflow_probability-0.12.2
Note: you may need to restart the kernel to use updated packages.




In [29]:
gpus = tf.config.list_physical_devices("GPU") 
if gpus:
    try:
        # Restrict TensorFlow to only use the fourth GPU
        tf.config.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

Physical devices cannot be modified after being initialized


In [4]:
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"  # this line make pop-out window not appear
from ple.games.flappybird import FlappyBird
from ple import PLE

game = FlappyBird()
env = PLE(game, fps=30, display_screen=False)  # environment interface to game
env.reset_game()

test_game = FlappyBird()
test_env = PLE(test_game, fps=30, display_screen=False)
test_env.reset_game()

In [5]:
path = './movie_f' 
if not os.path.exists(path):
    os.makedirs(path)

In [6]:
hparas = {
    'image_size': 84,
    'num_stack': 4,
    'action_dim': len(env.getActionSet()),
    'hidden_size': 256,
    'lr': 0.0001,
    'gamma': 0.99,
    'lambda': 0.95,
    'clip_val': 0.2,
    'ppo_epochs': 8,
    'test_epochs': 1,
    'num_steps': 512,
    'mini_batch_size': 64,
    'target_reward': 200,
    'max_episode': 30000,
}

In [7]:
# Please do not modify this method
def make_anim(images, fps=60, true_image=False):
    duration = len(images) / fps

    def make_frame(t):
        try:
            x = images[int(len(images) / duration * t)]
        except:
            x = images[-1]

        if true_image:
            return x.astype(np.uint8)
        else:
            return ((x + 1) / 2 * 255).astype(np.uint8)

    clip = mpy.VideoClip(make_frame, duration=duration)
    clip.fps = fps
    
    return clip

In [8]:
def preprocess_screen(screen):
    screen = skimage.transform.rotate(screen, -90, resize=True)
    screen = screen[:400, :]
    screen = skimage.transform.resize(screen, [hparas['image_size'], hparas['image_size'], 1])
    return screen.astype(np.float32)

def frames_to_state(input_frames):
    if(len(input_frames) == 1):
        state = np.concatenate(input_frames*4, axis=-1)
    elif(len(input_frames) == 2):
        state = np.concatenate(input_frames[0:1]*2 + input_frames[1:]*2, axis=-1)
    elif(len(input_frames) == 3):
        state = np.concatenate(input_frames + input_frames[2:], axis=-1)
    else:
        state = np.concatenate(input_frames[-4:], axis=-1)

    return state

In [9]:
class ActorCriticNetwork(tf.keras.Model):
    def __init__(self, hparas):
        super().__init__()

        self.feature_extractor = tf.keras.Sequential([
          # Convolutional Layers
          tf.keras.layers.Conv2D(filters=32, kernel_size=8, strides=4),
          tf.keras.layers.ReLU(),
          tf.keras.layers.Conv2D(filters=64, kernel_size=4, strides=2),
          tf.keras.layers.ReLU(),
          tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides=1),
          tf.keras.layers.ReLU(),
          # Embedding Layers
          tf.keras.layers.Flatten(),
          tf.keras.layers.Dense(hparas['hidden_size']),
          tf.keras.layers.ReLU(),
        ])

        # Actor Network
        self.actor = tf.keras.layers.Dense(hparas['action_dim'], activation='softmax')
        # Critic Network
        self.critic = tf.keras.layers.Dense(1, activation = None)

    def call(self, input):
        x = self.feature_extractor(input)
        action_logits = self.actor(x)
        value = self.critic(x)
        return action_logits, value

In [10]:
class Agent():
    def __init__(self, hparas):
        self.gamma = hparas['gamma']
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=hparas['lr'])
        self.actor_critic = ActorCriticNetwork(hparas)
        self.clip_pram = hparas['clip_val']
    
    def ppo_iter(self, mini_batch_size, states, actions, log_probs, returns, advantage):
        batch_size = states.shape[0]
        for _ in range(batch_size // mini_batch_size):
            rand_ids = tf.convert_to_tensor(np.random.randint(0, batch_size, mini_batch_size), dtype=tf.int32)
            yield tf.gather(states, rand_ids), tf.gather(actions, rand_ids), tf.gather(log_probs, rand_ids), \
             tf.gather(returns, rand_ids), tf.gather(advantage, rand_ids)
    
    def ppo_update(self, ppo_epochs, mini_batch_size, states, actions, log_probs, discount_rewards, advantages):       
        total_actor_loss = 0
        total_critic_loss = 0
        for _ in range(ppo_epochs):
            for state, action, old_log_probs, reward, advantage in self.ppo_iter(mini_batch_size, states, actions, log_probs, discount_rewards, advantages):
                reward = tf.expand_dims(reward, axis=-1)

                with tf.GradientTape() as tape:
                    prob, value = self.actor_critic(state, training=True)
                    dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
                    entropy = tf.math.reduce_mean(dist.entropy())
                    new_log_probs = dist.log_prob(action)

                    # PPO ratio
                    ratio = tf.math.exp(new_log_probs - old_log_probs)
                    surr1 = ratio * advantage
                    surr2 = tf.clip_by_value(ratio, 1.0 - self.clip_pram, 1.0 + self.clip_pram) * advantage

                    actor_loss = tf.math.negative(tf.math.reduce_mean(tf.math.minimum(surr1, surr2))) - 0.1 * entropy
                    critic_loss = 0.5 * tf.math.reduce_mean(kls.mean_squared_error(reward, value))

                    total_loss = actor_loss + critic_loss
            
                # single optimizer
                grads = tape.gradient(total_loss, self.actor_critic.trainable_variables)
                self.optimizer.apply_gradients(zip(grads, self.actor_critic.trainable_variables))
      
                total_actor_loss += actor_loss
                total_critic_loss += critic_loss
        return total_actor_loss, total_critic_loss

In [11]:
# https://arxiv.org/pdf/1506.02438.pdf
# Equation 16
def compute_gae(rewards, masks, values, gamma, LAMBDA):
    gae = 0
    returns = []
    for i in reversed(range(len(rewards))):
        delta = rewards[i] + gamma * values[i + 1] * masks[i] - values[i]
        gae = delta + gamma * LAMBDA * masks[i] * gae
        returns.append(gae + values[i])

    returns.reverse()
    return returns

In [12]:
def test_reward(test_env, agent):
    total_reward = 0
    # Reset the environment
    test_env.reset_game()
    input_frames = [preprocess_screen(test_env.getScreenGrayscale())]

    while not test_env.game_over():

        state = frames_to_state(input_frames)
        state = tf.expand_dims(state, axis=0)
        prob, value = agent.actor_critic(state)

        action = np.argmax(prob[0].numpy())
        reward = test_env.act(test_env.getActionSet()[action])
        total_reward += reward

        input_frames.append(preprocess_screen(test_env.getScreenGrayscale()))

    return total_reward

In [13]:
agent = Agent(hparas)
max_episode = hparas['max_episode']
test_per_n_episode = 10
force_save_per_n_episode = 1000
early_stop_reward = 10

start_s = 0
best_reward = -5.0

checkpoint = tf.train.Checkpoint(
    actor_critic = agent.actor_critic,
    optimizer = agent.optimizer,
)

# Load from old checkpoint
# checkpoint.restore('ckpt_dir/ckpt-?')

In [14]:
ep_reward = []
total_avgr = []
early_stop = False
avg_rewards_list = []

env.reset_game()

for s in range(0, max_episode):
    if early_stop == True:
        break

    rewards = []
    states = []
    actions = []
    log_probs = []
    masks = []
    values = []

    display_frames = [env.getScreenRGB()]
    input_frames = [preprocess_screen(env.getScreenGrayscale())]

    for step in range(hparas['num_steps']):

        state = frames_to_state(input_frames)
        state = tf.expand_dims(state, axis=0)
        prob, value = agent.actor_critic(state)

        dist = tfp.distributions.Categorical(probs=prob[0], dtype=tf.float32)
        action = dist.sample(1)
        log_prob = dist.log_prob(action)

        reward = env.act(env.getActionSet()[int(action.numpy())])

        done = env.game_over()

        states.append(state)
        actions.append(action)
        values.append(value[0])
        log_probs.append(log_prob)
        rewards.append(tf.convert_to_tensor(reward, dtype=tf.float32))
        masks.append(tf.convert_to_tensor(1-int(done), dtype=tf.float32))

        display_frames.append(env.getScreenRGB())
        input_frames.append(preprocess_screen(env.getScreenGrayscale()))

        if done:
            env.reset_game()
            input_frames = [preprocess_screen(env.getScreenGrayscale())]
  
    _, next_value = agent.actor_critic(state)
    values.append(next_value[0])

    returns = compute_gae(rewards, masks, values, hparas['gamma'], hparas['lambda'])

    returns = tf.concat(returns, axis=0)
    log_probs = tf.concat(log_probs, axis=0)
    values = tf.concat(values, axis=0)
    states = tf.concat(states, axis=0)
    actions = tf.concat(actions, axis=0)
    advantage = returns - values[:-1]

    a_loss, c_loss = agent.ppo_update(hparas['ppo_epochs'], hparas['mini_batch_size'], states, actions, log_probs, returns, advantage)
    print('[Episode %d]  Actor loss: %.5f, Critic loss: %.5f' % (s, a_loss, c_loss))

    if s % test_per_n_episode == 0:
        # test agent hparas['test_epochs'] times to get the average reward
        avg_reward = np.mean([test_reward(test_env, agent) for _ in range(hparas['test_epochs'])])
        print("Test average reward is %.1f, Current best average reward is %.1f\n" % (avg_reward, best_reward))
        avg_rewards_list.append(avg_reward)

        if avg_reward > best_reward:
            best_reward = avg_reward
            agent.actor_critic.save('./save/Actor/model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
            checkpoint.save(file_prefix = './save/checkpoints/ckpt')

    if s % force_save_per_n_episode == 0:
        agent.actor_critic.save('./save/Actor/model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
        checkpoint.save(file_prefix = './save/checkpoints/ckpt')
        clip = make_anim(display_frames, fps=60, true_image=True).rotate(-90)
        clip.write_videofile("movie_f/{}_demo-{}.webm".format('Lab15', s), fps=60)
        display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))

    if best_reward >= early_stop_reward:
        early_stop = True

[Episode 0]  Actor loss: 94.67840, Critic loss: 65.12277
Test average reward is -5.0, Current best average reward is -5.0





INFO:tensorflow:Assets written to: ./save/Actor/model_actor_0_-5.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_0_-5.0\assets


Moviepy - Building video movie_f/Lab15_demo-0.webm.
Moviepy - Writing video movie_f/Lab15_demo-0.webm



                                                                                                                       

Moviepy - Done !
Moviepy - video ready movie_f/Lab15_demo-0.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                                                                                       

Moviepy - Done !
Moviepy - video ready __temp__.mp4




[Episode 1]  Actor loss: 32.98872, Critic loss: 21.68518
[Episode 2]  Actor loss: 35.83522, Critic loss: 19.86360
[Episode 3]  Actor loss: 12.07397, Critic loss: 7.82365
[Episode 4]  Actor loss: 7.40627, Critic loss: 4.81992
[Episode 5]  Actor loss: 4.15539, Critic loss: 4.57111
[Episode 6]  Actor loss: -1.34668, Critic loss: 3.63459
[Episode 7]  Actor loss: -3.97504, Critic loss: 3.22701
[Episode 8]  Actor loss: -4.87517, Critic loss: 3.69831
[Episode 9]  Actor loss: -8.81595, Critic loss: 2.09124
[Episode 10]  Actor loss: -3.83865, Critic loss: 2.45828
Test average reward is -5.0, Current best average reward is -5.0

[Episode 11]  Actor loss: -7.31574, Critic loss: 1.67733
[Episode 12]  Actor loss: -0.45947, Critic loss: 2.14636
[Episode 13]  Actor loss: -3.69978, Critic loss: 1.66016
[Episode 14]  Actor loss: -7.31441, Critic loss: 1.32048
[Episode 15]  Actor loss: -7.45264, Critic loss: 1.58836
[Episode 16]  Actor loss: -11.64316, Critic loss: 1.19266
[Episode 17]  Actor loss: -9.6



INFO:tensorflow:Assets written to: ./save/Actor/model_actor_90_-4.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_90_-4.0\assets


[Episode 91]  Actor loss: -7.99886, Critic loss: 0.48598
[Episode 92]  Actor loss: -8.46013, Critic loss: 0.37339
[Episode 93]  Actor loss: -4.74121, Critic loss: 0.33433
[Episode 94]  Actor loss: -7.67050, Critic loss: 0.28731
[Episode 95]  Actor loss: -6.85786, Critic loss: 0.22725
[Episode 96]  Actor loss: -6.65518, Critic loss: 0.38575
[Episode 97]  Actor loss: -4.95143, Critic loss: 0.32928
[Episode 98]  Actor loss: -3.08475, Critic loss: 0.63244
[Episode 99]  Actor loss: -6.25681, Critic loss: 0.51594
[Episode 100]  Actor loss: -3.63418, Critic loss: 0.55015
Test average reward is -5.0, Current best average reward is -4.0

[Episode 101]  Actor loss: -4.08141, Critic loss: 0.48200
[Episode 102]  Actor loss: -5.04587, Critic loss: 0.30074
[Episode 103]  Actor loss: -6.24307, Critic loss: 0.49465
[Episode 104]  Actor loss: -5.37818, Critic loss: 2.13698
[Episode 105]  Actor loss: 1.02396, Critic loss: 0.90330
[Episode 106]  Actor loss: -1.77666, Critic loss: 0.40221
[Episode 107]  A

[Episode 219]  Actor loss: -5.13424, Critic loss: 0.47687
[Episode 220]  Actor loss: -8.32610, Critic loss: 0.39448
Test average reward is -5.0, Current best average reward is -4.0

[Episode 221]  Actor loss: -8.85167, Critic loss: 0.38028
[Episode 222]  Actor loss: -7.67402, Critic loss: 0.36631
[Episode 223]  Actor loss: -7.55357, Critic loss: 0.83648
[Episode 224]  Actor loss: -5.66486, Critic loss: 0.39168
[Episode 225]  Actor loss: -5.60605, Critic loss: 0.38444
[Episode 226]  Actor loss: -5.46558, Critic loss: 0.33424
[Episode 227]  Actor loss: -2.67597, Critic loss: 0.33137
[Episode 228]  Actor loss: -5.11485, Critic loss: 0.42076
[Episode 229]  Actor loss: -6.03346, Critic loss: 0.34598
[Episode 230]  Actor loss: -4.15324, Critic loss: 0.34080
Test average reward is -5.0, Current best average reward is -4.0

[Episode 231]  Actor loss: -5.49099, Critic loss: 0.33335
[Episode 232]  Actor loss: -5.27692, Critic loss: 0.46039
[Episode 233]  Actor loss: -2.43478, Critic loss: 0.4198

[Episode 346]  Actor loss: -4.47656, Critic loss: 0.31102
[Episode 347]  Actor loss: -1.40074, Critic loss: 0.22173
[Episode 348]  Actor loss: -7.77583, Critic loss: 0.27196
[Episode 349]  Actor loss: -2.75523, Critic loss: 0.20881
[Episode 350]  Actor loss: -3.44931, Critic loss: 0.39874
Test average reward is -5.0, Current best average reward is -4.0

[Episode 351]  Actor loss: -4.43487, Critic loss: 0.31513
[Episode 352]  Actor loss: -5.88391, Critic loss: 0.31384
[Episode 353]  Actor loss: -4.24840, Critic loss: 0.23185
[Episode 354]  Actor loss: -6.64609, Critic loss: 0.20171
[Episode 355]  Actor loss: -1.82619, Critic loss: 0.35564
[Episode 356]  Actor loss: -3.61007, Critic loss: 0.23899
[Episode 357]  Actor loss: -3.79361, Critic loss: 0.16017
[Episode 358]  Actor loss: -6.77597, Critic loss: 0.31695
[Episode 359]  Actor loss: -6.10283, Critic loss: 0.27641
[Episode 360]  Actor loss: -4.52239, Critic loss: 0.26361
Test average reward is -4.0, Current best average reward is -4.0

[Episode 473]  Actor loss: -5.74353, Critic loss: 0.41042
[Episode 474]  Actor loss: -5.80931, Critic loss: 0.27098
[Episode 475]  Actor loss: -7.44216, Critic loss: 0.28088
[Episode 476]  Actor loss: -5.19220, Critic loss: 0.29239
[Episode 477]  Actor loss: -4.28224, Critic loss: 0.35653
[Episode 478]  Actor loss: -3.65245, Critic loss: 0.25217
[Episode 479]  Actor loss: -0.00293, Critic loss: 0.34356
[Episode 480]  Actor loss: -8.62961, Critic loss: 0.87133
Test average reward is -5.0, Current best average reward is -4.0

[Episode 481]  Actor loss: -3.47750, Critic loss: 0.47815
[Episode 482]  Actor loss: -4.40202, Critic loss: 0.46270
[Episode 483]  Actor loss: -5.01383, Critic loss: 0.22849
[Episode 484]  Actor loss: -3.37304, Critic loss: 0.31743
[Episode 485]  Actor loss: -4.53437, Critic loss: 0.32808
[Episode 486]  Actor loss: -6.35739, Critic loss: 0.28128
[Episode 487]  Actor loss: -6.43975, Critic loss: 0.23553
[Episode 488]  Actor loss: -0.78283, Critic loss: 0.18553
[Episo

Test average reward is -5.0, Current best average reward is -4.0

[Episode 601]  Actor loss: -11.56389, Critic loss: 0.71343
[Episode 602]  Actor loss: -6.96742, Critic loss: 0.97811
[Episode 603]  Actor loss: 2.01004, Critic loss: 0.48492
[Episode 604]  Actor loss: -1.83400, Critic loss: 0.43462
[Episode 605]  Actor loss: -2.55180, Critic loss: 0.36529
[Episode 606]  Actor loss: -4.38539, Critic loss: 0.17092
[Episode 607]  Actor loss: -6.49936, Critic loss: 0.41008
[Episode 608]  Actor loss: -5.29586, Critic loss: 0.45726
[Episode 609]  Actor loss: -3.14097, Critic loss: 0.25510
[Episode 610]  Actor loss: -5.28397, Critic loss: 0.77335
Test average reward is -5.0, Current best average reward is -4.0

[Episode 611]  Actor loss: -3.43857, Critic loss: 0.62832
[Episode 612]  Actor loss: -4.08997, Critic loss: 0.54845
[Episode 613]  Actor loss: -1.80915, Critic loss: 0.58988
[Episode 614]  Actor loss: -7.70469, Critic loss: 0.66100
[Episode 615]  Actor loss: -7.38689, Critic loss: 1.1841

[Episode 728]  Actor loss: -9.07067, Critic loss: 0.99788
[Episode 729]  Actor loss: -4.96579, Critic loss: 0.68058
[Episode 730]  Actor loss: -9.31461, Critic loss: 1.54329
Test average reward is -4.0, Current best average reward is -4.0

[Episode 731]  Actor loss: -7.69683, Critic loss: 1.18630
[Episode 732]  Actor loss: -4.35263, Critic loss: 0.86368
[Episode 733]  Actor loss: -1.40499, Critic loss: 0.54231
[Episode 734]  Actor loss: -8.30675, Critic loss: 1.06629
[Episode 735]  Actor loss: -6.93330, Critic loss: 0.38666
[Episode 736]  Actor loss: -4.15421, Critic loss: 0.63086
[Episode 737]  Actor loss: -8.22035, Critic loss: 0.50672
[Episode 738]  Actor loss: -3.79515, Critic loss: 0.86869
[Episode 739]  Actor loss: -1.62471, Critic loss: 0.43941
[Episode 740]  Actor loss: -7.15994, Critic loss: 1.13637
Test average reward is -5.0, Current best average reward is -4.0

[Episode 741]  Actor loss: -4.34898, Critic loss: 0.70600
[Episode 742]  Actor loss: -7.16639, Critic loss: 0.8610

[Episode 855]  Actor loss: -2.61615, Critic loss: 0.46071
[Episode 856]  Actor loss: -1.79248, Critic loss: 0.36359
[Episode 857]  Actor loss: -2.21137, Critic loss: 0.30176
[Episode 858]  Actor loss: -2.89989, Critic loss: 0.42092
[Episode 859]  Actor loss: -8.95839, Critic loss: 1.06413
[Episode 860]  Actor loss: -5.30527, Critic loss: 0.32734
Test average reward is -5.0, Current best average reward is -4.0

[Episode 861]  Actor loss: -6.63855, Critic loss: 0.31301
[Episode 862]  Actor loss: -3.15294, Critic loss: 0.44062
[Episode 863]  Actor loss: -5.35821, Critic loss: 0.36034
[Episode 864]  Actor loss: -6.75719, Critic loss: 0.64723
[Episode 865]  Actor loss: -11.99011, Critic loss: 0.57239
[Episode 866]  Actor loss: -6.45115, Critic loss: 0.56137
[Episode 867]  Actor loss: -5.38660, Critic loss: 0.81802
[Episode 868]  Actor loss: -3.18380, Critic loss: 0.57371
[Episode 869]  Actor loss: -2.17594, Critic loss: 0.43935
[Episode 870]  Actor loss: -9.30163, Critic loss: 0.79921
Test 



INFO:tensorflow:Assets written to: ./save/Actor/model_actor_910_-3.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_910_-3.0\assets


[Episode 911]  Actor loss: -2.34086, Critic loss: 0.33114
[Episode 912]  Actor loss: -5.69000, Critic loss: 0.89172
[Episode 913]  Actor loss: -15.98423, Critic loss: 1.47534
[Episode 914]  Actor loss: -5.95322, Critic loss: 1.33859
[Episode 915]  Actor loss: -2.72551, Critic loss: 0.43575
[Episode 916]  Actor loss: -4.61292, Critic loss: 1.08796
[Episode 917]  Actor loss: -5.30669, Critic loss: 0.57773
[Episode 918]  Actor loss: -8.74681, Critic loss: 0.72364
[Episode 919]  Actor loss: -5.96366, Critic loss: 1.06227
[Episode 920]  Actor loss: 4.07408, Critic loss: 0.90408
Test average reward is -5.0, Current best average reward is -3.0

[Episode 921]  Actor loss: -3.76757, Critic loss: 0.55860
[Episode 922]  Actor loss: -6.73592, Critic loss: 1.22084
[Episode 923]  Actor loss: -8.76172, Critic loss: 0.83325
[Episode 924]  Actor loss: -4.39397, Critic loss: 0.79059
[Episode 925]  Actor loss: -4.95692, Critic loss: 0.83617
[Episode 926]  Actor loss: -6.80055, Critic loss: 1.00028
[Episo



INFO:tensorflow:Assets written to: ./save/Actor/model_actor_1000_-5.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_1000_-5.0\assets


Moviepy - Building video movie_f/Lab15_demo-1000.webm.
Moviepy - Writing video movie_f/Lab15_demo-1000.webm



                                                                                                                       

Moviepy - Done !
Moviepy - video ready movie_f/Lab15_demo-1000.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                                                                                       

Moviepy - Done !
Moviepy - video ready __temp__.mp4




[Episode 1001]  Actor loss: -4.98598, Critic loss: 2.16844
[Episode 1002]  Actor loss: -4.28454, Critic loss: 3.64413
[Episode 1003]  Actor loss: 0.01601, Critic loss: 1.78265
[Episode 1004]  Actor loss: -8.91062, Critic loss: 1.61705
[Episode 1005]  Actor loss: -15.45654, Critic loss: 1.99856
[Episode 1006]  Actor loss: -1.60536, Critic loss: 1.77466
[Episode 1007]  Actor loss: -2.33018, Critic loss: 1.34710
[Episode 1008]  Actor loss: 4.25358, Critic loss: 1.01364
[Episode 1009]  Actor loss: -9.24465, Critic loss: 1.31169
[Episode 1010]  Actor loss: 2.16850, Critic loss: 1.38379
Test average reward is -5.0, Current best average reward is -3.0

[Episode 1011]  Actor loss: -6.18377, Critic loss: 1.19105
[Episode 1012]  Actor loss: -0.51461, Critic loss: 1.34172
[Episode 1013]  Actor loss: -0.31729, Critic loss: 1.74125
[Episode 1014]  Actor loss: -7.04686, Critic loss: 2.21823
[Episode 1015]  Actor loss: -0.72328, Critic loss: 0.82488
[Episode 1016]  Actor loss: -13.53034, Critic loss:

[Episode 1127]  Actor loss: -6.42335, Critic loss: 1.39078
[Episode 1128]  Actor loss: -8.42373, Critic loss: 2.96340
[Episode 1129]  Actor loss: -0.11847, Critic loss: 1.50556
[Episode 1130]  Actor loss: -2.71202, Critic loss: 2.35296
Test average reward is -5.0, Current best average reward is -3.0

[Episode 1131]  Actor loss: -6.84213, Critic loss: 1.30599
[Episode 1132]  Actor loss: 0.90342, Critic loss: 0.85908
[Episode 1133]  Actor loss: -1.85345, Critic loss: 1.18408
[Episode 1134]  Actor loss: -5.16709, Critic loss: 1.13531
[Episode 1135]  Actor loss: -7.24588, Critic loss: 1.25987
[Episode 1136]  Actor loss: -9.30048, Critic loss: 0.65848
[Episode 1137]  Actor loss: -7.59364, Critic loss: 0.82379
[Episode 1138]  Actor loss: -3.52049, Critic loss: 0.92177
[Episode 1139]  Actor loss: -13.17284, Critic loss: 1.64151
[Episode 1140]  Actor loss: -6.47198, Critic loss: 1.07455
Test average reward is -5.0, Current best average reward is -3.0

[Episode 1141]  Actor loss: -7.57073, Crit

[Episode 1252]  Actor loss: -2.69637, Critic loss: 1.56472
[Episode 1253]  Actor loss: -1.47921, Critic loss: 1.36859
[Episode 1254]  Actor loss: -2.58930, Critic loss: 1.73412
[Episode 1255]  Actor loss: 2.48362, Critic loss: 0.75442
[Episode 1256]  Actor loss: 1.45875, Critic loss: 1.21950
[Episode 1257]  Actor loss: -8.86996, Critic loss: 1.00030
[Episode 1258]  Actor loss: -5.63327, Critic loss: 1.09598
[Episode 1259]  Actor loss: -10.13929, Critic loss: 1.35763
[Episode 1260]  Actor loss: -1.47790, Critic loss: 0.51340
Test average reward is -5.0, Current best average reward is -3.0

[Episode 1261]  Actor loss: -11.66943, Critic loss: 1.63864
[Episode 1262]  Actor loss: -11.50525, Critic loss: 1.84339
[Episode 1263]  Actor loss: -6.31841, Critic loss: 1.82439
[Episode 1264]  Actor loss: -4.73123, Critic loss: 1.59461
[Episode 1265]  Actor loss: -9.97525, Critic loss: 1.02713
[Episode 1266]  Actor loss: -3.06449, Critic loss: 1.56787
[Episode 1267]  Actor loss: -10.22964, Critic lo

[Episode 1378]  Actor loss: -14.45516, Critic loss: 2.50429
[Episode 1379]  Actor loss: -2.61476, Critic loss: 1.62131
[Episode 1380]  Actor loss: -6.82640, Critic loss: 1.77932
Test average reward is -5.0, Current best average reward is -3.0

[Episode 1381]  Actor loss: -6.08150, Critic loss: 1.17390
[Episode 1382]  Actor loss: -6.25513, Critic loss: 1.68010
[Episode 1383]  Actor loss: -2.06839, Critic loss: 2.18177
[Episode 1384]  Actor loss: 1.39970, Critic loss: 1.75017
[Episode 1385]  Actor loss: -4.69396, Critic loss: 3.78498
[Episode 1386]  Actor loss: -2.67665, Critic loss: 2.22635
[Episode 1387]  Actor loss: -19.08742, Critic loss: 2.15890
[Episode 1388]  Actor loss: -5.53817, Critic loss: 3.48051
[Episode 1389]  Actor loss: -6.34379, Critic loss: 2.49567
[Episode 1390]  Actor loss: 2.52299, Critic loss: 0.92482
Test average reward is -5.0, Current best average reward is -3.0

[Episode 1391]  Actor loss: -4.02952, Critic loss: 1.34383
[Episode 1392]  Actor loss: -1.10060, Crit

[Episode 1503]  Actor loss: 2.41312, Critic loss: 1.40486
[Episode 1504]  Actor loss: -7.51434, Critic loss: 2.54548
[Episode 1505]  Actor loss: -14.96119, Critic loss: 1.98971
[Episode 1506]  Actor loss: -7.21076, Critic loss: 3.14419
[Episode 1507]  Actor loss: 4.89859, Critic loss: 2.77615
[Episode 1508]  Actor loss: -7.16571, Critic loss: 1.10748
[Episode 1509]  Actor loss: -11.53598, Critic loss: 1.37422
[Episode 1510]  Actor loss: -4.21221, Critic loss: 1.26180
Test average reward is -5.0, Current best average reward is -3.0

[Episode 1511]  Actor loss: -34.96690, Critic loss: 3.36240
[Episode 1512]  Actor loss: -8.16105, Critic loss: 3.46076
[Episode 1513]  Actor loss: -6.00466, Critic loss: 4.51407
[Episode 1514]  Actor loss: -4.36567, Critic loss: 2.76560
[Episode 1515]  Actor loss: -11.31701, Critic loss: 3.11840
[Episode 1516]  Actor loss: -4.96987, Critic loss: 3.02314
[Episode 1517]  Actor loss: -5.39529, Critic loss: 3.94369
[Episode 1518]  Actor loss: -1.22623, Critic lo



INFO:tensorflow:Assets written to: ./save/Actor/model_actor_1600_-2.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_1600_-2.0\assets


[Episode 1601]  Actor loss: -7.55066, Critic loss: 2.62271
[Episode 1602]  Actor loss: -10.57434, Critic loss: 3.28414
[Episode 1603]  Actor loss: 1.29114, Critic loss: 3.73448
[Episode 1604]  Actor loss: -17.24921, Critic loss: 3.10024
[Episode 1605]  Actor loss: -15.24380, Critic loss: 3.20332
[Episode 1606]  Actor loss: -2.57473, Critic loss: 4.20650
[Episode 1607]  Actor loss: 0.87902, Critic loss: 1.36100
[Episode 1608]  Actor loss: -0.94844, Critic loss: 1.61877
[Episode 1609]  Actor loss: -3.95923, Critic loss: 1.96987
[Episode 1610]  Actor loss: -2.87118, Critic loss: 1.48536
Test average reward is -5.0, Current best average reward is -2.0

[Episode 1611]  Actor loss: -24.57329, Critic loss: 2.81798
[Episode 1612]  Actor loss: -18.02882, Critic loss: 3.80719
[Episode 1613]  Actor loss: -1.58466, Critic loss: 2.74564
[Episode 1614]  Actor loss: -11.43993, Critic loss: 2.97437
[Episode 1615]  Actor loss: 2.31436, Critic loss: 2.91941
[Episode 1616]  Actor loss: -16.20860, Critic 



INFO:tensorflow:Assets written to: ./save/Actor/model_actor_1640_0.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_1640_0.0\assets


[Episode 1641]  Actor loss: -15.11164, Critic loss: 2.83704
[Episode 1642]  Actor loss: -5.03544, Critic loss: 2.10548
[Episode 1643]  Actor loss: -7.72856, Critic loss: 2.32771
[Episode 1644]  Actor loss: 8.60543, Critic loss: 1.77778
[Episode 1645]  Actor loss: 6.03542, Critic loss: 1.21872
[Episode 1646]  Actor loss: 1.20390, Critic loss: 1.49417
[Episode 1647]  Actor loss: -5.25363, Critic loss: 1.30104
[Episode 1648]  Actor loss: -6.30585, Critic loss: 0.91320
[Episode 1649]  Actor loss: 1.33630, Critic loss: 1.43417
[Episode 1650]  Actor loss: -5.81173, Critic loss: 3.63967
Test average reward is -5.0, Current best average reward is 0.0

[Episode 1651]  Actor loss: -13.25003, Critic loss: 1.97510
[Episode 1652]  Actor loss: -17.43698, Critic loss: 3.34608
[Episode 1653]  Actor loss: -15.61042, Critic loss: 1.93697
[Episode 1654]  Actor loss: -12.29452, Critic loss: 1.68685
[Episode 1655]  Actor loss: -11.99123, Critic loss: 2.89500
[Episode 1656]  Actor loss: -17.37018, Critic lo

[Episode 1767]  Actor loss: 10.53566, Critic loss: 3.67860
[Episode 1768]  Actor loss: 3.53845, Critic loss: 4.86998
[Episode 1769]  Actor loss: 15.15968, Critic loss: 2.88467
[Episode 1770]  Actor loss: -3.20791, Critic loss: 1.16149
Test average reward is -4.0, Current best average reward is 0.0

[Episode 1771]  Actor loss: 3.23616, Critic loss: 1.12749
[Episode 1772]  Actor loss: -11.10459, Critic loss: 1.63291
[Episode 1773]  Actor loss: -1.45238, Critic loss: 1.13854
[Episode 1774]  Actor loss: -8.05743, Critic loss: 2.47237
[Episode 1775]  Actor loss: -2.48195, Critic loss: 1.80750
[Episode 1776]  Actor loss: -14.94473, Critic loss: 3.09931
[Episode 1777]  Actor loss: -15.08047, Critic loss: 1.86006
[Episode 1778]  Actor loss: -1.86830, Critic loss: 1.49012
[Episode 1779]  Actor loss: -3.22805, Critic loss: 1.74910
[Episode 1780]  Actor loss: -12.82859, Critic loss: 3.83508
Test average reward is -4.0, Current best average reward is 0.0

[Episode 1781]  Actor loss: -29.86009, Cri

[Episode 1892]  Actor loss: -7.79085, Critic loss: 5.14716
[Episode 1893]  Actor loss: 2.64200, Critic loss: 2.10516
[Episode 1894]  Actor loss: -1.96280, Critic loss: 5.77578
[Episode 1895]  Actor loss: 20.80234, Critic loss: 4.96874
[Episode 1896]  Actor loss: -12.08239, Critic loss: 4.57710
[Episode 1897]  Actor loss: -2.06632, Critic loss: 5.23174
[Episode 1898]  Actor loss: 1.65209, Critic loss: 3.38223
[Episode 1899]  Actor loss: -4.36672, Critic loss: 4.80535
[Episode 1900]  Actor loss: 6.44704, Critic loss: 2.29690
Test average reward is -5.0, Current best average reward is 0.0

[Episode 1901]  Actor loss: -1.20777, Critic loss: 2.24144
[Episode 1902]  Actor loss: -8.73768, Critic loss: 4.55144
[Episode 1903]  Actor loss: -3.80607, Critic loss: 2.95950
[Episode 1904]  Actor loss: -4.81110, Critic loss: 2.81156
[Episode 1905]  Actor loss: 2.59165, Critic loss: 3.04113
[Episode 1906]  Actor loss: -6.82477, Critic loss: 2.42862
[Episode 1907]  Actor loss: 5.82563, Critic loss: 1.4



INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2000_-5.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2000_-5.0\assets


Moviepy - Building video movie_f/Lab15_demo-2000.webm.
Moviepy - Writing video movie_f/Lab15_demo-2000.webm



                                                                                                                       

Moviepy - Done !
Moviepy - video ready movie_f/Lab15_demo-2000.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                                                                                       

Moviepy - Done !
Moviepy - video ready __temp__.mp4




[Episode 2001]  Actor loss: 0.08830, Critic loss: 5.01123
[Episode 2002]  Actor loss: 16.02975, Critic loss: 7.43025
[Episode 2003]  Actor loss: 3.57175, Critic loss: 6.21530
[Episode 2004]  Actor loss: 8.23753, Critic loss: 2.48994
[Episode 2005]  Actor loss: -0.47141, Critic loss: 2.99520
[Episode 2006]  Actor loss: 3.95189, Critic loss: 3.37002
[Episode 2007]  Actor loss: -4.50093, Critic loss: 2.81173
[Episode 2008]  Actor loss: 0.13119, Critic loss: 2.09710
[Episode 2009]  Actor loss: -8.12102, Critic loss: 2.28042
[Episode 2010]  Actor loss: -21.25922, Critic loss: 6.03298
Test average reward is -5.0, Current best average reward is 0.0

[Episode 2011]  Actor loss: -13.96286, Critic loss: 3.81421
[Episode 2012]  Actor loss: -6.20129, Critic loss: 2.45231
[Episode 2013]  Actor loss: -17.84280, Critic loss: 6.04045
[Episode 2014]  Actor loss: -23.80975, Critic loss: 5.12166
[Episode 2015]  Actor loss: 2.36033, Critic loss: 3.51217
[Episode 2016]  Actor loss: -55.85118, Critic loss: 

[Episode 2127]  Actor loss: -9.96944, Critic loss: 2.68855
[Episode 2128]  Actor loss: 0.87669, Critic loss: 2.73248
[Episode 2129]  Actor loss: -1.86447, Critic loss: 7.10920
[Episode 2130]  Actor loss: 6.72758, Critic loss: 2.93088
Test average reward is -5.0, Current best average reward is 0.0

[Episode 2131]  Actor loss: -8.97363, Critic loss: 2.44714
[Episode 2132]  Actor loss: -27.84427, Critic loss: 4.17139
[Episode 2133]  Actor loss: -3.47320, Critic loss: 4.55895
[Episode 2134]  Actor loss: -1.55318, Critic loss: 2.66731
[Episode 2135]  Actor loss: -0.20943, Critic loss: 3.42248
[Episode 2136]  Actor loss: -9.06575, Critic loss: 1.67102
[Episode 2137]  Actor loss: 0.94408, Critic loss: 2.16617
[Episode 2138]  Actor loss: -9.21794, Critic loss: 1.49339
[Episode 2139]  Actor loss: -16.59902, Critic loss: 1.92706
[Episode 2140]  Actor loss: -4.95092, Critic loss: 1.06786
Test average reward is -5.0, Current best average reward is 0.0

[Episode 2141]  Actor loss: -32.85485, Critic



INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2200_1.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2200_1.0\assets


[Episode 2201]  Actor loss: -7.39522, Critic loss: 5.78091
[Episode 2202]  Actor loss: -1.74446, Critic loss: 4.53209
[Episode 2203]  Actor loss: -26.29055, Critic loss: 4.48716
[Episode 2204]  Actor loss: -7.57052, Critic loss: 3.35850
[Episode 2205]  Actor loss: 6.59997, Critic loss: 1.36867
[Episode 2206]  Actor loss: -4.42847, Critic loss: 4.14619
[Episode 2207]  Actor loss: -7.84197, Critic loss: 3.48125
[Episode 2208]  Actor loss: -0.72132, Critic loss: 2.83338
[Episode 2209]  Actor loss: -11.02530, Critic loss: 2.72838
[Episode 2210]  Actor loss: -22.15742, Critic loss: 3.97658
Test average reward is 2.0, Current best average reward is 1.0





INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2210_2.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2210_2.0\assets


[Episode 2211]  Actor loss: -17.05330, Critic loss: 2.59872
[Episode 2212]  Actor loss: -9.84278, Critic loss: 2.42526
[Episode 2213]  Actor loss: -27.44706, Critic loss: 4.33677
[Episode 2214]  Actor loss: -4.05270, Critic loss: 5.77903
[Episode 2215]  Actor loss: -13.55562, Critic loss: 4.59322
[Episode 2216]  Actor loss: -24.63937, Critic loss: 3.77568
[Episode 2217]  Actor loss: -1.78240, Critic loss: 3.61648
[Episode 2218]  Actor loss: -9.47397, Critic loss: 3.77297
[Episode 2219]  Actor loss: -5.51622, Critic loss: 3.23609
[Episode 2220]  Actor loss: 13.52051, Critic loss: 6.08999
Test average reward is -5.0, Current best average reward is 2.0

[Episode 2221]  Actor loss: -14.07643, Critic loss: 8.26912
[Episode 2222]  Actor loss: -19.64329, Critic loss: 4.45961
[Episode 2223]  Actor loss: 9.86185, Critic loss: 6.73281
[Episode 2224]  Actor loss: -25.65510, Critic loss: 4.35687
[Episode 2225]  Actor loss: 0.04484, Critic loss: 7.97845
[Episode 2226]  Actor loss: -2.54851, Critic 

[Episode 2336]  Actor loss: 5.11732, Critic loss: 4.75334
[Episode 2337]  Actor loss: 8.87764, Critic loss: 9.15912
[Episode 2338]  Actor loss: -37.17192, Critic loss: 7.72914
[Episode 2339]  Actor loss: -8.50717, Critic loss: 4.73133
[Episode 2340]  Actor loss: -27.15245, Critic loss: 5.32989
Test average reward is -5.0, Current best average reward is 2.0

[Episode 2341]  Actor loss: -6.56972, Critic loss: 9.06900
[Episode 2342]  Actor loss: -10.32051, Critic loss: 6.65586
[Episode 2343]  Actor loss: -13.41212, Critic loss: 6.95742
[Episode 2344]  Actor loss: -28.64540, Critic loss: 5.33665
[Episode 2345]  Actor loss: 19.46738, Critic loss: 5.93425
[Episode 2346]  Actor loss: -15.34567, Critic loss: 8.56853
[Episode 2347]  Actor loss: -32.37254, Critic loss: 5.08787
[Episode 2348]  Actor loss: 1.69169, Critic loss: 8.94539
[Episode 2349]  Actor loss: -25.13832, Critic loss: 5.67467
[Episode 2350]  Actor loss: -4.29905, Critic loss: 4.91514
Test average reward is -5.0, Current best ave

KeyboardInterrupt: 

# Brief report

最後跑了 2423 個 episodes，最好的 reward 為 2.0 ，跑了約 920 個 episodes ，bird 才飛過第一個 pipe 。 <br>
從跑最後的影片中，發現 bird 都是在最後快撞到 pipe 時才移動。 <br>
bird 最後大約可以飛過 6 個 pipes 。 <br>
跑到這樣大約花了 8 個小時。

跑會出錯的話要跑下面這行

In [43]:
pip install moviepy --upgrade
pip install ffmpeg --upgrade

Collecting decorator<5.0,>=4.0.2
  Using cached decorator-4.4.2-py2.py3-none-any.whl (9.2 kB)
Installing collected packages: decorator
  Attempting uninstall: decorator
    Found existing installation: decorator 5.1.1
    Uninstalling decorator-5.1.1:
      Successfully uninstalled decorator-5.1.1
Successfully installed decorator-4.4.2
Note: you may need to restart the kernel to use updated packages.


