In [1]:
!git clone https://github.com/ntasfi/PyGame-Learning-Environment
%cd PyGame-Learning-Environment
!pip install -e .
!pip install pygame

Cloning into 'PyGame-Learning-Environment'...
remote: Enumerating objects: 1118, done.[K
remote: Total 1118 (delta 0), reused 0 (delta 0), pack-reused 1118 (from 1)[K
Receiving objects: 100% (1118/1118), 8.06 MiB | 61.10 MiB/s, done.
Resolving deltas: 100% (592/592), done.
/content/PyGame-Learning-Environment
Obtaining file:///content/PyGame-Learning-Environment
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: ple
  Running setup.py develop for ple
Successfully installed ple-0.0.1


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import moviepy.editor as mpy
import skimage.transform
from IPython.display import Image, display

import tensorflow as tf
import tensorflow_probability as tfp
import tensorflow.keras.losses as kls

  if event.key is 'enter':



In [3]:
gpus = tf.config.list_physical_devices("GPU")
if gpus:
    try:
        # Restrict TensorFlow to only use the fourth GPU
        tf.config.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)

        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

In [4]:
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"  # this line make pop-out window not appear
from ple.games.flappybird import FlappyBird
from ple import PLE

game = FlappyBird()
env = PLE(game, fps=30, display_screen=False)  # environment interface to game
env.reset_game()

test_game = FlappyBird()
test_env = PLE(test_game, fps=30, display_screen=False)
test_env.reset_game()

couldn't import doomish
Couldn't import doom


In [5]:
path = './movie_f'
if not os.path.exists(path):
    os.makedirs(path)

In [6]:
hparas = {
    'image_size': 84,
    'num_stack': 4,
    'action_dim': len(env.getActionSet()),
    'hidden_size': 256,
    'lr': 0.0001,
    'gamma': 0.99,
    'lambda': 0.95,
    'clip_val': 0.2,
    'ppo_epochs': 8,
    'test_epochs': 1,
    'num_steps': 512,
    'mini_batch_size': 64,
    'target_reward': 200,
    'max_episode': 30000,
}

In [7]:
# Please do not modify this method
def make_anim(images, fps=60, true_image=False):
    duration = len(images) / fps

    def make_frame(t):
        try:
            x = images[int(len(images) / duration * t)]
        except:
            x = images[-1]

        if true_image:
            return x.astype(np.uint8)
        else:
            return ((x + 1) / 2 * 255).astype(np.uint8)

    clip = mpy.VideoClip(make_frame, duration=duration)
    clip.fps = fps

    return clip

In [8]:
def preprocess_screen(screen):
    screen = skimage.transform.rotate(screen, -90, resize=True)
    screen = screen[:400, :]
    screen = skimage.transform.resize(screen, [hparas['image_size'], hparas['image_size'], 1])
    return screen.astype(np.float32)

def frames_to_state(input_frames):
    if(len(input_frames) == 1):
        state = np.concatenate(input_frames*4, axis=-1)
    elif(len(input_frames) == 2):
        state = np.concatenate(input_frames[0:1]*2 + input_frames[1:]*2, axis=-1)
    elif(len(input_frames) == 3):
        state = np.concatenate(input_frames + input_frames[2:], axis=-1)
    else:
        state = np.concatenate(input_frames[-4:], axis=-1)

    return state

In [9]:
class ActorCriticNetwork(tf.keras.Model):
    def __init__(self, hparas):
        super().__init__()

        self.feature_extractor = tf.keras.Sequential([
          # Convolutional Layers
          tf.keras.layers.Conv2D(filters=32, kernel_size=8, strides=4),
          tf.keras.layers.ReLU(),
          tf.keras.layers.Conv2D(filters=64, kernel_size=4, strides=2),
          tf.keras.layers.ReLU(),
          tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides=1),
          tf.keras.layers.ReLU(),
          # Embedding Layers
          tf.keras.layers.Flatten(),
          tf.keras.layers.Dense(hparas['hidden_size']),
          tf.keras.layers.ReLU(),
        ])

        # Actor Network
        self.actor = tf.keras.layers.Dense(hparas['action_dim'], activation='softmax')
        # Critic Network
        self.critic = tf.keras.layers.Dense(1, activation = None)

    def call(self, input):
        x = self.feature_extractor(input)
        action_logits = self.actor(x)
        value = self.critic(x)
        return action_logits, value

In [15]:
class Agent():
    def __init__(self, hparas):
        self.gamma = hparas['gamma']
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=hparas['lr'])
        self.actor_critic = ActorCriticNetwork(hparas)
        self.clip_pram = hparas['clip_val']

    def ppo_iter(self, mini_batch_size, states, actions, log_probs, returns, advantage):
        batch_size = states.shape[0]
        for _ in range(batch_size // mini_batch_size):
            rand_ids = tf.convert_to_tensor(np.random.randint(0, batch_size, mini_batch_size), dtype=tf.int32)
            yield tf.gather(states, rand_ids), tf.gather(actions, rand_ids), tf.gather(log_probs, rand_ids), \
             tf.gather(returns, rand_ids), tf.gather(advantage, rand_ids)

    def ppo_update(self, ppo_epochs, mini_batch_size, states, actions, log_probs, discount_rewards, advantages):
        total_actor_loss = 0
        total_critic_loss = 0
        for _ in range(ppo_epochs):
            for state, action, old_log_probs, reward, advantage in self.ppo_iter(mini_batch_size, states, actions, log_probs, discount_rewards, advantages):
                reward = tf.expand_dims(reward, axis=-1)

                with tf.GradientTape() as tape:
                    prob, value = self.actor_critic(state, training=True)
                    dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
                    entropy = tf.math.reduce_mean(dist.entropy())
                    new_log_probs = dist.log_prob(action)

                    # PPO ratio
                    ratio = tf.math.exp(new_log_probs - old_log_probs)
                    surr1 = ratio * advantage
                    surr2 = tf.clip_by_value(ratio, 1.0 - self.clip_pram, 1.0 + self.clip_pram) * advantage

                    actor_loss = tf.math.negative(tf.math.reduce_mean(tf.math.minimum(surr1, surr2))) - 0.1 * entropy
                    #//
                    critic_loss = 0.5 * tf.math.reduce_mean(tf.math.squared_difference(reward, value))

                    total_loss = actor_loss + critic_loss

                # single optimizer
                grads = tape.gradient(total_loss, self.actor_critic.trainable_variables)
                self.optimizer.apply_gradients(zip(grads, self.actor_critic.trainable_variables))

                total_actor_loss += actor_loss
                total_critic_loss += critic_loss
        return total_actor_loss, total_critic_loss

In [16]:
# https://arxiv.org/pdf/1506.02438.pdf
# Equation 16
def compute_gae(rewards, masks, values, gamma, LAMBDA):
    gae = 0
    returns = []
    for i in reversed(range(len(rewards))):
        delta = rewards[i] + gamma * values[i + 1] * masks[i] - values[i]
        gae = delta + gamma * LAMBDA * masks[i] * gae
        returns.append(gae + values[i])

    returns.reverse()
    return returns

# Testing Environment

In [17]:
def test_reward(test_env, agent):
    total_reward = 0
    # Reset the environment
    test_env.reset_game()
    input_frames = [preprocess_screen(test_env.getScreenGrayscale())]

    while not test_env.game_over():

        state = frames_to_state(input_frames)
        state = tf.expand_dims(state, axis=0)
        prob, value = agent.actor_critic(state)

        action = np.argmax(prob[0].numpy())
        reward = test_env.act(test_env.getActionSet()[action])
        total_reward += reward

        input_frames.append(preprocess_screen(test_env.getScreenGrayscale()))

    return total_reward

# Training

In [18]:
agent = Agent(hparas)
max_episode = hparas['max_episode']
test_per_n_episode = 10
force_save_per_n_episode = 1000
early_stop_reward = 10

start_s = 0
best_reward = -5.0

checkpoint = tf.train.Checkpoint(
    actor_critic = agent.actor_critic,
    optimizer = agent.optimizer,
)

# Load from old checkpoint
# checkpoint.restore('ckpt_dir/ckpt-?')

In [19]:
ep_reward = []
total_avgr = []
early_stop = False
avg_rewards_list = []

env.reset_game()

for s in range(0, max_episode):
    if early_stop == True:
        break

    rewards = []
    states = []
    actions = []
    log_probs = []
    masks = []
    values = []

    display_frames = [env.getScreenRGB()]
    input_frames = [preprocess_screen(env.getScreenGrayscale())]

    for step in range(hparas['num_steps']):

        state = frames_to_state(input_frames)
        state = tf.expand_dims(state, axis=0)
        prob, value = agent.actor_critic(state)

        dist = tfp.distributions.Categorical(probs=prob[0], dtype=tf.float32)
        action = dist.sample(1)
        log_prob = dist.log_prob(action)

        reward = env.act(env.getActionSet()[int(action.numpy())])

        done = env.game_over()

        states.append(state)
        actions.append(action)
        values.append(value[0])
        log_probs.append(log_prob)
        rewards.append(tf.convert_to_tensor(reward, dtype=tf.float32))
        masks.append(tf.convert_to_tensor(1-int(done), dtype=tf.float32))

        display_frames.append(env.getScreenRGB())
        input_frames.append(preprocess_screen(env.getScreenGrayscale()))

        if done:
            env.reset_game()
            input_frames = [preprocess_screen(env.getScreenGrayscale())]

    _, next_value = agent.actor_critic(state)
    values.append(next_value[0])

    returns = compute_gae(rewards, masks, values, hparas['gamma'], hparas['lambda'])

    returns = tf.concat(returns, axis=0)
    log_probs = tf.concat(log_probs, axis=0)
    values = tf.concat(values, axis=0)
    states = tf.concat(states, axis=0)
    actions = tf.concat(actions, axis=0)
    advantage = returns - values[:-1]

    a_loss, c_loss = agent.ppo_update(hparas['ppo_epochs'], hparas['mini_batch_size'], states, actions, log_probs, returns, advantage)
    print('[Episode %d]  Actor loss: %.5f, Critic loss: %.5f' % (s, a_loss, c_loss))

    if s % test_per_n_episode == 0:
        # test agent hparas['test_epochs'] times to get the average reward
        avg_reward = np.mean([test_reward(test_env, agent) for _ in range(hparas['test_epochs'])])
        print("Test average reward is %.1f, Current best average reward is %.1f\n" % (avg_reward, best_reward))
        avg_rewards_list.append(avg_reward)

        if avg_reward > best_reward:
            best_reward = avg_reward
            # 保存为 .h5 格式
            agent.actor_critic.save('./save/Actor/model_actor_{}_{}.h5'.format(s, avg_reward))
            checkpoint.save(file_prefix = './save/checkpoints/ckpt')

    if s % force_save_per_n_episode == 0:
        # 保存为 .h5 格式
        agent.actor_critic.save('./save/Actor/model_actor_{}_{}.h5'.format(s, avg_reward))
        checkpoint.save(file_prefix = './save/checkpoints/ckpt')
        clip = make_anim(display_frames, fps=60, true_image=True).rotate(-90)
        clip.write_videofile("movie_f/{}_demo-{}.webm".format('Lab15', s), fps=60)
        display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))

    if best_reward >= early_stop_reward:
        early_stop = True

  reward = env.act(env.getActionSet()[int(action.numpy())])



[Episode 0]  Actor loss: 88.59031, Critic loss: 63.71158




Test average reward is -5.0, Current best average reward is -5.0

Moviepy - Building video movie_f/Lab15_demo-0.webm.
Moviepy - Writing video movie_f/Lab15_demo-0.webm





Moviepy - Done !
Moviepy - video ready movie_f/Lab15_demo-0.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4




[Episode 1]  Actor loss: 54.54134, Critic loss: 37.22117
[Episode 2]  Actor loss: 36.37086, Critic loss: 21.74653
[Episode 3]  Actor loss: 29.47977, Critic loss: 12.18717
[Episode 4]  Actor loss: 24.71905, Critic loss: 9.44314
[Episode 5]  Actor loss: 3.43527, Critic loss: 5.31724
[Episode 6]  Actor loss: 8.17182, Critic loss: 5.84671
[Episode 7]  Actor loss: -3.89057, Critic loss: 3.43496
[Episode 8]  Actor loss: -4.60115, Critic loss: 3.10960
[Episode 9]  Actor loss: -8.16456, Critic loss: 2.98535
[Episode 10]  Actor loss: -9.01003, Critic loss: 2.50381
Test average reward is -5.0, Current best average reward is -5.0

[Episode 11]  Actor loss: 4.32775, Critic loss: 3.18325
[Episode 12]  Actor loss: 0.60741, Critic loss: 3.34672
[Episode 13]  Actor loss: -2.53377, Critic loss: 2.61472
[Episode 14]  Actor loss: -3.77342, Critic loss: 2.39350
[Episode 15]  Actor loss: 0.75823, Critic loss: 3.32172
[Episode 16]  Actor loss: 1.58531, Critic loss: 2.88921
[Episode 17]  Actor loss: -3.43866



Test average reward is -4.0, Current best average reward is -5.0

[Episode 221]  Actor loss: -4.41581, Critic loss: 0.36187
[Episode 222]  Actor loss: -5.83302, Critic loss: 0.36537
[Episode 223]  Actor loss: -1.82721, Critic loss: 0.32618
[Episode 224]  Actor loss: -7.40614, Critic loss: 0.44454
[Episode 225]  Actor loss: -4.84624, Critic loss: 0.37156
[Episode 226]  Actor loss: 2.01165, Critic loss: 0.51657
[Episode 227]  Actor loss: -4.87960, Critic loss: 0.31853
[Episode 228]  Actor loss: -4.06621, Critic loss: 0.36971
[Episode 229]  Actor loss: -1.47981, Critic loss: 0.39603
[Episode 230]  Actor loss: -4.85555, Critic loss: 0.26750
Test average reward is -5.0, Current best average reward is -4.0

[Episode 231]  Actor loss: -7.29669, Critic loss: 0.26893
[Episode 232]  Actor loss: -7.70916, Critic loss: 0.36920
[Episode 233]  Actor loss: -2.15687, Critic loss: 0.39655
[Episode 234]  Actor loss: -4.65200, Critic loss: 0.35460
[Episode 235]  Actor loss: -5.37668, Critic loss: 0.43368



Test average reward is -5.0, Current best average reward is -4.0

Moviepy - Building video movie_f/Lab15_demo-1000.webm.
Moviepy - Writing video movie_f/Lab15_demo-1000.webm





Moviepy - Done !
Moviepy - video ready movie_f/Lab15_demo-1000.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4




[Episode 1001]  Actor loss: -10.08454, Critic loss: 0.96257
[Episode 1002]  Actor loss: 1.49562, Critic loss: 0.48414
[Episode 1003]  Actor loss: -5.71235, Critic loss: 0.38421
[Episode 1004]  Actor loss: -1.77563, Critic loss: 0.48882
[Episode 1005]  Actor loss: -10.88589, Critic loss: 0.90417
[Episode 1006]  Actor loss: -4.62421, Critic loss: 0.68384
[Episode 1007]  Actor loss: -12.19605, Critic loss: 0.53510
[Episode 1008]  Actor loss: -5.48314, Critic loss: 1.06121
[Episode 1009]  Actor loss: -5.08428, Critic loss: 0.72133
[Episode 1010]  Actor loss: -4.96493, Critic loss: 0.64110
Test average reward is -5.0, Current best average reward is -4.0

[Episode 1011]  Actor loss: 0.62171, Critic loss: 0.84987
[Episode 1012]  Actor loss: -2.43919, Critic loss: 0.74993
[Episode 1013]  Actor loss: -23.57061, Critic loss: 1.58566
[Episode 1014]  Actor loss: -1.85260, Critic loss: 0.53457
[Episode 1015]  Actor loss: -5.03672, Critic loss: 0.75351
[Episode 1016]  Actor loss: -9.21341, Critic lo



Test average reward is -3.0, Current best average reward is -4.0

[Episode 1621]  Actor loss: -18.84017, Critic loss: 1.86936
[Episode 1622]  Actor loss: -7.62591, Critic loss: 1.69861
[Episode 1623]  Actor loss: -15.06760, Critic loss: 1.82158
[Episode 1624]  Actor loss: 1.49162, Critic loss: 2.98510
[Episode 1625]  Actor loss: -1.23780, Critic loss: 1.38625
[Episode 1626]  Actor loss: 4.22358, Critic loss: 1.43078
[Episode 1627]  Actor loss: -1.99085, Critic loss: 1.32902
[Episode 1628]  Actor loss: -4.46984, Critic loss: 1.46639
[Episode 1629]  Actor loss: -13.95982, Critic loss: 1.94789
[Episode 1630]  Actor loss: -13.02701, Critic loss: 1.51164
Test average reward is -5.0, Current best average reward is -3.0

[Episode 1631]  Actor loss: -13.37899, Critic loss: 4.40960
[Episode 1632]  Actor loss: -11.55783, Critic loss: 1.55902
[Episode 1633]  Actor loss: -14.01550, Critic loss: 2.57836
[Episode 1634]  Actor loss: -28.01480, Critic loss: 2.91211
[Episode 1635]  Actor loss: -11.1453



Test average reward is -5.0, Current best average reward is -3.0

Moviepy - Building video movie_f/Lab15_demo-2000.webm.
Moviepy - Writing video movie_f/Lab15_demo-2000.webm





Moviepy - Done !
Moviepy - video ready movie_f/Lab15_demo-2000.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4




[Episode 2001]  Actor loss: -30.77071, Critic loss: 6.25019
[Episode 2002]  Actor loss: -25.57653, Critic loss: 3.48777
[Episode 2003]  Actor loss: -14.60817, Critic loss: 3.92268
[Episode 2004]  Actor loss: -3.51185, Critic loss: 5.86628
[Episode 2005]  Actor loss: -3.24352, Critic loss: 3.97975
[Episode 2006]  Actor loss: -17.46392, Critic loss: 2.07407
[Episode 2007]  Actor loss: -15.80979, Critic loss: 5.58154
[Episode 2008]  Actor loss: 0.58981, Critic loss: 3.14569
[Episode 2009]  Actor loss: -10.11289, Critic loss: 5.80809
[Episode 2010]  Actor loss: -1.31197, Critic loss: 3.09429




Test average reward is 9.0, Current best average reward is -3.0

[Episode 2011]  Actor loss: -6.57395, Critic loss: 2.35256
[Episode 2012]  Actor loss: -0.51636, Critic loss: 2.88561
[Episode 2013]  Actor loss: -11.07585, Critic loss: 2.60630
[Episode 2014]  Actor loss: -16.50791, Critic loss: 2.95171
[Episode 2015]  Actor loss: -10.42778, Critic loss: 2.65472
[Episode 2016]  Actor loss: 5.63255, Critic loss: 3.60852
[Episode 2017]  Actor loss: -9.09197, Critic loss: 3.05903
[Episode 2018]  Actor loss: -7.41296, Critic loss: 5.08758
[Episode 2019]  Actor loss: -9.31607, Critic loss: 9.63489
[Episode 2020]  Actor loss: -6.93498, Critic loss: 9.52299
Test average reward is -5.0, Current best average reward is 9.0

[Episode 2021]  Actor loss: 0.44549, Critic loss: 5.64021
[Episode 2022]  Actor loss: -8.96248, Critic loss: 4.04467
[Episode 2023]  Actor loss: -3.41969, Critic loss: 2.29340
[Episode 2024]  Actor loss: -19.29981, Critic loss: 6.13802
[Episode 2025]  Actor loss: -29.90871, Cri



Test average reward is -5.0, Current best average reward is 9.0

Moviepy - Building video movie_f/Lab15_demo-3000.webm.
Moviepy - Writing video movie_f/Lab15_demo-3000.webm





Moviepy - Done !
Moviepy - video ready movie_f/Lab15_demo-3000.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4




[Episode 3001]  Actor loss: -5.72690, Critic loss: 3.88176
[Episode 3002]  Actor loss: -46.03850, Critic loss: 5.91017
[Episode 3003]  Actor loss: -24.23717, Critic loss: 12.56866
[Episode 3004]  Actor loss: -3.22769, Critic loss: 6.88348
[Episode 3005]  Actor loss: 6.47982, Critic loss: 8.99111
[Episode 3006]  Actor loss: -13.44020, Critic loss: 5.74513
[Episode 3007]  Actor loss: -15.36291, Critic loss: 3.67840
[Episode 3008]  Actor loss: -18.51128, Critic loss: 5.43515
[Episode 3009]  Actor loss: -14.77660, Critic loss: 7.40957
[Episode 3010]  Actor loss: -2.94855, Critic loss: 9.13127
Test average reward is -4.0, Current best average reward is 9.0

[Episode 3011]  Actor loss: 7.28199, Critic loss: 8.20149
[Episode 3012]  Actor loss: 5.21091, Critic loss: 7.13374
[Episode 3013]  Actor loss: -5.41874, Critic loss: 13.25968
[Episode 3014]  Actor loss: -24.20827, Critic loss: 6.43921
[Episode 3015]  Actor loss: -19.47849, Critic loss: 10.12021
[Episode 3016]  Actor loss: -33.50911, Cri



Test average reward is 13.0, Current best average reward is 9.0



# Report:

In this implementation, the design of the Actor-Critic network effectively integrates convolutional neural network (CNN) feature extraction with PPO (Proximal Policy Optimization) strategy optimization. In the network architecture, high-level features are extracted from input data through multiple convolutional layers and nonlinear activation functions. These features are then further mapped using fully connected layers, finally splitting into two branches: the **Actor** outputs the action probability distribution, while the **Critic** outputs the state value function. This design effectively separates strategy learning (Actor) from value evaluation (Critic), allowing the network to learn both action policies and state values simultaneously, thereby improving learning efficiency.

The PPO update process utilizes the **importance sampling ratio** to measure the difference between the new and old policies. Through a "clipping" operation, the update magnitude of the policy is controlled. Specifically, with `tf.clip_by_value`, the ratio \( r \) is constrained between \( 1 - \epsilon \) and \( 1 + \epsilon \), preventing the policy from changing too drastically in a single update. This is the core idea of PPO, which stabilizes the training process and accelerates convergence.

Regarding the loss function, the Actor loss incorporates a combination of policy loss and weighted policy entropy. The policy entropy encourages exploratory behavior, preventing the policy from falling into local optima. The Critic loss, on the other hand, is based on the **Mean Squared Error (MSE)**, measuring the difference between the estimated value function and the actual return. This helps guide the value network to learn more accurate evaluations.

I used the CPU to run a total of 3580 episodes to reach the 10-point threshold, which took about 20 hours.