In [1]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.layers import LeakyReLU


In [2]:
seed = 42
gamma = 0.99 
max_steps_per_episode = 1000
env = gym.make("AssaultNoFrameskip-v0")
env = gym.wrappers.AtariPreprocessing(env, noop_max=30, frame_skip=4, screen_size=84, terminal_on_life_loss=False, grayscale_obs=True, grayscale_newaxis=True, scale_obs=True)

env.seed(seed)
eps = np.finfo(np.float32).eps.item()

In [3]:
print("Action Space: {}".format(env.action_space))
print("State space: {}".format(env.observation_space))

Action Space: Discrete(7)
State space: Box([[[0.]
  [0.]
  [0.]
  ...
  [0.]
  [0.]
  [0.]]

 [[0.]
  [0.]
  [0.]
  ...
  [0.]
  [0.]
  [0.]]

 [[0.]
  [0.]
  [0.]
  ...
  [0.]
  [0.]
  [0.]]

 ...

 [[0.]
  [0.]
  [0.]
  ...
  [0.]
  [0.]
  [0.]]

 [[0.]
  [0.]
  [0.]
  ...
  [0.]
  [0.]
  [0.]]

 [[0.]
  [0.]
  [0.]
  ...
  [0.]
  [0.]
  [0.]]], [[[1.]
  [1.]
  [1.]
  ...
  [1.]
  [1.]
  [1.]]

 [[1.]
  [1.]
  [1.]
  ...
  [1.]
  [1.]
  [1.]]

 [[1.]
  [1.]
  [1.]
  ...
  [1.]
  [1.]
  [1.]]

 ...

 [[1.]
  [1.]
  [1.]
  ...
  [1.]
  [1.]
  [1.]]

 [[1.]
  [1.]
  [1.]
  ...
  [1.]
  [1.]
  [1.]]

 [[1.]
  [1.]
  [1.]
  ...
  [1.]
  [1.]
  [1.]]], (84, 84, 1), float32)


In [4]:
env.action_space.n


7

In [5]:
env.observation_space.shape

(84, 84, 1)

In [6]:
from time import sleep

epochs, rewards = 0, 0
state = env.reset()
done = False

while not done:
  #  sleep(0.01)
    env.render()
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)
    rewards  += reward
    epochs += 1
env.close()
print(f"Total reward: { rewards}")  



Total reward: 189.0


In [7]:
from tensorflow.keras.layers import Input, Flatten, MaxPooling2D,BatchNormalization

inputs = layers.Input(shape=(env.observation_space.shape))
convd = layers.Conv2D(16, kernel_size=(3,3))(inputs)
convd1 = layers.Conv2D(32, kernel_size=(3, 3))(convd)
pool = layers.MaxPooling2D(pool_size=(2, 2))(convd1)
batch = layers.BatchNormalization()(pool)
flatten = layers.Flatten()(batch)

common = layers.Dense(128, activation="relu")(flatten)
common2 = layers.Dense(128, activation="relu")(common)

#common = layers.Dense(128, activation="relu")(flatten)
#common2 = layers.Dense(128, activation="relu")(common)
action = layers.Dense(env.action_space.n, activation="softmax")(common2)
critic = layers.Dense(1)(common2)

model = keras.Model(inputs=inputs, outputs=[action, critic])
model.summary()

optimizer = keras.optimizers.Adam(learning_rate=0.1)
huber_loss = keras.losses.Huber()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 84, 84, 1)]  0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 82, 82, 16)   160         input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 80, 80, 32)   4640        conv2d[0][0]                     
__________________________________________________________________________________________________
max_pooling2d (MaxPooling2D)    (None, 40, 40, 32)   0           conv2d_1[0][0]                   
______________________________________________________________________________________________

In [8]:
epochs, rewards = 0, 0
state = env.reset()
done = False

while not done:
   # sleep(0.02)
   # env.render()
    state = tf.convert_to_tensor(state)
    state = tf.expand_dims(state, 0)
    action_probs, critic_value = model(state)
    
    action = np.random.choice(env.action_space.n, p=np.squeeze(action_probs))

    state, reward, done, info = env.step(action)
    rewards  += reward
    epochs += 1
env.close()
print(f"Total reward: { rewards}")

Total reward: 336.0


In [9]:
action_probs_history = []
critic_value_history = []
rewards_history = []
score_history = []

average_reward = 0
episode_count = 0

while True: 
    state = env.reset()
    episode_reward = 0
    with tf.GradientTape() as tape:
        for timestep in range(1, max_steps_per_episode):
            # env.render()
            state = tf.convert_to_tensor(state)
            state = tf.expand_dims(state, 0)

            action_probs, critic_value = model(state)

            action = np.random.choice(env.action_space.n, p=np.squeeze(action_probs))
            action_probs_history.append(tf.math.log(action_probs[0, action]))
            critic_value_history.append(critic_value[0, 0])
            state, reward, done, info = env.step(action)
            
            rewards_history.append(reward)
            episode_reward += reward
            
            if done:
                break
        
        score_history.append(episode_reward)
        
        G = np.zeros_like(rewards_history)
        for t in range(len(rewards_history)):
            G_sum = 0
            discount = 1
            for k in range(t, len(rewards_history)):
                G_sum += rewards_history[k]*discount
                discount *= gamma
            G[t] = G_sum
        
        G = np.array(G)
        G = (G - np.mean(G)) / (np.std(G) + eps)
        G = G.tolist()
        
        history = zip(action_probs_history, critic_value_history, G)
        actor_losses = []
        critic_losses = []
        
        for log_prob, value, rew in history:
            diff = rew - value
            actor_losses.append(-log_prob * diff)
            critic_losses.append(huber_loss(tf.expand_dims(value, 0), tf.expand_dims(rew, 0)))
        
        loss_value = sum(actor_losses) + sum(critic_losses)
        grads = tape.gradient(loss_value, model.trainable_variables)
        
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        action_probs_history.clear()
        critic_value_history.clear()
        rewards_history.clear()
        
    episode_count += 1
    avg_score = np.mean(score_history[-100:])
    if episode_count % 1 == 0:
        template = "average score: {:.2f} at episode {}"
        print(template.format(avg_score, episode_count))
    if avg_score > 700: #500 voorlopig, kan mss nog hoger?
        print("Solved at episode {}!".format(episode_count))    
        break

average score: 273.00 at episode 1
average score: 357.00 at episode 2
average score: 378.00 at episode 3
average score: 420.00 at episode 4


KeyboardInterrupt: 