In [4]:
import gym
import tensorflow as tf
import numpy as np

env = gym.make('Breakout-v0')

epsilon = 1.0
epsilon_min = 0.1
epsilon_decay = 0.999
gamma = 0.99
learning_rate = 0.001
memory_size = 1000000
batch_size = 32
update_target_network_freq = 10000
num_episodes = 2

input_shape = env.observation_space.shape
output_shape = env.action_space.n
model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, 8, strides=4, activation='relu', input_shape=input_shape),
    tf.keras.layers.Conv2D(64, 4, strides=2, activation='relu'),
    tf.keras.layers.Conv2D(64, 3, strides=1, activation='relu'),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(output_shape, activation='linear')
])
model.compile(optimizer=tf.keras.optimizers.Adam(lr=learning_rate), loss='mse')

target_model = tf.keras.models.clone_model(model)
target_model.set_weights(model.get_weights())

memory = []

total_reward = 0
steps = 0
episode = 0

while episode < num_episodes:
    state = env.reset()
    done = False
    while not done:
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            q_values = model.predict(np.expand_dims(state, axis=0))
            action = np.argmax(q_values)

        next_state, reward, done, _ = env.step(action)

        memory.append((state, action, reward, next_state, done))
        if len(memory) > memory_size:
            memory.pop(0)

        if len(memory) >= batch_size:
            minibatch = np.array(memory)[np.random.choice(len(memory), batch_size, replace=False)]
            states = np.array([transition[0] for transition in minibatch])
            actions = np.array([transition[1] for transition in minibatch])
            rewards = np.array([transition[2] for transition in minibatch])
            next_states = np.array([transition[3] for transition in minibatch])
            terminals = np.array([transition[4] for transition in minibatch])
            q_values = model.predict(states)
            next_q_values = target_model.predict(next_states)
            q_values[np.arange(len(actions)), actions] = rewards + gamma * np.max(next_q_values, axis=1) * (1 - terminals)
            model.train_on_batch(states, q_values)

        if steps % update_target_network_freq == 0:
            target_model.set_weights(model.get_weights())

        total_reward += reward
        state = next_state
        steps += 1

    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    episode += 1
    print(f'Episode {episode}, total reward: {total_reward}, epsilon: {epsilon}')

    total_reward = 0
    steps = 0

env.close()





  minibatch = np.array(memory)[np.random.choice(len(memory), batch_size, replace=False)]








Episode 1, total reward: 2.0, epsilon: 0.999




Episode 2, total reward: 0.0, epsilon: 0.998001
