In [1]:
!pip install gymnasium
!pip install Ale

Collecting Ale
  Downloading Ale-0.8.4.tar.gz (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.4/53.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: Ale
  Building wheel for Ale (setup.py) ... [?25ldone
[?25h  Created wheel for Ale: filename=Ale-0.8.4-py3-none-any.whl size=70154 sha256=e814034ac6ad6c245a6a413ec4cbedbdf84c3e0dd532968be283bd69b62dc3c3
  Stored in directory: /root/.cache/pip/wheels/4f/12/a2/6bc0ac816f390c106c73ce33b95abbf51653fb1b61c13fff0b
Successfully built Ale
Installing collected packages: Ale
Successfully installed Ale-0.8.4


In [None]:
!pip install ale-py tensorflow-gpu
import pandas as pd
import numpy as np
import random
import time
import gymnasium as gym
import ale_py
from collections import deque
from tensorflow.keras.layers import Dense, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
import matplotlib.pyplot as plt
import matplotlib as mpl
import tensorflow as tf

# Configure TensorFlow to use GPU
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

class DQN_Agent:
    def __init__(self, env, num_episodes=3000, gamma=0.99, alpha=0.001, epsilon=1.0, epsilon_min=0.1, epsilon_decay=0.995):
        self.env = env
        self.num_episodes = num_episodes
        self.gamma = gamma
        self.alpha = alpha
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.memory = deque(maxlen=2000)
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_model()

    def _build_model(self):
        model = Sequential()
        model.add(Conv2D(32, (3, 3), activation='relu', input_shape=self.env.observation_space.shape))
        model.add(MaxPooling2D((2, 2)))
        model.add(Flatten())
        model.add(Dense(64, activation='relu'))
        model.add(Dense(self.env.action_space.n, activation='linear'))
        model.compile(optimizer=Adam(learning_rate=self.alpha), loss=MeanSquaredError())
        return model

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return self.env.action_space.sample()
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target += self.gamma * np.amax(self.target_model.predict(next_state)[0])
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

def plot_results(episode_rewards, avg_rewards):
    plt.figure(figsize=(12, 5))
    plt.plot(range(len(episode_rewards)), episode_rewards, label='Episode Reward')
    plt.plot(range(len(avg_rewards)), avg_rewards, label='Average Reward')
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.title('DQN Agent Performance on MsPacMan')
    plt.legend()
    plt.show()

def main():
    env = gym.make('ALE/MsPacman-v5', frameskip=4, render_mode='rgb_array')
    agent = DQN_Agent(env, num_episodes=3000, gamma=0.99, alpha=0.001, epsilon=1.0, epsilon_min=0.1, epsilon_decay=0.995)
    episode_rewards = []
    avg_rewards = []

    for e in range(agent.num_episodes):
        state = env.reset()
        state = np.reshape(state[0], [1, 210, 160, 3])
        total_reward = 0
        done = False
        while not done:
            action = agent.act(state)
            next_state, reward, done, truncated, _ = env.step(action)
            next_state = np.reshape(next_state, [1, 210, 160, 3])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward
            if done:
                agent.update_target_model()
                break

        episode_rewards.append(total_reward)
        avg_rewards.append(np.mean(episode_rewards[-100:]))
        print(f"Episode: {e+1}/{agent.num_episodes}, Score: {total_reward}, Epsilon: {agent.epsilon:.2f}")

        if len(agent.memory) > 32:
            agent.replay(32)

    plot_results(episode_rewards, avg_rewards)
    agent.save("dqn_mspacman.h5")

if __name__ == "__main__":
    main()

Collecting tensorflow-gpu
  Downloading tensorflow-gpu-2.12.0.tar.gz (2.6 kB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[39 lines of output][0m
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File "/opt/conda/lib/python3.10/site-packages/setuptools/_vendor/packaging/requirements.py", line 35, in __init__
  [31m   [0m     parsed = _parse_requirement(requirement_string)
  [31m   [0m   File "/opt/conda/lib/python3.10/site-packages/setuptools/_vendor/packaging/_parser.py", line 64, in parse_requirement
  [31m   [0m     return _parse_requirement(Tokenizer(source, rules=DEFAULT_RULES))
  [31m   [0m   File "/opt/conda/lib/python3.10/site-packages/setuptools/_vendor/packaging/_parser.py", line 82, in _parse_requirement
  [31m   [0m     url, specifier, marker = _parse_requ

  File "/opt/conda/lib/python3.10/site-packages/gymnasium/envs/registration.py", line 594, in load_plugin_envs
    fn()
  File "/opt/conda/lib/python3.10/site-packages/shimmy/registration.py", line 304, in register_gymnasium_envs
    _register_atari_envs()
  File "/opt/conda/lib/python3.10/site-packages/shimmy/registration.py", line 205, in _register_atari_envs
    import ale_py
  File "/opt/conda/lib/python3.10/site-packages/ale_py/__init__.py", line 66, in <module>
    register_v0_v4_envs()
  File "/opt/conda/lib/python3.10/site-packages/ale_py/registration.py", line 176, in register_v0_v4_envs
    _register_rom_configs(legacy_games, obs_types, versions)
  File "/opt/conda/lib/python3.10/site-packages/ale_py/registration.py", line 62, in _register_rom_configs
    gymnasium.register(
AttributeError: partially initialized module 'gymnasium' has no attribute 'register' (most likely due to a circular import)
[0m
  logger.warn(f"plugin: {plugin.value} raised {traceback.format_exc()}")
20

Episode: 1/3000, Score: 170.0, Epsilon: 1.00


2024-08-07 00:44:01.612417: E external/local_xla/xla/service/gpu/buffer_comparator.cc:1137] Difference at 67128: 4.30602, expected 3.69871
2024-08-07 00:44:01.612491: E external/local_xla/xla/service/gpu/buffer_comparator.cc:1137] Difference at 68238: 4.76332, expected 4.15601
2024-08-07 00:44:01.612513: E external/local_xla/xla/service/gpu/buffer_comparator.cc:1137] Difference at 70437: 5.06801, expected 4.4607
2024-08-07 00:44:01.612523: E external/local_xla/xla/service/gpu/buffer_comparator.cc:1137] Difference at 70438: 4.85415, expected 4.24684
2024-08-07 00:44:01.612538: E external/local_xla/xla/service/gpu/buffer_comparator.cc:1137] Difference at 71778: 4.94568, expected 4.33838
2024-08-07 00:44:01.612556: E external/local_xla/xla/service/gpu/buffer_comparator.cc:1137] Difference at 73658: 5.06337, expected 4.45607
2024-08-07 00:44:01.612566: E external/local_xla/xla/service/gpu/buffer_comparator.cc:1137] Difference at 73785: 4.81123, expected 4.20392
2024-08-07 00:44:01.612590: 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step


I0000 00:00:1722991442.204894     108 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 220ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2