In [None]:

!pip install gymnasium[atari] ale-py opencv-python tensorflow shimmy

import os
import argparse
import time
import random
import numpy as np
import tensorflow as tf
import gymnasium as gym
from collections import deque
import cv2
import ale_py

# ثبت محیط‌های آتاری
gym.register_envs(ale_py)

# --- تنظیمات دمو (برای اجرای سریع) ---
env_id = 'PongNoFrameskip-v4'
seed = 42
lr = 0.0001
buffer_size = 50000
batch_size = 32
warm_start = 500          # شروع سریع بعد از ۵۰۰ فریم
train_freq = 4
target_q_update_freq = 200 # آپدیت سریع‌تر تارگت برای دمو
reward_gamma = 0.99
number_timesteps = 2000   # کل زمان اجرا فقط ۲۰۰۰ فریم (حدود ۳ دقیقه)
clipnorm = 10.0
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay_steps = 2000 # کاهش سریع اپسیلون برای دمو

# ==========================================

class FireResetEnv(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)
        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
        assert len(env.unwrapped.get_action_meanings()) >= 3

    def reset(self, **kwargs):
        self.env.reset(**kwargs)
        obs, _, terminated, truncated, _ = self.env.step(1)
        if terminated or truncated:
            self.env.reset(**kwargs)
        obs, _, terminated, truncated, _ = self.env.step(2)
        if terminated or truncated:
            self.env.reset(**kwargs)
        return obs, {}

class ProcessFrame84(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.observation_space = gym.spaces.Box(
            low=0, high=255, shape=(84, 84, 1), dtype=np.uint8
        )

    def observation(self, obs):
        return ProcessFrame84.process(obs)

    @staticmethod
    def process(frame):
        if frame.size == 210 * 160 * 3:
            img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
        elif frame.size == 250 * 160 * 3:
            img = np.reshape(frame, [250, 160, 3]).astype(np.float32)
        else:
            return frame

        img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
        resized_screen = cv2.resize(img, (84, 84), interpolation=cv2.INTER_AREA)
        x_t = np.reshape(resized_screen, [84, 84, 1])
        return x_t.astype(np.uint8)

class FrameStack(gym.Wrapper):
    def __init__(self, env, k):
        super().__init__(env)
        self.k = k
        self.frames = deque([], maxlen=k)
        shp = env.observation_space.shape
        self.observation_space = gym.spaces.Box(
            low=0, high=255,
            shape=(shp[0], shp[1], shp[2] * k),
            dtype=env.observation_space.dtype
        )

    def reset(self, **kwargs):
        ob, info = self.env.reset(**kwargs)
        for _ in range(self.k):
            self.frames.append(ob)
        return self._get_ob(), info

    def step(self, action):
        ob, reward, terminated, truncated, info = self.env.step(action)
        self.frames.append(ob)
        return self._get_ob(), reward, terminated, truncated, info

    def _get_ob(self):
        assert len(self.frames) == self.k
        return np.concatenate(self.frames, axis=2)

print("="*40)
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"✅ GPU DETECTED: {gpus[0].name}")
else:
    print("❌ GPU NOT FOUND. Please enable T4 GPU in Runtime settings.")
print("="*40)

def build_env(env_id, seed=0):
    env = gym.make(env_id, render_mode='rgb_array')
    env = gym.wrappers.RecordEpisodeStatistics(env)
    env = FireResetEnv(env)
    env = ProcessFrame84(env)
    env = FrameStack(env, 4)
    env.action_space.seed(seed)
    return env

class ReplayBuffer:
    def __init__(self, size):
        self.buffer = deque(maxlen=size)

    def add(self, obs, act, rew, next_obs, done):
        obs = np.array(obs, dtype=np.uint8)
        next_obs = np.array(next_obs, dtype=np.uint8)
        self.buffer.append((obs, act, rew, next_obs, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        obs, act, rew, next_obs, done = zip(*batch)
        return (np.array(obs), np.array(act), np.array(rew, dtype=np.float32),
                np.array(next_obs), np.array(done, dtype=np.float32))

def sync(model, target_model):
    target_model.set_weights(model.get_weights())

def huber_loss(x):
    return tf.keras.losses.Huber()(tf.zeros_like(x), x)

def epsilon(step):
    if step > epsilon_decay_steps:
        return epsilon_end
    else:
        diff = epsilon_start - epsilon_end
        return epsilon_start - diff * (step / epsilon_decay_steps)

random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

print(f"Creating environment {env_id}...")
try:
    env = build_env(env_id, seed=seed)
    print("Environment created successfully.")
except Exception as e:
    print(f"Error creating env: {e}")
    print("Trying generic Pong-v4...")
    env = build_env('Pong-v4', seed=seed)

action_dim = env.action_space.n

class QFunc(tf.keras.Model):
    def __init__(self, name):
        super(QFunc, self).__init__(name=name)
        self.conv1 = tf.keras.layers.Conv2D(32, (8, 8), strides=(4, 4), padding='valid', activation='relu')
        self.conv2 = tf.keras.layers.Conv2D(64, (4, 4), strides=(2, 2), padding='valid', activation='relu')
        self.conv3 = tf.keras.layers.Conv2D(64, (3, 3), strides=(1, 1), padding='valid', activation='relu')
        self.flat = tf.keras.layers.Flatten()
        self.fc1 = tf.keras.layers.Dense(512, activation='relu')
        self.fc2 = tf.keras.layers.Dense(action_dim, activation='linear')

    @tf.function
    def call(self, pixels, **kwargs):
        pixels = tf.divide(tf.cast(pixels, tf.float32), tf.constant(255.0))
        if len(pixels.shape) == 4 and pixels.shape[1] == 4: # NCHW
             pixels = tf.transpose(pixels, perm=[0, 2, 3, 1]) # -> NHWC

        feature = self.flat(self.conv3(self.conv2(self.conv1(pixels))))
        return self.fc2(self.fc1(feature))

class DQN(object):
    def __init__(self):
        self.qnet = QFunc('q')
        self.targetqnet = QFunc('targetq')
        dummy_obs = tf.zeros((1, 84, 84, 4))
        self.qnet(dummy_obs)
        self.targetqnet(dummy_obs)
        sync(self.qnet, self.targetqnet)
        self.niter = 0
        self.optimizer = tf.optimizers.Adam(learning_rate=lr, epsilon=1e-5, clipnorm=clipnorm)

    def get_action(self, obv):
        eps = epsilon(self.niter)
        if random.random() < eps:
            return int(random.random() * action_dim)
        else:
            obv = np.expand_dims(obv, 0).astype('float32')
            return self._qvalues_func(obv).numpy().argmax(1)[0]

    @tf.function
    def _qvalues_func(self, obv):
        return self.qnet(obv)

    def train(self, b_o, b_a, b_r, b_o_, b_d):
        self._train_func(b_o, b_a, b_r, b_o_, b_d)
        self.niter += 1
        if self.niter % target_q_update_freq == 0:
            sync(self.qnet, self.targetqnet)

    @tf.function
    def _train_func(self, b_o, b_a, b_r, b_o_, b_d):
        with tf.GradientTape() as tape:
            td_errors = self._tderror_func(b_o, b_a, b_r, b_o_, b_d)
            loss = tf.reduce_mean(huber_loss(td_errors))
        grad = tape.gradient(loss, self.qnet.trainable_weights)
        self.optimizer.apply_gradients(zip(grad, self.qnet.trainable_weights))
        return td_errors

    @tf.function
    def _tderror_func(self, b_o, b_a, b_r, b_o_, b_d):
        b_a_ = tf.one_hot(tf.argmax(self.qnet(b_o_), 1), action_dim)
        b_q_ = (1 - b_d) * tf.reduce_sum(self.targetqnet(b_o_) * b_a_, 1)
        b_q = tf.reduce_sum(self.qnet(b_o) * tf.one_hot(b_a, action_dim), 1)
        return b_q - (b_r + reward_gamma * b_q_)

if __name__ == '__main__':
    print("Starting training loop...")
    dqn = DQN()
    buffer = ReplayBuffer(buffer_size)

    o, _ = env.reset()
    nepisode = 0
    start_time = time.time()

    for i in range(1, number_timesteps + 1):
        a = dqn.get_action(o)
        o_, r, terminated, truncated, info = env.step(a)
        done = terminated or truncated
        buffer.add(o, a, r, o_, done)

        # === بخش اضافه شده برای نمایش لاگ وسط بازی ===
        if i % 100 == 0:
             print(f"Step: {i} / {number_timesteps} - Training in progress... Epsilon: {epsilon(i):.3f}")
        # ============================================

        if i >= warm_start and i % train_freq == 0:
            transitions = buffer.sample(batch_size)
            dqn.train(*transitions)

        if done:
            o, _ = env.reset()
            nepisode += 1
            if 'episode' in info:
                 reward = info['episode']['r']
                 length = info['episode']['l']
                 if hasattr(reward, 'numpy'): reward = reward.numpy()
                 if hasattr(length, 'numpy'): length = length.numpy()

                 elapsed = time.time() - start_time
                 print(f'*** EPISODE DONE *** Step: {i}, Episode: {nepisode}, Reward: {reward:.1f}, Len: {length}, Time: {elapsed:.1f}s')
            else:
                 print(f'Episode {nepisode} finished at step {i}')
        else:
            o = o_

    print("Training finished successfully.")


Collecting shimmy
  Downloading Shimmy-2.0.0-py3-none-any.whl.metadata (3.5 kB)
Downloading Shimmy-2.0.0-py3-none-any.whl (30 kB)
Installing collected packages: shimmy
Successfully installed shimmy-2.0.0
✅ GPU DETECTED: /physical_device:GPU:0
Creating environment PongNoFrameskip-v4...
Environment created successfully.
Starting training loop...
Step: 100 / 2000 - Training in progress... Epsilon: 0.951
Step: 200 / 2000 - Training in progress... Epsilon: 0.901
Step: 300 / 2000 - Training in progress... Epsilon: 0.852
Step: 400 / 2000 - Training in progress... Epsilon: 0.802
Step: 500 / 2000 - Training in progress... Epsilon: 0.752
Step: 600 / 2000 - Training in progress... Epsilon: 0.703
Step: 700 / 2000 - Training in progress... Epsilon: 0.653
Step: 800 / 2000 - Training in progress... Epsilon: 0.604
Step: 900 / 2000 - Training in progress... Epsilon: 0.554
Step: 1000 / 2000 - Training in progress... Epsilon: 0.505
Step: 1100 / 2000 - Training in progress... Epsilon: 0.456
Step: 1200 / 2