# Chrome Dino RL Training - DQN with CNN (PyTorch + GPU)

**Hardware Configuration:**
- GPU: RTX 3050 Mobile (4GB VRAM) ‚úÖ CUDA Enabled
- Drivers: NVIDIA Studio Drivers
- Framework: PyTorch 2.5.1 with CUDA 12.1

**Model Architecture:**
- Deep Q-Network (DQN) with Convolutional Neural Networks
- Input: 80x80 grayscale game screenshots
- Output: Q-values for 3 actions (run, jump, duck)

**Training Strategy:**
- Experience Replay Buffer (10,000 transitions)
- Target Network (updated every 1,000 steps)
- Epsilon-greedy exploration (1.0 ‚Üí 0.1 over 10,000 steps)
- Batch size: 32 (optimized for 4GB VRAM)

In [30]:
import os
import sys
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import deque
import random
from datetime import datetime
import json

# Add src directory to path
sys.path.append(os.path.join(os.getcwd(), 'src'))
from browser_dino_env import BrowserDinoEnv

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
    print("‚ö† No GPU detected - training will use CPU")
print(f"\nUsing device: {device}")

PyTorch version: 2.5.1+cu121
CUDA available: True
GPU device: NVIDIA GeForce RTX 3050 Laptop GPU
CUDA version: 12.1
GPU memory: 4.00 GB

Using device: cuda


## 2. GPU Configuration for RTX 3050 Mobile (4GB VRAM)

PyTorch with CUDA 12.1 provides native GPU acceleration on Windows.
Your RTX 3050 Mobile GPU is ready for training!

In [31]:
# GPU memory management
if torch.cuda.is_available():
    # Clear cache
    torch.cuda.empty_cache()
    
    # Set memory fraction (use ~90% of VRAM, leave some for Chrome)
    torch.cuda.set_per_process_memory_fraction(0.9, 0)
    
    # Enable cuDNN benchmarking for optimal performance
    torch.backends.cudnn.benchmark = True
    
    # Print memory info
    props = torch.cuda.get_device_properties(0)
    print(f"‚úì GPU: {props.name}")
    print(f"‚úì Total VRAM: {props.total_memory / 1024**3:.2f} GB")
    print(f"‚úì Allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
    print(f"‚úì Cached: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")
    print(f"‚úì cuDNN benchmark: {torch.backends.cudnn.benchmark}")
else:
    print("‚ö† No GPU - training will be slow on CPU")

‚úì GPU: NVIDIA GeForce RTX 3050 Laptop GPU
‚úì Total VRAM: 4.00 GB
‚úì Allocated: 0.05 GB
‚úì Cached: 0.07 GB
‚úì cuDNN benchmark: True


## 3. DQN Network with CNN Architecture (PyTorch)

In [32]:
class DQN(nn.Module):
    """
    Deep Q-Network with CNN for processing game screenshots.
    
    Architecture:
    - Conv2D(32, 8x8, stride 4) ‚Üí ReLU
    - Conv2D(64, 4x4, stride 2) ‚Üí ReLU
    - Conv2D(64, 3x3, stride 1) ‚Üí ReLU
    - Flatten
    - Dense(512) ‚Üí ReLU
    - Dense(action_size) ‚Üí Linear (Q-values)
    """
    
    def __init__(self, input_shape=(1, 80, 80), action_size=3):
        super(DQN, self).__init__()
        
        # Convolutional layers
        self.conv1 = nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        
        # Calculate flattened size
        def conv2d_size_out(size, kernel_size, stride):
            return (size - (kernel_size - 1) - 1) // stride + 1
        
        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(80, 8, 4), 4, 2), 3, 1)
        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(80, 8, 4), 4, 2), 3, 1)
        linear_input_size = convw * convh * 64
        
        # Fully connected layers
        self.fc1 = nn.Linear(linear_input_size, 512)
        self.fc2 = nn.Linear(512, action_size)
        
    def forward(self, x):
        # Normalize to [0, 1]
        x = x.float() / 255.0
        
        # Conv layers with ReLU
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        
        # Flatten
        x = x.view(x.size(0), -1)
        
        # FC layers
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x


class DQNAgent:
    """
    DQN Agent with experience replay and target network.
    Optimized for 4GB VRAM with efficient memory usage.
    """
    
    def __init__(
        self,
        state_shape=(1, 80, 80),  # (channels, height, width)
        action_size=3,             # run, jump, duck
        learning_rate=0.00025,
        gamma=0.99,                # Discount factor
        epsilon_start=1.0,
        epsilon_end=0.1,
        epsilon_decay_steps=10000,
        batch_size=32,
        memory_size=10000,
        target_update_freq=1000
    ):
        self.state_shape = state_shape
        self.action_size = action_size
        self.gamma = gamma
        self.device = device
        
        # Exploration parameters
        self.epsilon = epsilon_start
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay_steps = epsilon_decay_steps
        self.epsilon_decay_rate = (epsilon_start - epsilon_end) / epsilon_decay_steps
        
        # Training parameters
        self.batch_size = batch_size
        self.target_update_freq = target_update_freq
        self.steps = 0
        
        # Experience replay buffer
        self.memory = deque(maxlen=memory_size)
        
        # Create Q-network and target network
        self.q_network = DQN(state_shape, action_size).to(self.device)
        self.target_network = DQN(state_shape, action_size).to(self.device)
        self.update_target_network()
        
        # Optimizer
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate)
        self.loss_fn = nn.MSELoss()
        
        print("‚úì DQN Agent initialized")
        print(f"  - State shape: {state_shape}")
        print(f"  - Action size: {action_size}")
        print(f"  - Batch size: {batch_size}")
        print(f"  - Memory size: {memory_size}")
        print(f"  - Device: {self.device}")
    
    def update_target_network(self):
        """Copy weights from Q-network to target network"""
        self.target_network.load_state_dict(self.q_network.state_dict())
    
    def remember(self, state, action, reward, next_state, done):
        """Store experience in replay buffer"""
        self.memory.append((state, action, reward, next_state, done))
    
    def act(self, state, training=True):
        """
        Choose action using epsilon-greedy policy.
        
        Args:
            state: Current game state (80x80 grayscale image)
            training: If True, use epsilon-greedy; if False, use greedy
        
        Returns:
            action: Integer action (0=run, 1=jump, 2=duck)
        """
        # Exploration: random action
        if training and np.random.rand() <= self.epsilon:
            return np.random.randint(self.action_size)
        
        # Exploitation: best action from Q-network
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            q_values = self.q_network(state_tensor)
            return q_values.argmax().item()
    
    def replay(self):
        """
        Train on a batch of experiences from replay buffer.
        
        Returns:
            loss: Training loss for monitoring
        """
        if len(self.memory) < self.batch_size:
            return 0.0
        
        # Sample random batch
        batch = random.sample(self.memory, self.batch_size)
        states = torch.FloatTensor(np.array([exp[0] for exp in batch])).to(self.device)
        actions = torch.LongTensor([exp[1] for exp in batch]).to(self.device)
        rewards = torch.FloatTensor([exp[2] for exp in batch]).to(self.device)
        next_states = torch.FloatTensor(np.array([exp[3] for exp in batch])).to(self.device)
        dones = torch.FloatTensor([exp[4] for exp in batch]).to(self.device)
        
        # Get current Q-values
        current_q = self.q_network(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        
        # Compute target Q-values
        with torch.no_grad():
            next_q = self.target_network(next_states).max(1)[0]
            target_q = rewards + (1 - dones) * self.gamma * next_q
        
        # Compute loss and backpropagate
        loss = self.loss_fn(current_q, target_q)
        self.optimizer.zero_grad()
        loss.backward()
        
        # Gradient clipping for stability
        torch.nn.utils.clip_grad_norm_(self.q_network.parameters(), 10.0)
        
        self.optimizer.step()
        
        return loss.item()
    
    def update_epsilon(self):
        """Decay epsilon for exploration"""
        if self.epsilon > self.epsilon_end:
            self.epsilon -= self.epsilon_decay_rate
            self.epsilon = max(self.epsilon_end, self.epsilon)
    
    def save(self, filepath):
        """Save model weights"""
        torch.save({
            'q_network': self.q_network.state_dict(),
            'target_network': self.target_network.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'steps': self.steps,
            'epsilon': self.epsilon
        }, filepath)
        print(f"‚úì Model saved to {filepath}")
    
    def load(self, filepath):
        """Load model weights"""
        checkpoint = torch.load(filepath, map_location=self.device)
        self.q_network.load_state_dict(checkpoint['q_network'])
        self.target_network.load_state_dict(checkpoint['target_network'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        self.steps = checkpoint.get('steps', 0)
        self.epsilon = checkpoint.get('epsilon', self.epsilon_end)
        print(f"‚úì Model loaded from {filepath}")

## 4. Create Model Summary

In [33]:
# Create a dummy agent to visualize architecture
dummy_agent = DQNAgent()
print("\nüìä Network Architecture:")
print(dummy_agent.q_network)

# Calculate parameters
total_params = sum(p.numel() for p in dummy_agent.q_network.parameters())
trainable_params = sum(p.numel() for p in dummy_agent.q_network.parameters() if p.requires_grad)
param_size_mb = (total_params * 4) / (1024 ** 2)  # 4 bytes per float32

print(f"\nüìä Model Statistics:")
print(f"  Total parameters: {total_params:,}")
print(f"  Trainable parameters: {trainable_params:,}")
print(f"  Approximate size: {param_size_mb:.2f} MB")
print(f"  Estimated VRAM usage (with batch): ~{param_size_mb * 3:.2f} MB")
print(f"  Safe for RTX 3050 4GB: {'‚úì Yes' if param_size_mb * 3 < 3000 else '‚úó No'}")

# Check GPU memory if available
if torch.cuda.is_available():
    dummy_input = torch.randn(32, 1, 80, 80).to(device)
    _ = dummy_agent.q_network(dummy_input)
    print(f"\nüíæ GPU Memory after forward pass:")
    print(f"  Allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
    print(f"  Reserved: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB")
    torch.cuda.empty_cache()

del dummy_agent  # Clean up

‚úì DQN Agent initialized
  - State shape: (1, 80, 80)
  - Action size: 3
  - Batch size: 32
  - Memory size: 10000
  - Device: cuda

üìä Network Architecture:
DQN(
  (conv1): Conv2d(1, 32, kernel_size=(8, 8), stride=(4, 4))
  (conv2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=2304, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=3, bias=True)
)

üìä Model Statistics:
  Total parameters: 1,253,539
  Trainable parameters: 1,253,539
  Approximate size: 4.78 MB
  Estimated VRAM usage (with batch): ~14.35 MB
  Safe for RTX 3050 4GB: ‚úì Yes

üíæ GPU Memory after forward pass:
  Allocated: 54.20 MB
  Reserved: 70.00 MB


## 5. Training Configuration

In [34]:
# Training hyperparameters
CONFIG = {
    # Training
    'num_episodes': 1000,
    'max_steps_per_episode': 5000,
    
    # DQN parameters
    'learning_rate': 0.00025,
    'gamma': 0.99,
    'batch_size': 32,
    'memory_size': 10000,
    'target_update_freq': 1000,
    
    # Exploration
    'epsilon_start': 1.0,
    'epsilon_end': 0.1,
    'epsilon_decay_steps': 10000,
    
    # Logging
    'log_interval': 10,        # Print stats every N episodes
    'save_interval': 50,       # Save model every N episodes
    'plot_interval': 50,       # Update plots every N episodes
}

# Create directories for saving
os.makedirs('models', exist_ok=True)
os.makedirs('logs', exist_ok=True)
os.makedirs('plots', exist_ok=True)

print("üìã Training Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

üìã Training Configuration:
  num_episodes: 1000
  max_steps_per_episode: 5000
  learning_rate: 0.00025
  gamma: 0.99
  batch_size: 32
  memory_size: 10000
  target_update_freq: 1000
  epsilon_start: 1.0
  epsilon_end: 0.1
  epsilon_decay_steps: 10000
  log_interval: 10
  save_interval: 50
  plot_interval: 50


## 7. Initialize Environment and Agent

In [35]:
class ParallelDinoEnvs:
    """
    Wrapper for running multiple Chrome Dino environments in parallel.
    Each environment runs in its own browser window.
    """
    
    def __init__(self, num_envs=4):
        self.num_envs = num_envs
        self.envs = []
        
        print(f"üöÄ Initializing {num_envs} parallel environments...")
        for i in range(num_envs):
            try:
                env = BrowserDinoEnv()
                self.envs.append(env)
                print(f"  ‚úì Environment {i+1}/{num_envs} ready")
            except Exception as e:
                print(f"  ‚úó Environment {i+1} failed: {e}")
                # Clean up on failure
                for env in self.envs:
                    env.close()
                raise
        
        print(f"‚úÖ All {num_envs} environments initialized!")
        
        # Store observation/action space from first env
        self.observation_space = self.envs[0].observation_space
        self.action_space = self.envs[0].action_space
    
    def reset(self):
        """Reset all environments and return initial states"""
        states = []
        infos = []
        
        for env in self.envs:
            state, info = env.reset()
            states.append(state)
            infos.append(info)
        
        return np.array(states), infos
    
    def step(self, actions):
        """
        Take actions in all environments.
        
        Args:
            actions: List of actions (one per environment)
        
        Returns:
            states: Array of next states
            rewards: Array of rewards
            terminateds: Array of terminated flags
            truncateds: Array of truncated flags
            infos: List of info dicts
        """
        states = []
        rewards = []
        terminateds = []
        truncateds = []
        infos = []
        
        for env, action in zip(self.envs, actions):
            state, reward, terminated, truncated, info = env.step(action)
            states.append(state)
            rewards.append(reward)
            terminateds.append(terminated)
            truncateds.append(truncated)
            infos.append(info)
        
        return (
            np.array(states),
            np.array(rewards),
            np.array(terminateds),
            np.array(truncateds),
            infos
        )
    
    def close(self):
        """Close all environments"""
        print(f"\nüîí Closing {self.num_envs} environments...")
        for i, env in enumerate(self.envs):
            try:
                env.close()
                print(f"  ‚úì Environment {i+1} closed")
            except Exception as e:
                print(f"  ‚úó Environment {i+1} error: {e}")
        print("‚úì All environments closed")
    
    def __len__(self):
        return self.num_envs

## 6. Parallel Environment Wrapper

Training on 4 browser windows simultaneously will:
- 4x faster experience collection
- Better sample diversity
- More stable training
- ~2-4 hours for 1000 episodes (vs 8-15 hours single env)

In [36]:
# Choose number of parallel environments (1-4 recommended for 4GB VRAM)
NUM_PARALLEL_ENVS = 4

# Initialize parallel environments
print("üéÆ Initializing Chrome Dino environments...")
if NUM_PARALLEL_ENVS > 1:
    env = ParallelDinoEnvs(num_envs=NUM_PARALLEL_ENVS)
else:
    env = BrowserDinoEnv()
    
print(f"‚úì Environment(s) initialized")
print(f"  Observation space: {env.observation_space}")
print(f"  Action space: {env.action_space}")
print(f"  Parallel envs: {NUM_PARALLEL_ENVS}")

# Initialize agent
print("\nü§ñ Initializing DQN agent...")
agent = DQNAgent(
    state_shape=(1, 80, 80),  # PyTorch uses (C, H, W) format
    action_size=3,
    learning_rate=CONFIG['learning_rate'],
    gamma=CONFIG['gamma'],
    epsilon_start=CONFIG['epsilon_start'],
    epsilon_end=CONFIG['epsilon_end'],
    epsilon_decay_steps=CONFIG['epsilon_decay_steps'],
    batch_size=CONFIG['batch_size'],
    memory_size=CONFIG['memory_size'],
    target_update_freq=CONFIG['target_update_freq']
)

print(f"\n‚úÖ Ready to train on GPU with {NUM_PARALLEL_ENVS}x parallel collection!")

üéÆ Initializing Chrome Dino environments...
üöÄ Initializing 4 parallel environments...
‚úì Ads and unnecessary content hidden
  ‚úì Environment 1/4 ready
‚úì Ads and unnecessary content hidden
  ‚úì Environment 1/4 ready
‚úì Ads and unnecessary content hidden
  ‚úì Environment 2/4 ready
‚úì Ads and unnecessary content hidden
  ‚úì Environment 2/4 ready
‚úì Ads and unnecessary content hidden
  ‚úì Environment 3/4 ready
‚úì Ads and unnecessary content hidden
  ‚úì Environment 3/4 ready
‚úì Ads and unnecessary content hidden
  ‚úì Environment 4/4 ready
‚úÖ All 4 environments initialized!
‚úì Ads and unnecessary content hidden
  ‚úì Environment 4/4 ready
‚úÖ All 4 environments initialized!
‚úì Environment(s) initialized
  Observation space: Box(0, 255, (80, 80), uint8)
  Action space: Discrete(2)
  Parallel envs: 4

ü§ñ Initializing DQN agent...
‚úì DQN Agent initialized
  - State shape: (1, 80, 80)
  - Action size: 3
  - Batch size: 32
  - Memory size: 10000
  - Device: cuda

‚úÖ Rea

In [37]:
# Resume training configuration
# ‚ö†Ô∏è IMPORTANT: Score retrieval was fixed! Old checkpoints have inflated scores (40x too high)
# Set RESUME_TRAINING = False to start fresh with corrected scores
RESUME_TRAINING = False  # Set to True to resume from checkpoint (only if checkpoint uses correct scores)
CHECKPOINT_PATH = 'models/best_model.pth'  # or 'models/checkpoint_ep50.pth'

# Initialize metric variables
episode_rewards = []
episode_scores = []
episode_losses = []
episode_lengths = []
epsilon_history = []
best_score = 0
best_reward = float('-inf')

if RESUME_TRAINING and os.path.exists(CHECKPOINT_PATH):
    print(f"üîÑ Resuming training from {CHECKPOINT_PATH}")
    agent.load(CHECKPOINT_PATH)
    
    # Try to load previous metrics to continue plotting
    try:
        # Find latest metrics file
        metric_files = sorted([f for f in os.listdir('logs') if f.startswith('metrics_ep') and f.endswith('.json')])
        if metric_files:
            latest_metrics = metric_files[-1]
            with open(f'logs/{latest_metrics}', 'r') as f:
                prev_metrics = json.load(f)
                
                # Load previous metrics
                episode_rewards = prev_metrics.get('episode_rewards', [])
                episode_scores = prev_metrics.get('episode_scores', [])
                episode_losses = prev_metrics.get('episode_losses', [])
                episode_lengths = prev_metrics.get('episode_lengths', [])
                epsilon_history = prev_metrics.get('epsilon_history', [])
                best_score = prev_metrics.get('best_score', 0)
                best_reward = prev_metrics.get('best_reward', float('-inf'))
                
                # Adjust episode count to continue from where we left off
                completed_episodes = prev_metrics.get('episode', 0)
                print(f"‚úì Loaded metrics from episode {completed_episodes}")
                print(f"  Previous best score: {best_score}")
                print(f"  Previous best reward: {best_reward:.2f}")
                print(f"  Current epsilon: {agent.epsilon:.3f}")
                print(f"  Total steps: {agent.steps}")
                print(f"  Memory buffer: {len(agent.memory)} experiences")
                
                # Update CONFIG to train for additional episodes
                print(f"\nüìã Training will continue for {CONFIG['num_episodes']} more episodes")
                print(f"   (Total episodes will be: {completed_episodes + CONFIG['num_episodes']})")
        else:
            print("‚ö† No previous metrics found, starting with loaded weights only")
    except Exception as e:
        print(f"‚ö† Could not load previous metrics: {e}")
        print("  Will continue with loaded model weights only")
else:
    if RESUME_TRAINING:
        print(f"‚ö† Checkpoint not found: {CHECKPOINT_PATH}")
        print("üÜï Starting fresh training instead")
    else:
        print("üÜï Starting fresh training with corrected score retrieval")
        print("   (Score now matches displayed value - expect 30-50 range initially)")

print(f"\n‚úÖ Agent ready with epsilon={agent.epsilon:.3f}, steps={agent.steps}")

üÜï Starting fresh training with corrected score retrieval
   (Score now matches displayed value - expect 30-50 range initially)

‚úÖ Agent ready with epsilon=1.000, steps=0


## 7.5. Optional: Resume Training from Checkpoint

Set `RESUME_TRAINING = True` to continue from a saved model instead of starting from scratch.

## 7.6. IMPORTANT: Restart Environments After Code Changes

‚ö†Ô∏è **If you just updated the score retrieval in `browser_dino_env.py`, you MUST restart the environments!**

The old browser instances are still running with the old code. Run the cell below to close and reinitialize them.

In [None]:
# Close old environments and reinitialize with updated code
print("üîÑ Restarting environments to load updated code...")

# Close existing environments
try:
    env.close()
    print("‚úì Old environments closed")
except:
    pass

# Reimport the module to get latest code changes
import importlib
import sys
if 'browser_dino_env' in sys.modules:
    importlib.reload(sys.modules['browser_dino_env'])
    from browser_dino_env import BrowserDinoEnv
    print("‚úì Module reloaded with latest changes")

# Reinitialize environments
print(f"\nüéÆ Reinitializing {NUM_PARALLEL_ENVS} environments...")
if NUM_PARALLEL_ENVS > 1:
    env = ParallelDinoEnvs(num_envs=NUM_PARALLEL_ENVS)
else:
    env = BrowserDinoEnv()

print(f"\n‚úÖ Environments restarted with corrected score retrieval!")
print("   Scores should now show realistic values (30-50 range initially)")

## 8. Training Loop (Parallel)

Training will continue from loaded checkpoint if `RESUME_TRAINING = True` was set above.

In [38]:
# Parallel environment tracking
num_envs = NUM_PARALLEL_ENVS if NUM_PARALLEL_ENVS > 1 else 1

# Calculate starting episode (for resume)
start_episode = len(episode_rewards)

# Progress tracking file
progress_file = 'logs/training_progress.txt'
with open(progress_file, 'a') as f:
    f.write(f"\n{'='*80}\n")
    f.write(f"Training session started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"Starting from episode: {start_episode + 1}\n")
    f.write(f"Parallel environments: {num_envs}\n")
    f.write(f"{'='*80}\n\n")

print(f"üöÄ Starting training on GPU with {num_envs}x parallel environments...")
if start_episode > 0:
    print(f"üìç Resuming from episode {start_episode + 1}")
print(f"üìù Progress will be saved to {progress_file}")
print("=" * 80)

try:
    for episode in range(start_episode, start_episode + CONFIG['num_episodes']):
        # Reset environment(s)
        if num_envs > 1:
            states, infos = env.reset()
            # Convert to PyTorch format: (N, H, W) -> (N, C, H, W)
            states = np.expand_dims(states, axis=1)
        else:
            state, info = env.reset()
            states = np.expand_dims(np.expand_dims(state, axis=0), axis=0)
            infos = [info]
        
        episode_rewards_parallel = [0] * num_envs
        episode_losses_parallel = []
        dones = [False] * num_envs
        steps = [0] * num_envs
        
        # Real-time progress tracking
        step_count = 0
        
        # Episode loop
        while not all(dones):
            # Select actions for all environments
            actions = []
            for i in range(num_envs):
                if not dones[i]:
                    action = agent.act(states[i], training=True)
                    actions.append(action)
                else:
                    actions.append(0)  # Dummy action for done envs
            
            # Take steps in all environments
            if num_envs > 1:
                next_states, rewards, terminateds, truncateds, new_infos = env.step(actions)
                next_states = np.expand_dims(next_states, axis=1)
            else:
                next_state, reward, terminated, truncated, new_info = env.step(actions[0])
                next_states = np.expand_dims(np.expand_dims(next_state, axis=0), axis=0)
                rewards = np.array([reward])
                terminateds = np.array([terminated])
                truncateds = np.array([truncated])
                new_infos = [new_info]
            
            # Process each environment
            for i in range(num_envs):
                if not dones[i]:
                    done = terminateds[i] or truncateds[i]
                    
                    # Store experience
                    agent.remember(states[i], actions[i], rewards[i], next_states[i], done)
                    
                    # Train agent
                    loss = agent.replay()
                    if loss > 0:
                        episode_losses_parallel.append(loss)
                    
                    # Update metrics
                    episode_rewards_parallel[i] += rewards[i]
                    steps[i] += 1
                    agent.steps += 1
                    
                    # Update target network
                    if agent.steps % agent.target_update_freq == 0:
                        agent.update_target_network()
                    
                    # Decay epsilon
                    agent.update_epsilon()
                    
                    # Mark as done
                    if done:
                        dones[i] = True
                        infos[i] = new_infos[i]
            
            # Update states
            states = next_states
            step_count += 1
            
            # Real-time progress update (every 50 steps)
            if step_count % 50 == 0:
                active_envs = sum(1 for d in dones if not d)
                avg_steps = np.mean([s for i, s in enumerate(steps) if not dones[i]]) if active_envs > 0 else np.mean(steps)
                current_scores = [info.get('score', 0) for i, info in enumerate(infos) if not dones[i]]
                avg_score = np.mean(current_scores) if current_scores else 0
                
                print(f"  Episode {episode + 1} | Step {step_count} | "
                      f"Active: {active_envs}/{num_envs} | "
                      f"Avg Steps: {avg_steps:.0f} | "
                      f"Current Score: {avg_score:.0f} | "
                      f"Œµ: {agent.epsilon:.3f}", end='\r')
            
            # Break if max steps reached for all envs
            if all(s >= CONFIG['max_steps_per_episode'] for s in steps):
                break
        
        # Clear the progress line
        print(" " * 120, end='\r')
        
        # Episode complete - record metrics (average across parallel envs)
        avg_episode_reward = np.mean(episode_rewards_parallel)
        avg_episode_score = np.mean([info.get('score', 0) for info in infos])
        avg_episode_length = np.mean(steps)
        avg_episode_loss = np.mean(episode_losses_parallel) if episode_losses_parallel else 0
        
        episode_rewards.append(avg_episode_reward)
        episode_scores.append(avg_episode_score)
        episode_lengths.append(avg_episode_length)
        episode_losses.append(avg_episode_loss)
        epsilon_history.append(agent.epsilon)
        
        # Calculate improvement metrics
        recent_window = 10
        if len(episode_rewards) >= recent_window:
            recent_avg_reward = np.mean(episode_rewards[-recent_window:])
            prev_avg_reward = np.mean(episode_rewards[-recent_window*2:-recent_window]) if len(episode_rewards) >= recent_window*2 else np.mean(episode_rewards[:recent_window])
            reward_improvement = recent_avg_reward - prev_avg_reward
            
            recent_avg_score = np.mean(episode_scores[-recent_window:])
            prev_avg_score = np.mean(episode_scores[-recent_window*2:-recent_window]) if len(episode_scores) >= recent_window*2 else np.mean(episode_scores[:recent_window])
            score_improvement = recent_avg_score - prev_avg_score
        else:
            reward_improvement = 0
            score_improvement = 0
        
        # Update best performance
        max_score = max(info.get('score', 0) for info in infos)
        is_new_best = False
        if max_score > best_score:
            best_score = max_score
            agent.save(f'models/best_model.pth')
            is_new_best = True
            print(f"üèÜ New best score: {best_score:.0f}")
        
        if avg_episode_reward > best_reward:
            best_reward = avg_episode_reward
        
        # Save detailed progress to file
        with open(progress_file, 'a') as f:
            f.write(f"Episode {episode + 1}/{start_episode + CONFIG['num_episodes']} - {datetime.now().strftime('%H:%M:%S')}\n")
            f.write(f"  Score: {avg_episode_score:.2f} (Best: {best_score:.0f}{'  üèÜ NEW BEST!' if is_new_best else ''})\n")
            f.write(f"  Reward: {avg_episode_reward:.2f} (Best: {best_reward:.2f})\n")
            f.write(f"  Steps: {avg_episode_length:.0f}\n")
            f.write(f"  Loss: {avg_episode_loss:.6f}\n")
            f.write(f"  Epsilon: {agent.epsilon:.4f}\n")
            f.write(f"  Total training steps: {agent.steps}\n")
            f.write(f"  Memory buffer: {len(agent.memory)}/{CONFIG['memory_size']}\n")
            if len(episode_rewards) >= recent_window:
                f.write(f"  Improvement (last {recent_window} eps):\n")
                f.write(f"    Reward: {reward_improvement:+.2f}\n")
                f.write(f"    Score: {score_improvement:+.2f}\n")
            f.write(f"\n")
        
        # Logging
        if (episode + 1) % CONFIG['log_interval'] == 0:
            avg_reward = np.mean(episode_rewards[-CONFIG['log_interval']:])
            avg_score = np.mean(episode_scores[-CONFIG['log_interval']:])
            avg_loss = np.mean(episode_losses[-CONFIG['log_interval']:])
            avg_length = np.mean(episode_lengths[-CONFIG['log_interval']:])
            
            # GPU memory info
            gpu_mem = ""
            if torch.cuda.is_available():
                gpu_mem = f" | GPU: {torch.cuda.memory_allocated(0) / 1024**2:.0f}MB"
            
            print(f"\nEpisode {episode + 1}/{start_episode + CONFIG['num_episodes']} ({num_envs}x parallel)")
            print(f"  Avg Reward: {avg_reward:.2f} | Avg Score: {avg_score:.0f} | "
                  f"Avg Steps: {avg_length:.0f}")
            print(f"  Loss: {avg_loss:.4f} | Epsilon: {agent.epsilon:.3f} | "
                  f"Memory: {len(agent.memory)}/{CONFIG['memory_size']}{gpu_mem}")
            print(f"  Best Score: {best_score:.0f} | Best Reward: {best_reward:.2f}")
            
            if len(episode_rewards) >= recent_window:
                print(f"  Improvement (last {recent_window} eps): "
                      f"Reward {reward_improvement:+.2f} | Score {score_improvement:+.2f}")
            print("-" * 80)
        else:
            # Brief update for non-logged episodes
            improvement_str = ""
            if len(episode_rewards) >= recent_window and (episode + 1) % 5 == 0:
                improvement_str = f" | Œî Reward: {reward_improvement:+.2f}, Œî Score: {score_improvement:+.2f}"
            
            print(f"Episode {episode + 1} complete | Score: {avg_episode_score:.0f} | "
                  f"Steps: {avg_episode_length:.0f} | "
                  f"Reward: {avg_episode_reward:.2f}{improvement_str}")
        
        # Save checkpoint
        if (episode + 1) % CONFIG['save_interval'] == 0:
            agent.save(f'models/checkpoint_ep{episode + 1}.pth')
            
            # Save metrics with improvement tracking
            metrics = {
                'episode': episode + 1,
                'timestamp': datetime.now().isoformat(),
                'num_parallel_envs': num_envs,
                'episode_rewards': episode_rewards,
                'episode_scores': episode_scores,
                'episode_losses': episode_losses,
                'episode_lengths': episode_lengths,
                'epsilon_history': epsilon_history,
                'best_score': best_score,
                'best_reward': best_reward,
                'total_steps': agent.steps,
                'memory_size': len(agent.memory),
                'config': CONFIG
            }
            
            # Add improvement metrics
            if len(episode_rewards) >= recent_window:
                metrics['recent_improvement'] = {
                    'window_size': recent_window,
                    'reward_improvement': float(reward_improvement),
                    'score_improvement': float(score_improvement),
                    'recent_avg_reward': float(recent_avg_reward),
                    'recent_avg_score': float(recent_avg_score)
                }
            
            with open(f'logs/metrics_ep{episode + 1}.json', 'w') as f:
                json.dump(metrics, f, indent=2)
            
            print(f"üíæ Checkpoint saved: models/checkpoint_ep{episode + 1}.pth")

except KeyboardInterrupt:
    print("\n‚ö† Training interrupted by user")
    print("Saving current progress...")
    agent.save(f'models/interrupted_ep{episode + 1}.pth')
    
    with open(progress_file, 'a') as f:
        f.write(f"\n‚ö† Training interrupted at episode {episode + 1}\n")
        f.write(f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

finally:
    print("\nüèÅ Training complete!")
    print(f"  Total episodes: {len(episode_rewards)}")
    print(f"  Best score: {best_score}")
    print(f"  Best reward: {best_reward:.2f}")
    print(f"  Parallel speedup: ~{num_envs}x")
    
    # Save final summary to progress file
    with open(progress_file, 'a') as f:
        f.write(f"\n{'='*80}\n")
        f.write(f"Training session ended: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Total episodes completed: {len(episode_rewards)}\n")
        f.write(f"Best score achieved: {best_score:.0f}\n")
        f.write(f"Best reward achieved: {best_reward:.2f}\n")
        f.write(f"Total training steps: {agent.steps}\n")
        f.write(f"Final epsilon: {agent.epsilon:.4f}\n")
        f.write(f"{'='*80}\n")
    
    print(f"üìù Full training progress saved to {progress_file}")
    
    # GPU memory cleanup
    if torch.cuda.is_available():
        print(f"  Final GPU memory: {torch.cuda.memory_allocated(0) / 1024**2:.0f}MB")
        torch.cuda.empty_cache()
    
    # Close environment(s)
    env.close()
    print("‚úì Browser(s) closed")

üöÄ Starting training on GPU with 4x parallel environments...
üìù Progress will be saved to logs/training_progress.txt
‚úì Model saved to models/best_model.pth                                                                                  
üèÜ New best score: 1927
Episode 1 complete | Score: 1906 | Steps: 6 | Reward: 78.10
‚úì Model saved to models/best_model.pth                                                                                  
üèÜ New best score: 1927
Episode 1 complete | Score: 1906 | Steps: 6 | Reward: 78.10


  dones = torch.FloatTensor([exp[4] for exp in batch]).to(self.device)


‚úì Model saved to models/best_model.pth                                                                                  
üèÜ New best score: 2050
Episode 2 complete | Score: 1955 | Steps: 5 | Reward: 126.30
Episode 3 complete | Score: 1815 | Steps: 6 | Reward: 109.15                                                            
Episode 3 complete | Score: 1815 | Steps: 6 | Reward: 109.15                                                            
Episode 4 complete | Score: 1856 | Steps: 5 | Reward: 125.88                                                            
Episode 4 complete | Score: 1856 | Steps: 5 | Reward: 125.88                                                            
Episode 5 complete | Score: 1902 | Steps: 4 | Reward: 78.58                                                             
Episode 5 complete | Score: 1902 | Steps: 4 | Reward: 78.58                                                             
‚úì Model saved to models/best_model.pth                        

## 9. Training Visualization

In [None]:
# Plot training metrics
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('DQN Training Metrics', fontsize=16, fontweight='bold')

# Rewards
axes[0, 0].plot(episode_rewards, alpha=0.3, label='Episode Reward')
axes[0, 0].plot(np.convolve(episode_rewards, np.ones(50)/50, mode='valid'), 
                label='Moving Avg (50)', linewidth=2)
axes[0, 0].set_xlabel('Episode')
axes[0, 0].set_ylabel('Total Reward')
axes[0, 0].set_title('Episode Rewards')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Scores
axes[0, 1].plot(episode_scores, alpha=0.3, label='Episode Score')
axes[0, 1].plot(np.convolve(episode_scores, np.ones(50)/50, mode='valid'),
                label='Moving Avg (50)', linewidth=2)
axes[0, 1].set_xlabel('Episode')
axes[0, 1].set_ylabel('Score (Distance)')
axes[0, 1].set_title('Game Scores')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Loss
axes[1, 0].plot(episode_losses, alpha=0.6)
axes[1, 0].set_xlabel('Episode')
axes[1, 0].set_ylabel('Loss')
axes[1, 0].set_title('Training Loss')
axes[1, 0].grid(True, alpha=0.3)

# Epsilon
axes[1, 1].plot(epsilon_history, color='orange')
axes[1, 1].set_xlabel('Episode')
axes[1, 1].set_ylabel('Epsilon')
axes[1, 1].set_title('Exploration Rate (Epsilon)')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('plots/training_metrics.png', dpi=150, bbox_inches='tight')
plt.show()

print("‚úì Training plots saved to plots/training_metrics.png")

## 10. Test Trained Agent

In [None]:
# Test the best model
print("üéÆ Testing trained agent...\n")

# Load best model
agent.load('models/best_model.pth')
agent.epsilon = 0.0  # No exploration during testing

# Run test episodes
num_test_episodes = 5
test_scores = []
test_rewards = []

env = BrowserDinoEnv()

for episode in range(num_test_episodes):
    state, info = env.reset()
    state = np.expand_dims(state, axis=0)  # Add channel dimension
    
    episode_reward = 0
    done = False
    step = 0
    
    while not done and step < 5000:
        action = agent.act(state, training=False)
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        
        state = np.expand_dims(next_state, axis=0)
        episode_reward += reward
        step += 1
    
    test_scores.append(info.get('score', 0))
    test_rewards.append(episode_reward)
    
    print(f"Test Episode {episode + 1}: Score={info.get('score', 0):.0f}, "
          f"Reward={episode_reward:.2f}, Steps={step}")

env.close()

print("\nüìä Test Results:")
print(f"  Average Score: {np.mean(test_scores):.0f} ¬± {np.std(test_scores):.0f}")
print(f"  Average Reward: {np.mean(test_rewards):.2f} ¬± {np.std(test_rewards):.2f}")
print(f"  Best Score: {max(test_scores):.0f}")
print(f"  Worst Score: {min(test_scores):.0f}")

## 11. Save Final Model and Summary

In [None]:
# Save final summary
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

summary = {
    'timestamp': timestamp,
    'hardware': {
        'gpu': torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU',
        'vram': f"{torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f}GB" if torch.cuda.is_available() else 'N/A',
        'pytorch_version': torch.__version__,
        'cuda_version': torch.version.cuda if torch.cuda.is_available() else 'N/A'
    },
    'training': {
        'total_episodes': len(episode_rewards),
        'total_steps': agent.steps,
        'best_score': best_score,
        'best_reward': best_reward,
        'final_epsilon': agent.epsilon
    },
    'test_results': {
        'num_episodes': num_test_episodes,
        'avg_score': float(np.mean(test_scores)),
        'avg_reward': float(np.mean(test_rewards)),
        'best_test_score': float(max(test_scores))
    },
    'config': CONFIG
}

with open(f'logs/training_summary_{timestamp}.json', 'w') as f:
    json.dump(summary, f, indent=2)

print(f"‚úì Training summary saved to logs/training_summary_{timestamp}.json")
print("\nüéâ Training pipeline complete!")
print(f"üöÄ GPU-accelerated training using {summary['hardware']['gpu']}")