## 1. Setup and Imports

In [None]:
import os
import sys
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from collections import deque
import random
from datetime import datetime
import json

# Add src directory to path
sys.path.append(os.path.join(os.getcwd(), 'src'))
from browser_dino_env import BrowserDinoEnv

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")
print(f"Num GPUs: {len(tf.config.list_physical_devices('GPU'))}")

## 2. GPU Configuration for RTX 3050 Mobile (4GB VRAM)

In [None]:
# Configure GPU memory growth to prevent VRAM overflow
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Enable memory growth - allocate only as needed
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        
        # Optional: Set memory limit if needed (e.g., 3.5GB out of 4GB)
        # tf.config.set_logical_device_configuration(
        #     gpus[0],
        #     [tf.config.LogicalDeviceConfiguration(memory_limit=3584)]
        # )
        
        print("‚úì GPU memory growth enabled")
        print(f"‚úì Using GPU: {gpus[0].name}")
    except RuntimeError as e:
        print(f"GPU configuration error: {e}")
else:
    print("‚ö† No GPU detected - training will use CPU (slower)")

# Set mixed precision for better performance on RTX 3050
policy = tf.keras.mixed_precision.Policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy(policy)
print(f"‚úì Mixed precision policy: {policy.name}")

## 3. DQN Agent with CNN Architecture

In [None]:
class DQNAgent:
    """
    Deep Q-Network agent with CNN for processing game screenshots.
    Optimized for 4GB VRAM with efficient memory usage.
    """
    
    def __init__(
        self,
        state_shape=(80, 80, 1),  # Grayscale 80x80 images
        action_size=3,             # run, jump, duck
        learning_rate=0.00025,
        gamma=0.99,                # Discount factor
        epsilon_start=1.0,
        epsilon_end=0.1,
        epsilon_decay_steps=10000,
        batch_size=32,             # Small batch for 4GB VRAM
        memory_size=10000,         # Replay buffer size
        target_update_freq=1000    # Update target network every N steps
    ):
        self.state_shape = state_shape
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.gamma = gamma
        
        # Exploration parameters
        self.epsilon = epsilon_start
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay_steps = epsilon_decay_steps
        self.epsilon_decay_rate = (epsilon_start - epsilon_end) / epsilon_decay_steps
        
        # Training parameters
        self.batch_size = batch_size
        self.target_update_freq = target_update_freq
        self.steps = 0
        
        # Experience replay buffer
        self.memory = deque(maxlen=memory_size)
        
        # Create Q-network and target network
        self.q_network = self._build_model()
        self.target_network = self._build_model()
        self.update_target_network()
        
        # Optimizer
        self.optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
        
        print("‚úì DQN Agent initialized")
        print(f"  - State shape: {state_shape}")
        print(f"  - Action size: {action_size}")
        print(f"  - Batch size: {batch_size}")
        print(f"  - Memory size: {memory_size}")
    
    def _build_model(self):
        """
        Build CNN architecture for processing game screenshots.
        
        Architecture:
        - Conv2D(32, 8x8, stride 4) ‚Üí ReLU
        - Conv2D(64, 4x4, stride 2) ‚Üí ReLU
        - Conv2D(64, 3x3, stride 1) ‚Üí ReLU
        - Flatten
        - Dense(512) ‚Üí ReLU
        - Dense(action_size) ‚Üí Linear (Q-values)
        """
        inputs = layers.Input(shape=self.state_shape)
        
        # Normalize pixel values to [0, 1]
        x = layers.Lambda(lambda x: x / 255.0)(inputs)
        
        # Convolutional layers
        x = layers.Conv2D(32, (8, 8), strides=4, activation='relu', 
                         kernel_initializer='he_normal')(x)
        x = layers.Conv2D(64, (4, 4), strides=2, activation='relu',
                         kernel_initializer='he_normal')(x)
        x = layers.Conv2D(64, (3, 3), strides=1, activation='relu',
                         kernel_initializer='he_normal')(x)
        
        # Flatten and dense layers
        x = layers.Flatten()(x)
        x = layers.Dense(512, activation='relu',
                        kernel_initializer='he_normal')(x)
        
        # Output layer - Q-values for each action
        outputs = layers.Dense(self.action_size, activation='linear',
                              kernel_initializer='he_normal',
                              dtype='float32')(x)  # Force float32 for stability
        
        model = keras.Model(inputs=inputs, outputs=outputs)
        return model
    
    def update_target_network(self):
        """Copy weights from Q-network to target network"""
        self.target_network.set_weights(self.q_network.get_weights())
    
    def remember(self, state, action, reward, next_state, done):
        """Store experience in replay buffer"""
        self.memory.append((state, action, reward, next_state, done))
    
    def act(self, state, training=True):
        """
        Choose action using epsilon-greedy policy.
        
        Args:
            state: Current game state (80x80 grayscale image)
            training: If True, use epsilon-greedy; if False, use greedy
        
        Returns:
            action: Integer action (0=run, 1=jump, 2=duck)
        """
        # Exploration: random action
        if training and np.random.rand() <= self.epsilon:
            return np.random.randint(self.action_size)
        
        # Exploitation: best action from Q-network
        state_batch = np.expand_dims(state, axis=0)  # Add batch dimension
        q_values = self.q_network.predict(state_batch, verbose=0)
        return np.argmax(q_values[0])
    
    def replay(self):
        """
        Train on a batch of experiences from replay buffer.
        
        Returns:
            loss: Training loss for monitoring
        """
        if len(self.memory) < self.batch_size:
            return 0.0
        
        # Sample random batch
        batch = random.sample(self.memory, self.batch_size)
        states = np.array([exp[0] for exp in batch])
        actions = np.array([exp[1] for exp in batch])
        rewards = np.array([exp[2] for exp in batch])
        next_states = np.array([exp[3] for exp in batch])
        dones = np.array([exp[4] for exp in batch])
        
        # Compute target Q-values using target network
        next_q_values = self.target_network.predict(next_states, verbose=0)
        max_next_q = np.max(next_q_values, axis=1)
        
        # Q-learning target: r + Œ≥ * max(Q(s', a')) if not done, else r
        targets = rewards + (1 - dones) * self.gamma * max_next_q
        
        # Train Q-network
        with tf.GradientTape() as tape:
            # Get current Q-values
            q_values = self.q_network(states, training=True)
            
            # Select Q-values for actions taken
            action_masks = tf.one_hot(actions, self.action_size)
            q_action = tf.reduce_sum(q_values * action_masks, axis=1)
            
            # Compute loss (MSE between predicted and target Q-values)
            loss = tf.reduce_mean(tf.square(targets - q_action))
        
        # Backpropagation
        gradients = tape.gradient(loss, self.q_network.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.q_network.trainable_variables))
        
        return loss.numpy()
    
    def update_epsilon(self):
        """Decay epsilon for exploration"""
        if self.epsilon > self.epsilon_end:
            self.epsilon -= self.epsilon_decay_rate
            self.epsilon = max(self.epsilon_end, self.epsilon)
    
    def save(self, filepath):
        """Save model weights"""
        self.q_network.save_weights(filepath)
        print(f"‚úì Model saved to {filepath}")
    
    def load(self, filepath):
        """Load model weights"""
        self.q_network.load_weights(filepath)
        self.update_target_network()
        print(f"‚úì Model loaded from {filepath}")

## 4. Create Model Summary

In [None]:
# Create a dummy agent to visualize architecture
dummy_agent = DQNAgent()
dummy_agent.q_network.summary()

# Calculate approximate VRAM usage
total_params = dummy_agent.q_network.count_params()
param_size_mb = (total_params * 4) / (1024 ** 2)  # 4 bytes per float32
print(f"\nüìä Model Statistics:")
print(f"  Total parameters: {total_params:,}")
print(f"  Approximate size: {param_size_mb:.2f} MB")
print(f"  Estimated VRAM usage (with batch): ~{param_size_mb * 3:.2f} MB")
print(f"  Safe for RTX 3050 4GB: {'‚úì Yes' if param_size_mb * 3 < 3000 else '‚úó No'}")

del dummy_agent  # Clean up

## 5. Training Configuration

In [None]:
# Training hyperparameters
CONFIG = {
    # Training
    'num_episodes': 1000,
    'max_steps_per_episode': 5000,
    
    # DQN parameters
    'learning_rate': 0.00025,
    'gamma': 0.99,
    'batch_size': 32,
    'memory_size': 10000,
    'target_update_freq': 1000,
    
    # Exploration
    'epsilon_start': 1.0,
    'epsilon_end': 0.1,
    'epsilon_decay_steps': 10000,
    
    # Logging
    'log_interval': 10,        # Print stats every N episodes
    'save_interval': 50,       # Save model every N episodes
    'plot_interval': 50,       # Update plots every N episodes
}

# Create directories for saving
os.makedirs('models', exist_ok=True)
os.makedirs('logs', exist_ok=True)
os.makedirs('plots', exist_ok=True)

print("üìã Training Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

## 6. Initialize Environment and Agent

In [None]:
# Initialize environment
print("üéÆ Initializing Chrome Dino environment...")
env = BrowserDinoEnv()
print(f"‚úì Environment initialized")
print(f"  Observation space: {env.observation_space}")
print(f"  Action space: {env.action_space}")

# Initialize agent
print("\nü§ñ Initializing DQN agent...")
agent = DQNAgent(
    state_shape=(80, 80, 1),
    action_size=3,
    learning_rate=CONFIG['learning_rate'],
    gamma=CONFIG['gamma'],
    epsilon_start=CONFIG['epsilon_start'],
    epsilon_end=CONFIG['epsilon_end'],
    epsilon_decay_steps=CONFIG['epsilon_decay_steps'],
    batch_size=CONFIG['batch_size'],
    memory_size=CONFIG['memory_size'],
    target_update_freq=CONFIG['target_update_freq']
)

print("\n‚úÖ Ready to train!")

## 7. Training Loop

In [None]:
# Training metrics
episode_rewards = []
episode_scores = []
episode_losses = []
episode_lengths = []
epsilon_history = []

# Best performance tracking
best_score = 0
best_reward = float('-inf')

print("üöÄ Starting training...\n")
print("=" * 80)

try:
    for episode in range(CONFIG['num_episodes']):
        # Reset environment
        state, info = env.reset()
        state = np.expand_dims(state, axis=-1)  # Add channel dimension
        
        episode_reward = 0
        episode_loss = []
        done = False
        step = 0
        
        # Episode loop
        while not done and step < CONFIG['max_steps_per_episode']:
            # Select action
            action = agent.act(state, training=True)
            
            # Take step in environment
            next_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            next_state = np.expand_dims(next_state, axis=-1)
            
            # Store experience
            agent.remember(state, action, reward, next_state, done)
            
            # Train agent
            loss = agent.replay()
            if loss > 0:
                episode_loss.append(loss)
            
            # Update state
            state = next_state
            episode_reward += reward
            step += 1
            agent.steps += 1
            
            # Update target network
            if agent.steps % agent.target_update_freq == 0:
                agent.update_target_network()
            
            # Decay epsilon
            agent.update_epsilon()
        
        # Episode complete - record metrics
        episode_rewards.append(episode_reward)
        episode_scores.append(info.get('score', 0))
        episode_lengths.append(step)
        episode_losses.append(np.mean(episode_loss) if episode_loss else 0)
        epsilon_history.append(agent.epsilon)
        
        # Update best performance
        if info.get('score', 0) > best_score:
            best_score = info.get('score', 0)
            agent.save(f'models/best_model.weights.h5')
        
        if episode_reward > best_reward:
            best_reward = episode_reward
        
        # Logging
        if (episode + 1) % CONFIG['log_interval'] == 0:
            avg_reward = np.mean(episode_rewards[-CONFIG['log_interval']:])
            avg_score = np.mean(episode_scores[-CONFIG['log_interval']:])
            avg_loss = np.mean(episode_losses[-CONFIG['log_interval']:])
            avg_length = np.mean(episode_lengths[-CONFIG['log_interval']:])
            
            print(f"Episode {episode + 1}/{CONFIG['num_episodes']}")
            print(f"  Avg Reward: {avg_reward:.2f} | Avg Score: {avg_score:.0f} | "
                  f"Avg Steps: {avg_length:.0f}")
            print(f"  Loss: {avg_loss:.4f} | Epsilon: {agent.epsilon:.3f} | "
                  f"Memory: {len(agent.memory)}/{CONFIG['memory_size']}")
            print(f"  Best Score: {best_score:.0f} | Best Reward: {best_reward:.2f}")
            print("-" * 80)
        
        # Save checkpoint
        if (episode + 1) % CONFIG['save_interval'] == 0:
            agent.save(f'models/checkpoint_ep{episode + 1}.weights.h5')
            
            # Save metrics
            metrics = {
                'episode': episode + 1,
                'episode_rewards': episode_rewards,
                'episode_scores': episode_scores,
                'episode_losses': episode_losses,
                'episode_lengths': episode_lengths,
                'epsilon_history': epsilon_history,
                'best_score': best_score,
                'best_reward': best_reward,
                'config': CONFIG
            }
            
            with open(f'logs/metrics_ep{episode + 1}.json', 'w') as f:
                json.dump(metrics, f, indent=2)

except KeyboardInterrupt:
    print("\n‚ö† Training interrupted by user")
    print("Saving current progress...")
    agent.save(f'models/interrupted_ep{episode}.weights.h5')

finally:
    print("\nüèÅ Training complete!")
    print(f"  Total episodes: {len(episode_rewards)}")
    print(f"  Best score: {best_score}")
    print(f"  Best reward: {best_reward:.2f}")
    
    # Close environment
    env.close()
    print("‚úì Browser closed")

## 8. Training Visualization

In [None]:
# Plot training metrics
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('DQN Training Metrics', fontsize=16, fontweight='bold')

# Rewards
axes[0, 0].plot(episode_rewards, alpha=0.3, label='Episode Reward')
axes[0, 0].plot(np.convolve(episode_rewards, np.ones(50)/50, mode='valid'), 
                label='Moving Avg (50)', linewidth=2)
axes[0, 0].set_xlabel('Episode')
axes[0, 0].set_ylabel('Total Reward')
axes[0, 0].set_title('Episode Rewards')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Scores
axes[0, 1].plot(episode_scores, alpha=0.3, label='Episode Score')
axes[0, 1].plot(np.convolve(episode_scores, np.ones(50)/50, mode='valid'),
                label='Moving Avg (50)', linewidth=2)
axes[0, 1].set_xlabel('Episode')
axes[0, 1].set_ylabel('Score (Distance)')
axes[0, 1].set_title('Game Scores')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Loss
axes[1, 0].plot(episode_losses, alpha=0.6)
axes[1, 0].set_xlabel('Episode')
axes[1, 0].set_ylabel('Loss')
axes[1, 0].set_title('Training Loss')
axes[1, 0].grid(True, alpha=0.3)

# Epsilon
axes[1, 1].plot(epsilon_history, color='orange')
axes[1, 1].set_xlabel('Episode')
axes[1, 1].set_ylabel('Epsilon')
axes[1, 1].set_title('Exploration Rate (Epsilon)')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('plots/training_metrics.png', dpi=150, bbox_inches='tight')
plt.show()

print("‚úì Training plots saved to plots/training_metrics.png")

## 9. Test Trained Agent

In [None]:
# Test the best model
print("üéÆ Testing trained agent...\n")

# Load best model
agent.load('models/best_model.weights.h5')
agent.epsilon = 0.0  # No exploration during testing

# Run test episodes
num_test_episodes = 5
test_scores = []
test_rewards = []

env = BrowserDinoEnv()

for episode in range(num_test_episodes):
    state, info = env.reset()
    state = np.expand_dims(state, axis=-1)
    
    episode_reward = 0
    done = False
    step = 0
    
    while not done and step < 5000:
        action = agent.act(state, training=False)
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        
        state = np.expand_dims(next_state, axis=-1)
        episode_reward += reward
        step += 1
    
    test_scores.append(info.get('score', 0))
    test_rewards.append(episode_reward)
    
    print(f"Test Episode {episode + 1}: Score={info.get('score', 0):.0f}, "
          f"Reward={episode_reward:.2f}, Steps={step}")

env.close()

print("\nüìä Test Results:")
print(f"  Average Score: {np.mean(test_scores):.0f} ¬± {np.std(test_scores):.0f}")
print(f"  Average Reward: {np.mean(test_rewards):.2f} ¬± {np.std(test_rewards):.2f}")
print(f"  Best Score: {max(test_scores):.0f}")
print(f"  Worst Score: {min(test_scores):.0f}")

## 10. Save Final Model and Summary

In [None]:
# Save final summary
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

summary = {
    'timestamp': timestamp,
    'hardware': {
        'gpu': 'RTX 3050 Mobile',
        'vram': '4GB',
        'tensorflow_version': tf.__version__
    },
    'training': {
        'total_episodes': len(episode_rewards),
        'total_steps': agent.steps,
        'best_score': best_score,
        'best_reward': best_reward,
        'final_epsilon': agent.epsilon
    },
    'test_results': {
        'num_episodes': num_test_episodes,
        'avg_score': float(np.mean(test_scores)),
        'avg_reward': float(np.mean(test_rewards)),
        'best_test_score': float(max(test_scores))
    },
    'config': CONFIG
}

with open(f'logs/training_summary_{timestamp}.json', 'w') as f:
    json.dump(summary, f, indent=2)

print(f"‚úì Training summary saved to logs/training_summary_{timestamp}.json")
print("\nüéâ Training pipeline complete!")