In [None]:
# 🔧 Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("⚠️ No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime → Change runtime type → GPU")

print(f"\n📦 Python {sys.version.split()[0]}")
print(f"🔥 PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"🎲 Random seed set to {SEED}")

%matplotlib inline

# DQN Case Study: Automated Game Testing with Deep Q-Networks

This notebook implements the core DQN-based game testing pipeline described in the Nexus Interactive case study. We will train DQN agents on procedurally generated dungeon levels and extract QA metrics: completability, difficulty estimation, and exploit detection.

---

## Setup and Dependencies

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import random
from collections import deque
import time

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

## Part 1: The Dungeon Environment

We create a simplified dungeon crawler environment with tile-based grid levels.

In [None]:
class DungeonLevel:
    """
    A procedurally generated dungeon level for testing.

    State: 5-channel 32x32 grid
      Channel 0: Terrain (0=floor, 1=wall, 0.5=door, 0.3=trap)
      Channel 1: Items (0=empty, 1=key, 0.7=potion, 0.5=gold)
      Channel 2: Enemies (0=empty, value=enemy health/10)
      Channel 3: Player (1=position, health/10 in surrounding cells)
      Channel 4: Explored (0=unexplored, 1=explored)

    Actions: 0-3=move(up/down/left/right), 4=attack, 5=use_item, 6=interact, 7=wait
    """
    def __init__(self, size=32, difficulty=5, seed=None):
        if seed is not None:
            np.random.seed(seed)
        self.size = size
        self.difficulty = difficulty
        self.n_actions = 8
        self._generate_level()

    def _generate_level(self):
        """Generate a random dungeon level."""
        self.terrain = np.zeros((self.size, self.size))
        # Walls around the border
        self.terrain[0, :] = 1.0
        self.terrain[-1, :] = 1.0
        self.terrain[:, 0] = 1.0
        self.terrain[:, -1] = 1.0

        # Internal walls (more walls = harder)
        n_walls = int(self.difficulty * 8)
        for _ in range(n_walls):
            x, y = np.random.randint(2, self.size-2, size=2)
            length = np.random.randint(2, 6)
            if np.random.random() < 0.5:
                self.terrain[y, x:min(x+length, self.size-1)] = 1.0
            else:
                self.terrain[y:min(y+length, self.size-1), x] = 1.0

        # Traps (difficulty-dependent)
        n_traps = int(self.difficulty * 2)
        self.traps = []
        for _ in range(n_traps):
            x, y = np.random.randint(2, self.size-2, size=2)
            if self.terrain[y, x] == 0:
                self.terrain[y, x] = 0.3
                self.traps.append((x, y))

        # Items
        self.items = np.zeros((self.size, self.size))
        n_items = max(1, 8 - self.difficulty)
        for _ in range(n_items):
            x, y = np.random.randint(2, self.size-2, size=2)
            if self.terrain[y, x] == 0:
                self.items[y, x] = np.random.choice([1.0, 0.7, 0.5])

        # Enemies
        self.enemies = np.zeros((self.size, self.size))
        n_enemies = int(self.difficulty * 1.5)
        self.enemy_list = []
        for _ in range(n_enemies):
            x, y = np.random.randint(3, self.size-3, size=2)
            if self.terrain[y, x] == 0:
                health = min(1.0, 0.2 + self.difficulty * 0.08)
                self.enemies[y, x] = health
                self.enemy_list.append([x, y, health])

        # Player start and goal
        self.player_pos = [1, 1]
        self.goal_pos = [self.size-2, self.size-2]
        # Ensure goal is reachable (clear the goal area)
        self.terrain[self.goal_pos[1]-1:self.goal_pos[1]+2,
                     self.goal_pos[0]-1:self.goal_pos[0]+2] = 0
        self.terrain[1, 1] = 0  # Clear start

        self.player_health = 1.0
        self.inventory = []
        self.explored = np.zeros((self.size, self.size))
        self.steps = 0
        self.total_reward = 0
        self.items_collected = 0

    def reset(self):
        self._generate_level()
        self._update_explored()
        return self._get_state()

    def _update_explored(self):
        """Update fog of war around player."""
        x, y = self.player_pos
        for dx in range(-3, 4):
            for dy in range(-3, 4):
                nx, ny = x+dx, y+dy
                if 0 <= nx < self.size and 0 <= ny < self.size:
                    self.explored[ny, nx] = 1.0

    def _get_state(self):
        """Return 5-channel state tensor."""
        state = np.zeros((5, self.size, self.size), dtype=np.float32)
        state[0] = self.terrain
        state[1] = self.items
        state[2] = self.enemies
        # Player position
        px, py = self.player_pos
        state[3, py, px] = 1.0
        # Encode health in surrounding cells
        for dx in range(-1, 2):
            for dy in range(-1, 2):
                nx, ny = px+dx, py+dy
                if 0 <= nx < self.size and 0 <= ny < self.size:
                    state[3, ny, nx] = max(state[3, ny, nx], self.player_health * 0.5)
        state[4] = self.explored
        return torch.tensor(state)

    def get_action_mask(self):
        """Return valid action mask."""
        mask = torch.ones(self.n_actions, dtype=torch.bool)
        px, py = self.player_pos
        # Check move validity
        moves = [(0, -1), (0, 1), (-1, 0), (1, 0)]
        for i, (dx, dy) in enumerate(moves):
            nx, ny = px+dx, py+dy
            if nx < 0 or nx >= self.size or ny < 0 or ny >= self.size or self.terrain[ny, nx] == 1.0:
                mask[i] = False
        # Attack only if enemy adjacent
        has_adjacent_enemy = False
        for dx, dy in moves:
            nx, ny = px+dx, py+dy
            if 0 <= nx < self.size and 0 <= ny < self.size and self.enemies[ny, nx] > 0:
                has_adjacent_enemy = True
        mask[4] = has_adjacent_enemy
        # Use item only if inventory non-empty
        mask[5] = len(self.inventory) > 0
        # Interact only if on a special tile
        mask[6] = self.items[py, px] > 0
        return mask

    def step(self, action):
        """Execute action and return (next_state, reward, done)."""
        self.steps += 1
        reward = -0.01  # Small step penalty
        done = False
        px, py = self.player_pos

        # Movement
        if action < 4:
            moves = [(0, -1), (0, 1), (-1, 0), (1, 0)]
            dx, dy = moves[action]
            nx, ny = px+dx, py+dy
            if 0 <= nx < self.size and 0 <= ny < self.size and self.terrain[ny, nx] != 1.0:
                # Check for trap
                if self.terrain[ny, nx] == 0.3:
                    self.player_health -= 0.1
                    reward -= 0.05
                self.player_pos = [nx, ny]
                # Exploration bonus
                old_explored = self.explored.sum()
                self._update_explored()
                new_explored = self.explored.sum()
                reward += 0.1 * (new_explored - old_explored) / (self.size * self.size)

        # Attack
        elif action == 4:
            for dx, dy in [(0,-1),(0,1),(-1,0),(1,0)]:
                nx, ny = px+dx, py+dy
                if 0 <= nx < self.size and 0 <= ny < self.size and self.enemies[ny, nx] > 0:
                    self.enemies[ny, nx] -= 0.3
                    if self.enemies[ny, nx] <= 0:
                        self.enemies[ny, nx] = 0
                        reward += 0.2
                    break

        # Use item
        elif action == 5 and self.inventory:
            item = self.inventory.pop(0)
            if item == 'potion':
                self.player_health = min(1.0, self.player_health + 0.3)
                reward += 0.1

        # Interact (pick up item)
        elif action == 6:
            px, py = self.player_pos
            if self.items[py, px] > 0:
                item_type = self.items[py, px]
                self.items[py, px] = 0
                self.items_collected += 1
                if item_type == 1.0:  # Key
                    self.inventory.append('key')
                    reward += 0.15
                elif item_type == 0.7:  # Potion
                    self.inventory.append('potion')
                    reward += 0.1
                elif item_type == 0.5:  # Gold
                    reward += 0.3

        # Check win condition
        if self.player_pos == self.goal_pos:
            reward += 1.0
            done = True

        # Check death
        if self.player_health <= 0:
            reward -= 0.5
            done = True

        # Time limit
        if self.steps >= 500:
            done = True

        self.total_reward += reward
        return self._get_state(), reward, done

# Test the environment
env = DungeonLevel(difficulty=3, seed=42)
state = env.reset()
print(f"State shape: {state.shape}")
print(f"Actions: {env.n_actions}")

# Visualize the level
fig, axes = plt.subplots(1, 5, figsize=(20, 4))
channel_names = ['Terrain', 'Items', 'Enemies', 'Player', 'Explored']
for i, (ax, name) in enumerate(zip(axes, channel_names)):
    ax.imshow(state[i].numpy(), cmap='viridis')
    ax.set_title(name)
    ax.axis('off')
plt.suptitle('Dungeon Level State Channels', fontweight='bold')
plt.tight_layout()
plt.show()

## Part 2: The Game Testing DQN

In [None]:
class GameTestDQN(nn.Module):
    """DQN adapted for dungeon game testing (5-channel 32x32 input)."""
    def __init__(self, n_actions=8):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(5, 32, kernel_size=5, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )
        # Calculate conv output size
        test = torch.zeros(1, 5, 32, 32)
        conv_out = self.conv(test)
        self.conv_size = conv_out.view(1, -1).size(1)

        self.fc = nn.Sequential(
            nn.Linear(self.conv_size, 256),
            nn.ReLU(),
            nn.Linear(256, n_actions)
        )

    def forward(self, x, action_mask=None):
        x = self.conv(x)
        x = x.view(x.size(0), -1)
        q_values = self.fc(x)
        if action_mask is not None:
            q_values = q_values.masked_fill(~action_mask, float('-inf'))
        return q_values


class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done, mask):
        self.buffer.append((state, action, reward, next_state, done, mask))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        s, a, r, ns, d, m = zip(*batch)
        return (torch.stack(s), torch.tensor(a, dtype=torch.long),
                torch.tensor(r, dtype=torch.float32), torch.stack(ns),
                torch.tensor(d, dtype=torch.bool), torch.stack(m))

    def __len__(self):
        return len(self.buffer)


# Test the network
net = GameTestDQN().to(device)
test_state = torch.randn(1, 5, 32, 32).to(device)
test_mask = torch.ones(1, 8, dtype=torch.bool).to(device)
q = net(test_state, test_mask)
print(f"Network output: {q.shape}")
print(f"Parameters: {sum(p.numel() for p in net.parameters()):,}")

## Part 3: Training Pipeline with QA Metrics

In [None]:
def train_dqn_on_level(difficulty, seed=None, n_episodes=300,
                       buffer_size=50000, batch_size=64, gamma=0.99,
                       lr=5e-4, target_update=500, eps_decay=10000):
    """
    Train a Double DQN agent on a single level and extract QA metrics.

    Returns:
        metrics: dict with completability, difficulty, exploit scores
        episode_rewards: list of rewards per episode
        episode_completions: list of booleans
    """
    env = DungeonLevel(difficulty=difficulty, seed=seed)

    online = GameTestDQN().to(device)
    target = GameTestDQN().to(device)
    target.load_state_dict(online.state_dict())
    target.eval()

    optimizer = optim.Adam(online.parameters(), lr=lr)
    buffer = ReplayBuffer(buffer_size)

    episode_rewards = []
    episode_completions = []
    episode_lengths = []
    step_count = 0
    first_completion = None

    for episode in range(n_episodes):
        state = env.reset()
        ep_reward = 0
        completed = False

        for t in range(500):
            # Epsilon-greedy with action masking
            epsilon = max(0.05, 1.0 - step_count / eps_decay)
            mask = env.get_action_mask()

            if random.random() < epsilon:
                valid_actions = torch.where(mask)[0]
                action = valid_actions[random.randint(0, len(valid_actions)-1)].item()
            else:
                with torch.no_grad():
                    q = online(state.unsqueeze(0).to(device), mask.unsqueeze(0).to(device))
                    action = q.argmax(1).item()

            next_state, reward, done = env.step(action)
            next_mask = env.get_action_mask()
            buffer.push(state, action, reward, next_state, done, next_mask)

            state = next_state
            ep_reward += reward
            step_count += 1

            if env.player_pos == env.goal_pos:
                completed = True

            # Training (Double DQN)
            if len(buffer) >= batch_size:
                s, a, r, ns, d, m = buffer.sample(batch_size)
                s, a, r, ns, d, m = (s.to(device), a.to(device), r.to(device),
                                      ns.to(device), d.to(device), m.to(device))

                q_vals = online(s).gather(1, a.unsqueeze(1)).squeeze(1)

                with torch.no_grad():
                    # Double DQN: select with online, evaluate with target
                    best_actions = online(ns, m).argmax(1)
                    next_q = target(ns).gather(1, best_actions.unsqueeze(1)).squeeze(1)
                    targets = r + gamma * next_q * (~d).float()

                loss = F.smooth_l1_loss(q_vals, targets)
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(online.parameters(), 10.0)
                optimizer.step()

            if step_count % target_update == 0:
                target.load_state_dict(online.state_dict())

            if done:
                break

        episode_rewards.append(ep_reward)
        episode_completions.append(completed)
        episode_lengths.append(t + 1)

        if completed and first_completion is None:
            first_completion = episode

    # Compute QA metrics
    last_100 = episode_completions[-100:] if len(episode_completions) >= 100 else episode_completions
    completion_rate = sum(last_100) / len(last_100)

    difficulty_score = (first_completion / n_episodes * 10) if first_completion is not None else 10.0
    difficulty_score = min(10.0, difficulty_score)

    rewards_arr = np.array(episode_rewards)
    if rewards_arr.std() > 0:
        exploit_score = (rewards_arr.max() - rewards_arr.mean()) / rewards_arr.std()
    else:
        exploit_score = 0.0

    metrics = {
        'completion_rate': completion_rate,
        'difficulty_score': difficulty_score,
        'exploit_score': exploit_score,
        'first_completion': first_completion,
        'avg_reward': np.mean(episode_rewards[-50:]),
        'avg_length': np.mean(episode_lengths[-50:]),
    }

    return metrics, episode_rewards, episode_completions

print("Training pipeline ready.")

## Part 4: Running the QA Suite

In [None]:
# Test across multiple difficulty levels
difficulties = [1, 3, 5, 7, 9]
all_metrics = {}
all_rewards = {}

print("Running QA suite across difficulty levels...")
print("=" * 70)

for diff in difficulties:
    start = time.time()
    metrics, rewards, completions = train_dqn_on_level(
        difficulty=diff, seed=42+diff, n_episodes=200
    )
    elapsed = time.time() - start

    all_metrics[diff] = metrics
    all_rewards[diff] = rewards

    status = "PASS" if metrics['completion_rate'] > 0.1 else "FAIL"
    print(f"Difficulty {diff:2d} | CR: {metrics['completion_rate']:.2f} | "
          f"DS: {metrics['difficulty_score']:.1f}/10 | "
          f"ES: {metrics['exploit_score']:.2f} | "
          f"First complete: ep {metrics['first_completion']} | "
          f"Status: {status} | {elapsed:.0f}s")

print("=" * 70)

## Part 5: Visualizing QA Results

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# 1. Training curves by difficulty
ax = axes[0, 0]
colors = plt.cm.RdYlGn_r(np.linspace(0.1, 0.9, len(difficulties)))
for diff, color in zip(difficulties, colors):
    rewards = all_rewards[diff]
    window = 15
    if len(rewards) >= window:
        smoothed = np.convolve(rewards, np.ones(window)/window, mode='valid')
        ax.plot(range(window-1, len(rewards)), smoothed, label=f'Diff={diff}',
                color=color, linewidth=2)
ax.set_xlabel('Episode')
ax.set_ylabel('Reward')
ax.set_title('Training Curves by Difficulty')
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)

# 2. Completion rate
ax = axes[0, 1]
crs = [all_metrics[d]['completion_rate'] for d in difficulties]
bars = ax.bar(range(len(difficulties)), crs, color=['#2ecc71' if cr > 0.5 else '#f39c12' if cr > 0.1 else '#e74c3c' for cr in crs])
ax.set_xticks(range(len(difficulties)))
ax.set_xticklabels([f'Diff {d}' for d in difficulties])
ax.set_ylabel('Completion Rate')
ax.set_title('Level Completability')
ax.axhline(y=0.1, color='red', linestyle='--', alpha=0.5, label='Min threshold')
ax.legend()
ax.grid(axis='y', alpha=0.3)

# 3. Difficulty scores
ax = axes[0, 2]
ds_values = [all_metrics[d]['difficulty_score'] for d in difficulties]
ax.plot(difficulties, ds_values, 'o-', color='#3498db', linewidth=2, markersize=10)
ax.plot(difficulties, difficulties, '--', color='gray', alpha=0.5, label='Ideal (linear)')
ax.set_xlabel('Target Difficulty')
ax.set_ylabel('DQN Difficulty Score')
ax.set_title('Difficulty Calibration')
ax.legend()
ax.grid(True, alpha=0.3)

# 4. Exploit scores
ax = axes[1, 0]
es_values = [all_metrics[d]['exploit_score'] for d in difficulties]
bars = ax.bar(range(len(difficulties)), es_values,
              color=['#e74c3c' if es > 5 else '#2ecc71' for es in es_values])
ax.set_xticks(range(len(difficulties)))
ax.set_xticklabels([f'Diff {d}' for d in difficulties])
ax.set_ylabel('Exploit Score')
ax.set_title('Exploit Detection')
ax.axhline(y=5.0, color='red', linestyle='--', label='Alert threshold')
ax.legend()
ax.grid(axis='y', alpha=0.3)

# 5. Average episode length
ax = axes[1, 1]
lengths = [all_metrics[d]['avg_length'] for d in difficulties]
ax.bar(range(len(difficulties)), lengths, color='#9b59b6', alpha=0.7)
ax.set_xticks(range(len(difficulties)))
ax.set_xticklabels([f'Diff {d}' for d in difficulties])
ax.set_ylabel('Avg Episode Length')
ax.set_title('Agent Behavior Complexity')
ax.grid(axis='y', alpha=0.3)

# 6. Summary dashboard
ax = axes[1, 2]
ax.axis('off')
summary_text = "QA SUMMARY\n" + "=" * 30 + "\n\n"
for d in difficulties:
    m = all_metrics[d]
    status = "PASS" if m['completion_rate'] > 0.1 else "FAIL"
    flag = " [!]" if m['exploit_score'] > 5 else ""
    summary_text += f"Level (Diff {d}): {status}{flag}\n"
    summary_text += f"  CR={m['completion_rate']:.0%}  DS={m['difficulty_score']:.1f}  ES={m['exploit_score']:.1f}\n\n"
ax.text(0.05, 0.95, summary_text, transform=ax.transAxes, fontsize=10,
        verticalalignment='top', fontfamily='monospace',
        bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.suptitle('DQN Game Testing QA Dashboard', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

## Part 6: Summary

In [None]:
print("=" * 60)
print("CASE STUDY RESULTS")
print("=" * 60)
print(f"""
We implemented a DQN-based automated game testing pipeline that:

1. COMPLETABILITY: Trains a DQN agent per level. If the agent
   cannot learn to complete the level (CR < 10%), it flags a
   potential soft-lock or impossible state.

2. DIFFICULTY: The number of episodes to first completion
   provides a difficulty score (0-10) that correlates with
   human-perceived difficulty.

3. EXPLOIT DETECTION: Unusually high reward episodes
   (>5 sigma) indicate potential reward exploits.

Key Implementation Details:
  - Double DQN to prevent overestimation (critical for accuracy)
  - Action masking to handle invalid moves
  - Reward shaping for faster convergence
  - 5-channel grid state representation

Business Impact:
  - QA cycle: 8-10 days -> 3.5 days (65% reduction)
  - Post-release bugs: 3.2 -> 0.8 per update (75% reduction)
  - Enabled weekly releases (23% engagement increase)
  - ROI: 29x return on infrastructure investment
""")