# 🎮 Multi-Agent Soccer Game with Deep Reinforcement Learning
## Google Colab 完全統合版 (最終修正版)

このノートブックは深層強化学習を用いたマルチエージェントサッカーゲームの完全統合版です。

### ⚠️ 重要: 実行手順
1. **Runtime → Change runtime type → GPU を選択**
2. **Runtime → Run all または上から順番に実行**
3. **すべてのコードセルを実行してから実行セクションを使用**

### 📋 修正内容
- ✅ ModuleNotFoundError 修正済み
- ✅ TypeError (agent_selector) 修正済み
- ✅ すべての内部import削除済み

### 🎯 実装内容
- 2v2サッカーゲーム（PettingZoo互換）
- 物理エンジン・報酬システム完備
- Random, DQN, MADDPG エージェント実装

## 📦 Step 1: 必要なライブラリのインストール

In [None]:
# 必要なライブラリをインストール
!pip install -q gymnasium
!pip install -q pettingzoo
!pip install -q pygame
!pip install -q torch torchvision
!pip install -q matplotlib seaborn
!pip install -q numpy

print("✅ All dependencies installed successfully!")

## 🔧 Step 2: 基本インポート

In [None]:
# 基本ライブラリのインポート
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Tuple, Optional, Any, Union
import json
from collections import defaultdict, deque
import random
from dataclasses import dataclass
from abc import ABC, abstractmethod
import time
import os

# Gymnasium and PettingZoo (修正済み)
import gymnasium as gym
from gymnasium import spaces
from pettingzoo import AECEnv
from pettingzoo.utils import AgentSelector, wrappers  # Fixed: AgentSelector instead of agent_selector

# Pygame (optional)
try:
    import pygame
    PYGAME_AVAILABLE = True
except ImportError:
    PYGAME_AVAILABLE = False
    print("⚠️ Pygame not available. Rendering disabled.")

# Set seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

# Matplotlib settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"✅ PyTorch version: {torch.__version__}")
print(f"✅ Using device: {device}")

## ⚙️ Step 3: 設定クラス

In [None]:
"""
Configuration file for Multi-Agent Soccer Game
"""

from dataclasses import dataclass
from typing import Tuple

@dataclass
class SoccerEnvironmentConfig:
    """Environment configuration for soccer game"""
    FIELD_SIZE: Tuple[int, int] = (800, 600)
    GOAL_SIZE: Tuple[int, int] = (20, 200)
    BALL_RADIUS: int = 10
    PLAYER_RADIUS: int = 20
    MAX_STEPS: int = 1000

    NUM_PLAYERS_PER_TEAM: int = 2
    TEAM_COLORS: Tuple[str, str] = ('blue', 'red')
    PLAYER_SPEED: float = 5.0
    BALL_SPEED_MULTIPLIER: float = 1.5

    FRICTION: float = 0.95
    BALL_DECAY: float = 0.98
    COLLISION_THRESHOLD: float = 30.0

@dataclass
class MADDPGConfig:
    """MADDPG algorithm configuration"""
    obs_dim: int = 28
    action_dim: int = 5
    global_obs_dim: int = 112  # 28 * 4 agents
    global_action_dim: int = 20  # 5 * 4 agents
    hidden_dims: Tuple[int, ...] = (256, 128)

    actor_lr: float = 1e-4
    critic_lr: float = 1e-3
    gamma: float = 0.95
    tau: float = 0.01
    batch_size: int = 256
    buffer_size: int = int(1e6)
    noise_scale: float = 0.1
    noise_decay: float = 0.9999

@dataclass
class TrainingConfig:
    """Training configuration"""
    max_episodes: int = 10000
    max_steps_per_episode: int = 1000
    save_freq: int = 1000
    eval_freq: int = 500
    log_freq: int = 100

    # Reproducibility
    random_seed: int = 42

@dataclass
class ExperimentConfig:
    """Experiment configuration"""
    experiment_name: str = "soccer_multiagent"
    log_dir: str = "logs"
    save_dir: str = "saved_models"
    video_dir: str = "videos"

    # Algorithms to run
    algorithms: Tuple[str, ...] = ("random", "dqn", "ppo", "maddpg")

print("✅ Section completed: ⚙️ Step 3: 設定クラス")

## 🎯 Step 4: 物理エンジン

In [None]:
"""
Physics engine for soccer game
"""

import numpy as np
from typing import Dict, List, Tuple, Optional

class Ball:
    def __init__(self, x: float, y: float, radius: float = 10):
        self.pos = np.array([x, y], dtype=float)
        self.vel = np.array([0.0, 0.0], dtype=float)
        self.radius = radius

    def update(self, config: SoccerEnvironmentConfig):
        """Update ball position with physics"""
        # Apply velocity
        self.pos += self.vel

        # Apply ball decay (friction)
        self.vel *= config.BALL_DECAY

        # Boundary collision detection
        field_width, field_height = config.FIELD_SIZE

        # Horizontal boundaries (top/bottom)
        if self.pos[1] <= self.radius or self.pos[1] >= field_height - self.radius:
            self.vel[1] *= -0.8  # Energy loss on collision
            self.pos[1] = max(self.radius, min(field_height - self.radius, self.pos[1]))

        # Vertical boundaries (left/right - goals)
        goal_top = (field_height - config.GOAL_SIZE[1]) // 2
        goal_bottom = goal_top + config.GOAL_SIZE[1]

        # Left side
        if self.pos[0] <= self.radius:
            if goal_top <= self.pos[1] <= goal_bottom:
                # Goal scored
                return "goal_left"
            else:
                self.vel[0] *= -0.8
                self.pos[0] = self.radius

        # Right side
        elif self.pos[0] >= field_width - self.radius:
            if goal_top <= self.pos[1] <= goal_bottom:
                # Goal scored
                return "goal_right"
            else:
                self.vel[0] *= -0.8
                self.pos[0] = field_width - self.radius

        return None

class Player:
    def __init__(self, x: float, y: float, team: int, player_id: int, radius: float = 20):
        self.pos = np.array([x, y], dtype=float)
        self.vel = np.array([0.0, 0.0], dtype=float)
        self.team = team
        self.player_id = player_id
        self.radius = radius
        self.has_ball = False

    def update(self, action: np.ndarray, config: SoccerEnvironmentConfig):
        """Update player position based on action"""
        # Extract movement and kick actions
        move_x, move_y = action[0], action[1]
        kick_power = action[2] if len(action) > 2 else 0.0
        kick_dir_x = action[3] if len(action) > 3 else 0.0
        kick_dir_y = action[4] if len(action) > 4 else 0.0

        # Apply movement
        movement = np.array([move_x, move_y]) * config.PLAYER_SPEED
        self.vel = movement
        self.pos += self.vel

        # Apply friction
        self.vel *= config.FRICTION

        # Boundary constraints
        field_width, field_height = config.FIELD_SIZE
        self.pos[0] = max(self.radius, min(field_width - self.radius, self.pos[0]))
        self.pos[1] = max(self.radius, min(field_height - self.radius, self.pos[1]))

        return kick_power, np.array([kick_dir_x, kick_dir_y])

class PhysicsEngine:
    def __init__(self, config: SoccerEnvironmentConfig):
        self.config = config
        self.ball = Ball(
            config.FIELD_SIZE[0] // 2,
            config.FIELD_SIZE[1] // 2,
            config.BALL_RADIUS
        )

        # Initialize players
        self.players = []
        self._init_players()

    def _init_players(self):
        """Initialize player positions"""
        field_width, field_height = self.config.FIELD_SIZE

        # Team 0 (left side - blue)
        self.players.append(Player(field_width * 0.2, field_height * 0.3, 0, 0))
        self.players.append(Player(field_width * 0.2, field_height * 0.7, 0, 1))

        # Team 1 (right side - red)
        self.players.append(Player(field_width * 0.8, field_height * 0.3, 1, 0))
        self.players.append(Player(field_width * 0.8, field_height * 0.7, 1, 1))

    def reset(self):
        """Reset physics state"""
        self.ball.pos = np.array([
            self.config.FIELD_SIZE[0] // 2,
            self.config.FIELD_SIZE[1] // 2
        ], dtype=float)
        self.ball.vel = np.array([0.0, 0.0], dtype=float)

        # Reset player positions
        field_width, field_height = self.config.FIELD_SIZE
        positions = [
            (field_width * 0.2, field_height * 0.3),  # Team 0, Player 0
            (field_width * 0.2, field_height * 0.7),  # Team 0, Player 1
            (field_width * 0.8, field_height * 0.3),  # Team 1, Player 0
            (field_width * 0.8, field_height * 0.7),  # Team 1, Player 1
        ]

        for i, (x, y) in enumerate(positions):
            self.players[i].pos = np.array([x, y], dtype=float)
            self.players[i].vel = np.array([0.0, 0.0], dtype=float)
            self.players[i].has_ball = False

    def step(self, actions: Dict[str, np.ndarray]) -> Optional[str]:
        """Step physics simulation"""
        # Update players
        kicks = {}
        for i, player in enumerate(self.players):
            agent_key = f"player_{i}"
            if agent_key in actions:
                kick_power, kick_dir = player.update(actions[agent_key], self.config)
                if kick_power > 0:
                    kicks[i] = (kick_power, kick_dir)

        # Check player collisions with ball and apply kicks
        ball_touched_by = None
        for i, player in enumerate(self.players):
            dist = np.linalg.norm(player.pos - self.ball.pos)
            if dist <= player.radius + self.ball.radius:
                ball_touched_by = i
                player.has_ball = True

                # Apply kick if player is kicking
                if i in kicks:
                    kick_power, kick_dir = kicks[i]
                    kick_dir = kick_dir / (np.linalg.norm(kick_dir) + 1e-8)  # Normalize
                    self.ball.vel += kick_dir * kick_power * self.config.BALL_SPEED_MULTIPLIER
            else:
                player.has_ball = False

        # Update ball
        goal_result = self.ball.update(self.config)

        # Handle player-player collisions
        self._handle_player_collisions()

        return goal_result, ball_touched_by

    def _handle_player_collisions(self):
        """Handle collisions between players"""
        for i in range(len(self.players)):
            for j in range(i + 1, len(self.players)):
                p1, p2 = self.players[i], self.players[j]
                dist = np.linalg.norm(p1.pos - p2.pos)

                if dist < p1.radius + p2.radius:
                    # Separate players
                    direction = p1.pos - p2.pos
                    direction = direction / (np.linalg.norm(direction) + 1e-8)
                    overlap = (p1.radius + p2.radius) - dist

                    p1.pos += direction * overlap * 0.5
                    p2.pos -= direction * overlap * 0.5

    def get_state(self) -> Dict:
        """Get current state of all entities"""
        return {
            'ball': {
                'pos': self.ball.pos.copy(),
                'vel': self.ball.vel.copy()
            },
            'players': [
                {
                    'pos': player.pos.copy(),
                    'vel': player.vel.copy(),
                    'team': player.team,
                    'player_id': player.player_id,
                    'has_ball': player.has_ball
                }
                for player in self.players
            ]
        }

print("✅ Section completed: 🎯 Step 4: 物理エンジン")

## 🎮 Step 5: 観測・行動空間

In [None]:
"""
Observation and action space definitions for soccer environment
"""

import numpy as np
import gymnasium as gym
from gymnasium import spaces
from typing import Dict, List, Tuple, Union

class ObservationSpace:
    """
    Observation space for each agent (28 dimensions total)
    """
    def __init__(self, config: SoccerEnvironmentConfig):
        self.config = config

        # Observation space bounds
        field_width, field_height = config.FIELD_SIZE
        max_velocity = config.PLAYER_SPEED * 2  # Max possible velocity
        max_distance = np.sqrt(field_width**2 + field_height**2)  # Diagonal distance

        # Define observation bounds
        obs_low = np.array([
            # Self state (4 dims)
            0, 0,           # position (normalized)
            -max_velocity, -max_velocity,  # velocity

            # Ball state (4 dims)
            0, 0,           # position (normalized)
            -max_velocity, -max_velocity,  # velocity

            # Teammate state (4 dims)
            0, 0,           # position (normalized)
            -max_velocity, -max_velocity,  # velocity

            # Opponent 1 state (4 dims)
            0, 0,           # position (normalized)
            -max_velocity, -max_velocity,  # velocity

            # Opponent 2 state (4 dims)
            0, 0,           # position (normalized)
            -max_velocity, -max_velocity,  # velocity

            # Goal information (4 dims)
            0,              # own goal distance
            0,              # enemy goal distance
            -np.pi,         # own goal angle
            -np.pi,         # enemy goal angle

            # Context information (4 dims)
            -1,             # ball possession (-1: none, 0-3: player id)
            0,              # time remaining (normalized)
            -10,            # score difference
            -1,             # last touch player id
        ], dtype=np.float32)

        obs_high = np.array([
            # Self state (4 dims)
            1, 1,           # position (normalized)
            max_velocity, max_velocity,  # velocity

            # Ball state (4 dims)
            1, 1,           # position (normalized)
            max_velocity, max_velocity,  # velocity

            # Teammate state (4 dims)
            1, 1,           # position (normalized)
            max_velocity, max_velocity,  # velocity

            # Opponent 1 state (4 dims)
            1, 1,           # position (normalized)
            max_velocity, max_velocity,  # velocity

            # Opponent 2 state (4 dims)
            1, 1,           # position (normalized)
            max_velocity, max_velocity,  # velocity

            # Goal information (4 dims)
            max_distance,   # own goal distance
            max_distance,   # enemy goal distance
            np.pi,          # own goal angle
            np.pi,          # enemy goal angle

            # Context information (4 dims)
            3,              # ball possession (player 0-3)
            1,              # time remaining (normalized)
            10,             # score difference
            3,              # last touch player id
        ], dtype=np.float32)

        self.gym_space = spaces.Box(low=obs_low, high=obs_high, dtype=np.float32)

    def create_observation(self, agent_id: int, state: Dict,
                         scores: Tuple[int, int], step: int,
                         max_steps: int, ball_possession: int = -1,
                         last_touch: int = -1) -> np.ndarray:
        """Create observation for specific agent"""

        field_width, field_height = self.config.FIELD_SIZE
        players = state['players']
        ball = state['ball']

        # Get agent info
        agent = players[agent_id]
        agent_team = agent['team']
        agent_pos = agent['pos'] / np.array([field_width, field_height])  # Normalize
        agent_vel = agent['vel'] / self.config.PLAYER_SPEED  # Normalize

        # Get ball info
        ball_pos = ball['pos'] / np.array([field_width, field_height])  # Normalize
        ball_vel = ball['vel'] / self.config.PLAYER_SPEED  # Normalize

        # Get teammate and opponents
        teammates = [p for i, p in enumerate(players)
                    if p['team'] == agent_team and i != agent_id]
        opponents = [p for p in players if p['team'] != agent_team]

        teammate = teammates[0] if teammates else agent  # fallback
        teammate_pos = teammate['pos'] / np.array([field_width, field_height])
        teammate_vel = teammate['vel'] / self.config.PLAYER_SPEED

        # Opponents
        opp1 = opponents[0] if len(opponents) > 0 else agent
        opp2 = opponents[1] if len(opponents) > 1 else agent

        opp1_pos = opp1['pos'] / np.array([field_width, field_height])
        opp1_vel = opp1['vel'] / self.config.PLAYER_SPEED

        opp2_pos = opp2['pos'] / np.array([field_width, field_height])
        opp2_vel = opp2['vel'] / self.config.PLAYER_SPEED

        # Goal information
        if agent_team == 0:  # Blue team (left side)
            own_goal_pos = np.array([0, 0.5])
            enemy_goal_pos = np.array([1, 0.5])
        else:  # Red team (right side)
            own_goal_pos = np.array([1, 0.5])
            enemy_goal_pos = np.array([0, 0.5])

        own_goal_dist = np.linalg.norm(agent_pos - own_goal_pos)
        enemy_goal_dist = np.linalg.norm(agent_pos - enemy_goal_pos)

        # Goal angles
        own_goal_vec = own_goal_pos - agent_pos
        enemy_goal_vec = enemy_goal_pos - agent_pos

        own_goal_angle = np.arctan2(own_goal_vec[1], own_goal_vec[0])
        enemy_goal_angle = np.arctan2(enemy_goal_vec[1], enemy_goal_vec[0])

        # Context information
        time_remaining = (max_steps - step) / max_steps
        score_diff = scores[agent_team] - scores[1 - agent_team]

        # Construct observation
        observation = np.concatenate([
            # Self state
            agent_pos, agent_vel,

            # Ball state
            ball_pos, ball_vel,

            # Teammate state
            teammate_pos, teammate_vel,

            # Opponent states
            opp1_pos, opp1_vel,
            opp2_pos, opp2_vel,

            # Goal information
            [own_goal_dist, enemy_goal_dist, own_goal_angle, enemy_goal_angle],

            # Context information
            [ball_possession, time_remaining, score_diff, last_touch]
        ]).astype(np.float32)

        return observation

class ActionSpace:
    """
    Action space for each agent
    """
    def __init__(self, action_type: str = "continuous"):
        self.action_type = action_type

        if action_type == "continuous":
            # 5-dimensional continuous action space
            # [move_x, move_y, kick_power, kick_dir_x, kick_dir_y]
            self.gym_space = spaces.Box(
                low=np.array([-1, -1, 0, -1, -1], dtype=np.float32),
                high=np.array([1, 1, 1, 1, 1], dtype=np.float32),
                dtype=np.float32
            )
        else:
            # 9-dimensional discrete action space
            # [NOOP, UP, DOWN, LEFT, RIGHT, KICK_UP, KICK_DOWN, KICK_LEFT, KICK_RIGHT]
            self.gym_space = spaces.Discrete(9)

        self.action_meanings = {
            0: "NOOP",
            1: "UP",
            2: "DOWN",
            3: "LEFT",
            4: "RIGHT",
            5: "KICK_UP",
            6: "KICK_DOWN",
            7: "KICK_LEFT",
            8: "KICK_RIGHT"
        }

    def sample(self) -> Union[np.ndarray, int]:
        """Sample a random action"""
        return self.gym_space.sample()

    def convert_discrete_to_continuous(self, action: int) -> np.ndarray:
        """Convert discrete action to continuous action format"""
        action_map = {
            0: np.array([0, 0, 0, 0, 0]),        # NOOP
            1: np.array([0, -1, 0, 0, 0]),       # UP
            2: np.array([0, 1, 0, 0, 0]),        # DOWN
            3: np.array([-1, 0, 0, 0, 0]),       # LEFT
            4: np.array([1, 0, 0, 0, 0]),        # RIGHT
            5: np.array([0, 0, 0.5, 0, -1]),     # KICK_UP
            6: np.array([0, 0, 0.5, 0, 1]),      # KICK_DOWN
            7: np.array([0, 0, 0.5, -1, 0]),     # KICK_LEFT
            8: np.array([0, 0, 0.5, 1, 0]),      # KICK_RIGHT
        }

        return action_map.get(action, action_map[0]).astype(np.float32)

def create_spaces(config: SoccerEnvironmentConfig, action_type: str = "continuous"):
    """Create observation and action spaces"""
    obs_space = ObservationSpace(config)
    action_space = ActionSpace(action_type)

    return obs_space, action_space

print("✅ Section completed: 🎮 Step 5: 観測・行動空間")

## 🏆 Step 6: 報酬システム

In [None]:
"""
Reward system for soccer environment with multi-objective reward function
"""

import numpy as np
from typing import Dict, List, Tuple, Optional

class RewardCalculator:
    """
    Multi-objective reward function for soccer agents
    """
    def __init__(self, config: SoccerEnvironmentConfig):
        self.config = config
        self.field_width, self.field_height = config.FIELD_SIZE

        # Reward weights
        self.reward_weights = {
            'goal_scored': 100.0,           # Goal scored by team
            'goal_conceded': -100.0,        # Goal conceded by team
            'ball_touch': 5.0,              # Touching the ball
            'goal_approach': 0.1,           # Moving closer to enemy goal
            'ball_approach': 0.05,          # Moving closer to ball
            'teamwork': 0.02,               # Team coordination
            'out_of_bounds': -10.0,         # Going out of bounds
            'stalemate': -0.1,              # Stalemate penalty
            'ball_possession': 0.01,        # Keeping ball possession
            'defensive_positioning': 0.01,   # Good defensive position
        }

        # Track previous states for delta calculations
        self.prev_states = {}

    def calculate_reward(self, agent_id: int, action: np.ndarray,
                        prev_state: Dict, current_state: Dict,
                        goal_scored: Optional[str] = None,
                        ball_touched_by: Optional[int] = None,
                        scores: Tuple[int, int] = (0, 0),
                        out_of_bounds_agents: List[int] = None) -> float:
        """
        Calculate multi-objective reward for agent
        """
        reward = 0.0
        agent_team = current_state['players'][agent_id]['team']
        out_of_bounds_agents = out_of_bounds_agents or []

        # 1. Goal rewards (most important)
        if goal_scored:
            if goal_scored == "goal_left" and agent_team == 1:  # Red team scored
                reward += self.reward_weights['goal_scored']
            elif goal_scored == "goal_right" and agent_team == 0:  # Blue team scored
                reward += self.reward_weights['goal_scored']
            elif goal_scored == "goal_left" and agent_team == 0:  # Blue conceded
                reward += self.reward_weights['goal_conceded']
            elif goal_scored == "goal_right" and agent_team == 1:  # Red conceded
                reward += self.reward_weights['goal_conceded']

        # 2. Ball contact reward
        if ball_touched_by == agent_id:
            reward += self.reward_weights['ball_touch']

        # 3. Goal approach reward
        goal_approach_reward = self.calculate_goal_approach_reward(
            agent_id, prev_state, current_state
        )
        reward += goal_approach_reward * self.reward_weights['goal_approach']

        # 4. Ball approach reward
        ball_approach_reward = self.calculate_ball_approach_reward(
            agent_id, prev_state, current_state
        )
        reward += ball_approach_reward * self.reward_weights['ball_approach']

        # 5. Teamwork reward
        teamwork_reward = self.calculate_teamwork_reward(agent_id, current_state)
        reward += teamwork_reward * self.reward_weights['teamwork']

        # 6. Penalties
        if agent_id in out_of_bounds_agents:
            reward += self.reward_weights['out_of_bounds']

        if self.is_stalemate(current_state):
            reward += self.reward_weights['stalemate']

        # 7. Ball possession reward
        if current_state['players'][agent_id]['has_ball']:
            reward += self.reward_weights['ball_possession']

        # 8. Defensive positioning reward
        defensive_reward = self.calculate_defensive_positioning_reward(
            agent_id, current_state
        )
        reward += defensive_reward * self.reward_weights['defensive_positioning']

        return reward

    def calculate_goal_approach_reward(self, agent_id: int, prev_state: Dict,
                                     current_state: Dict) -> float:
        """Calculate reward for approaching enemy goal"""
        agent_team = current_state['players'][agent_id]['team']
        current_pos = current_state['players'][agent_id]['pos']
        prev_pos = prev_state['players'][agent_id]['pos']

        # Enemy goal position
        if agent_team == 0:  # Blue team
            enemy_goal_pos = np.array([self.field_width, self.field_height / 2])
        else:  # Red team
            enemy_goal_pos = np.array([0, self.field_height / 2])

        prev_dist = np.linalg.norm(prev_pos - enemy_goal_pos)
        current_dist = np.linalg.norm(current_pos - enemy_goal_pos)

        return prev_dist - current_dist  # Positive if getting closer

    def calculate_ball_approach_reward(self, agent_id: int, prev_state: Dict,
                                     current_state: Dict) -> float:
        """Calculate reward for approaching ball"""
        current_pos = current_state['players'][agent_id]['pos']
        prev_pos = prev_state['players'][agent_id]['pos']
        ball_pos = current_state['ball']['pos']

        prev_dist = np.linalg.norm(prev_pos - ball_pos)
        current_dist = np.linalg.norm(current_pos - ball_pos)

        return prev_dist - current_dist  # Positive if getting closer

    def calculate_teamwork_reward(self, agent_id: int, current_state: Dict) -> float:
        """Calculate teamwork reward based on team coordination"""
        agent_team = current_state['players'][agent_id]['team']
        players = current_state['players']

        # Find teammate
        teammates = [p for i, p in enumerate(players)
                    if p['team'] == agent_team and i != agent_id]

        if not teammates:
            return 0.0

        teammate = teammates[0]
        agent_pos = current_state['players'][agent_id]['pos']
        teammate_pos = teammate['pos']

        # Optimal distance between teammates (100-200 pixels)
        teammate_dist = np.linalg.norm(agent_pos - teammate_pos)
        optimal_dist = 150
        dist_penalty = abs(teammate_dist - optimal_dist) / optimal_dist

        return 1.0 - dist_penalty  # Higher reward for optimal distance

    def calculate_defensive_positioning_reward(self, agent_id: int,
                                             current_state: Dict) -> float:
        """Calculate reward for good defensive positioning"""
        agent_team = current_state['players'][agent_id]['team']
        agent_pos = current_state['players'][agent_id]['pos']
        ball_pos = current_state['ball']['pos']

        # Own goal position
        if agent_team == 0:  # Blue team
            own_goal_pos = np.array([0, self.field_height / 2])
        else:  # Red team
            own_goal_pos = np.array([self.field_width, self.field_height / 2])

        # Reward for being between ball and own goal
        goal_to_ball = ball_pos - own_goal_pos
        goal_to_agent = agent_pos - own_goal_pos

        # Project agent position onto goal-ball line
        if np.linalg.norm(goal_to_ball) > 0:
            projection = np.dot(goal_to_agent, goal_to_ball) / np.linalg.norm(goal_to_ball)
            ball_dist = np.linalg.norm(goal_to_ball)

            # Reward if agent is between goal and ball
            if 0 < projection < ball_dist:
                return 1.0

        return 0.0

    def is_stalemate(self, state: Dict, threshold: float = 1.0) -> bool:
        """Check if the game is in a stalemate (low activity)"""
        ball_speed = np.linalg.norm(state['ball']['vel'])
        player_speeds = [np.linalg.norm(p['vel']) for p in state['players']]
        avg_player_speed = np.mean(player_speeds)

        return ball_speed < threshold and avg_player_speed < threshold

    def get_team_reward(self, team: int, individual_rewards: Dict[int, float]) -> float:
        """Calculate team reward from individual rewards"""
        team_players = [i for i in individual_rewards.keys()
                       if i // 2 == team]  # Assuming 2 players per team
        return sum(individual_rewards[i] for i in team_players) / len(team_players)

class RewardShaper:
    """
    Advanced reward shaping techniques
    """
    def __init__(self, config: SoccerEnvironmentConfig):
        self.config = config
        self.reward_calculator = RewardCalculator(config)

    def shaped_reward(self, agent_id: int, action: np.ndarray,
                     prev_state: Dict, current_state: Dict,
                     **kwargs) -> float:
        """Apply reward shaping for better learning"""
        base_reward = self.reward_calculator.calculate_reward(
            agent_id, action, prev_state, current_state, **kwargs
        )

        # Potential-based reward shaping
        potential_reward = self.calculate_potential_based_reward(
            agent_id, prev_state, current_state
        )

        return base_reward + potential_reward

    def calculate_potential_based_reward(self, agent_id: int,
                                       prev_state: Dict, current_state: Dict) -> float:
        """Calculate potential-based shaped reward"""
        agent_team = current_state['players'][agent_id]['team']

        # Potential functions
        prev_potential = self.calculate_potential(agent_id, prev_state)
        current_potential = self.calculate_potential(agent_id, current_state)

        # Potential-based shaping: F(s,a,s') = γΦ(s') - Φ(s)
        gamma = 0.99  # Discount factor
        return gamma * current_potential - prev_potential

    def calculate_potential(self, agent_id: int, state: Dict) -> float:
        """Calculate potential function value"""
        agent_team = state['players'][agent_id]['team']
        agent_pos = state['players'][agent_id]['pos']
        ball_pos = state['ball']['pos']

        # Potential based on distance to ball and enemy goal
        if agent_team == 0:  # Blue team
            enemy_goal_pos = np.array([self.config.FIELD_SIZE[0], self.config.FIELD_SIZE[1] / 2])
        else:  # Red team
            enemy_goal_pos = np.array([0, self.config.FIELD_SIZE[1] / 2])

        ball_dist = np.linalg.norm(agent_pos - ball_pos)
        goal_dist = np.linalg.norm(ball_pos - enemy_goal_pos)

        # Potential decreases with distance (encouraging approach)
        potential = -0.001 * ball_dist - 0.001 * goal_dist

        return potential

print("✅ Section completed: 🏆 Step 6: 報酬システム")

## 🎨 Step 7: レンダラー

In [None]:
"""
Renderer for soccer game visualization using pygame
"""

import pygame
import numpy as np
from typing import Dict, List, Tuple, Optional

class SoccerRenderer:
    def __init__(self, config: SoccerEnvironmentConfig, window_size: Tuple[int, int] = None):
        self.config = config
        self.window_size = window_size or config.FIELD_SIZE

        pygame.init()
        self.screen = pygame.display.set_mode(self.window_size)
        pygame.display.set_caption("Multi-Agent Soccer Game")

        # Colors
        self.colors = {
            'field': (0, 128, 0),        # Green
            'field_lines': (255, 255, 255),  # White
            'ball': (255, 255, 255),     # White
            'team_0': (0, 0, 255),       # Blue
            'team_1': (255, 0, 0),       # Red
            'goal': (128, 128, 128),     # Gray
            'background': (0, 64, 0),    # Dark green
        }

        self.font = pygame.font.Font(None, 36)
        self.clock = pygame.time.Clock()

    def render(self, state: Dict, scores: Tuple[int, int] = (0, 0), step: int = 0) -> bool:
        """
        Render the current state
        Returns True if rendering should continue, False if window was closed
        """
        # Handle pygame events
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                return False

        # Clear screen
        self.screen.fill(self.colors['background'])

        # Draw field
        self._draw_field()

        # Draw players
        self._draw_players(state['players'])

        # Draw ball
        self._draw_ball(state['ball'])

        # Draw UI
        self._draw_ui(scores, step)

        pygame.display.flip()
        self.clock.tick(60)  # 60 FPS

        return True

    def _draw_field(self):
        """Draw soccer field with goals and center line"""
        field_width, field_height = self.config.FIELD_SIZE

        # Field background
        field_rect = pygame.Rect(0, 0, field_width, field_height)
        pygame.draw.rect(self.screen, self.colors['field'], field_rect)

        # Field border
        pygame.draw.rect(self.screen, self.colors['field_lines'], field_rect, 3)

        # Center line
        center_x = field_width // 2
        pygame.draw.line(self.screen, self.colors['field_lines'],
                        (center_x, 0), (center_x, field_height), 3)

        # Center circle
        pygame.draw.circle(self.screen, self.colors['field_lines'],
                          (center_x, field_height // 2), 100, 3)

        # Goals
        goal_width, goal_height = self.config.GOAL_SIZE
        goal_top = (field_height - goal_height) // 2
        goal_bottom = goal_top + goal_height

        # Left goal
        left_goal = pygame.Rect(-goal_width//2, goal_top, goal_width, goal_height)
        pygame.draw.rect(self.screen, self.colors['goal'], left_goal)
        pygame.draw.rect(self.screen, self.colors['field_lines'], left_goal, 3)

        # Right goal
        right_goal = pygame.Rect(field_width - goal_width//2, goal_top, goal_width, goal_height)
        pygame.draw.rect(self.screen, self.colors['goal'], right_goal)
        pygame.draw.rect(self.screen, self.colors['field_lines'], right_goal, 3)

        # Goal areas (penalty boxes)
        penalty_width, penalty_height = 120, 200
        penalty_top = (field_height - penalty_height) // 2

        # Left penalty box
        left_penalty = pygame.Rect(0, penalty_top, penalty_width, penalty_height)
        pygame.draw.rect(self.screen, self.colors['field_lines'], left_penalty, 2)

        # Right penalty box
        right_penalty = pygame.Rect(field_width - penalty_width, penalty_top,
                                   penalty_width, penalty_height)
        pygame.draw.rect(self.screen, self.colors['field_lines'], right_penalty, 2)

    def _draw_players(self, players: List[Dict]):
        """Draw all players"""
        for i, player in enumerate(players):
            pos = player['pos']
            team = player['team']
            has_ball = player['has_ball']

            color = self.colors[f'team_{team}']

            # Draw player circle
            pygame.draw.circle(self.screen, color, pos.astype(int), self.config.PLAYER_RADIUS)

            # Draw player outline
            outline_color = (255, 255, 255) if has_ball else (0, 0, 0)
            outline_width = 4 if has_ball else 2
            pygame.draw.circle(self.screen, outline_color, pos.astype(int),
                             self.config.PLAYER_RADIUS, outline_width)

            # Draw player number
            player_text = self.font.render(str(player['player_id']), True, (255, 255, 255))
            text_rect = player_text.get_rect(center=pos.astype(int))
            self.screen.blit(player_text, text_rect)

    def _draw_ball(self, ball: Dict):
        """Draw the ball"""
        pos = ball['pos']
        pygame.draw.circle(self.screen, self.colors['ball'], pos.astype(int), self.config.BALL_RADIUS)
        pygame.draw.circle(self.screen, (0, 0, 0), pos.astype(int), self.config.BALL_RADIUS, 2)

        # Draw ball velocity vector (for debugging)
        vel = ball['vel']
        if np.linalg.norm(vel) > 0.1:
            end_pos = pos + vel * 10  # Scale for visibility
            pygame.draw.line(self.screen, (255, 255, 0), pos.astype(int), end_pos.astype(int), 2)

    def _draw_ui(self, scores: Tuple[int, int], step: int):
        """Draw game UI (scores, step counter)"""
        # Score display
        score_text = f"Blue: {scores[0]}  Red: {scores[1]}"
        score_surface = self.font.render(score_text, True, (255, 255, 255))
        self.screen.blit(score_surface, (10, 10))

        # Step counter
        step_text = f"Step: {step}"
        step_surface = self.font.render(step_text, True, (255, 255, 255))
        step_rect = step_surface.get_rect()
        step_rect.topright = (self.window_size[0] - 10, 10)
        self.screen.blit(step_surface, step_rect)

    def close(self):
        """Close the renderer"""
        pygame.quit()

    def save_frame(self, filename: str):
        """Save current frame as image"""
        pygame.image.save(self.screen, filename)

class VideoRecorder:
    """Record gameplay videos"""
    def __init__(self, filename: str, fps: int = 30):
        self.filename = filename
        self.fps = fps
        self.frames = []

    def add_frame(self, surface):
        """Add a frame to the video"""
        frame_array = pygame.surfarray.array3d(surface)
        frame_array = np.transpose(frame_array, (1, 0, 2))  # Correct orientation
        self.frames.append(frame_array)

    def save_video(self):
        """Save recorded frames as video (requires opencv)"""
        if not self.frames:
            return

        try:
            import cv2
            height, width, _ = self.frames[0].shape
            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
            out = cv2.VideoWriter(self.filename, fourcc, self.fps, (width, height))

            for frame in self.frames:
                frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
                out.write(frame_bgr)

            out.release()
            print(f"Video saved as {self.filename}")
        except ImportError:
            print("OpenCV not available. Cannot save video.")

    def clear(self):
        """Clear recorded frames"""
        self.frames = []

print("✅ Section completed: 🎨 Step 7: レンダラー")

## 🌍 Step 8: メイン環境

In [None]:
"""
Main Soccer Environment - PettingZoo compatible multi-agent environment
"""

import numpy as np
import gymnasium as gym
from typing import Dict, List, Tuple, Optional, Any
from pettingzoo import AECEnv
from pettingzoo.utils import AgentSelector, wrappers


class SoccerEnvironment(AECEnv):
    """
    PettingZoo-compatible soccer environment for multi-agent reinforcement learning
    """

    metadata = {
        "render_modes": ["human", "rgb_array"],
        "name": "soccer_v1"
    }

    def __init__(self, config: SoccerEnvironmentConfig = None,
                 training_config: TrainingConfig = None,
                 render_mode: str = None,
                 action_type: str = "continuous"):
        """
        Initialize soccer environment

        Args:
            config: Environment configuration
            training_config: Training configuration
            render_mode: Rendering mode ("human", "rgb_array", or None)
            action_type: Action space type ("continuous" or "discrete")
        """
        super().__init__()

        self.config = config or SoccerEnvironmentConfig()
        self.training_config = training_config or TrainingConfig()
        self.render_mode = render_mode
        self.action_type = action_type

        # Initialize components
        self.physics = PhysicsEngine(self.config)
        self.reward_calculator = RewardCalculator(self.config)
        self.reward_shaper = RewardShaper(self.config)

        # Initialize spaces
        self.observation_space_handler = ObservationSpace(self.config)
        self.action_space_handler = ActionSpace(action_type)

        # Agent setup
        self.possible_agents = [f"player_{i}" for i in range(4)]
        self.agents = self.possible_agents[:]

        # Observation and action spaces
        self.observation_spaces = {
            agent: self.observation_space_handler.gym_space
            for agent in self.possible_agents
        }
        self.action_spaces = {
            agent: self.action_space_handler.gym_space
            for agent in self.possible_agents
        }

        # Agent selector for turn-based execution
        self._agent_selector = AgentSelector(self.agents)

        # Initialize renderer if needed
        self.renderer = None
        if self.render_mode == "human":
            self.renderer = SoccerRenderer(self.config)

        # Game state
        self.reset()

    def reset(self, seed: Optional[int] = None, options: Optional[Dict] = None):
        """Reset the environment"""
        if seed is not None:
            np.random.seed(seed)

        # Reset physics
        self.physics.reset()

        # Reset agents
        self.agents = self.possible_agents[:]
        self._agent_selector = AgentSelector(self.agents)
        self.agent_selection = self._agent_selector.next()

        # Reset game state
        self.step_count = 0
        self.scores = [0, 0]  # [team_0, team_1]
        self.episode_terminated = False
        self.episode_truncated = False

        # Reset rewards and info
        self.rewards = {agent: 0.0 for agent in self.agents}
        self.terminations = {agent: False for agent in self.agents}
        self.truncations = {agent: False for agent in self.agents}
        self.infos = {agent: {} for agent in self.agents}

        # Store previous state for reward calculation
        self.prev_state = None
        self.ball_possession = -1  # -1: no possession, 0-3: player id
        self.last_touch = -1

        return self._get_observations()

    def step(self, action: Any):
        """Execute one step in the environment"""
        if self.episode_terminated or self.episode_truncated:
            return self._was_dead_step(action)

        agent_id = int(self.agent_selection.split('_')[1])

        # Store previous state
        if self.prev_state is None:
            self.prev_state = self.physics.get_state()

        # Convert discrete action to continuous if needed
        if self.action_type == "discrete" and isinstance(action, (int, np.integer)):
            action = self.action_space_handler.convert_discrete_to_continuous(action)

        # Execute action in physics
        actions = {self.agent_selection: action}
        goal_result, ball_touched_by = self.physics.step(actions)

        # Update ball possession and last touch
        if ball_touched_by is not None:
            self.ball_possession = ball_touched_by
            self.last_touch = ball_touched_by

        # Handle goal scoring
        if goal_result:
            if goal_result == "goal_left":
                self.scores[1] += 1  # Red team scored
            elif goal_result == "goal_right":
                self.scores[0] += 1  # Blue team scored

        # Get current state
        current_state = self.physics.get_state()

        # Calculate rewards
        self._calculate_rewards(agent_id, action, self.prev_state, current_state,
                              goal_result, ball_touched_by)

        # Update step count
        self.step_count += 1

        # Check termination conditions
        self._check_termination()

        # Move to next agent
        self.agent_selection = self._agent_selector.next()

        # Update previous state
        self.prev_state = current_state

    def _calculate_rewards(self, agent_id: int, action: np.ndarray,
                          prev_state: Dict, current_state: Dict,
                          goal_result: Optional[str], ball_touched_by: Optional[int]):
        """Calculate rewards for all agents"""
        # Reset rewards for this step
        self.rewards = {agent: 0.0 for agent in self.agents}

        # Calculate rewards for each agent
        for i, agent in enumerate(self.agents):
            reward = self.reward_shaper.shaped_reward(
                i, action if i == agent_id else np.zeros(5),
                prev_state, current_state,
                goal_scored=goal_result,
                ball_touched_by=ball_touched_by,
                scores=tuple(self.scores)
            )
            self.rewards[agent] = reward

    def _check_termination(self):
        """Check if episode should terminate"""
        # Game ends if max steps reached
        if self.step_count >= self.config.MAX_STEPS:
            self.episode_truncated = True

        # Game ends if goal difference is too large (optional)
        goal_diff = abs(self.scores[0] - self.scores[1])
        if goal_diff >= 5:  # End early if one team is dominating
            self.episode_terminated = True

        # Update termination/truncation for all agents
        if self.episode_terminated or self.episode_truncated:
            for agent in self.agents:
                self.terminations[agent] = self.episode_terminated
                self.truncations[agent] = self.episode_truncated

    def _get_observations(self) -> Dict[str, np.ndarray]:
        """Get observations for all agents"""
        current_state = self.physics.get_state()
        observations = {}

        for i, agent in enumerate(self.agents):
            obs = self.observation_space_handler.create_observation(
                i, current_state, tuple(self.scores), self.step_count,
                self.config.MAX_STEPS, self.ball_possession, self.last_touch
            )
            observations[agent] = obs

        return observations

    def observe(self, agent: str) -> np.ndarray:
        """Get observation for specific agent"""
        agent_id = int(agent.split('_')[1])
        current_state = self.physics.get_state()

        return self.observation_space_handler.create_observation(
            agent_id, current_state, tuple(self.scores), self.step_count,
            self.config.MAX_STEPS, self.ball_possession, self.last_touch
        )

    def render(self):
        """Render the environment"""
        if self.render_mode == "human" and self.renderer:
            current_state = self.physics.get_state()
            return self.renderer.render(current_state, tuple(self.scores), self.step_count)
        elif self.render_mode == "rgb_array":
            # Return RGB array for recording
            if not self.renderer:
                self.renderer = SoccerRenderer(self.config)
            current_state = self.physics.get_state()
            self.renderer.render(current_state, tuple(self.scores), self.step_count)
            # Convert pygame surface to numpy array
            import pygame
            rgb_array = pygame.surfarray.array3d(self.renderer.screen)
            return np.transpose(rgb_array, (1, 0, 2))

    def close(self):
        """Clean up resources"""
        if self.renderer:
            self.renderer.close()

    def state(self) -> np.ndarray:
        """Get global state (concatenated observations)"""
        observations = self._get_observations()
        return np.concatenate([observations[agent] for agent in self.agents])

    def _was_dead_step(self, action):
        """Handle action taken when episode is over"""
        # This method is required by PettingZoo but not used in our implementation
        pass

# Wrapper functions for easier usage

def make_soccer_env(config: SoccerEnvironmentConfig = None,
                   render_mode: str = None,
                   action_type: str = "continuous") -> SoccerEnvironment:
    """Create soccer environment with default settings"""
    return SoccerEnvironment(config, render_mode=render_mode, action_type=action_type)

def make_parallel_soccer_env(config: SoccerEnvironmentConfig = None,
                            render_mode: str = None,
                            action_type: str = "continuous"):
    """Create parallel version of soccer environment"""
    from pettingzoo.utils import parallel_to_aec
    env = make_soccer_env(config, render_mode, action_type)
    return parallel_to_aec(env)

# Compatibility with stable-baselines3
class SB3SoccerEnv:
    """Stable-Baselines3 compatible wrapper"""
    def __init__(self, config: SoccerEnvironmentConfig = None,
                 action_type: str = "continuous"):
        self.env = make_soccer_env(config, action_type=action_type)
        self.agents = self.env.agents
        self.num_agents = len(self.agents)

        # For SB3 compatibility
        self.observation_space = self.env.observation_spaces[self.agents[0]]
        self.action_space = self.env.action_spaces[self.agents[0]]

    def reset(self):
        observations = self.env.reset()
        return np.array([observations[agent] for agent in self.agents])

    def step(self, actions):
        # Execute actions for all agents simultaneously
        rewards = []
        done = False
        infos = []

        for i, agent in enumerate(self.agents):
            if not done:
                self.env.step(actions[i])
                rewards.append(self.env.rewards[agent])
                done = self.env.terminations[agent] or self.env.truncations[agent]
                infos.append(self.env.infos[agent])

        obs = [self.env.observe(agent) for agent in self.agents]
        return np.array(obs), np.array(rewards), done, infos

print("✅ Section completed: 🌍 Step 8: メイン環境")

## 🤖 Step 9: エージェント

In [None]:
"""
Agent implementations for soccer environment
"""

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from abc import ABC, abstractmethod
from typing import Dict, List, Tuple, Optional, Union
from collections import deque
import random


class BaseAgent(ABC):
    """Base class for all agents"""

    def __init__(self, agent_id: int, action_space_size: int):
        self.agent_id = agent_id
        self.action_space_size = action_space_size

    @abstractmethod
    def select_action(self, observation: np.ndarray, training: bool = True) -> np.ndarray:
        """Select action based on observation"""
        pass

    @abstractmethod
    def learn(self, experiences: List) -> Dict[str, float]:
        """Learn from experiences"""
        pass

    def save(self, filepath: str):
        """Save agent model"""
        pass

    def load(self, filepath: str):
        """Load agent model"""
        pass

class RandomAgent(BaseAgent):
    """Random agent for baseline and testing"""

    def __init__(self, agent_id: int, action_space_size: int = 5,
                 action_type: str = "continuous"):
        super().__init__(agent_id, action_space_size)
        self.action_type = action_type

    def select_action(self, observation: np.ndarray, training: bool = True) -> np.ndarray:
        """Select random action"""
        if self.action_type == "continuous":
            # Continuous action: [move_x, move_y, kick_power, kick_dir_x, kick_dir_y]
            action = np.array([
                np.random.uniform(-1, 1),  # move_x
                np.random.uniform(-1, 1),  # move_y
                np.random.uniform(0, 1),   # kick_power
                np.random.uniform(-1, 1),  # kick_dir_x
                np.random.uniform(-1, 1),  # kick_dir_y
            ], dtype=np.float32)
        else:
            # Discrete action
            action = np.random.randint(0, 9)  # 9 possible actions

        return action

    def learn(self, experiences: List) -> Dict[str, float]:
        """Random agent doesn't learn"""
        return {"loss": 0.0}

class MLPNetwork(nn.Module):
    """Multi-layer perceptron network"""

    def __init__(self, input_dim: int, output_dim: int, hidden_dims: Tuple[int, ...]):
        super().__init__()

        layers = []
        prev_dim = input_dim

        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, hidden_dim))
            layers.append(nn.ReLU())
            prev_dim = hidden_dim

        layers.append(nn.Linear(prev_dim, output_dim))

        self.network = nn.Sequential(*layers)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.network(x)

class DQNAgent(BaseAgent):
    """Deep Q-Network agent - Fixed version"""

    def __init__(self, agent_id: int, obs_dim: int, action_dim: int = 9,
                 hidden_dims: Tuple[int, ...] = (256, 128),
                 lr: float = 1e-3, gamma: float = 0.99,
                 epsilon: float = 1.0, epsilon_decay: float = 0.995,
                 epsilon_min: float = 0.01, buffer_size: int = 10000,
                 batch_size: int = 64):
        super().__init__(agent_id, action_dim)

        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.batch_size = batch_size

        # Neural networks
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.q_network = MLPNetwork(obs_dim, action_dim, hidden_dims).to(self.device)
        self.target_network = MLPNetwork(obs_dim, action_dim, hidden_dims).to(self.device)
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)

        # Experience replay buffer
        self.replay_buffer = deque(maxlen=buffer_size)

        # Copy weights to target network
        self.update_target_network()

    def select_action(self, observation: np.ndarray, training: bool = True) -> int:
        """Select action using epsilon-greedy policy"""
        if training and np.random.random() < self.epsilon:
            return np.random.randint(self.action_dim)

        with torch.no_grad():
            obs_tensor = torch.FloatTensor(observation).unsqueeze(0).to(self.device)
            q_values = self.q_network(obs_tensor)
            action = q_values.argmax(dim=1).item()

        return action

    def store_experience(self, state: np.ndarray, action: int, reward: float,
                        next_state: np.ndarray, done: bool):
        """Store experience in replay buffer"""
        self.replay_buffer.append((state, action, reward, next_state, done))

    def learn(self, experiences: List = None, training: bool = True) -> Dict[str, float]:
        """Learn from experiences in replay buffer - FIXED"""
        if len(self.replay_buffer) < self.batch_size:
            return {"loss": 0.0}

        # Sample batch from replay buffer
        batch = random.sample(self.replay_buffer, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.BoolTensor(dones).to(self.device)

        # Compute current Q values
        current_q_values = self.q_network(states).gather(1, actions.unsqueeze(1))

        # Compute target Q values
        with torch.no_grad():
            next_q_values = self.target_network(next_states).max(1)[0]
            target_q_values = rewards + (self.gamma * next_q_values * ~dones)

        # Compute loss and update
        loss = nn.MSELoss()(current_q_values.squeeze(), target_q_values)

        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.q_network.parameters(), max_norm=1.0)
        self.optimizer.step()

        # Update epsilon (only if training)
        if training:
            self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

        return {"loss": loss.item(), "epsilon": self.epsilon}

    def update_target_network(self):
        """Copy weights from main network to target network"""
        self.target_network.load_state_dict(self.q_network.state_dict())

    def save(self, filepath: str):
        """Save model"""
        torch.save({
            'q_network': self.q_network.state_dict(),
            'target_network': self.target_network.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'epsilon': self.epsilon
        }, filepath)

    def load(self, filepath: str):
        """Load model"""
        checkpoint = torch.load(filepath, map_location=self.device)
        self.q_network.load_state_dict(checkpoint['q_network'])
        self.target_network.load_state_dict(checkpoint['target_network'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        self.epsilon = checkpoint['epsilon']

# Replace the DQNAgent class with the fixed version
print("✅ DQNAgent class fixed - 'training' parameter added to learn method")

class MADDPGAgent(BaseAgent):
    """Multi-Agent Deep Deterministic Policy Gradient agent"""

    def __init__(self, agent_id: int, config: MADDPGConfig):
        super().__init__(agent_id, config.action_dim)

        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Networks
        self.actor = Actor(config.obs_dim, config.action_dim, config.hidden_dims).to(self.device)
        self.critic = Critic(config.global_obs_dim, config.global_action_dim,
                           config.hidden_dims).to(self.device)
        self.target_actor = Actor(config.obs_dim, config.action_dim, config.hidden_dims).to(self.device)
        self.target_critic = Critic(config.global_obs_dim, config.global_action_dim,
                                  config.hidden_dims).to(self.device)

        # Optimizers
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=config.actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=config.critic_lr)

        # Copy weights to target networks
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

        # Noise for exploration
        self.noise = OUNoise(config.action_dim, sigma=config.noise_scale)

    def select_action(self, observation: np.ndarray, training: bool = True) -> np.ndarray:
        """Select action using actor network"""
        with torch.no_grad():
            obs_tensor = torch.FloatTensor(observation).unsqueeze(0).to(self.device)
            action = self.actor(obs_tensor).cpu().data.numpy().flatten()

        if training:
            action += self.noise.sample()
            action = np.clip(action, -1, 1)

        return action

    def learn(self, experiences: Dict) -> Dict[str, float]:
        """Learn from experiences (implemented in trainer)"""
        # This will be implemented in the MADDPG trainer
        return {"actor_loss": 0.0, "critic_loss": 0.0}

    def soft_update(self, local_model: nn.Module, target_model: nn.Module):
        """Soft update of target network"""
        for target_param, local_param in zip(target_model.parameters(),
                                           local_model.parameters()):
            target_param.data.copy_(
                self.config.tau * local_param.data + (1.0 - self.config.tau) * target_param.data
            )

    def save(self, filepath: str):
        """Save model"""
        torch.save({
            'actor': self.actor.state_dict(),
            'critic': self.critic.state_dict(),
            'target_actor': self.target_actor.state_dict(),
            'target_critic': self.target_critic.state_dict(),
            'actor_optimizer': self.actor_optimizer.state_dict(),
            'critic_optimizer': self.critic_optimizer.state_dict(),
        }, filepath)

    def load(self, filepath: str):
        """Load model"""
        checkpoint = torch.load(filepath, map_location=self.device)
        self.actor.load_state_dict(checkpoint['actor'])
        self.critic.load_state_dict(checkpoint['critic'])
        self.target_actor.load_state_dict(checkpoint['target_actor'])
        self.target_critic.load_state_dict(checkpoint['target_critic'])
        self.actor_optimizer.load_state_dict(checkpoint['actor_optimizer'])
        self.critic_optimizer.load_state_dict(checkpoint['critic_optimizer'])

def create_agent(agent_type: str, agent_id: int, config: Dict) -> BaseAgent:
    """Factory function to create agents"""
    if agent_type == "random":
        return RandomAgent(agent_id)
    elif agent_type == "dqn":
        return DQNAgent(agent_id, **config)
    elif agent_type == "maddpg":
        return MADDPGAgent(agent_id, **config)
    else:
        raise ValueError(f"Unknown agent type: {agent_type}")

print("✅ Section completed: 🤖 Step 9: エージェント")

## 📚 Step 10: トレーニング

In [None]:
"""
Training frameworks for multi-agent soccer environment
"""

import numpy as np
import torch
import torch.nn.functional as F
from typing import Dict, List, Tuple, Optional, Any
from collections import deque, defaultdict
import random
import time
from abc import ABC, abstractmethod


class ReplayBuffer:
    """Experience replay buffer for multi-agent learning"""

    def __init__(self, capacity: int):
        self.capacity = capacity
        self.buffer = deque(maxlen=capacity)

    def push(self, experience: Tuple):
        """Add experience to buffer"""
        self.buffer.append(experience)

    def sample(self, batch_size: int) -> List[Tuple]:
        """Sample batch from buffer"""
        return random.sample(self.buffer, min(batch_size, len(self.buffer)))

    def __len__(self):
        return len(self.buffer)

class BaseTrainer(ABC):
    """Base class for all training frameworks"""

    def __init__(self, env_config: SoccerEnvironmentConfig,
                 training_config: TrainingConfig):
        self.env_config = env_config
        self.training_config = training_config
        self.env = make_soccer_env(env_config, render_mode=None)

        # Training statistics
        self.episode_rewards = []
        self.episode_lengths = []
        self.scores_history = []
        self.training_metrics = defaultdict(list)

    @abstractmethod
    def train(self, num_episodes: int) -> Dict[str, Any]:
        """Train agents for specified number of episodes"""
        pass

    def evaluate(self, num_episodes: int = 10) -> Dict[str, float]:
        """Evaluate current agent performance"""
        total_rewards = []
        total_lengths = []
        team_scores = [[], []]

        for episode in range(num_episodes):
            observations = self.env.reset()
            episode_reward = 0
            steps = 0

            while not all(self.env.terminations.values()) and not all(self.env.truncations.values()):
                actions = {}
                for agent in self.env.agents:
                    if not self.env.terminations.get(agent, False) and not self.env.truncations.get(agent, False):
                        obs = self.env.observe(agent)
                        actions[agent] = self._get_agent_action(agent, obs, training=False)

                for agent, action in actions.items():
                    self.env.step(action)
                    episode_reward += self.env.rewards.get(agent, 0)
                    steps += 1

                    if self.env.terminations.get(agent, False) or self.env.truncations.get(agent, False):
                        break

            total_rewards.append(episode_reward)
            total_lengths.append(steps)
            team_scores[0].append(self.env.scores[0])
            team_scores[1].append(self.env.scores[1])

        return {
            'avg_reward': np.mean(total_rewards),
            'avg_length': np.mean(total_lengths),
            'team_0_avg_score': np.mean(team_scores[0]),
            'team_1_avg_score': np.mean(team_scores[1]),
            'win_rate_team_0': sum(1 for i in range(num_episodes) if team_scores[0][i] > team_scores[1][i]) / num_episodes,
            'win_rate_team_1': sum(1 for i in range(num_episodes) if team_scores[1][i] > team_scores[0][i]) / num_episodes,
        }

    @abstractmethod
    def _get_agent_action(self, agent: str, observation: np.ndarray, training: bool = True) -> np.ndarray:
        """Get action from agent"""
        pass

class IndependentLearningTrainer(BaseTrainer):
    """Independent learning trainer where each agent learns separately"""

    def __init__(self, env_config: SoccerEnvironmentConfig,
                 training_config: TrainingConfig,
                 agent_type: str = "dqn",
                 agent_configs: Dict = None):
        super().__init__(env_config, training_config)

        self.agent_type = agent_type
        self.agent_configs = agent_configs or {}

        # Create agents
        self.agents = {}
        for i, agent_name in enumerate(self.env.agents):
            if agent_type == "dqn":
                self.agents[agent_name] = DQNAgent(
                    agent_id=i,
                    obs_dim=28,  # From observation space
                    **self.agent_configs
                )
            elif agent_type == "random":
                self.agents[agent_name] = RandomAgent(i)
            else:
                raise ValueError(f"Unknown agent type: {agent_type}")

    def train(self, num_episodes: int) -> Dict[str, Any]:
        """Train agents independently"""
        print(f"Starting independent learning training with {self.agent_type} agents")

        for episode in range(num_episodes):
            observations = self.env.reset()
            episode_rewards = {agent: 0 for agent in self.env.agents}
            episode_length = 0

            # Store previous observations for experience replay
            prev_observations = {}

            while not all(self.env.terminations.values()) and not all(self.env.truncations.values()):
                actions = {}

                # Get actions from all agents
                for agent in self.env.agents:
                    if not self.env.terminations.get(agent, False) and not self.env.truncations.get(agent, False):
                        obs = self.env.observe(agent)
                        action = self.agents[agent].select_action(obs, training=True)
                        actions[agent] = action
                        prev_observations[agent] = obs

                # Execute actions
                for agent, action in actions.items():
                    self.env.step(action)
                    reward = self.env.rewards.get(agent, 0)
                    episode_rewards[agent] += reward
                    episode_length += 1

                    # Store experience for DQN agents
                    if self.agent_type == "dqn" and agent in prev_observations:
                        next_obs = self.env.observe(agent)
                        done = self.env.terminations.get(agent, False) or self.env.truncations.get(agent, False)

                        if isinstance(action, np.ndarray):
                            action = int(action[0]) if len(action) > 0 else 0

                        self.agents[agent].store_experience(
                            prev_observations[agent], action, reward, next_obs, done
                        )

                        # Learn from experience
                        metrics = self.agents[agent].learn()
                        if metrics and metrics['loss'] > 0:
                            self.training_metrics[f'{agent}_loss'].append(metrics['loss'])

                    if self.env.terminations.get(agent, False) or self.env.truncations.get(agent, False):
                        break

            # Update target networks for DQN agents
            if self.agent_type == "dqn" and episode % 100 == 0:
                for agent_name, agent in self.agents.items():
                    agent.update_target_network()

            # Record episode statistics
            total_reward = sum(episode_rewards.values())
            self.episode_rewards.append(total_reward)
            self.episode_lengths.append(episode_length)
            self.scores_history.append(self.env.scores.copy())

            # Print progress
            if episode % 100 == 0:
                avg_reward = np.mean(self.episode_rewards[-100:]) if len(self.episode_rewards) >= 100 else np.mean(self.episode_rewards)
                print(f"Episode {episode}: Avg Reward (last 100): {avg_reward:.2f}, Scores: {self.env.scores}")

                # Evaluate current performance
                if episode % 500 == 0 and episode > 0:
                    eval_metrics = self.evaluate(num_episodes=10)
                    print(f"Evaluation: {eval_metrics}")

        return {
            'episode_rewards': self.episode_rewards,
            'episode_lengths': self.episode_lengths,
            'scores_history': self.scores_history,
            'training_metrics': dict(self.training_metrics)
        }

    def _get_agent_action(self, agent: str, observation: np.ndarray, training: bool = True) -> np.ndarray:
        """Get action from specific agent"""
        if self.agent_type == "dqn":
            discrete_action = self.agents[agent].select_action(observation, training)
            # Convert discrete action to continuous for environment
            action_space = ActionSpace("discrete")
            return action_space.convert_discrete_to_continuous(discrete_action)
        else:
            return self.agents[agent].select_action(observation, training)

class MADDPGTrainer(BaseTrainer):
    """MADDPG trainer with centralized critic"""

    def __init__(self, env_config: SoccerEnvironmentConfig,
                 training_config: TrainingConfig,
                 maddpg_config: MADDPGConfig):
        super().__init__(env_config, training_config)
        self.maddpg_config = maddpg_config

        # Create MADDPG agents
        self.agents = {}
        for i, agent_name in enumerate(self.env.agents):
            self.agents[agent_name] = MADDPGAgent(i, maddpg_config)

        # Shared replay buffer
        self.replay_buffer = ReplayBuffer(maddpg_config.buffer_size)

    def train(self, num_episodes: int) -> Dict[str, Any]:
        """Train MADDPG agents"""
        print("Starting MADDPG training")

        for episode in range(num_episodes):
            observations = self.env.reset()
            episode_rewards = {agent: 0 for agent in self.env.agents}
            episode_length = 0

            # Episode experience
            episode_experiences = []

            while not all(self.env.terminations.values()) and not all(self.env.truncations.values()):
                # Get global observation and actions
                global_obs = []
                actions = {}

                for agent in self.env.agents:
                    if not self.env.terminations.get(agent, False) and not self.env.truncations.get(agent, False):
                        obs = self.env.observe(agent)
                        action = self.agents[agent].select_action(obs, training=True)
                        actions[agent] = action
                        global_obs.append(obs)

                # Execute actions and collect rewards
                global_actions = list(actions.values())
                step_experience = {
                    'global_obs': np.concatenate(global_obs),
                    'actions': actions.copy(),
                    'global_actions': np.concatenate(global_actions),
                    'rewards': {},
                    'next_global_obs': None,
                    'dones': {}
                }

                for agent, action in actions.items():
                    self.env.step(action)
                    reward = self.env.rewards.get(agent, 0)
                    episode_rewards[agent] += reward
                    step_experience['rewards'][agent] = reward
                    step_experience['dones'][agent] = self.env.terminations.get(agent, False) or self.env.truncations.get(agent, False)
                    episode_length += 1

                    if step_experience['dones'][agent]:
                        break

                # Get next global observation
                next_global_obs = []
                for agent in self.env.agents:
                    next_obs = self.env.observe(agent)
                    next_global_obs.append(next_obs)
                step_experience['next_global_obs'] = np.concatenate(next_global_obs)

                episode_experiences.append(step_experience)

            # Store experiences in replay buffer
            for exp in episode_experiences:
                self.replay_buffer.push(exp)

            # Train agents if enough experiences
            if len(self.replay_buffer) > self.maddpg_config.batch_size:
                self._train_maddpg_step()

            # Record statistics
            total_reward = sum(episode_rewards.values())
            self.episode_rewards.append(total_reward)
            self.episode_lengths.append(episode_length)
            self.scores_history.append(self.env.scores.copy())

            # Print progress
            if episode % 100 == 0:
                avg_reward = np.mean(self.episode_rewards[-100:]) if len(self.episode_rewards) >= 100 else np.mean(self.episode_rewards)
                print(f"Episode {episode}: Avg Reward: {avg_reward:.2f}, Scores: {self.env.scores}")

        return {
            'episode_rewards': self.episode_rewards,
            'episode_lengths': self.episode_lengths,
            'scores_history': self.scores_history,
            'training_metrics': dict(self.training_metrics)
        }

    def _train_maddpg_step(self):
        """Perform one MADDPG training step"""
        batch = self.replay_buffer.sample(self.maddpg_config.batch_size)

        for i, (agent_name, agent) in enumerate(self.agents.items()):
            # Extract data for this agent
            states = torch.FloatTensor([exp['global_obs'] for exp in batch]).to(agent.device)
            actions = torch.FloatTensor([exp['global_actions'] for exp in batch]).to(agent.device)
            rewards = torch.FloatTensor([exp['rewards'][agent_name] for exp in batch]).to(agent.device)
            next_states = torch.FloatTensor([exp['next_global_obs'] for exp in batch]).to(agent.device)
            dones = torch.BoolTensor([exp['dones'][agent_name] for exp in batch]).to(agent.device)

            # Get agent-specific observations
            agent_obs = states[:, i*28:(i+1)*28]  # 28D observation per agent
            next_agent_obs = next_states[:, i*28:(i+1)*28]

            # Update critic
            with torch.no_grad():
                next_actions = torch.cat([
                    self.agents[list(self.agents.keys())[j]].target_actor(next_states[:, j*28:(j+1)*28])
                    for j in range(len(self.agents))
                ], dim=1)
                target_q = agent.target_critic(next_states, next_actions)
                target_q = rewards + (self.maddpg_config.gamma * target_q * ~dones)

            current_q = agent.critic(states, actions)
            critic_loss = F.mse_loss(current_q.squeeze(), target_q.squeeze())

            agent.critic_optimizer.zero_grad()
            critic_loss.backward()
            torch.nn.utils.clip_grad_norm_(agent.critic.parameters(), max_norm=0.5)
            agent.critic_optimizer.step()

            # Update actor
            agent_actions = agent.actor(agent_obs)
            full_actions = actions.clone()
            full_actions[:, i*5:(i+1)*5] = agent_actions  # 5D action per agent

            actor_loss = -agent.critic(states, full_actions).mean()

            agent.actor_optimizer.zero_grad()
            actor_loss.backward()
            torch.nn.utils.clip_grad_norm_(agent.actor.parameters(), max_norm=0.5)
            agent.actor_optimizer.step()

            # Soft update target networks
            agent.soft_update(agent.actor, agent.target_actor)
            agent.soft_update(agent.critic, agent.target_critic)

            # Record metrics
            self.training_metrics[f'{agent_name}_critic_loss'].append(critic_loss.item())
            self.training_metrics[f'{agent_name}_actor_loss'].append(actor_loss.item())

    def _get_agent_action(self, agent: str, observation: np.ndarray, training: bool = True) -> np.ndarray:
        """Get action from MADDPG agent"""
        return self.agents[agent].select_action(observation, training)

def create_trainer(trainer_type: str, env_config: SoccerEnvironmentConfig,
                  training_config: TrainingConfig, **kwargs) -> BaseTrainer:
    """Factory function to create trainers"""
    if trainer_type == "independent":
        return IndependentLearningTrainer(env_config, training_config, **kwargs)
    elif trainer_type == "maddpg":
        maddpg_config = kwargs.get('maddpg_config', MADDPGConfig())
        return MADDPGTrainer(env_config, training_config, maddpg_config)
    else:
        raise ValueError(f"Unknown trainer type: {trainer_type}")

print("✅ Section completed: 📚 Step 10: トレーニング")

## 🧪 Step 11: テスト関数

In [None]:
"""
Test script for soccer environment with random agents
"""

import numpy as np
import matplotlib.pyplot as plt
from typing import List, Dict
import time


def test_basic_environment():
    """Test basic environment functionality"""
    print("Testing basic environment functionality...")

    # Create environment
    config = SoccerEnvironmentConfig()
    env = make_soccer_env(config, render_mode=None, action_type="continuous")

    print(f"Environment created with {len(env.agents)} agents")
    print(f"Agents: {env.agents}")
    print(f"Observation space: {env.observation_spaces[env.agents[0]]}")
    print(f"Action space: {env.action_spaces[env.agents[0]]}")

    # Test reset
    observations = env.reset()
    print(f"Reset successful, observations shape: {[obs.shape for obs in observations.values()]}")

    # Test step
    for agent in env.agents:
        action = env.action_spaces[agent].sample()
        print(f"Agent {agent} taking action: {action}")
        env.step(action)

    print("Basic environment test completed successfully!")
    return True

def test_random_agents_episode():
    """Test full episode with random agents"""
    print("Testing full episode with random agents...")

    # Create environment and agents
    config = SoccerEnvironmentConfig()
    env = make_soccer_env(config, render_mode=None, action_type="continuous")

    # Create random agents
    agents = {}
    for i, agent_name in enumerate(env.agents):
        agents[agent_name] = RandomAgent(i, action_space_size=5, action_type="continuous")

    # Run episode
    observations = env.reset()
    episode_rewards = {agent: 0 for agent in env.agents}
    episode_length = 0

    print("Running episode...")
    start_time = time.time()

    while not all(env.terminations.values()) and not all(env.truncations.values()):
        for agent in env.agents:
            if not env.terminations.get(agent, False) and not env.truncations.get(agent, False):
                # Get action from agent
                obs = env.observe(agent)
                action = agents[agent].select_action(obs, training=False)

                # Take step
                env.step(action)

                # Accumulate reward
                episode_rewards[agent] += env.rewards.get(agent, 0)

                episode_length += 1

                # Break if episode is done
                if env.terminations.get(agent, False) or env.truncations.get(agent, False):
                    break

    elapsed_time = time.time() - start_time

    print(f"Episode completed in {elapsed_time:.2f} seconds")
    print(f"Episode length: {episode_length} steps")
    print(f"Final scores: {env.scores}")
    print(f"Episode rewards: {episode_rewards}")

    env.close()
    return True

def test_multiple_episodes(num_episodes: int = 5):
    """Test multiple episodes and collect statistics"""
    print(f"Testing {num_episodes} episodes for performance analysis...")

    config = SoccerEnvironmentConfig()
    env = make_soccer_env(config, render_mode=None, action_type="continuous")

    # Create random agents
    agents = {}
    for i, agent_name in enumerate(env.agents):
        agents[agent_name] = RandomAgent(i, action_space_size=5, action_type="continuous")

    # Statistics
    episode_lengths = []
    episode_rewards = []
    final_scores = []

    for episode in range(num_episodes):
        print(f"Episode {episode + 1}/{num_episodes}")

        observations = env.reset()
        episode_reward = {agent: 0 for agent in env.agents}
        steps = 0

        while not all(env.terminations.values()) and not all(env.truncations.values()):
            for agent in env.agents:
                if not env.terminations.get(agent, False) and not env.truncations.get(agent, False):
                    obs = env.observe(agent)
                    action = agents[agent].select_action(obs, training=False)
                    env.step(action)
                    episode_reward[agent] += env.rewards.get(agent, 0)
                    steps += 1

                    if env.terminations.get(agent, False) or env.truncations.get(agent, False):
                        break

        episode_lengths.append(steps)
        episode_rewards.append(episode_reward)
        final_scores.append(env.scores.copy())

        print(f"  Steps: {steps}, Scores: {env.scores}, Avg Reward: {np.mean(list(episode_reward.values())):.2f}")

    # Print statistics
    print(f"\n=== Statistics over {num_episodes} episodes ===")
    print(f"Average episode length: {np.mean(episode_lengths):.2f} ± {np.std(episode_lengths):.2f}")

    # Team scores
    team_0_scores = [score[0] for score in final_scores]
    team_1_scores = [score[1] for score in final_scores]

    print(f"Team 0 (Blue) average score: {np.mean(team_0_scores):.2f} ± {np.std(team_0_scores):.2f}")
    print(f"Team 1 (Red) average score: {np.mean(team_1_scores):.2f} ± {np.std(team_1_scores):.2f}")

    # Average rewards per agent
    for agent in env.agents:
        agent_rewards = [ep_reward[agent] for ep_reward in episode_rewards]
        print(f"{agent} average reward: {np.mean(agent_rewards):.2f} ± {np.std(agent_rewards):.2f}")

    env.close()
    return True

def run_all_tests():
    """Run all tests"""
    print("=" * 60)
    print("Multi-Agent Soccer Environment Test Suite")
    print("=" * 60)

    tests = [
        ("Basic Environment", test_basic_environment),
        ("Random Agents Episode", test_random_agents_episode),
        ("Multiple Episodes", lambda: test_multiple_episodes(3))
    ]

    results = []
    for test_name, test_func in tests:
        print(f"\n[TEST] {test_name}")
        print("-" * 40)
        try:
            result = test_func()
            results.append(result)
            print(f"✓ {test_name} PASSED")
        except Exception as e:
            print(f"✗ {test_name} FAILED: {e}")
            results.append(False)

    print("\n" + "=" * 60)
    print("TEST SUMMARY")
    print("=" * 60)
    for i, (test_name, _) in enumerate(tests):
        status = "PASSED" if results[i] else "FAILED"
        print(f"{test_name}: {status}")

    success_rate = sum(results) / len(results)
    print(f"\nSuccess Rate: {success_rate:.1%} ({sum(results)}/{len(results)})")

    return success_rate == 1.0

if __name__ == "__main__":
    run_all_tests()

print("✅ Section completed: 🧪 Step 11: テスト関数")

## 🚀 Step 12: 実行セクション

**注意**: 上記のすべてのコードセルを実行してから以下を実行してください。

In [None]:
# 環境のテスト
print("🧪 Testing the environment...")
print("=" * 60)

try:
    # 簡単なテスト
    test_config = SoccerEnvironmentConfig()
    test_env = make_soccer_env(test_config, render_mode=None, action_type="continuous")
    print(f"✅ Environment created successfully!")
    print(f"   Agents: {test_env.agents}")
    print(f"   Observation space: {test_env.observation_spaces[test_env.agents[0]].shape}")
    print(f"   Action space: {test_env.action_spaces[test_env.agents[0]].shape}")
    
    # Reset test
    test_env.reset()
    print(f"✅ Environment reset successful!")
    
    # Step test
    for agent in test_env.agents:
        action = test_env.action_spaces[agent].sample()
        test_env.step(action)
        break  # Just test one step
    print(f"✅ Environment step successful!")
    
    test_env.close()
    print(f"\n✅ All basic tests passed!")
    
except Exception as e:
    print(f"❌ Error during testing: {e}")
    print("Please make sure all previous cells have been executed.")

In [None]:
# ベースライン実行（ランダムエージェント）
print("🎮 Running baseline with random agents...")
print("=" * 60)

# 環境とエージェントの作成
config = SoccerEnvironmentConfig()
env = make_soccer_env(config, render_mode=None, action_type="continuous")

# ランダムエージェントの作成
random_agents = {}
for i, agent_name in enumerate(env.agents):
    random_agents[agent_name] = RandomAgent(i, action_space_size=5, action_type="continuous")

# 5エピソード実行
episode_results = []
for episode in range(5):
    env.reset()
    episode_reward = 0
    steps = 0
    
    while not all(env.terminations.values()) and not all(env.truncations.values()):
        for agent in env.agents:
            if not env.terminations.get(agent, False) and not env.truncations.get(agent, False):
                obs = env.observe(agent)
                action = random_agents[agent].select_action(obs, training=False)
                env.step(action)
                episode_reward += env.rewards.get(agent, 0)
                steps += 1
                
                if env.terminations.get(agent, False) or env.truncations.get(agent, False):
                    break
    
    episode_results.append({
        'steps': steps,
        'reward': episode_reward,
        'scores': env.scores.copy()
    })
    
    print(f"Episode {episode + 1}: Steps={steps}, Scores={env.scores}, Reward={episode_reward:.2f}")

# 統計
avg_steps = np.mean([r['steps'] for r in episode_results])
avg_reward = np.mean([r['reward'] for r in episode_results])
print(f"\n📊 Statistics:")
print(f"   Average steps: {avg_steps:.1f}")
print(f"   Average reward: {avg_reward:.2f}")

env.close()
print("\n✅ Baseline completed successfully!")

## 📝 まとめ

### ✅ 実装完了
- 完全な物理エンジンとPettingZoo互換環境
- Random, DQN, MADDPG エージェント
- 訓練フレームワーク

### 🔧 修正済みの問題
- ModuleNotFoundError: 内部import削除
- TypeError (agent_selector): AgentSelectorに修正
- 実行順序の依存関係

### 📚 次のステップ
1. より長いエピソードで訓練
2. ハイパーパラメータの調整
3. 学習曲線の分析

**Happy Training! 🎮**

### 🔧 DQNAgent Fix (Run if you encounter 'training' error)

In [None]:
class DQNAgent(BaseAgent):
    """Deep Q-Network agent - Fixed version"""

    def __init__(self, agent_id: int, obs_dim: int, action_dim: int = 9,
                 hidden_dims: Tuple[int, ...] = (256, 128),
                 lr: float = 1e-3, gamma: float = 0.99,
                 epsilon: float = 1.0, epsilon_decay: float = 0.995,
                 epsilon_min: float = 0.01, buffer_size: int = 10000,
                 batch_size: int = 64):
        super().__init__(agent_id, action_dim)

        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.batch_size = batch_size

        # Neural networks
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.q_network = MLPNetwork(obs_dim, action_dim, hidden_dims).to(self.device)
        self.target_network = MLPNetwork(obs_dim, action_dim, hidden_dims).to(self.device)
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)

        # Experience replay buffer
        self.replay_buffer = deque(maxlen=buffer_size)

        # Copy weights to target network
        self.update_target_network()

    def select_action(self, observation: np.ndarray, training: bool = True) -> int:
        """Select action using epsilon-greedy policy"""
        if training and np.random.random() < self.epsilon:
            return np.random.randint(self.action_dim)

        with torch.no_grad():
            obs_tensor = torch.FloatTensor(observation).unsqueeze(0).to(self.device)
            q_values = self.q_network(obs_tensor)
            action = q_values.argmax(dim=1).item()

        return action

    def store_experience(self, state: np.ndarray, action: int, reward: float,
                        next_state: np.ndarray, done: bool):
        """Store experience in replay buffer"""
        self.replay_buffer.append((state, action, reward, next_state, done))

    def learn(self, experiences: List = None, training: bool = True) -> Dict[str, float]:
        """Learn from experiences in replay buffer - FIXED"""
        if len(self.replay_buffer) < self.batch_size:
            return {"loss": 0.0}

        # Sample batch from replay buffer
        batch = random.sample(self.replay_buffer, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.BoolTensor(dones).to(self.device)

        # Compute current Q values
        current_q_values = self.q_network(states).gather(1, actions.unsqueeze(1))

        # Compute target Q values
        with torch.no_grad():
            next_q_values = self.target_network(next_states).max(1)[0]
            target_q_values = rewards + (self.gamma * next_q_values * ~dones)

        # Compute loss and update
        loss = nn.MSELoss()(current_q_values.squeeze(), target_q_values)

        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.q_network.parameters(), max_norm=1.0)
        self.optimizer.step()

        # Update epsilon (only if training)
        if training:
            self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

        return {"loss": loss.item(), "epsilon": self.epsilon}

    def update_target_network(self):
        """Copy weights from main network to target network"""
        self.target_network.load_state_dict(self.q_network.state_dict())

    def save(self, filepath: str):
        """Save model"""
        torch.save({
            'q_network': self.q_network.state_dict(),
            'target_network': self.target_network.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'epsilon': self.epsilon
        }, filepath)

    def load(self, filepath: str):
        """Load model"""
        checkpoint = torch.load(filepath, map_location=self.device)
        self.q_network.load_state_dict(checkpoint['q_network'])
        self.target_network.load_state_dict(checkpoint['target_network'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        self.epsilon = checkpoint['epsilon']

# Replace the DQNAgent class with the fixed version
print("✅ DQNAgent class fixed - 'training' parameter added to learn method")

# 🚀 Extended Training and Visualization
## より長いエピソードでの訓練と動画可視化

このセクションでは、より長いエピソードでエージェントを訓練し、結果を動画として可視化します。

### 📦 動画作成用の追加ライブラリ

In [None]:
# 動画作成用ライブラリのインストール
!apt-get update -qq
!apt-get install -qq xvfb
!pip install -q imageio imageio-ffmpeg
!pip install -q pyvirtualdisplay

import imageio
from IPython.display import HTML, display
import base64

# Virtual display for rendering
from pyvirtualdisplay import Display
display_virtual = Display(visible=0, size=(1400, 900))
display_virtual.start()

print("✅ Video dependencies installed successfully!")

### 📚 拡張訓練クラスの実装

In [None]:
class ExtendedTrainer:
    """Extended trainer with video recording capabilities"""
    
    def __init__(self, env_config, agent_type="dqn"):
        self.env_config = env_config
        self.agent_type = agent_type
        self.episode_rewards = []
        self.episode_lengths = []
        self.scores_history = []
        self.video_frames = []
        
    def create_agents(self, env):
        """Create agents based on type"""
        agents = {}
        
        if self.agent_type == "random":
            for i, agent_name in enumerate(env.agents):
                agents[agent_name] = RandomAgent(i, action_space_size=5, action_type="continuous")
        
        elif self.agent_type == "dqn":
            for i, agent_name in enumerate(env.agents):
                agents[agent_name] = DQNAgent(
                    agent_id=i,
                    obs_dim=28,
                    action_dim=9,
                    hidden_dims=(256, 128),
                    lr=1e-3,
                    gamma=0.99,
                    epsilon=1.0,
                    epsilon_decay=0.995,
                    epsilon_min=0.01,
                    buffer_size=10000,
                    batch_size=64
                )
        
        elif self.agent_type == "maddpg":
            maddpg_config = MADDPGConfig()
            for i, agent_name in enumerate(env.agents):
                agents[agent_name] = MADDPGAgent(i, maddpg_config)
        
        return agents
    
    def train(self, num_episodes=100, record_video_every=20, max_video_episodes=5):
        """Train agents and record videos"""
        print(f"🎮 Starting extended training with {self.agent_type} agents")
        print(f"   Episodes: {num_episodes}")
        print(f"   Recording video every {record_video_every} episodes")
        print("=" * 60)
        
        # Create environments
        env = make_soccer_env(self.env_config, render_mode=None, action_type="continuous")
        render_env = make_soccer_env(self.env_config, render_mode="rgb_array", action_type="continuous")
        
        # Create agents
        agents = self.create_agents(env)
        
        videos = []  # Store video data
        
        for episode in range(num_episodes):
            # Determine if we should record this episode
            record_this_episode = (episode % record_video_every == 0) and (len(videos) < max_video_episodes)
            
            # Use render environment if recording
            current_env = render_env if record_this_episode else env
            current_env.reset()
            
            episode_reward = {agent: 0 for agent in current_env.agents}
            episode_frames = []
            steps = 0
            
            # Store experiences for learning (DQN)
            episode_experiences = {agent: [] for agent in current_env.agents}
            
            while not all(current_env.terminations.values()) and not all(current_env.truncations.values()):
                # Record frame if needed
                if record_this_episode:
                    frame = current_env.render()
                    if frame is not None:
                        episode_frames.append(frame)
                
                for agent_name in current_env.agents:
                    if not current_env.terminations.get(agent_name, False) and not current_env.truncations.get(agent_name, False):
                        # Get observation and action
                        obs = current_env.observe(agent_name)
                        
                        if self.agent_type == "dqn":
                            action = agents[agent_name].select_action(obs, training=True)
                            # Convert discrete to continuous
                            action_space = ActionSpace("discrete")
                            action_continuous = action_space.convert_discrete_to_continuous(action)
                            current_env.step(action_continuous)
                            
                            # Store experience
                            next_obs = current_env.observe(agent_name)
                            reward = current_env.rewards.get(agent_name, 0)
                            done = current_env.terminations.get(agent_name, False) or current_env.truncations.get(agent_name, False)
                            
                            agents[agent_name].store_experience(obs, action, reward, next_obs, done)
                            
                            # Learn from experience
                            if len(agents[agent_name].replay_buffer) > agents[agent_name].batch_size:
                                agents[agent_name].learn()
                        else:
                            action = agents[agent_name].select_action(obs, training=True)
                            current_env.step(action)
                            reward = current_env.rewards.get(agent_name, 0)
                        
                        episode_reward[agent_name] += reward
                        steps += 1
                        
                        if current_env.terminations.get(agent_name, False) or current_env.truncations.get(agent_name, False):
                            break
            
            # Save video if recorded
            if record_this_episode and episode_frames:
                videos.append({
                    'episode': episode,
                    'frames': episode_frames,
                    'scores': current_env.scores.copy(),
                    'reward': sum(episode_reward.values())
                })
                print(f"📹 Recorded video for episode {episode}")
            
            # Update target networks for DQN
            if self.agent_type == "dqn" and episode % 10 == 0:
                for agent_name in agents:
                    agents[agent_name].update_target_network()
            
            # Store metrics
            self.episode_rewards.append(sum(episode_reward.values()))
            self.episode_lengths.append(steps)
            self.scores_history.append(current_env.scores.copy())
            
            # Print progress
            if episode % 10 == 0:
                avg_reward = np.mean(self.episode_rewards[-10:]) if len(self.episode_rewards) >= 10 else np.mean(self.episode_rewards)
                print(f"Episode {episode}: Avg Reward (last 10): {avg_reward:.2f}, Scores: {current_env.scores}")
        
        env.close()
        render_env.close()
        
        return videos

print("✅ Extended trainer class defined!")

### 🎮 拡張訓練の実行

In [None]:
# 拡張訓練の実行
print("🚀 Starting extended training...")
print("This will take a few minutes. Please be patient.")
print("=" * 60)

# Configuration
config = SoccerEnvironmentConfig()
config.MAX_STEPS = 500  # Shorter episodes for faster training

# Create trainer
trainer = ExtendedTrainer(config, agent_type="random")  # Start with random for quick results

# Train and record videos
videos = trainer.train(
    num_episodes=50,      # Total episodes
    record_video_every=10, # Record every 10 episodes
    max_video_episodes=5   # Maximum 5 videos
)

print(f"\n✅ Training completed!")
print(f"   Total episodes: {len(trainer.episode_rewards)}")
print(f"   Videos recorded: {len(videos)}")
print(f"   Average reward: {np.mean(trainer.episode_rewards):.2f}")
print(f"   Average episode length: {np.mean(trainer.episode_lengths):.1f}")

### 📊 訓練結果の可視化

In [None]:
# 訓練結果の可視化
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Episode rewards
ax = axes[0, 0]
ax.plot(trainer.episode_rewards, alpha=0.3, label='Raw')
if len(trainer.episode_rewards) > 10:
    smoothed = np.convolve(trainer.episode_rewards, np.ones(10)/10, mode='valid')
    ax.plot(range(9, len(trainer.episode_rewards)), smoothed, linewidth=2, label='Smoothed (10-ep)')
ax.set_title('Episode Rewards Over Time')
ax.set_xlabel('Episode')
ax.set_ylabel('Total Reward')
ax.grid(True, alpha=0.3)
ax.legend()

# Episode lengths
ax = axes[0, 1]
ax.plot(trainer.episode_lengths, alpha=0.5, color='orange')
ax.set_title('Episode Lengths')
ax.set_xlabel('Episode')
ax.set_ylabel('Steps')
ax.grid(True, alpha=0.3)

# Team scores over time
ax = axes[1, 0]
team_0_scores = [score[0] for score in trainer.scores_history]
team_1_scores = [score[1] for score in trainer.scores_history]
ax.plot(team_0_scores, label='Team 0 (Blue)', alpha=0.7, color='blue')
ax.plot(team_1_scores, label='Team 1 (Red)', alpha=0.7, color='red')
ax.set_title('Team Scores Over Episodes')
ax.set_xlabel('Episode')
ax.set_ylabel('Goals Scored')
ax.legend()
ax.grid(True, alpha=0.3)

# Win rate analysis
ax = axes[1, 1]
wins_0 = sum(1 for s in trainer.scores_history if s[0] > s[1])
wins_1 = sum(1 for s in trainer.scores_history if s[1] > s[0])
draws = len(trainer.scores_history) - wins_0 - wins_1

labels = ['Team 0 Wins', 'Team 1 Wins', 'Draws']
sizes = [wins_0, wins_1, draws]
colors = ['#3498db', '#e74c3c', '#95a5a6']
ax.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
ax.set_title('Win Rate Distribution')

plt.suptitle(f'Training Results - {trainer.agent_type.upper()} Agents', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print("\n📈 Statistics Summary:")
print(f"   Team 0 wins: {wins_0} ({wins_0/len(trainer.scores_history)*100:.1f}%)")
print(f"   Team 1 wins: {wins_1} ({wins_1/len(trainer.scores_history)*100:.1f}%)")
print(f"   Draws: {draws} ({draws/len(trainer.scores_history)*100:.1f}%)")
print(f"   Max reward: {max(trainer.episode_rewards):.2f}")
print(f"   Min reward: {min(trainer.episode_rewards):.2f}")

### 🎬 動画の作成と表示

In [None]:
def create_video_from_frames(frames, output_path, fps=30):
    """Create video from frames"""
    if not frames:
        print("No frames to create video")
        return None
    
    # Convert frames to proper format
    processed_frames = []
    for frame in frames:
        if frame.dtype != np.uint8:
            frame = (frame * 255).astype(np.uint8) if frame.max() <= 1 else frame.astype(np.uint8)
        processed_frames.append(frame)
    
    # Create video
    imageio.mimsave(output_path, processed_frames, fps=fps)
    return output_path

def display_video(video_path):
    """Display video in Colab"""
    video = open(video_path, 'rb').read()
    encoded = base64.b64encode(video).decode('ascii')
    html_code = f'''
    <video width="800" height="600" controls>
        <source src="data:video/mp4;base64,{encoded}" type="video/mp4">
    </video>
    '''
    return HTML(html_code)

# Create and display videos
print("🎬 Creating videos from recorded episodes...")
print("=" * 60)

video_paths = []
for i, video_data in enumerate(videos):
    output_path = f'/tmp/soccer_episode_{video_data["episode"]}.mp4'
    
    # Create video
    create_video_from_frames(video_data['frames'], output_path, fps=30)
    video_paths.append(output_path)
    
    print(f"✅ Created video {i+1}: Episode {video_data['episode']}")
    print(f"   Scores: {video_data['scores']}")
    print(f"   Total Reward: {video_data['reward']:.2f}")
    print()

print(f"\n🎥 Videos saved to: /tmp/")
print("Use the next cell to display videos")

In [None]:
# 動画の表示
if videos and len(video_paths) > 0:
    print(f"📺 Displaying video from Episode {videos[0]['episode']}")
    print(f"   Scores: Blue {videos[0]['scores'][0]} - Red {videos[0]['scores'][1]}")
    display(display_video(video_paths[0]))
else:
    print("No videos available to display.")
    print("Please run the training cell first.")

### 🧠 DQNエージェントでの高度な訓練

In [None]:
# DQNエージェントでのより長い訓練
print("🧠 Starting advanced DQN training...")
print("This will take longer but show learning progress.")
print("=" * 60)

# Configuration for longer training
config_dqn = SoccerEnvironmentConfig()
config_dqn.MAX_STEPS = 600  # Balanced episode length

# Create DQN trainer
dqn_trainer = ExtendedTrainer(config_dqn, agent_type="dqn")

# Train with DQN agents
dqn_videos = dqn_trainer.train(
    num_episodes=200,      # More episodes for learning
    record_video_every=40, # Record every 40 episodes to see progress
    max_video_episodes=5   # Record 5 videos total
)

print(f"\n✅ DQN Training completed!")
print(f"   Total episodes: {len(dqn_trainer.episode_rewards)}")
print(f"   Videos recorded: {len(dqn_videos)}")
print(f"   Final avg reward (last 20): {np.mean(dqn_trainer.episode_rewards[-20:]):.2f}")
print(f"   Initial avg reward (first 20): {np.mean(dqn_trainer.episode_rewards[:20]):.2f}")
print(f"   Improvement: {np.mean(dqn_trainer.episode_rewards[-20:]) - np.mean(dqn_trainer.episode_rewards[:20]):.2f}")

In [None]:
# 訓練結果の比較
if 'dqn_trainer' in globals():
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Rewards comparison
    ax = axes[0]
    
    # Random agent rewards
    random_rewards_smooth = np.convolve(trainer.episode_rewards, np.ones(10)/10, mode='valid')
    ax.plot(range(9, len(trainer.episode_rewards)), random_rewards_smooth, 
            label='Random Agents', alpha=0.7, color='gray')
    
    # DQN agent rewards
    dqn_rewards_smooth = np.convolve(dqn_trainer.episode_rewards, np.ones(10)/10, mode='valid')
    ax.plot(range(9, len(dqn_trainer.episode_rewards)), dqn_rewards_smooth, 
            label='DQN Agents', linewidth=2, color='green')
    
    ax.set_title('Learning Progress Comparison')
    ax.set_xlabel('Episode')
    ax.set_ylabel('Average Reward (10-ep smoothed)')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Score distribution
    ax = axes[1]
    
    # Calculate average scores for last 20 episodes
    random_scores = trainer.scores_history[-20:] if len(trainer.scores_history) >= 20 else trainer.scores_history
    dqn_scores = dqn_trainer.scores_history[-20:] if len(dqn_trainer.scores_history) >= 20 else dqn_trainer.scores_history
    
    random_avg = [np.mean([s[0] for s in random_scores]), np.mean([s[1] for s in random_scores])]
    dqn_avg = [np.mean([s[0] for s in dqn_scores]), np.mean([s[1] for s in dqn_scores])]
    
    x = np.arange(2)
    width = 0.35
    
    ax.bar(x - width/2, random_avg, width, label='Random', color='gray', alpha=0.7)
    ax.bar(x + width/2, dqn_avg, width, label='DQN', color='green', alpha=0.7)
    
    ax.set_title('Average Goals Scored (Last 20 Episodes)')
    ax.set_xticks(x)
    ax.set_xticklabels(['Team 0 (Blue)', 'Team 1 (Red)'])
    ax.set_ylabel('Average Goals')
    ax.legend()
    ax.grid(True, alpha=0.3, axis='y')
    
    plt.suptitle('Random vs DQN Agent Performance', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    print("\n📊 Performance Comparison:")
    print(f"Random Agents - Avg Reward: {np.mean(trainer.episode_rewards):.2f} ± {np.std(trainer.episode_rewards):.2f}")
    print(f"DQN Agents - Avg Reward: {np.mean(dqn_trainer.episode_rewards):.2f} ± {np.std(dqn_trainer.episode_rewards):.2f}")
else:
    print("Please run DQN training first to see comparison.")

In [None]:
# DQN訓練の動画表示
if 'dqn_videos' in globals() and dqn_videos:
    print("🎬 Creating DQN training videos...")
    
    dqn_video_paths = []
    for i, video_data in enumerate(dqn_videos):
        output_path = f'/tmp/dqn_episode_{video_data["episode"]}.mp4'
        create_video_from_frames(video_data['frames'], output_path, fps=30)
        dqn_video_paths.append(output_path)
        print(f"✅ Created DQN video {i+1}: Episode {video_data['episode']}")
    
    # Display comparison: early vs late training
    print("\n📺 Early Training (Episode {}):".format(dqn_videos[0]['episode']))
    display(display_video(dqn_video_paths[0]))
    
    if len(dqn_video_paths) > 1:
        print("\n📺 Later Training (Episode {}):".format(dqn_videos[-1]['episode']))
        display(display_video(dqn_video_paths[-1]))
else:
    print("No DQN videos available. Please run DQN training first.")

# 🏆 Expert Learning and Advanced Training
## エキスパートデータを使った学習と改善された訓練

点が入らない問題を解決するため、以下を実装します：
1. エキスパート戦略の実装（ヒューリスティック）
2. 模倣学習（Imitation Learning）
3. 報酬シェーピングの改善
4. カリキュラム学習

### 🎯 エキスパートエージェントの実装

In [None]:
class ExpertAgent(BaseAgent):
    """Expert agent with rule-based strategy for scoring goals"""
    
    def __init__(self, agent_id: int, team: int, config: SoccerEnvironmentConfig):
        super().__init__(agent_id, 5)  # 5D continuous action
        self.team = team
        self.config = config
        self.field_width, self.field_height = config.FIELD_SIZE
        
    def select_action(self, observation: np.ndarray, training: bool = True) -> np.ndarray:
        """Select action based on expert strategy"""
        # Parse observation (28 dimensions)
        # [0-1]: self position (normalized)
        # [2-3]: self velocity
        # [4-5]: ball position (normalized)
        # [6-7]: ball velocity
        # [8-9]: teammate position
        # [10-11]: teammate velocity
        # [12-15]: opponents positions
        # [16-19]: opponents velocities
        # [20-23]: goal information
        # [24-27]: context
        
        self_pos = observation[0:2]
        ball_pos = observation[4:6]
        teammate_pos = observation[8:10]
        
        # Denormalize positions for strategy
        self_x, self_y = self_pos[0] * self.field_width, self_pos[1] * self.field_height
        ball_x, ball_y = ball_pos[0] * self.field_width, ball_pos[1] * self.field_height
        
        # Calculate distances
        dist_to_ball = np.sqrt((self_x - ball_x)**2 + (self_y - ball_y)**2)
        
        # Determine target goal position
        if self.team == 0:  # Blue team attacks right
            goal_x = self.field_width
            goal_y = self.field_height / 2
        else:  # Red team attacks left
            goal_x = 0
            goal_y = self.field_height / 2
        
        # Strategy 1: Go to ball if far
        if dist_to_ball > 50:
            # Move towards ball
            move_x = np.clip((ball_x - self_x) / 100, -1, 1)
            move_y = np.clip((ball_y - self_y) / 100, -1, 1)
            kick_power = 0.0
            kick_dir_x = 0.0
            kick_dir_y = 0.0
        
        # Strategy 2: Kick towards goal if close to ball
        else:
            # Move towards ball for better position
            move_x = np.clip((ball_x - self_x) / 50, -1, 1)
            move_y = np.clip((ball_y - self_y) / 50, -1, 1)
            
            # Calculate kick direction towards goal
            kick_dir_x = np.clip((goal_x - ball_x) / self.field_width, -1, 1)
            kick_dir_y = np.clip((goal_y - ball_y) / self.field_height, -1, 1)
            
            # Strong kick when aligned with goal
            alignment = abs(kick_dir_y) < 0.3  # Close to horizontal alignment
            kick_power = 0.8 if alignment else 0.5
        
        # Add some randomness for diversity
        if training and np.random.random() < 0.1:
            move_x += np.random.uniform(-0.2, 0.2)
            move_y += np.random.uniform(-0.2, 0.2)
        
        action = np.array([move_x, move_y, kick_power, kick_dir_x, kick_dir_y], dtype=np.float32)
        return np.clip(action, [-1, -1, 0, -1, -1], [1, 1, 1, 1, 1])
    
    def learn(self, experiences: List) -> Dict[str, float]:
        """Expert doesn't learn"""
        return {"loss": 0.0}

print("✅ Expert agent with goal-scoring strategy implemented!")

### 📊 エキスパートデモンストレーションの収集

In [None]:
def collect_expert_demonstrations(num_episodes=50):
    """Collect expert demonstrations for imitation learning"""
    print(f"📊 Collecting {num_episodes} episodes of expert demonstrations...")
    
    config = SoccerEnvironmentConfig()
    config.MAX_STEPS = 600  # Shorter episodes
    env = make_soccer_env(config, render_mode=None, action_type="continuous")
    
    # Create expert agents
    expert_agents = {}
    for i, agent_name in enumerate(env.agents):
        team = i // 2  # 0 or 1
        expert_agents[agent_name] = ExpertAgent(i, team, config)
    
    demonstrations = []
    total_goals = {'team_0': 0, 'team_1': 0}
    
    for episode in range(num_episodes):
        env.reset()
        episode_data = []
        
        while not all(env.terminations.values()) and not all(env.truncations.values()):
            for agent_name in env.agents:
                if not env.terminations.get(agent_name, False) and not env.truncations.get(agent_name, False):
                    # Get observation
                    obs = env.observe(agent_name)
                    
                    # Get expert action
                    action = expert_agents[agent_name].select_action(obs, training=False)
                    
                    # Store state-action pair
                    episode_data.append({
                        'agent': agent_name,
                        'observation': obs.copy(),
                        'action': action.copy()
                    })
                    
                    # Execute action
                    env.step(action)
                    
                    if env.terminations.get(agent_name, False) or env.truncations.get(agent_name, False):
                        break
        
        demonstrations.append(episode_data)
        total_goals['team_0'] += env.scores[0]
        total_goals['team_1'] += env.scores[1]
        
        if (episode + 1) % 10 == 0:
            print(f"Episode {episode + 1}: Scores = {env.scores}")
    
    env.close()
    
    print(f"\n✅ Collected {num_episodes} expert demonstrations")
    print(f"   Total goals: Team 0 = {total_goals['team_0']}, Team 1 = {total_goals['team_1']}")
    print(f"   Average goals per episode: {(total_goals['team_0'] + total_goals['team_1']) / num_episodes:.2f}")
    
    return demonstrations

# Collect demonstrations
expert_demonstrations = collect_expert_demonstrations(50)
print(f"\n📦 Demonstration data size: {len(expert_demonstrations)} episodes")
print(f"   First episode length: {len(expert_demonstrations[0])} steps")

### 🧠 行動クローニング（Behavioral Cloning）の実装

In [None]:
class BehavioralCloningAgent(BaseAgent):
    """Agent trained with behavioral cloning from expert demonstrations"""
    
    def __init__(self, agent_id: int, obs_dim: int = 28, action_dim: int = 5,
                 hidden_dims: Tuple[int, ...] = (256, 128), lr: float = 1e-3):
        super().__init__(agent_id, action_dim)
        
        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Policy network (observation -> action)
        self.policy_network = self._build_network(obs_dim, action_dim, hidden_dims)
        self.policy_network.to(self.device)
        
        self.optimizer = optim.Adam(self.policy_network.parameters(), lr=lr)
        self.loss_history = []
        
    def _build_network(self, input_dim: int, output_dim: int, hidden_dims: Tuple[int, ...]):
        """Build neural network"""
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.1))  # Add dropout for regularization
            prev_dim = hidden_dim
        
        layers.append(nn.Linear(prev_dim, output_dim))
        layers.append(nn.Tanh())  # Output in [-1, 1]
        
        return nn.Sequential(*layers)
    
    def select_action(self, observation: np.ndarray, training: bool = True) -> np.ndarray:
        """Select action using learned policy"""
        with torch.no_grad():
            obs_tensor = torch.FloatTensor(observation).unsqueeze(0).to(self.device)
            action = self.policy_network(obs_tensor).cpu().numpy().flatten()
        
        # Add exploration noise during training
        if training:
            noise = np.random.normal(0, 0.1, size=action.shape)
            action = action + noise
        
        # Ensure kick power is positive
        action[2] = np.clip(action[2], 0, 1)
        
        return np.clip(action, [-1, -1, 0, -1, -1], [1, 1, 1, 1, 1])
    
    def train_on_demonstrations(self, demonstrations: List, epochs: int = 100, batch_size: int = 64):
        """Train the agent on expert demonstrations"""
        print(f"\n🧠 Training behavioral cloning agent...")
        print(f"   Epochs: {epochs}, Batch size: {batch_size}")
        
        # Prepare training data
        all_observations = []
        all_actions = []
        
        for episode in demonstrations:
            for step_data in episode:
                if step_data['agent'] == f'player_{self.agent_id}':
                    all_observations.append(step_data['observation'])
                    all_actions.append(step_data['action'])
        
        # Convert to tensors
        observations = torch.FloatTensor(all_observations).to(self.device)
        actions = torch.FloatTensor(all_actions).to(self.device)
        
        dataset_size = observations.shape[0]
        print(f"   Training on {dataset_size} samples")
        
        # Training loop
        for epoch in range(epochs):
            # Shuffle data
            indices = torch.randperm(dataset_size)
            
            total_loss = 0
            num_batches = 0
            
            for i in range(0, dataset_size, batch_size):
                batch_indices = indices[i:i+batch_size]
                batch_obs = observations[batch_indices]
                batch_actions = actions[batch_indices]
                
                # Forward pass
                predicted_actions = self.policy_network(batch_obs)
                
                # Compute loss (MSE)
                loss = nn.MSELoss()(predicted_actions, batch_actions)
                
                # Backward pass
                self.optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.policy_network.parameters(), max_norm=1.0)
                self.optimizer.step()
                
                total_loss += loss.item()
                num_batches += 1
            
            avg_loss = total_loss / num_batches
            self.loss_history.append(avg_loss)
            
            if (epoch + 1) % 20 == 0:
                print(f"   Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")
        
        print(f"\n✅ Behavioral cloning training completed!")
        print(f"   Final loss: {self.loss_history[-1]:.4f}")
        
    def learn(self, experiences: List) -> Dict[str, float]:
        """Can continue learning during deployment"""
        return {"loss": self.loss_history[-1] if self.loss_history else 0.0}

print("✅ Behavioral cloning agent implemented!")

### 🎓 BCエージェントの訓練

In [None]:
# Create and train BC agents
print("🎓 Creating and training Behavioral Cloning agents...")
print("=" * 60)

bc_agents = {}

for i in range(4):  # 4 agents total
    agent_name = f"player_{i}"
    print(f"\nTraining {agent_name}...")
    
    # Create BC agent
    bc_agent = BehavioralCloningAgent(
        agent_id=i,
        obs_dim=28,
        action_dim=5,
        hidden_dims=(256, 128),
        lr=5e-4
    )
    
    # Train on expert demonstrations
    bc_agent.train_on_demonstrations(
        expert_demonstrations,
        epochs=50,
        batch_size=32
    )
    
    bc_agents[agent_name] = bc_agent

print("\n" + "=" * 60)
print("✅ All BC agents trained successfully!")

# Plot training loss
plt.figure(figsize=(10, 4))
for i, (name, agent) in enumerate(bc_agents.items()):
    plt.plot(agent.loss_history, label=name, alpha=0.7)
plt.title('Behavioral Cloning Training Loss')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

### 🎮 BCエージェントのテストと評価

In [None]:
def evaluate_agents(agents_dict, num_episodes=20, record_video=False):
    """Evaluate agent performance"""
    config = SoccerEnvironmentConfig()
    config.MAX_STEPS = 600
    
    render_mode = "rgb_array" if record_video else None
    env = make_soccer_env(config, render_mode=render_mode, action_type="continuous")
    
    stats = {
        'scores': [],
        'rewards': [],
        'steps': [],
        'goals_team_0': 0,
        'goals_team_1': 0
    }
    
    video_frames = []
    
    for episode in range(num_episodes):
        env.reset()
        episode_reward = 0
        steps = 0
        episode_frames = [] if record_video and episode == 0 else None
        
        while not all(env.terminations.values()) and not all(env.truncations.values()):
            if episode_frames is not None:
                frame = env.render()
                if frame is not None:
                    episode_frames.append(frame)
            
            for agent_name in env.agents:
                if not env.terminations.get(agent_name, False) and not env.truncations.get(agent_name, False):
                    obs = env.observe(agent_name)
                    action = agents_dict[agent_name].select_action(obs, training=False)
                    env.step(action)
                    episode_reward += env.rewards.get(agent_name, 0)
                    steps += 1
                    
                    if env.terminations.get(agent_name, False) or env.truncations.get(agent_name, False):
                        break
        
        stats['scores'].append(env.scores.copy())
        stats['rewards'].append(episode_reward)
        stats['steps'].append(steps)
        stats['goals_team_0'] += env.scores[0]
        stats['goals_team_1'] += env.scores[1]
        
        if episode_frames:
            video_frames = episode_frames
        
        if (episode + 1) % 5 == 0:
            print(f"Episode {episode + 1}: Scores = {env.scores}, Reward = {episode_reward:.2f}")
    
    env.close()
    
    return stats, video_frames

# Test BC agents
print("🎮 Testing Behavioral Cloning agents...")
print("=" * 60)
bc_stats, bc_video = evaluate_agents(bc_agents, num_episodes=20, record_video=True)

print("\n📊 BC Agents Performance:")
print(f"   Total goals scored: Team 0 = {bc_stats['goals_team_0']}, Team 1 = {bc_stats['goals_team_1']}")
print(f"   Average goals per episode: {(bc_stats['goals_team_0'] + bc_stats['goals_team_1']) / 20:.2f}")
print(f"   Average reward: {np.mean(bc_stats['rewards']):.2f}")
print(f"   Average steps: {np.mean(bc_stats['steps']):.1f}")

### 📈 エージェント性能の比較

In [None]:
# Compare different agent types
print("📈 Comparing agent performances...")
print("=" * 60)

# Test random agents for comparison
config = SoccerEnvironmentConfig()
env_temp = make_soccer_env(config, render_mode=None, action_type="continuous")
random_agents_comp = {}
for i, agent_name in enumerate(env_temp.agents):
    random_agents_comp[agent_name] = RandomAgent(i, action_space_size=5, action_type="continuous")
env_temp.close()

print("\nTesting Random agents...")
random_stats, _ = evaluate_agents(random_agents_comp, num_episodes=20, record_video=False)

# Test expert agents
expert_agents_comp = {}
for i in range(4):
    agent_name = f"player_{i}"
    team = i // 2
    expert_agents_comp[agent_name] = ExpertAgent(i, team, config)

print("\nTesting Expert agents...")
expert_stats, expert_video = evaluate_agents(expert_agents_comp, num_episodes=20, record_video=True)

# Visualization
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Goals comparison
ax = axes[0]
agent_types = ['Random', 'BC (Learned)', 'Expert']
goals_team0 = [random_stats['goals_team_0'], bc_stats['goals_team_0'], expert_stats['goals_team_0']]
goals_team1 = [random_stats['goals_team_1'], bc_stats['goals_team_1'], expert_stats['goals_team_1']]

x = np.arange(len(agent_types))
width = 0.35
ax.bar(x - width/2, goals_team0, width, label='Team 0 (Blue)', color='blue', alpha=0.7)
ax.bar(x + width/2, goals_team1, width, label='Team 1 (Red)', color='red', alpha=0.7)
ax.set_xlabel('Agent Type')
ax.set_ylabel('Total Goals (20 episodes)')
ax.set_title('Goals Scored Comparison')
ax.set_xticks(x)
ax.set_xticklabels(agent_types)
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

# Average rewards
ax = axes[1]
avg_rewards = [
    np.mean(random_stats['rewards']),
    np.mean(bc_stats['rewards']),
    np.mean(expert_stats['rewards'])
]
bars = ax.bar(agent_types, avg_rewards, color=['gray', 'green', 'gold'], alpha=0.7)
ax.set_ylabel('Average Reward per Episode')
ax.set_title('Reward Comparison')
ax.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bar, val in zip(bars, avg_rewards):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{val:.1f}', ha='center', va='bottom')

# Goals per episode distribution
ax = axes[2]
total_goals = [
    (random_stats['goals_team_0'] + random_stats['goals_team_1']) / 20,
    (bc_stats['goals_team_0'] + bc_stats['goals_team_1']) / 20,
    (expert_stats['goals_team_0'] + expert_stats['goals_team_1']) / 20
]
bars = ax.bar(agent_types, total_goals, color=['gray', 'green', 'gold'], alpha=0.7)
ax.set_ylabel('Average Goals per Episode')
ax.set_title('Scoring Frequency')
ax.grid(True, alpha=0.3, axis='y')

# Add value labels
for bar, val in zip(bars, total_goals):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{val:.2f}', ha='center', va='bottom')

plt.suptitle('Agent Performance Comparison', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print("\n" + "=" * 60)
print("📊 Summary:")
print(f"Random - Goals/episode: {total_goals[0]:.2f}, Avg reward: {avg_rewards[0]:.1f}")
print(f"BC     - Goals/episode: {total_goals[1]:.2f}, Avg reward: {avg_rewards[1]:.1f}")
print(f"Expert - Goals/episode: {total_goals[2]:.2f}, Avg reward: {avg_rewards[2]:.1f}")

improvement = (total_goals[1] - total_goals[0]) / (total_goals[0] + 0.01) * 100
print(f"\n🎯 BC improvement over Random: {improvement:.1f}%")

### 🎬 動画での比較

In [None]:
# Create and display comparison videos
print("🎬 Creating comparison videos...")

if bc_video:
    # Create BC agent video
    bc_video_path = '/tmp/bc_agents_gameplay.mp4'
    create_video_from_frames(bc_video, bc_video_path, fps=30)
    print("✅ BC agents video created")
    
    print("\n📺 Behavioral Cloning Agents Gameplay:")
    display(display_video(bc_video_path))

if expert_video:
    # Create expert agent video
    expert_video_path = '/tmp/expert_agents_gameplay.mp4'
    create_video_from_frames(expert_video, expert_video_path, fps=30)
    print("\n✅ Expert agents video created")
    
    print("\n📺 Expert Agents Gameplay:")
    display(display_video(expert_video_path))

print("\n💡 観察ポイント:")
print("  - Expert: ボールに向かって積極的に移動し、ゴールを狙う")
print("  - BC: Expertの戦略を模倣し、より多くのゴールを決める")
print("  - Random: ランダムな動きで、ゴールはほとんど入らない")

# ⏱️ Extended Episode Duration (20 seconds) with Improved Physics
## エピソードを20秒に拡張 + 物理エンジン改善版

より現実的なサッカーゲームのために、1エピソードを20秒（約600ステップ @ 30FPS）に設定。
さらに、ボールが挟まらないよう物理パラメータを最適化しました。

### 🔧 改善点:
- ⚙️ 物理パラメータの最適化
- 🚨 スタック検出・脱出システム
- 🎲 対称性破壊メカニズム
- 👁️ ビジュアル改善
- 🤖 スマートエージェント戦略

### ⚙️ 拡張エピソード設定

In [None]:
# Extended episode configuration with improved physics
from collections import deque
import math

@dataclass
class ExtendedSoccerConfig(SoccerEnvironmentConfig):
    """Extended configuration for 20-second episodes with improved physics"""
    # Override MAX_STEPS for 20 seconds at ~30 FPS
    MAX_STEPS: int = 600  # 20 seconds * 30 steps/second
    
    # Player movement
    PLAYER_SPEED: float = 4.0  # Strategic play speed
    
    # ⚙️ IMPROVED PHYSICS PARAMETERS
    BALL_SPEED_MULTIPLIER: float = 1.8  # Faster ball (was 1.3)
    FRICTION: float = 0.96  # Less friction (was 0.93)
    BALL_DECAY: float = 0.97  # Ball moves longer
    
    # Enhanced collision physics
    BALL_RESTITUTION: float = 0.85  # Higher bounce (was 0.7)
    COLLISION_ELASTICITY: float = 0.9  # Elastic collisions (was 0.6)
    
    # 🚨 Anti-stuck mechanics
    MIN_BALL_SPEED: float = 0.5  # Minimum speed threshold
    STUCK_DETECTION_FRAMES: int = 15  # Frames to detect stuck
    STUCK_VELOCITY_THRESHOLD: float = 0.8  # Velocity threshold
    ESCAPE_FORCE: float = 8.0  # Escape force strength
    PLAYER_SEPARATION_FORCE: float = 3.0  # Player separation
    
    # Goal celebration
    GOAL_PAUSE_STEPS: int = 30  # 1 second pause

print("✅ Improved physics configuration:")
print(f"   Ball friction: 0.96 (improved from 0.93)")
print(f"   Ball restitution: 0.85 (improved from 0.7)")
print(f"   Collision elasticity: 0.9 (improved from 0.6)")
print(f"   Anti-stuck system: Enabled")

In [None]:
# 🚨 Anti-stuck detection and escape system
class AntiStuckSystem:
    """System to detect and resolve ball stuck situations"""
    
    def __init__(self, config: ExtendedSoccerConfig):
        self.config = config
        self.stuck_frames = 0
        self.ball_velocity_history = deque(maxlen=config.STUCK_DETECTION_FRAMES)
        self.last_ball_pos = None
    
    def update(self, ball_pos, ball_vel, players):
        """Update stuck detection and apply corrections"""
        ball_speed = np.linalg.norm(ball_vel)
        self.ball_velocity_history.append(ball_speed)
        
        # Check if stuck
        if self._is_stuck(ball_pos, players):
            self.stuck_frames += 1
            return self._apply_escape(ball_vel, ball_pos, players)
        else:
            self.stuck_frames = 0
            # Add small perturbation to prevent symmetry
            if ball_speed < self.config.MIN_BALL_SPEED:
                ball_vel += np.random.randn(2) * 0.1
            return ball_vel
    
    def _is_stuck(self, ball_pos, players):
        """Check if ball is stuck"""
        if len(self.ball_velocity_history) < self.config.STUCK_DETECTION_FRAMES:
            return False
        
        avg_velocity = np.mean(list(self.ball_velocity_history))
        if avg_velocity > self.config.STUCK_VELOCITY_THRESHOLD:
            return False
        
        # Count nearby players
        nearby = 0
        for p in players:
            dist = np.linalg.norm(ball_pos - p.position)
            if dist < self.config.PLAYER_RADIUS + self.config.BALL_RADIUS + 5:
                nearby += 1
        
        return nearby >= 2
    
    def _apply_escape(self, ball_vel, ball_pos, players):
        """Apply escape force"""
        # Find closest players
        dists = [(p, np.linalg.norm(ball_pos - p.position)) for p in players]
        dists.sort(key=lambda x: x[1])
        
        if len(dists) >= 2:
            p1, p2 = dists[0][0], dists[1][0]
            
            # Escape perpendicular to player line
            line = p2.position - p1.position
            if np.linalg.norm(line) > 0:
                line = line / np.linalg.norm(line)
                escape_dir = np.array([-line[1], line[0]])
                if random.random() > 0.5:
                    escape_dir = -escape_dir
            else:
                angle = random.uniform(0, 2 * math.pi)
                escape_dir = np.array([math.cos(angle), math.sin(angle)])
            
            # Apply force
            ball_vel += escape_dir * self.config.ESCAPE_FORCE
            
            # Separate players slightly
            sep = p2.position - p1.position
            if np.linalg.norm(sep) > 0:
                sep = sep / np.linalg.norm(sep) * self.config.PLAYER_SEPARATION_FORCE
                p1.position -= sep * 0.5
                p2.position += sep * 0.5
        
        return ball_vel

print('✅ Anti-stuck system initialized')

### 🎯 改良版エキスパートエージェント（20秒対応）

In [None]:
class EnhancedExpertAgent(BaseAgent):
    """Enhanced expert agent for 20-second episodes with stamina management"""
    
    def __init__(self, agent_id: int, team: int, config: ExtendedSoccerConfig):
        super().__init__(agent_id, 5)
        self.team = team
        self.config = config
        self.field_width, self.field_height = config.FIELD_SIZE
        self.stamina = 1.0  # Stamina system for longer games
        self.role = 'attacker' if agent_id % 2 == 0 else 'defender'
        self.last_ball_pos = None
        self.stuck_counter = 0
        
    def select_action(self, observation: np.ndarray, training: bool = True) -> np.ndarray:
        """Enhanced strategy for longer episodes"""
        # Initialize action array first
        action = np.zeros(5)
        
        # Parse observation
        self_pos = observation[0:2]
        ball_pos = observation[4:6]
        teammate_pos = observation[8:10]
        opp1_pos = observation[12:14]
        opp2_pos = observation[16:18]
        
        # Denormalize positions
        self_x, self_y = self_pos[0] * self.field_width, self_pos[1] * self.field_height
        ball_x, ball_y = ball_pos[0] * self.field_width, ball_pos[1] * self.field_height
        teammate_x, teammate_y = teammate_pos[0] * self.field_width, teammate_pos[1] * self.field_height
        
        # Current positions as arrays
        self_pos_denorm = np.array([self_x, self_y])
        ball_pos_denorm = np.array([ball_x, ball_y])
        
        # 🚨 Stuck detection
        if self.last_ball_pos is not None:
            ball_movement = np.linalg.norm(ball_pos_denorm - self.last_ball_pos)
            if ball_movement < 2.0:
                self.stuck_counter += 1
            else:
                self.stuck_counter = 0
        self.last_ball_pos = ball_pos_denorm.copy()
        
        # Distance to ball
        dist_to_ball = np.linalg.norm(ball_pos_denorm - self_pos_denorm)
        
        # Apply escape strategy if stuck
        if self.stuck_counter > 5:
            # Escape strategy when stuck
            angle = random.uniform(0, 2 * math.pi)
            action[0:2] = np.array([math.cos(angle), math.sin(angle)])
            if dist_to_ball < 50:
                action[2] = 1.0  # Strong kick
            action[0:2] += np.random.randn(2) * 0.2  # Add noise
            return action
        
        # Stamina management for 20-second games
        self.stamina -= 0.001  # Gradual stamina decrease
        self.stamina = max(0.3, self.stamina)  # Minimum stamina
        
        # Speed modifier based on stamina
        speed_modifier = 0.5 + 0.5 * self.stamina
        
        # Role-based strategy
        if self.role == 'attacker':
            # Attacker logic
            if self.team == 0:  # Blue team attacks right goal
                goal_x, goal_y = self.field_width - 30, self.field_height / 2
            else:  # Red team attacks left goal
                goal_x, goal_y = 30, self.field_height / 2
            
            goal_pos = np.array([goal_x, goal_y])
            
            if dist_to_ball < 50:
                # Has ball - move toward goal
                direction_to_goal = goal_pos - ball_pos_denorm
                direction_to_goal = direction_to_goal / (np.linalg.norm(direction_to_goal) + 1e-6)
                
                # Shooting range check
                dist_to_goal = np.linalg.norm(goal_pos - ball_pos_denorm)
                if dist_to_goal < 150:  # In shooting range
                    action[2] = 1.0  # Kick
                    action[3] = 0.8  # Kick power
                
                # Move in direction of goal
                action[0:2] = direction_to_goal * speed_modifier
            else:
                # Move to ball
                direction_to_ball = ball_pos_denorm - self_pos_denorm
                if np.linalg.norm(direction_to_ball) > 0:
                    direction_to_ball = direction_to_ball / np.linalg.norm(direction_to_ball)
                action[0:2] = direction_to_ball * speed_modifier
                
                # Sprint if far from ball
                if dist_to_ball > 200 and self.stamina > 0.5:
                    action[4] = 1.0  # Sprint
                    self.stamina -= 0.01  # Extra stamina cost
        
        else:  # Defender
            # Defender logic
            if self.team == 0:  # Blue team defends left goal
                goal_x, goal_y = 30, self.field_height / 2
            else:  # Red team defends right goal
                goal_x, goal_y = self.field_width - 30, self.field_height / 2
            
            goal_pos = np.array([goal_x, goal_y])
            
            # Position between ball and own goal
            ideal_pos = goal_pos + 0.4 * (ball_pos_denorm - goal_pos)
            
            # Move to ideal defensive position
            direction = ideal_pos - self_pos_denorm
            if np.linalg.norm(direction) > 0:
                direction = direction / np.linalg.norm(direction)
            
            action[0:2] = direction * speed_modifier
            
            # Clear ball if close
            if dist_to_ball < 40:
                # Clear away from goal
                clear_direction = ball_pos_denorm - goal_pos
                if np.linalg.norm(clear_direction) > 0:
                    clear_direction = clear_direction / np.linalg.norm(clear_direction)
                action[0:2] = clear_direction
                action[2] = 1.0  # Kick
                action[3] = 1.0  # Full power clear
        
        # Add small random noise to prevent perfect symmetry
        action[0:2] += np.random.randn(2) * 0.05
        
        # Ensure action is within bounds
        action = np.clip(action, -1, 1)
        
        return action
    
    def learn(self, *args, **kwargs):
        """Expert agents don't learn"""
        return {}
    
    def save(self, path: str):
        """Save agent (not needed for expert)"""
        pass
    
    def load(self, path: str):
        """Load agent (not needed for expert)"""
        pass

print('✅ Enhanced expert agents with stuck detection created')

### 🎮 20秒エピソードでの訓練

In [None]:
class LongEpisodeTrainer:
    """Trainer for 20-second episodes"""
    
    def __init__(self, config: ExtendedSoccerConfig):
        self.config = config
        self.episode_stats = []
        
    def run_episode(self, agents_dict, record_video=False, verbose=True):
        """Run a single 20-second episode"""
        render_mode = "rgb_array" if record_video else None
        env = make_soccer_env(self.config, render_mode=render_mode, action_type="continuous")
        
        env.reset()
        
        # Episode statistics
        stats = {
            'scores': [0, 0],
            'rewards': {agent: 0 for agent in env.agents},
            'steps': 0,
            'goals_timeline': [],  # When goals were scored
            'possession_time': {0: 0, 1: 0, -1: 0},  # Ball possession time
            'shots': {0: 0, 1: 0},  # Shot attempts
        }
        
        video_frames = [] if record_video else None
        last_ball_possession = -1
        
        # Run 20-second episode
        for step in range(self.config.MAX_STEPS):
            # Record frame
            if record_video and step % 2 == 0:  # Record every 2nd frame to reduce size
                frame = env.render()
                if frame is not None:
                    video_frames.append(frame)
            
            # Get actions from all agents
            for agent_name in env.agents:
                if not env.terminations.get(agent_name, False) and not env.truncations.get(agent_name, False):
                    obs = env.observe(agent_name)
                    action = agents_dict[agent_name].select_action(obs, training=False)
                    
                    # Detect shots (high kick power)
                    if action[2] > 0.7:  # kick_power > 0.7
                        team = int(agent_name.split('_')[1]) // 2
                        stats['shots'][team] += 1
                    
                    env.step(action)
                    stats['rewards'][agent_name] += env.rewards.get(agent_name, 0)
                    
                    if env.terminations.get(agent_name, False) or env.truncations.get(agent_name, False):
                        break
            
            # Track goals
            if env.scores[0] > stats['scores'][0]:
                stats['goals_timeline'].append({'team': 0, 'time': step / 30})  # Convert to seconds
                stats['scores'][0] = env.scores[0]
                if verbose:
                    print(f"⚽ GOAL! Team 0 scores at {step/30:.1f}s")
            
            if env.scores[1] > stats['scores'][1]:
                stats['goals_timeline'].append({'team': 1, 'time': step / 30})
                stats['scores'][1] = env.scores[1]
                if verbose:
                    print(f"⚽ GOAL! Team 1 scores at {step/30:.1f}s")
            
            # Track possession (simplified)
            current_possession = getattr(env, 'ball_possession', -1)
            if current_possession != -1:
                team = current_possession // 2
                stats['possession_time'][team] += 1
            else:
                stats['possession_time'][-1] += 1
            
            stats['steps'] += 1
        
        env.close()
        
        # Calculate possession percentage
        total_possession = sum(stats['possession_time'].values())
        if total_possession > 0:
            stats['possession_pct'] = {
                0: stats['possession_time'][0] / total_possession * 100,
                1: stats['possession_time'][1] / total_possession * 100
            }
        else:
            stats['possession_pct'] = {0: 0, 1: 0}
        
        return stats, video_frames
    
    def run_match(self, agents_dict, num_episodes=5, record_first=True):
        """Run multiple 20-second matches"""
        print(f"🏆 Running {num_episodes} x 20-second matches")
        print("=" * 60)
        
        all_stats = []
        videos = []
        
        for episode in range(num_episodes):
            print(f"\n📅 Match {episode + 1}/{num_episodes}")
            record = record_first and episode == 0
            
            stats, frames = self.run_episode(agents_dict, record_video=record, verbose=True)
            all_stats.append(stats)
            
            if frames:
                videos.append(frames)
            
            # Match summary
            print(f"\n📊 Match {episode + 1} Summary:")
            print(f"   Final Score: {stats['scores'][0]} - {stats['scores'][1]}")
            print(f"   Total Rewards: {sum(stats['rewards'].values()):.1f}")
            print(f"   Shots: Team 0 = {stats['shots'][0]}, Team 1 = {stats['shots'][1]}")
            print(f"   Possession: Team 0 = {stats['possession_pct'][0]:.1f}%, Team 1 = {stats['possession_pct'][1]:.1f}%")
            print(f"   Goals scored at: {[f"{g['time']:.1f}s" for g in stats['goals_timeline']]}")
        
        return all_stats, videos

print("✅ Long episode trainer ready!")
print("   - 20-second episodes (600 steps)")
print("   - Detailed statistics tracking")
print("   - Goal timeline and possession stats")

### 🏆 20秒マッチの実行

In [None]:
# Create enhanced expert agents for 20-second games
print("🎯 Creating enhanced expert agents for 20-second matches...")

extended_config = ExtendedSoccerConfig()
enhanced_experts = {}

for i in range(4):
    agent_name = f"player_{i}"
    team = i // 2
    enhanced_experts[agent_name] = EnhancedExpertAgent(i, team, extended_config)

print("✅ Enhanced expert agents created")
print("   Team 0: player_0 (attacker), player_1 (defender)")
print("   Team 1: player_2 (attacker), player_3 (defender)")

# Run matches
print("\n" + "=" * 60)
trainer = LongEpisodeTrainer(extended_config)
match_stats, match_videos = trainer.run_match(
    enhanced_experts,
    num_episodes=3,  # Run 3 matches
    record_first=True  # Record first match
)

# Overall statistics
print("\n" + "=" * 60)
print("🏅 Overall Statistics (3 matches):")
total_goals_0 = sum(s['scores'][0] for s in match_stats)
total_goals_1 = sum(s['scores'][1] for s in match_stats)
avg_rewards = np.mean([sum(s['rewards'].values()) for s in match_stats])
avg_shots_0 = np.mean([s['shots'][0] for s in match_stats])
avg_shots_1 = np.mean([s['shots'][1] for s in match_stats])

print(f"   Total Goals: Team 0 = {total_goals_0}, Team 1 = {total_goals_1}")
print(f"   Average Rewards per Match: {avg_rewards:.1f}")
print(f"   Average Shots per Match: Team 0 = {avg_shots_0:.1f}, Team 1 = {avg_shots_1:.1f}")
print(f"   Goals per 20 seconds: {(total_goals_0 + total_goals_1) / 3:.2f}")

### 📊 20秒マッチの可視化

In [None]:
# Visualize match statistics
if match_stats:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Goals over matches
    ax = axes[0, 0]
    matches = range(1, len(match_stats) + 1)
    team0_scores = [s['scores'][0] for s in match_stats]
    team1_scores = [s['scores'][1] for s in match_stats]
    
    width = 0.35
    x = np.arange(len(matches))
    ax.bar(x - width/2, team0_scores, width, label='Team 0 (Blue)', color='blue', alpha=0.7)
    ax.bar(x + width/2, team1_scores, width, label='Team 1 (Red)', color='red', alpha=0.7)
    ax.set_xlabel('Match Number')
    ax.set_ylabel('Goals Scored')
    ax.set_title('Goals per 20-second Match')
    ax.set_xticks(x)
    ax.set_xticklabels(matches)
    ax.legend()
    ax.grid(True, alpha=0.3, axis='y')
    
    # Shots comparison
    ax = axes[0, 1]
    team0_shots = [s['shots'][0] for s in match_stats]
    team1_shots = [s['shots'][1] for s in match_stats]
    
    ax.plot(matches, team0_shots, 'o-', label='Team 0', color='blue', linewidth=2, markersize=8)
    ax.plot(matches, team1_shots, 's-', label='Team 1', color='red', linewidth=2, markersize=8)
    ax.set_xlabel('Match Number')
    ax.set_ylabel('Shot Attempts')
    ax.set_title('Shots on Goal per Match')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Possession percentage
    ax = axes[1, 0]
    team0_possession = [s['possession_pct'][0] for s in match_stats]
    team1_possession = [s['possession_pct'][1] for s in match_stats]
    
    ax.bar(x - width/2, team0_possession, width, label='Team 0', color='blue', alpha=0.7)
    ax.bar(x + width/2, team1_possession, width, label='Team 1', color='red', alpha=0.7)
    ax.set_xlabel('Match Number')
    ax.set_ylabel('Ball Possession (%)')
    ax.set_title('Ball Possession Statistics')
    ax.set_xticks(x)
    ax.set_xticklabels(matches)
    ax.legend()
    ax.axhline(y=50, color='gray', linestyle='--', alpha=0.5)
    ax.grid(True, alpha=0.3, axis='y')
    
    # Goal timeline (for first match)
    ax = axes[1, 1]
    if match_stats[0]['goals_timeline']:
        goals = match_stats[0]['goals_timeline']
        times = [g['time'] for g in goals]
        teams = [g['team'] for g in goals]
        colors = ['blue' if t == 0 else 'red' for t in teams]
        
        ax.scatter(times, teams, c=colors, s=200, alpha=0.7)
        ax.set_xlabel('Time (seconds)')
        ax.set_ylabel('Team')
        ax.set_title('Goal Timeline (First Match)')
        ax.set_yticks([0, 1])
        ax.set_yticklabels(['Team 0', 'Team 1'])
        ax.set_xlim(0, 20)
        ax.grid(True, alpha=0.3)
        
        for i, (time, team) in enumerate(zip(times, teams)):
            ax.annotate(f'{time:.1f}s', (time, team), 
                       xytext=(0, 10), textcoords='offset points',
                       ha='center', fontsize=9)
    else:
        ax.text(0.5, 0.5, 'No goals scored', 
               ha='center', va='center', transform=ax.transAxes,
               fontsize=14, color='gray')
        ax.set_title('Goal Timeline (First Match)')
    
    plt.suptitle('20-Second Match Statistics', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()

print("\n📈 Key Observations:")
print("  - 20秒の試合で戦略的な展開が可能")
print("  - 攻撃と防御の役割分担が明確")
print("  - ボール支配率と得点の相関")

## 🎓 Behavioral Cloning (BC) Learning from Expert
## エキスパートからの模倣学習

エキスパートエージェントのプレイデータを収集し、Behavioral Cloningで学習します。
20秒のエピソードを100回実行して学習データを収集します。

In [None]:
# 🤖 Behavioral Cloning Agent
class BCAgent(BaseAgent):
    """Agent that learns from expert demonstrations using Behavioral Cloning"""
    
    def __init__(self, agent_id: int, action_dim: int, observation_dim: int = 28,
                 hidden_dim: int = 128, lr: float = 0.001):
        super().__init__(agent_id, action_dim)
        self.observation_dim = observation_dim
        self.hidden_dim = hidden_dim
        self.lr = lr
        
        # Build neural network for behavior cloning
        self.policy_net = nn.Sequential(
            nn.Linear(observation_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim),
            nn.Tanh()  # Actions are in [-1, 1]
        )
        
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.loss_fn = nn.MSELoss()
        
        # Storage for demonstrations
        self.demonstrations = []
        
    def select_action(self, observation: np.ndarray, training: bool = True) -> np.ndarray:
        """Select action using learned policy"""
        with torch.no_grad():
            obs_tensor = torch.FloatTensor(observation).unsqueeze(0)
            action = self.policy_net(obs_tensor).squeeze(0).numpy()
        
        # Add small noise during training for exploration
        if training:
            action += np.random.randn(self.action_dim) * 0.1
        
        return np.clip(action, -1, 1)
    
    def add_demonstration(self, observation: np.ndarray, action: np.ndarray):
        """Add expert demonstration to buffer"""
        self.demonstrations.append((observation, action))
    
    def train_on_demonstrations(self, batch_size: int = 64, epochs: int = 10):
        """Train on collected demonstrations"""
        if len(self.demonstrations) < batch_size:
            print(f"Not enough demonstrations: {len(self.demonstrations)} < {batch_size}")
            return {}
        
        dataset_size = len(self.demonstrations)
        losses = []
        
        for epoch in range(epochs):
            epoch_loss = 0.0
            num_batches = 0
            
            # Shuffle demonstrations
            indices = np.random.permutation(dataset_size)
            
            for i in range(0, dataset_size - batch_size, batch_size):
                batch_indices = indices[i:i+batch_size]
                
                # Prepare batch
                obs_batch = []
                action_batch = []
                for idx in batch_indices:
                    obs, act = self.demonstrations[idx]
                    obs_batch.append(obs)
                    action_batch.append(act)
                
                obs_tensor = torch.FloatTensor(np.array(obs_batch))
                action_tensor = torch.FloatTensor(np.array(action_batch))
                
                # Forward pass
                predicted_actions = self.policy_net(obs_tensor)
                loss = self.loss_fn(predicted_actions, action_tensor)
                
                # Backward pass
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                
                epoch_loss += loss.item()
                num_batches += 1
            
            avg_loss = epoch_loss / num_batches if num_batches > 0 else 0
            losses.append(avg_loss)
            
            if epoch % 2 == 0:
                print(f"  Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")
        
        return {
            'final_loss': losses[-1] if losses else 0,
            'avg_loss': np.mean(losses) if losses else 0,
            'num_demonstrations': len(self.demonstrations)
        }
    
    def learn(self, *args, **kwargs):
        """Compatibility method"""
        return self.train_on_demonstrations()
    
    def save(self, path: str):
        """Save model"""
        torch.save({
            'policy_net': self.policy_net.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'demonstrations': self.demonstrations
        }, path)
    
    def load(self, path: str):
        """Load model"""
        checkpoint = torch.load(path)
        self.policy_net.load_state_dict(checkpoint['policy_net'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        self.demonstrations = checkpoint['demonstrations']

print('✅ Behavioral Cloning Agent implemented')

In [None]:
# 📊 Collect Expert Demonstrations
def collect_expert_demonstrations(num_episodes: int = 100, episode_length: int = 600):
    """Collect demonstrations from expert agents for BC learning
    
    Args:
        num_episodes: Number of 20-second episodes to collect (default 100)
        episode_length: Steps per episode (600 = 20 seconds)
    """
    print(f"📊 Collecting expert demonstrations...")
    print(f"   Episodes: {num_episodes}")
    print(f"   Episode length: {episode_length} steps (20 seconds)\n")
    
    # Create environment with 20-second episodes
    env = SoccerEnvironment(extended_config, render_mode='rgb_array')
    
    # Create expert agents
    expert_agents = {}
    for i, agent in enumerate(env.possible_agents):
        team = 0 if 'team_0' in agent else 1
        expert_agents[agent] = EnhancedExpertAgent(i, team, extended_config)
    
    # Create BC agents to collect demonstrations
    bc_agents = {}
    for i, agent in enumerate(env.possible_agents):
        bc_agents[agent] = BCAgent(i, action_dim=5, observation_dim=28)
    
    # Collect demonstrations
    total_steps = 0
    for episode in tqdm(range(num_episodes), desc="Collecting demonstrations"):
        observations, _ = env.reset()
        
        episode_steps = 0
        while env.agents and episode_steps < episode_length:
            agent_id = env.agent_selection
            obs = observations[agent_id] if isinstance(observations, dict) else observations
            
            # Get expert action
            expert_action = expert_agents[agent_id].select_action(obs, training=False)
            
            # Store demonstration
            bc_agents[agent_id].add_demonstration(obs, expert_action)
            
            # Step environment
            observations, rewards, terminations, truncations, infos = env.step(expert_action)
            
            episode_steps += 1
            total_steps += 1
        
        if (episode + 1) % 20 == 0:
            print(f"  Episode {episode + 1}/{num_episodes} completed")
    
    env.close()
    
    print(f"\n✅ Data collection complete!")
    print(f"   Total steps: {total_steps:,}")
    print(f"   Demonstrations per agent:")
    for agent_name, bc_agent in bc_agents.items():
        print(f"     {agent_name}: {len(bc_agent.demonstrations):,}")
    
    return bc_agents

print('✅ Demonstration collection function ready')

### 🎯 Execute BC Learning (100 episodes × 20 seconds)

In [None]:
# 🎓 Execute Behavioral Cloning Learning
print("="*60)
print("🎓 BEHAVIORAL CLONING LEARNING")
print("="*60)
print("\n📚 Phase 1: Collect Expert Demonstrations\n")

# Collect demonstrations from 100 episodes of 20 seconds each
bc_agents = collect_expert_demonstrations(num_episodes=100, episode_length=600)

print("\n" + "="*60)
print("📖 Phase 2: Train BC Agents\n")

# Train each BC agent on collected demonstrations
training_results = {}
for agent_name, bc_agent in bc_agents.items():
    print(f"\n🤖 Training {agent_name}...")
    results = bc_agent.train_on_demonstrations(batch_size=64, epochs=10)
    training_results[agent_name] = results
    print(f"  ✅ Training complete! Final loss: {results['final_loss']:.4f}")

print("\n" + "="*60)
print("📊 TRAINING SUMMARY")
print("="*60)
for agent_name, results in training_results.items():
    print(f"\n{agent_name}:")
    print(f"  Demonstrations: {results['num_demonstrations']:,}")
    print(f"  Final Loss: {results['final_loss']:.4f}")
    print(f"  Average Loss: {results['avg_loss']:.4f}")

print("\n✅ Behavioral Cloning learning complete!")
print("   All agents have learned from 100 episodes of expert play.")
print("   Each episode was 20 seconds (600 steps).")

### 🏆 Evaluate BC Agents vs Experts

In [None]:
# 🏆 Evaluate learned BC agents
def evaluate_bc_agents(bc_agents: Dict, num_episodes: int = 5):
    """Evaluate BC agents in 20-second matches"""
    print(f"\n🏆 Evaluating BC Agents ({num_episodes} matches)\n")
    
    env = SoccerEnvironment(extended_config, render_mode='rgb_array')
    
    total_goals = [0, 0]
    total_rewards = defaultdict(float)
    
    for episode in range(num_episodes):
        print(f"Match {episode + 1}/{num_episodes}")
        observations, _ = env.reset()
        
        episode_rewards = defaultdict(float)
        frames = []
        
        step_count = 0
        while env.agents and step_count < 600:  # 20 seconds
            agent_id = env.agent_selection
            obs = observations[agent_id] if isinstance(observations, dict) else observations
            
            # Get BC agent action
            action = bc_agents[agent_id].select_action(obs, training=False)
            
            # Step environment
            observations, rewards, terminations, truncations, infos = env.step(action)
            
            # Track rewards
            for agent, reward in rewards.items():
                episode_rewards[agent] += reward
            
            # Capture frame periodically
            if step_count % 30 == 0:  # Every second
                frames.append(env.render())
            
            step_count += 1
        
        # Get final score
        final_score = env.score if hasattr(env, 'score') else [0, 0]
        total_goals[0] += final_score[0]
        total_goals[1] += final_score[1]
        
        print(f"  Score: Team 0: {final_score[0]} - Team 1: {final_score[1]}")
        
        # Accumulate rewards
        for agent, reward in episode_rewards.items():
            total_rewards[agent] += reward
        
        # Save video for first and last episode
        if episode == 0 or episode == num_episodes - 1:
            video_name = f'bc_match_{episode + 1}.mp4'
            if frames:
                save_video(frames, video_name, fps=1)  # 1 fps since we capture every second
                print(f"  Video saved: {video_name}")
    
    env.close()
    
    print("\n" + "="*60)
    print("📊 EVALUATION RESULTS")
    print("="*60)
    print(f"\nTotal Goals:")
    print(f"  Team 0: {total_goals[0]}")
    print(f"  Team 1: {total_goals[1]}")
    print(f"\nAverage Goals per Match:")
    print(f"  Team 0: {total_goals[0] / num_episodes:.2f}")
    print(f"  Team 1: {total_goals[1] / num_episodes:.2f}")
    print(f"\nAverage Rewards:")
    for agent in sorted(total_rewards.keys()):
        avg_reward = total_rewards[agent] / num_episodes
        print(f"  {agent}: {avg_reward:.2f}")
    
    return total_goals, total_rewards

# Run evaluation
print("🎮 Starting BC agent evaluation...")
bc_goals, bc_rewards = evaluate_bc_agents(bc_agents, num_episodes=5)
print("\n✅ Evaluation complete! BC agents have been trained and tested.")

### 🎬 20秒マッチ動画

In [None]:
# Create and display 20-second match video
if match_videos and match_videos[0]:
    print("🎬 Creating 20-second match video...")
    print(f"   Frames: {len(match_videos[0])}")
    print(f"   Duration: 20 seconds")
    print(f"   FPS: 15 (downsampled from 30)")
    
    video_path = '/tmp/soccer_20sec_match.mp4'
    create_video_from_frames(match_videos[0], video_path, fps=15)  # 15 FPS for smaller file
    
    print("\n✅ Video created successfully!")
    print("\n📺 20-Second Soccer Match:")
    display(display_video(video_path))
    
    print("\n🎯 視聴ポイント:")
    print("  - 20秒間の戦略的な攻防")
    print("  - アタッカーとディフェンダーの連携")
    print("  - スタミナによる後半の動きの変化")
    print("  - ゴールタイミングとチャンス創出")
else:
    print("No video available. Please run the match first.")