# üèÜ Advanced Chess RL Training

## Notebook untuk Training AI Catur yang Kuat

**Target:** Mengalahkan Stockfish pada level rendah-menengah

### ‚ö†Ô∏è Catatan Penting:
- Training membutuhkan **beberapa jam** GPU time
- Model akan di-checkpoint setiap beberapa iterasi
- Untuk hasil terbaik, jalankan training **berulang kali**

---

## 1Ô∏è‚É£ Setup Environment

In [1]:
# Check GPU
import torch

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"‚úÖ GPU: {gpu_name}")
    print(f"   Memory: {gpu_mem:.1f} GB")
    device = torch.device('cuda')
else:
    print("‚ùå GPU tidak tersedia! Training akan sangat lambat.")
    device = torch.device('cpu')

‚úÖ GPU: Tesla T4
   Memory: 15.8 GB


In [2]:
# Install dependencies
!pip install -q python-chess gymnasium tqdm matplotlib stockfish
print("‚úÖ Dependencies installed!")

[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m6.1/6.1 MB[0m [31m78.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for chess (setup.py) ... [?25l[?25hdone
‚úÖ Dependencies installed!


In [3]:
# Install Stockfish untuk evaluasi
!apt-get install -qq stockfish
print("‚úÖ Stockfish installed!")

Selecting previously unselected package stockfish.
(Reading database ... 121689 files and directories currently installed.)
Preparing to unpack .../stockfish_14.1-1_amd64.deb ...
Unpacking stockfish (14.1-1) ...
Setting up stockfish (14.1-1) ...
Processing triggers for man-db (2.10.2-1) ...
‚úÖ Stockfish installed!


## 2Ô∏è‚É£ Chess Environment (Enhanced)

In [4]:
import chess
import numpy as np
import gymnasium as gym
from gymnasium import spaces
import random

class ChessEnv(gym.Env):
    """Enhanced Chess Environment dengan reward shaping."""
    
    # Piece values untuk reward shaping
    PIECE_VALUES = {
        chess.PAWN: 1,
        chess.KNIGHT: 3,
        chess.BISHOP: 3,
        chess.ROOK: 5,
        chess.QUEEN: 9,
        chess.KING: 0
    }
    
    def __init__(self, max_moves=200, reward_shaping=True):
        super().__init__()
        self.board = chess.Board()
        self.max_moves = max_moves
        self.move_count = 0
        self.reward_shaping = reward_shaping
        self.prev_material = 0
        
        # State: 18 channels x 8 x 8
        # 12 piece planes + turn + castling (4) + en passant
        self.observation_space = spaces.Box(
            low=0, high=1, shape=(18, 8, 8), dtype=np.float32
        )
        self.action_space = spaces.Discrete(4672)
        self._init_move_encoding()
    
    def _init_move_encoding(self):
        self.action_to_move = {}
        self.move_to_action = {}
        
        directions = []
        for d in [(0,1), (0,-1), (1,0), (-1,0), (1,1), (1,-1), (-1,1), (-1,-1)]:
            for dist in range(1, 8):
                directions.append((d[0]*dist, d[1]*dist))
        for d in [(1,2), (2,1), (2,-1), (1,-2), (-1,-2), (-2,-1), (-2,1), (-1,2)]:
            directions.append(d)
        
        action = 0
        for sq in range(64):
            from_rank, from_file = sq // 8, sq % 8
            for dx, dy in directions:
                to_rank = from_rank + dy
                to_file = from_file + dx
                if 0 <= to_rank < 8 and 0 <= to_file < 8:
                    to_sq = to_rank * 8 + to_file
                    move = chess.Move(sq, to_sq)
                    self.action_to_move[action] = move
                    self.move_to_action[move.uci()] = action
                action += 1
            if from_rank == 6:
                for dx in [-1, 0, 1]:
                    for promo in [chess.KNIGHT, chess.BISHOP, chess.ROOK]:
                        to_file = from_file + dx
                        if 0 <= to_file < 8:
                            to_sq = 7 * 8 + to_file
                            move = chess.Move(sq, to_sq, promotion=promo)
                            self.action_to_move[action] = move
                            self.move_to_action[move.uci()] = action
                        action += 1
    
    def _get_material_score(self, color):
        score = 0
        for piece_type in self.PIECE_VALUES:
            score += len(self.board.pieces(piece_type, color)) * self.PIECE_VALUES[piece_type]
        return score
    
    def encode_state(self):
        state = np.zeros((18, 8, 8), dtype=np.float32)
        
        piece_to_channel = {
            (chess.PAWN, True): 0, (chess.KNIGHT, True): 1, (chess.BISHOP, True): 2,
            (chess.ROOK, True): 3, (chess.QUEEN, True): 4, (chess.KING, True): 5,
            (chess.PAWN, False): 6, (chess.KNIGHT, False): 7, (chess.BISHOP, False): 8,
            (chess.ROOK, False): 9, (chess.QUEEN, False): 10, (chess.KING, False): 11
        }
        
        for sq in chess.SQUARES:
            piece = self.board.piece_at(sq)
            if piece:
                rank, file = sq // 8, sq % 8
                ch = piece_to_channel[(piece.piece_type, piece.color)]
                state[ch, rank, file] = 1.0
        
        state[12, :, :] = 1.0 if self.board.turn else 0.0
        state[13, 0, :] = float(self.board.has_kingside_castling_rights(True))
        state[14, 0, :] = float(self.board.has_queenside_castling_rights(True))
        state[15, 0, :] = float(self.board.has_kingside_castling_rights(False))
        state[16, 0, :] = float(self.board.has_queenside_castling_rights(False))
        
        if self.board.ep_square:
            ep_rank, ep_file = self.board.ep_square // 8, self.board.ep_square % 8
            state[17, ep_rank, ep_file] = 1.0
        
        return state
    
    def get_legal_action_mask(self):
        mask = np.zeros(4672, dtype=bool)
        for move in self.board.legal_moves:
            uci = move.uci()
            if uci in self.move_to_action:
                mask[self.move_to_action[uci]] = True
        return mask
    
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.board = chess.Board()
        self.move_count = 0
        white_material = self._get_material_score(True)
        black_material = self._get_material_score(False)
        self.prev_material = white_material - black_material
        return self.encode_state(), {}
    
    def step(self, action):
        if action in self.action_to_move:
            move = self.action_to_move[action]
            if move in self.board.legal_moves:
                self.board.push(move)
                self.move_count += 1
            else:
                for legal in self.board.legal_moves:
                    if legal.uci()[:4] == move.uci()[:4]:
                        self.board.push(legal)
                        self.move_count += 1
                        break
                else:
                    return self.encode_state(), -1.0, True, False, {'illegal': True}
        else:
            return self.encode_state(), -1.0, True, False, {'illegal': True}
        
        reward = 0.0
        terminated = False
        truncated = False
        
        if self.board.is_checkmate():
            reward = 1.0 if not self.board.turn else -1.0
            terminated = True
        elif self.board.is_game_over():
            reward = 0.0
            terminated = True
        elif self.move_count >= self.max_moves:
            truncated = True
        elif self.reward_shaping:
            # Reward shaping berdasarkan material
            white_material = self._get_material_score(True)
            black_material = self._get_material_score(False)
            current_material = white_material - black_material
            material_diff = current_material - self.prev_material
            self.prev_material = current_material
            
            # Small bonus for capturing pieces
            if material_diff != 0:
                reward = material_diff * 0.01
            
            # Bonus for check
            if self.board.is_check():
                reward += 0.01
        
        return self.encode_state(), reward, terminated, truncated, {}

print("‚úÖ ChessEnv with reward shaping defined!")

‚úÖ ChessEnv with reward shaping defined!


## 3Ô∏è‚É£ Neural Network (Larger)

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SEBlock(nn.Module):
    """Squeeze-and-Excitation block untuk attention."""
    def __init__(self, channels, reduction=4):
        super().__init__()
        self.pool = nn.AdaptiveAvgPool2d(1)
        self.fc1 = nn.Linear(channels, channels // reduction)
        self.fc2 = nn.Linear(channels // reduction, channels)
    
    def forward(self, x):
        b, c, _, _ = x.size()
        y = self.pool(x).view(b, c)
        y = F.relu(self.fc1(y))
        y = torch.sigmoid(self.fc2(y)).view(b, c, 1, 1)
        return x * y

class ResidualBlock(nn.Module):
    """Residual block dengan SE attention."""
    def __init__(self, channels, use_se=True):
        super().__init__()
        self.conv1 = nn.Conv2d(channels, channels, 3, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(channels)
        self.conv2 = nn.Conv2d(channels, channels, 3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(channels)
        self.se = SEBlock(channels) if use_se else nn.Identity()
    
    def forward(self, x):
        residual = x
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out = self.se(out)
        return F.relu(out + residual)

class ChessNetwork(nn.Module):
    """Policy-Value Network yang lebih besar untuk training kuat."""
    
    def __init__(self, input_channels=18, num_filters=256, num_blocks=12, action_size=4672):
        super().__init__()
        
        self.action_size = action_size
        
        # Input conv
        self.input_conv = nn.Sequential(
            nn.Conv2d(input_channels, num_filters, 3, padding=1, bias=False),
            nn.BatchNorm2d(num_filters),
            nn.ReLU()
        )
        
        # Residual tower dengan SE blocks
        self.res_blocks = nn.ModuleList([
            ResidualBlock(num_filters, use_se=(i % 2 == 0))
            for i in range(num_blocks)
        ])
        
        # Policy head 
        self.policy_head = nn.Sequential(
            nn.Conv2d(num_filters, 80, 1, bias=False),
            nn.BatchNorm2d(80),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(80 * 64, action_size)
        )
        
        # Value head
        self.value_head = nn.Sequential(
            nn.Conv2d(num_filters, 32, 1, bias=False),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(32 * 64, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
            nn.Tanh()
        )
    
    def forward(self, x, legal_mask=None):
        x = self.input_conv(x)
        for block in self.res_blocks:
            x = block(x)
        
        policy_logits = self.policy_head(x)
        
        if legal_mask is not None:
            policy_logits = policy_logits.masked_fill(~legal_mask, float('-inf'))
        
        log_probs = F.log_softmax(policy_logits, dim=-1)
        value = self.value_head(x)
        
        return log_probs, value

print("‚úÖ Enhanced ChessNetwork with SE blocks defined!")

‚úÖ Enhanced ChessNetwork with SE blocks defined!


## 4Ô∏è‚É£ Create Components

In [6]:
# Configuration - ADVANCED TRAINING
CONFIG = {
    # Network
    'input_channels': 18,
    'num_filters': 256,   # Lebih besar: 256 (vs 128)
    'num_blocks': 12,     # Lebih dalam: 12 (vs 6)
    
    # PPO
    'gamma': 0.99,
    'gae_lambda': 0.95,
    'clip_range': 0.2,
    'entropy_coef': 0.02,   # Lebih tinggi untuk eksplorasi
    'value_coef': 0.5,
    'max_grad_norm': 0.5,
    
    # Training
    'learning_rate': 1e-4,   # Lebih kecil untuk stabilitas
    'n_steps': 512,          # Lebih panjang: 512 (vs 128)
    'n_epochs': 4,
    'batch_size': 128,       # Lebih besar: 128 (vs 64)
    'total_updates': 5000,   # JAUH lebih banyak: 5000 (vs 100)
    
    # Checkpointing
    'save_interval': 500,    # Save setiap 500 updates
    'eval_interval': 250,    # Evaluate setiap 250 updates
}

print("üìã Configuration:")
for k, v in CONFIG.items():
    print(f"   {k}: {v}")

üìã Configuration:
   input_channels: 18
   num_filters: 256
   num_blocks: 12
   gamma: 0.99
   gae_lambda: 0.95
   clip_range: 0.2
   entropy_coef: 0.02
   value_coef: 0.5
   max_grad_norm: 0.5
   learning_rate: 0.0001
   n_steps: 512
   n_epochs: 4
   batch_size: 128
   total_updates: 5000
   save_interval: 500
   eval_interval: 250


In [7]:
# Create environment & network
env = ChessEnv(max_moves=200, reward_shaping=True)
print(f"‚úÖ Environment: obs={env.observation_space.shape}, actions={env.action_space.n}")

network = ChessNetwork(
    input_channels=CONFIG['input_channels'],
    num_filters=CONFIG['num_filters'],
    num_blocks=CONFIG['num_blocks'],
    action_size=4672
).to(device)

num_params = sum(p.numel() for p in network.parameters())
print(f"‚úÖ Network: {num_params:,} parameters")

# Optimizer dengan weight decay untuk regularization
optimizer = torch.optim.AdamW(
    network.parameters(), 
    lr=CONFIG['learning_rate'],
    weight_decay=1e-4
)

# Learning rate scheduler
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer, 
    T_max=CONFIG['total_updates'],
    eta_min=1e-6
)

print(f"‚úÖ Optimizer: AdamW with cosine annealing LR")

‚úÖ Environment: obs=(18, 8, 8), actions=4672
‚úÖ Network: 38,887,585 parameters
‚úÖ Optimizer: AdamW with cosine annealing LR


## 5Ô∏è‚É£ PPO Training Loop

In [8]:
from torch.distributions import Categorical
from tqdm import tqdm
import matplotlib.pyplot as plt
import os
import time

# Training history
history = {
    'policy_loss': [], 'value_loss': [], 'entropy': [], 
    'rewards': [], 'game_lengths': [], 'lr': []
}

def select_action(state, legal_mask):
    network.eval()
    with torch.no_grad():
        state_t = torch.FloatTensor(state).unsqueeze(0).to(device)
        mask_t = torch.BoolTensor(legal_mask).unsqueeze(0).to(device)
        log_probs, value = network(state_t, mask_t)
        probs = torch.exp(log_probs)
        dist = Categorical(probs)
        action = dist.sample()
    network.train()
    return action.item(), log_probs[0, action.item()].item(), value.item()

def compute_gae(rewards, values, dones, last_value):
    gamma = CONFIG['gamma']
    lam = CONFIG['gae_lambda']
    advantages = np.zeros_like(rewards)
    last_gae = 0
    for t in reversed(range(len(rewards))):
        if t == len(rewards) - 1:
            next_value = last_value
        else:
            next_value = values[t + 1]
        delta = rewards[t] + gamma * next_value * (1 - dones[t]) - values[t]
        advantages[t] = last_gae = delta + gamma * lam * (1 - dones[t]) * last_gae
    returns = advantages + np.array(values)
    return advantages, returns

def evaluate_vs_random(n_games=20):
    """Evaluasi melawan random player."""
    network.eval()
    wins = 0
    draws = 0
    
    for _ in range(n_games):
        state, _ = env.reset()
        done = False
        
        while not done:
            legal_mask = env.get_legal_action_mask()
            
            if env.board.turn:  # AI plays white
                action, _, _ = select_action(state, legal_mask)
            else:  # Random plays black
                legal_actions = np.where(legal_mask)[0]
                action = np.random.choice(legal_actions)
            
            state, _, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
        
        result = env.board.result()
        if result == '1-0':
            wins += 1
        elif result == '1/2-1/2':
            draws += 1
    
    network.train()
    return wins / n_games, draws / n_games

print("‚úÖ Training functions defined!")

‚úÖ Training functions defined!


In [None]:
# ============================================================
# MAIN TRAINING LOOP
# ============================================================

print("="*60)
print("üöÄ STARTING ADVANCED PPO TRAINING")
print("="*60)
print(f"   Total updates: {CONFIG['total_updates']}")
print(f"   Steps per update: {CONFIG['n_steps']}")
print(f"   Estimated time: {CONFIG['total_updates'] * 0.5 / 60:.1f} - {CONFIG['total_updates'] * 2 / 60:.1f} minutes")
print("="*60)

start_time = time.time()
best_win_rate = 0.0

for update in tqdm(range(CONFIG['total_updates']), desc="Training"):
    # Collect rollout
    states, actions, rewards, dones = [], [], [], []
    old_log_probs, values, masks = [], [], []
    
    state, _ = env.reset()
    episode_reward = 0
    episode_rewards = []
    episode_lengths = []
    episode_length = 0
    
    for step in range(CONFIG['n_steps']):
        legal_mask = env.get_legal_action_mask()
        action, log_prob, value = select_action(state, legal_mask)
        
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        dones.append(done)
        old_log_probs.append(log_prob)
        values.append(value)
        masks.append(legal_mask)
        
        episode_reward += reward
        episode_length += 1
        
        if done:
            episode_rewards.append(episode_reward)
            episode_lengths.append(episode_length)
            episode_reward = 0
            episode_length = 0
            state, _ = env.reset()
        else:
            state = next_state
    
    # Compute last value
    with torch.no_grad():
        state_t = torch.FloatTensor(state).unsqueeze(0).to(device)
        _, last_value = network(state_t)
        last_value = last_value.item()
    
    # Compute GAE
    advantages, returns = compute_gae(rewards, values, dones, last_value)
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
    
    # Convert to tensors
    states_t = torch.FloatTensor(np.array(states)).to(device)
    actions_t = torch.LongTensor(actions).to(device)
    old_log_probs_t = torch.FloatTensor(old_log_probs).to(device)
    advantages_t = torch.FloatTensor(advantages).to(device)
    returns_t = torch.FloatTensor(returns).to(device)
    masks_t = torch.BoolTensor(np.array(masks)).to(device)
    
    # PPO update
    all_policy_loss, all_value_loss, all_entropy = [], [], []
    
    for epoch in range(CONFIG['n_epochs']):
        indices = np.random.permutation(len(states))
        for start in range(0, len(states), CONFIG['batch_size']):
            end = start + CONFIG['batch_size']
            batch_idx = indices[start:end]
            
            log_probs, values_pred = network(states_t[batch_idx], masks_t[batch_idx])
            values_pred = values_pred.squeeze(-1)
            
            action_log_probs = log_probs.gather(1, actions_t[batch_idx].unsqueeze(-1)).squeeze(-1)
            
            # Policy loss
            ratio = torch.exp(action_log_probs - old_log_probs_t[batch_idx])
            surr1 = ratio * advantages_t[batch_idx]
            surr2 = torch.clamp(ratio, 1 - CONFIG['clip_range'], 1 + CONFIG['clip_range']) * advantages_t[batch_idx]
            policy_loss = -torch.min(surr1, surr2).mean()
            
            # Value loss
            value_loss = F.mse_loss(values_pred, returns_t[batch_idx])
            
            # Entropy
            probs = torch.exp(log_probs)
            entropy = -(probs * log_probs.masked_fill(torch.isinf(log_probs), 0)).sum(-1).mean()
            
            # Total loss
            loss = policy_loss + CONFIG['value_coef'] * value_loss - CONFIG['entropy_coef'] * entropy
            
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(network.parameters(), CONFIG['max_grad_norm'])
            optimizer.step()
            
            all_policy_loss.append(policy_loss.item())
            all_value_loss.append(value_loss.item())
            all_entropy.append(entropy.item())
    
    scheduler.step()
    
    # Record history
    history['policy_loss'].append(np.mean(all_policy_loss))
    history['value_loss'].append(np.mean(all_value_loss))
    history['entropy'].append(np.mean(all_entropy))
    history['rewards'].append(np.mean(episode_rewards) if episode_rewards else 0)
    history['game_lengths'].append(np.mean(episode_lengths) if episode_lengths else 0)
    history['lr'].append(scheduler.get_last_lr()[0])
    
    # Logging
    if (update + 1) % 100 == 0:
        elapsed = time.time() - start_time
        print(f"\nUpdate {update+1}/{CONFIG['total_updates']} | Time: {elapsed/60:.1f}min")
        print(f"  PolicyL: {history['policy_loss'][-1]:.4f} | ValueL: {history['value_loss'][-1]:.4f}")
        print(f"  Entropy: {history['entropy'][-1]:.4f} | LR: {history['lr'][-1]:.2e}")
    
    # Evaluation
    if (update + 1) % CONFIG['eval_interval'] == 0:
        win_rate, draw_rate = evaluate_vs_random(20)
        print(f"  üìä vs Random: Win={win_rate:.1%}, Draw={draw_rate:.1%}")
        
        if win_rate > best_win_rate:
            best_win_rate = win_rate
            torch.save({
                'update': update,
                'network_state_dict': network.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'best_win_rate': best_win_rate,
            }, '/content/chess_model_best.pt')
            print(f"  üíæ New best model saved! (win_rate={best_win_rate:.1%})")
    
    # Checkpointing
    if (update + 1) % CONFIG['save_interval'] == 0:
        torch.save({
            'update': update,
            'network_state_dict': network.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'history': history,
            'config': CONFIG,
        }, f'/content/chess_checkpoint_{update+1}.pt')
        print(f"  üíæ Checkpoint saved!")

total_time = time.time() - start_time
print("\n" + "="*60)
print(f"‚úÖ TRAINING COMPLETED!")
print(f"   Total time: {total_time/3600:.2f} hours")
print(f"   Best win rate vs random: {best_win_rate:.1%}")
print("="*60)

üöÄ STARTING ADVANCED PPO TRAINING
   Total updates: 5000
   Steps per update: 512
   Estimated time: 41.7 - 166.7 minutes


Training:   2%|‚ñè         | 85/5000 [06:43<6:28:56,  4.75s/it]

## 6Ô∏è‚É£ Training Visualization

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Smoothing function
def smooth(data, window=50):
    if len(data) < window:
        return data
    return np.convolve(data, np.ones(window)/window, mode='valid')

axes[0, 0].plot(smooth(history['policy_loss']))
axes[0, 0].set_title('Policy Loss')
axes[0, 0].set_xlabel('Update')

axes[0, 1].plot(smooth(history['value_loss']))
axes[0, 1].set_title('Value Loss')
axes[0, 1].set_xlabel('Update')

axes[0, 2].plot(smooth(history['entropy']))
axes[0, 2].set_title('Entropy')
axes[0, 2].set_xlabel('Update')

axes[1, 0].plot(smooth(history['rewards']))
axes[1, 0].set_title('Mean Episode Reward')
axes[1, 0].set_xlabel('Update')

axes[1, 1].plot(smooth(history['game_lengths']))
axes[1, 1].set_title('Mean Game Length')
axes[1, 1].set_xlabel('Update')

axes[1, 2].plot(history['lr'])
axes[1, 2].set_title('Learning Rate')
axes[1, 2].set_xlabel('Update')

plt.tight_layout()
plt.savefig('/content/training_curves.png', dpi=150)
plt.show()

## 7Ô∏è‚É£ Evaluasi vs Stockfish

In [None]:
from stockfish import Stockfish

def evaluate_vs_stockfish(model, stockfish_path='/usr/games/stockfish', 
                          skill_level=0, n_games=10, time_limit=0.1):
    """
    Evaluasi model melawan Stockfish.
    skill_level: 0-20 (0 = paling lemah, 20 = paling kuat)
    """
    try:
        sf = Stockfish(path=stockfish_path)
        sf.set_skill_level(skill_level)
    except Exception as e:
        print(f"‚ùå Error loading Stockfish: {e}")
        return None
    
    model.eval()
    results = {'wins': 0, 'draws': 0, 'losses': 0}
    
    for game_idx in range(n_games):
        board = chess.Board()
        sf.set_fen_position(board.fen())
        
        ai_is_white = (game_idx % 2 == 0)
        move_count = 0
        
        while not board.is_game_over() and move_count < 200:
            if board.turn == ai_is_white:
                # AI's turn
                state = env.encode_state()
                env.board = board.copy()
                state = env.encode_state()
                legal_mask = env.get_legal_action_mask()
                
                with torch.no_grad():
                    state_t = torch.FloatTensor(state).unsqueeze(0).to(device)
                    mask_t = torch.BoolTensor(legal_mask).unsqueeze(0).to(device)
                    log_probs, _ = model(state_t, mask_t)
                    action = torch.argmax(log_probs, dim=-1).item()
                
                if action in env.action_to_move:
                    move = env.action_to_move[action]
                    if move in board.legal_moves:
                        board.push(move)
                    else:
                        # Fallback: random legal move
                        legal_moves = list(board.legal_moves)
                        if legal_moves:
                            board.push(random.choice(legal_moves))
                else:
                    legal_moves = list(board.legal_moves)
                    if legal_moves:
                        board.push(random.choice(legal_moves))
            else:
                # Stockfish's turn
                sf.set_fen_position(board.fen())
                best_move = sf.get_best_move_time(int(time_limit * 1000))
                if best_move:
                    board.push(chess.Move.from_uci(best_move))
                else:
                    break
            
            move_count += 1
        
        result = board.result()
        if result == '1-0':
            if ai_is_white:
                results['wins'] += 1
            else:
                results['losses'] += 1
        elif result == '0-1':
            if ai_is_white:
                results['losses'] += 1
            else:
                results['wins'] += 1
        else:
            results['draws'] += 1
        
        print(f"  Game {game_idx+1}: {result} ({'AI White' if ai_is_white else 'AI Black'})")
    
    model.train()
    return results

print("‚úÖ Stockfish evaluation function defined!")

In [None]:
# Load best model dan evaluasi
try:
    checkpoint = torch.load('/content/chess_model_best.pt')
    network.load_state_dict(checkpoint['network_state_dict'])
    print(f"‚úÖ Loaded best model (win_rate={checkpoint.get('best_win_rate', 'N/A')})")
except:
    print("‚ö†Ô∏è Using current model (no best checkpoint found)")

print("\n" + "="*60)
print("üìä EVALUATION VS STOCKFISH")
print("="*60)

for skill in [0, 1, 3, 5]:
    print(f"\nüéØ Stockfish Skill Level {skill}:")
    results = evaluate_vs_stockfish(network, skill_level=skill, n_games=10)
    if results:
        total = results['wins'] + results['draws'] + results['losses']
        print(f"   Results: W={results['wins']} D={results['draws']} L={results['losses']}")
        print(f"   Win Rate: {results['wins']/total:.1%}")

## 8Ô∏è‚É£ Demo: AI vs AI

In [None]:
import chess.svg
from IPython.display import display, HTML, clear_output
import time as time_module

def play_demo(max_moves=60, delay=0.5):
    state, _ = env.reset()
    moves_played = []
    
    for i in range(max_moves):
        if env.board.is_game_over():
            break
        
        clear_output(wait=True)
        display(HTML(chess.svg.board(env.board, size=400)))
        print(f"Move {i+1}: {'White' if env.board.turn else 'Black'}")
        if moves_played:
            print(f"Last moves: {' '.join(moves_played[-6:])}")
        
        legal_mask = env.get_legal_action_mask()
        
        # Use network to select move
        with torch.no_grad():
            state_t = torch.FloatTensor(state).unsqueeze(0).to(device)
            mask_t = torch.BoolTensor(legal_mask).unsqueeze(0).to(device)
            log_probs, value = network(state_t, mask_t)
            action = torch.argmax(log_probs, dim=-1).item()
            print(f"Eval: {value.item():.3f}")
        
        if action in env.action_to_move:
            move = env.action_to_move[action]
            moves_played.append(move.uci())
        
        state, _, done, _, _ = env.step(action)
        time_module.sleep(delay)
    
    clear_output(wait=True)
    display(HTML(chess.svg.board(env.board, size=400)))
    print(f"üèÜ Game Over! Result: {env.board.result()}")
    print(f"Total moves: {len(moves_played)}")
    print(f"Moves: {' '.join(moves_played)}")

play_demo()

## 9Ô∏è‚É£ Save Final Model

In [None]:
# Save final model
torch.save({
    'network_state_dict': network.state_dict(),
    'config': CONFIG,
    'history': history,
    'total_updates': CONFIG['total_updates'],
}, '/content/chess_model_final.pt')

print("‚úÖ Final model saved to /content/chess_model_final.pt")

# Download model
from google.colab import files
files.download('/content/chess_model_final.pt')

# Download best model jika ada
try:
    files.download('/content/chess_model_best.pt')
    print("‚úÖ Best model downloaded!")
except:
    pass

# Download training curves
try:
    files.download('/content/training_curves.png')
except:
    pass

---

## üìù Tips untuk Training Lebih Lanjut:

1. **Jalankan multiple sessions** - Resume dari checkpoint untuk training lebih lama
2. **Tingkatkan `total_updates`** ke 10000-50000 untuk hasil lebih baik
3. **Gunakan self-play** - Buat agent bermain melawan dirinya sendiri
4. **Fine-tune melawan Stockfish** - Setelah bagus vs random, train vs Stockfish level rendah

---