# Train Large Chess Evaluation Models

Train resnet_large, transformer_large, cnn_deep on Google Colab GPU.

**Setup:** Runtime → Change runtime type → T4 GPU (or A100 for faster)

In [None]:
# Check GPU
!nvidia-smi --query-gpu=name,memory.total --format=csv

In [None]:
# Upload your chess_quality.tsv file
from google.colab import files
uploaded = files.upload()
print(f"Uploaded: {list(uploaded.keys())}")

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# FEN to tensor conversion
PIECE_MAP = {'P': 0, 'N': 1, 'B': 2, 'R': 3, 'Q': 4, 'K': 5,
             'p': 6, 'n': 7, 'b': 8, 'r': 9, 'q': 10, 'k': 11}

def fen_to_tensor(fen):
    board = np.zeros((12, 8, 8), dtype=np.float32)
    parts = fen.split()
    rows = parts[0].split('/')
    for row_idx, row in enumerate(rows):
        col_idx = 0
        for char in row:
            if char.isdigit():
                col_idx += int(char)
            else:
                piece_idx = PIECE_MAP.get(char)
                if piece_idx is not None:
                    board[piece_idx, row_idx, col_idx] = 1.0
                col_idx += 1
    return board

def parse_score(score_str):
    try:
        return float(score_str.strip()) * 100
    except:
        return 0.0

In [None]:
# Dataset
class ChessDataset(Dataset):
    def __init__(self, tsv_path, max_score=15000):
        self.positions = []
        self.scores = []
        
        print(f"Loading {tsv_path}...")
        with open(tsv_path) as f:
            lines = f.readlines()
        
        for i, line in enumerate(lines):
            if i % 100000 == 0:
                print(f"\rProcessing: {i:,}/{len(lines):,}", end="", flush=True)
            
            parts = line.strip().split('\t')
            if len(parts) < 2:
                continue
            
            fen, score_str = parts[0], parts[1]
            score = parse_score(score_str)
            
            if max_score > 0 and abs(score) > max_score:
                continue
            
            self.positions.append(fen_to_tensor(fen))
            self.scores.append(score)
        
        print(f"\nLoaded {len(self.positions):,} positions")
        
        self.positions = np.array(self.positions)
        self.scores = np.array(self.scores, dtype=np.float32)
        self.score_scale = 500.0
        self.scores_normalized = self.scores / self.score_scale
    
    def __len__(self):
        return len(self.positions)
    
    def __getitem__(self, idx):
        return (torch.from_numpy(self.positions[idx]),
                torch.tensor(self.scores_normalized[idx], dtype=torch.float32))

In [None]:
# Model definitions

class ResidualBlock(nn.Module):
    def __init__(self, channels):
        super().__init__()
        self.conv1 = nn.Conv2d(channels, channels, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(channels)
        self.conv2 = nn.Conv2d(channels, channels, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(channels)
    
    def forward(self, x):
        residual = x
        x = torch.relu(self.bn1(self.conv1(x)))
        x = self.bn2(self.conv2(x))
        return torch.relu(x + residual)

class ResNetLarge(nn.Module):
    """Large ResNet (16 blocks, 256ch) - 20M params, Lc0-style"""
    def __init__(self):
        super().__init__()
        self.conv_in = nn.Conv2d(12, 256, 3, padding=1)
        self.bn_in = nn.BatchNorm2d(256)
        self.blocks = nn.Sequential(*[ResidualBlock(256) for _ in range(16)])
        self.fc1 = nn.Linear(256 * 8 * 8, 256)
        self.fc2 = nn.Linear(256, 1)
    
    def forward(self, x):
        x = torch.relu(self.bn_in(self.conv_in(x)))
        x = self.blocks(x)
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x).squeeze(-1)

class TransformerLarge(nn.Module):
    """Large Transformer (8 layers, 512dim) - 20M params"""
    def __init__(self, d_model=512, nhead=8, num_layers=8):
        super().__init__()
        self.d_model = d_model
        self.input_proj = nn.Linear(12, d_model)
        self.pos_embedding = nn.Parameter(torch.randn(64, d_model))
        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, d_model * 4, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        self.fc = nn.Linear(d_model, 1)
    
    def forward(self, x):
        b = x.size(0)
        x = x.view(b, 12, 64).permute(0, 2, 1)
        x = self.input_proj(x) + self.pos_embedding
        x = self.transformer(x)
        x = x.mean(dim=1)
        return self.fc(x).squeeze(-1)

class CNNDeep(nn.Module):
    """Deep CNN (5 layers) - 25M params"""
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(12, 64, 3, padding=1)
        self.conv2 = nn.Conv2d(64, 128, 3, padding=1)
        self.conv3 = nn.Conv2d(128, 256, 3, padding=1)
        self.conv4 = nn.Conv2d(256, 512, 3, padding=1)
        self.conv5 = nn.Conv2d(512, 512, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(64)
        self.bn2 = nn.BatchNorm2d(128)
        self.bn3 = nn.BatchNorm2d(256)
        self.bn4 = nn.BatchNorm2d(512)
        self.bn5 = nn.BatchNorm2d(512)
        self.fc1 = nn.Linear(512 * 8 * 8, 1024)
        self.fc2 = nn.Linear(1024, 1)
        self.dropout = nn.Dropout(0.3)
    
    def forward(self, x):
        x = torch.relu(self.bn1(self.conv1(x)))
        x = torch.relu(self.bn2(self.conv2(x)))
        x = torch.relu(self.bn3(self.conv3(x)))
        x = torch.relu(self.bn4(self.conv4(x)))
        x = torch.relu(self.bn5(self.conv5(x)))
        x = x.view(x.size(0), -1)
        x = self.dropout(torch.relu(self.fc1(x)))
        return self.fc2(x).squeeze(-1)

class MLPLarge(nn.Module):
    """Large MLP (1024-512-256) - 1M params"""
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(12 * 8 * 8, 1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )
    
    def forward(self, x):
        return self.layers(x).squeeze(-1)

MODELS = {
    'resnet_large': ResNetLarge,
    'transformer_large': TransformerLarge,
    'cnn_deep': CNNDeep,
    'mlp_large': MLPLarge,
}

In [None]:
# Training function
def train_model(model, train_loader, val_loader, epochs, device, score_scale, model_name):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)
    
    best_val_loss = float('inf')
    best_model_path = f'best_{model_name}.pth'
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        
        for batch_idx, (batch_x, batch_y) in enumerate(train_loader):
            if batch_idx % 500 == 0:
                print(f"\rEpoch {epoch+1}/{epochs} | Batch {batch_idx}/{len(train_loader)}", end="", flush=True)
            
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        train_loss /= len(train_loader)
        
        model.eval()
        val_loss = 0.0
        val_mae = 0.0
        
        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                outputs = model(batch_x)
                val_loss += criterion(outputs, batch_y).item()
                val_mae += torch.abs(outputs - batch_y).mean().item() * score_scale
        
        val_loss /= len(val_loader)
        val_mae /= len(val_loader)
        scheduler.step(val_loss)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), best_model_path)
        
        print(f"\nEpoch {epoch+1}/{epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val MAE: {val_mae:.1f} cp")
    
    return best_val_loss, best_model_path

In [None]:
# Load data
dataset = ChessDataset('chess_quality.tsv')

# Split
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    dataset, [train_size, val_size, test_size],
    generator=torch.Generator().manual_seed(42)
)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=32, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=32, num_workers=2)

print(f"Train: {len(train_dataset):,}, Val: {len(val_dataset):,}, Test: {len(test_dataset):,}")

In [None]:
# Config
EPOCHS = 50
MODELS_TO_TRAIN = ['resnet_large', 'transformer_large', 'cnn_deep', 'mlp_large']

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

In [None]:
# Train all large models
results = {}

for model_name in MODELS_TO_TRAIN:
    print(f"\n{'='*60}")
    print(f"Training: {model_name}")
    print(f"{'='*60}")
    
    model = MODELS[model_name]().to(device)
    params = sum(p.numel() for p in model.parameters())
    print(f"Parameters: {params:,}")
    
    val_loss, model_path = train_model(
        model, train_loader, val_loader, EPOCHS, device,
        dataset.score_scale, model_name
    )
    
    # Evaluate
    model.load_state_dict(torch.load(model_path))
    model.eval()
    
    test_mae = 0.0
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            outputs = model(batch_x)
            test_mae += torch.abs(outputs - batch_y).mean().item() * dataset.score_scale
    test_mae /= len(test_loader)
    
    results[model_name] = {'params': params, 'mae': test_mae}
    print(f"\n{model_name} Test MAE: {test_mae:.1f} cp")

In [None]:
# Results summary
print("\n" + "="*60)
print("RESULTS SUMMARY")
print("="*60)
print(f"{'Model':<20} {'Params':>12} {'Test MAE':>10}")
print("-"*60)
for name, r in sorted(results.items(), key=lambda x: x[1]['mae']):
    print(f"{name:<20} {r['params']:>12,} {r['mae']:>10.1f} cp")

In [None]:
# Download trained models
from google.colab import files
for model_name in MODELS_TO_TRAIN:
    files.download(f'best_{model_name}.pth')