# Train Chess Evaluation LLM

Fine-tune LLMs to predict chess position evaluations from FEN strings.

**Features:**
- Streams data from Lichess (depth 60+ evaluations)
- Uses Unsloth for 2x faster training
- Multiple model options (Llama, Mistral, Phi, Qwen)

**Runtime:** Select GPU: `Runtime → Change runtime type → T4 GPU` (or A100 for larger models)

In [None]:
# Check GPU
!nvidia-smi --query-gpu=name,memory.total --format=csv

## 1. Install Dependencies

In [None]:
%%capture
!pip install unsloth
!pip install --no-deps trl peft accelerate bitsandbytes
!apt-get install -qq zstd

In [None]:
import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

## 2. Configuration

In [None]:
# =============================================================================
# CONFIGURATION
# =============================================================================

# Data settings
NUM_POSITIONS = 50000      # Start small, increase to 500K-1M for better results
MIN_DEPTH = 40             # Minimum Stockfish depth (40-60 recommended)

# Model selection (uncomment one)
# For T4 (16GB) - use smaller models:
MODEL_NAME = "unsloth/Phi-3.5-mini-instruct"  # 3.8B - fast, good quality
# MODEL_NAME = "unsloth/Llama-3.2-3B-Instruct"  # 3B - good balance
# MODEL_NAME = "unsloth/Qwen2.5-3B-Instruct"    # 3B - multilingual
# MODEL_NAME = "unsloth/mistral-7b-instruct-v0.3"  # 7B - needs more VRAM

# For A100 (40GB) - can use larger models:
# MODEL_NAME = "unsloth/Llama-3.1-8B-Instruct"  # 8B
# MODEL_NAME = "unsloth/Qwen2.5-7B-Instruct"    # 7B

# Training settings
MAX_SEQ_LENGTH = 512
BATCH_SIZE = 4
GRADIENT_ACCUMULATION = 4
EPOCHS = 3
LEARNING_RATE = 2e-4

# LoRA settings
LORA_R = 16
LORA_ALPHA = 16

print(f"Model: {MODEL_NAME}")
print(f"Positions: {NUM_POSITIONS:,}")
print(f"Min depth: {MIN_DEPTH}")

## 3. Download Training Data

In [None]:
import subprocess
import json
import time
import random

LICHESS_EVAL_URL = "https://database.lichess.org/lichess_db_eval.jsonl.zst"

def score_to_description(score_cp):
    """Convert centipawn score to description."""
    pawns = score_cp / 100
    if score_cp > 900:
        return f"White is winning"
    elif score_cp > 300:
        return f"White has a significant advantage"
    elif score_cp > 100:
        return f"White has a slight advantage"
    elif score_cp > -100:
        return f"The position is roughly equal"
    elif score_cp > -300:
        return f"Black has a slight advantage"
    elif score_cp > -900:
        return f"Black has a significant advantage"
    else:
        return f"Black is winning"

def stream_lichess_data(limit, min_depth):
    """Stream and format data from Lichess."""
    data = []
    
    print(f"Streaming Lichess evaluations (depth >= {min_depth})...")
    print("This may take 10-30 minutes...\n")
    
    process = subprocess.Popen(
        f'curl -sL "{LICHESS_EVAL_URL}" | zstd -d',
        shell=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.DEVNULL,
        text=True,
        bufsize=1
    )
    
    start_time = time.time()
    processed = 0
    last_print = 0
    
    instructions = [
        "Evaluate this chess position.",
        "What is the evaluation of this chess position?",
        "Analyze this position and give a score.",
        "Score this chess position in centipawns.",
    ]
    
    for line in process.stdout:
        if len(data) >= limit:
            break
            
        processed += 1
        
        if processed - last_print >= 10000:
            elapsed = time.time() - start_time
            rate = processed / elapsed
            print(f"\rProcessed: {processed:,} | Found: {len(data):,}/{limit:,} | {rate:.0f}/s", end="")
            last_print = processed
        
        try:
            entry = json.loads(line.strip())
            fen = entry.get("fen")
            evals = entry.get("evals", [])
            
            if not fen or not evals:
                continue
            
            best_eval = max(evals, key=lambda e: e.get("depth", 0))
            depth = best_eval.get("depth", 0)
            
            if depth < min_depth:
                continue
            
            pvs = best_eval.get("pvs", [])
            if not pvs:
                continue
            
            pv = pvs[0]
            if "cp" in pv:
                score_cp = pv["cp"]
            elif "mate" in pv:
                continue  # Skip mates for cleaner training
            else:
                continue
            
            if abs(score_cp) > 5000:
                continue
            
            # Format for training
            instruction = random.choice(instructions)
            pawns = score_cp / 100
            description = score_to_description(score_cp)
            
            text = f"""### Instruction:
{instruction}

### Input:
{fen}

### Response:
Evaluation: {score_cp} centipawns ({pawns:+.2f} pawns)
{description}.
Depth: {depth}"""
            
            data.append({"text": text})
            
        except:
            continue
    
    process.terminate()
    
    elapsed = time.time() - start_time
    print(f"\n\nExtracted {len(data):,} positions in {elapsed:.1f}s")
    
    random.shuffle(data)
    return data

# Download data
train_data = stream_lichess_data(NUM_POSITIONS, MIN_DEPTH)
print(f"\nSample:\n{train_data[0]['text'][:500]}")

## 4. Load Model with Unsloth

In [None]:
from unsloth import FastLanguageModel

# Load model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,  # Auto-detect
    load_in_4bit=True,  # Use 4-bit quantization for memory efficiency
)

# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=LORA_R,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=LORA_ALPHA,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
)

print(f"Model loaded: {MODEL_NAME}")
print(f"Trainable parameters: {model.print_trainable_parameters()}")

## 5. Prepare Dataset

In [None]:
from datasets import Dataset

# Create HuggingFace dataset
dataset = Dataset.from_list(train_data)

# Split into train/val
split = dataset.train_test_split(test_size=0.05, seed=42)
train_dataset = split["train"]
val_dataset = split["test"]

print(f"Train: {len(train_dataset):,}")
print(f"Val: {len(val_dataset):,}")

## 6. Train

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        output_dir="./chess_llm_output",
        per_device_train_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION,
        warmup_steps=50,
        num_train_epochs=EPOCHS,
        learning_rate=LEARNING_RATE,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=25,
        eval_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=200,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=42,
    ),
)

print("Starting training...")
trainer_stats = trainer.train()
print(f"\nTraining complete!")
print(f"Total time: {trainer_stats.metrics['train_runtime'] / 60:.1f} minutes")

## 7. Test the Model

In [None]:
# Test positions
test_positions = [
    ("rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1", "Starting position"),
    ("rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR b KQkq e3 0 1", "After 1.e4"),
    ("r1bqkb1r/pppp1ppp/2n2n2/4p3/2B1P3/5N2/PPPP1PPP/RNBQK2R w KQkq - 4 4", "Italian Game"),
    ("8/8/8/8/8/5K2/4Q3/7k w - - 0 1", "K+Q vs K (winning)"),
    ("rnbqkb1r/pppp1ppp/5n2/4p2Q/2B1P3/8/PPPP1PPP/RNB1K1NR b KQkq - 3 3", "Scholar's Mate threat"),
]

FastLanguageModel.for_inference(model)

print("Testing model predictions:\n")
print("=" * 60)

for fen, description in test_positions:
    prompt = f"""### Instruction:
Evaluate this chess position.

### Input:
{fen}

### Response:
"""
    
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.1,
        do_sample=True,
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("### Response:")[-1].strip()
    
    print(f"Position: {description}")
    print(f"FEN: {fen[:40]}...")
    print(f"Prediction: {response[:150]}")
    print("-" * 60)

## 8. Save Model

In [None]:
# Save LoRA adapters locally
model.save_pretrained("chess_llm_lora")
tokenizer.save_pretrained("chess_llm_lora")
print("Saved LoRA adapters to: chess_llm_lora/")

In [None]:
# Save to Google Drive
from google.colab import drive
drive.mount('/content/drive')

import shutil
import os

save_dir = "/content/drive/MyDrive/chess_llm_models"
os.makedirs(save_dir, exist_ok=True)

# Copy LoRA adapters
shutil.copytree("chess_llm_lora", f"{save_dir}/chess_llm_lora", dirs_exist_ok=True)
print(f"Saved to Google Drive: {save_dir}/chess_llm_lora")

In [None]:
# Optional: Merge and save full model (larger, but no base model needed)
SAVE_MERGED = False  # Set to True if you want the full model

if SAVE_MERGED:
    model.save_pretrained_merged(
        "chess_llm_merged",
        tokenizer,
        save_method="merged_16bit",
    )
    print("Saved merged model to: chess_llm_merged/")

## 9. Optional: Push to Hugging Face Hub

In [None]:
# Uncomment and run if you want to push to HF Hub
# from huggingface_hub import login
# login(token="YOUR_HF_TOKEN")

# model.push_to_hub("your-username/chess-eval-llm")
# tokenizer.push_to_hub("your-username/chess-eval-llm")

## 10. Compare Multiple Models

To train multiple models, restart runtime and change `MODEL_NAME` to:
- `unsloth/Phi-3.5-mini-instruct` (3.8B)
- `unsloth/Llama-3.2-3B-Instruct` (3B)
- `unsloth/Qwen2.5-3B-Instruct` (3B)
- `unsloth/mistral-7b-instruct-v0.3` (7B, needs more VRAM)

Then compare their predictions on the same test positions.