In [None]:
# Cell 1: Environment Setup & Drive Mount
# ==============================================================================
# üìä MATCH DATA CURATOR - High-Quality Training Positions
# ==============================================================================

import os
import time
import threading
from google.colab import drive

print("‚öôÔ∏è Installing dependencies...")
!pip install python-chess zstandard -q

import chess.pgn
import zstandard as zstd
import io
import sqlite3
import json

print("\nüîó Mounting Google Drive...")
drive.mount('/content/drive')

# Project structure
PROJECT_ROOT = '/content/drive/MyDrive/GambitFlow_Project/Synapse_Data'
DATA_FACTORY = os.path.join(PROJECT_ROOT, 'match_Data')
os.makedirs(DATA_FACTORY, exist_ok=True)

print(f"‚úÖ Workspace: {DATA_FACTORY}")

# Keep-alive
def keep_alive():
    while True:
        time.sleep(60)

threading.Thread(target=keep_alive, daemon=True).start()
print("‚úÖ Keep-alive active")

‚öôÔ∏è Installing dependencies...

üîó Mounting Google Drive...
Mounted at /content/drive
‚úÖ Workspace: /content/drive/MyDrive/GambitFlow_Project/Synapse_Data/match_Data
‚úÖ Keep-alive active


In [None]:
# ==============================================================================
# üì• DOWNLOAD & PREPARE MATCH DATABASES (FIXED & MULTI-FILE)
# ==============================================================================

import requests
import shutil
import os
import zipfile

# --- 1. Configuration: Add 8 Elite Database Links Here ---
DOWNLOAD_URLS = [
    "https://database.nikonoel.fr/lichess_elite_2025-11.zip",
    "https://database.nikonoel.fr/lichess_elite_2025-10.zip",
    "https://database.nikonoel.fr/lichess_elite_2025-09.zip",
    "https://database.nikonoel.fr/lichess_elite_2025-08.zip",
    "https://database.nikonoel.fr/lichess_elite_2025-07.zip",
    "https://database.nikonoel.fr/lichess_elite_2025-06.zip",
    "https://database.nikonoel.fr/lichess_elite_2025-03.zip",
    "https://database.nikonoel.fr/lichess_elite_2025-04.zip",
]

# --- 2. Paths Setup ---
LOCAL_DIR = "/content/match_data"
# Using the DATA_FACTORY variable defined in the previous cell
DRIVE_BACKUP_DIR = os.path.join(DATA_FACTORY, "match_data_raw_zip")

os.makedirs(LOCAL_DIR, exist_ok=True)
os.makedirs(DRIVE_BACKUP_DIR, exist_ok=True)

print(f"üéØ Target: {len(DOWNLOAD_URLS)} high-quality game files")
print(f"üìÇ Local Processing Dir: {LOCAL_DIR}")
print(f"üíæ Drive Backup Dir: {DRIVE_BACKUP_DIR}\n")

# --- 3. Processing Pipeline ---
for url in DOWNLOAD_URLS:
    zip_filename = url.split('/')[-1]
    pgn_filename = zip_filename.replace(".zip", ".pgn")

    local_zip_path = os.path.join(LOCAL_DIR, zip_filename)
    local_pgn_path = os.path.join(LOCAL_DIR, pgn_filename)
    drive_zip_path = os.path.join(DRIVE_BACKUP_DIR, zip_filename)

    print(f"--- Processing: {zip_filename} ---")

    # Step A: Check if the final PGN file already exists
    if os.path.exists(local_pgn_path):
        print(f"‚úÖ Already extracted. Skipping.")
        continue

    # Step B: Check for the ZIP file (local or Drive)
    if not os.path.exists(local_zip_path):
        if os.path.exists(drive_zip_path):
            print(f"üì¶ Found in Drive. Copying to local for faster processing...")
            shutil.copy(drive_zip_path, local_zip_path)
            print("   -> Copy complete.")
        else:
            print(f"‚¨áÔ∏è Downloading from source...")
            try:
                with requests.get(url, stream=True) as r:
                    r.raise_for_status()
                    with open(local_zip_path, 'wb') as f:
                        for chunk in r.iter_content(chunk_size=8192*4):
                            f.write(chunk)
                print("   -> Download complete.")

                # Backup the downloaded ZIP to Drive for future use
                print(f"üì¶ Backing up to Drive...")
                shutil.copy(local_zip_path, drive_zip_path)

            except Exception as e:
                print(f"‚ùå Download failed for {zip_filename}: {e}")
                continue # Skip to next file

    # Step C: Extract the PGN from the ZIP file
    print(f"üîß Extracting PGN from {zip_filename}...")
    try:
        with zipfile.ZipFile(local_zip_path, 'r') as zip_ref:
            # Find the PGN file inside the zip
            pgn_files_in_zip = [name for name in zip_ref.namelist() if name.endswith('.pgn')]
            if pgn_files_in_zip:
                # Extract the first PGN file found
                zip_ref.extract(pgn_files_in_zip[0], path=LOCAL_DIR)
                # Rename if necessary
                extracted_file_path = os.path.join(LOCAL_DIR, pgn_files_in_zip[0])
                if extracted_file_path != local_pgn_path:
                    os.rename(extracted_file_path, local_pgn_path)
                print(f"   -> Successfully extracted {pgn_filename}")
            else:
                print(f"   ‚ö†Ô∏è No PGN file found inside {zip_filename}")

    except Exception as e:
        print(f"‚ùå Extraction failed for {zip_filename}: {e}")

    # Optional: Clean up the ZIP file to save space
    if os.path.exists(local_zip_path):
        os.remove(local_zip_path)
        print("   -> Cleaned up ZIP file.")

    print("-" * (len(zip_filename) + 20))


print("\nüéâ All match data files are downloaded and ready for processing!")

üéØ Target: 8 high-quality game files
üìÇ Local Processing Dir: /content/match_data
üíæ Drive Backup Dir: /content/drive/MyDrive/GambitFlow_Project/Synapse_Data/match_Data/match_data_raw_zip

--- Processing: lichess_elite_2025-11.zip ---
üì¶ Found in Drive. Copying to local for faster processing...
   -> Copy complete.
üîß Extracting PGN from lichess_elite_2025-11.zip...
   -> Successfully extracted lichess_elite_2025-11.pgn
   -> Cleaned up ZIP file.
---------------------------------------------
--- Processing: lichess_elite_2025-10.zip ---
üì¶ Found in Drive. Copying to local for faster processing...
   -> Copy complete.
üîß Extracting PGN from lichess_elite_2025-10.zip...
   -> Successfully extracted lichess_elite_2025-10.pgn
   -> Cleaned up ZIP file.
---------------------------------------------
--- Processing: lichess_elite_2025-09.zip ---
üì¶ Found in Drive. Copying to local for faster processing...
   -> Copy complete.
üîß Extracting PGN from lichess_elite_2025-09.zip.

In [None]:
# ==============================================================================
# üîç INTELLIGENT POSITION SELECTION (FINAL, DEEPLY ANALYZED)
# ==============================================================================

import chess
import random

# --- Configuration Constants (for easy tweaking) ---
SKIP_OPENING_MOVES = 10       # Moves covered by Opening DB
TABLEBASE_PIECE_COUNT = 6     # Positions covered by Endgame Tablebase
ENDGAME_PIECE_THRESHOLD = 12  # Positions below this are considered endgame
MATERIAL_IMBALANCE_MIN = 2    # Trigger for tactical positions
COMPLEX_MOVE_COUNT = 30       # Positions with this many legal moves are complex
QUIET_POSITION_SAMPLE_RATE = 0.20 # 20% chance to save a non-critical position

# --- Pre-calculated piece values for performance ---
PIECE_VALUES = {
    chess.PAWN: 1, chess.KNIGHT: 3, chess.BISHOP: 3,
    chess.ROOK: 5, chess.QUEEN: 9
}

def material_balance(board: chess.Board) -> int:
    """Calculates material imbalance (Optimized). Returns absolute difference."""
    white_material = 0
    black_material = 0

    # Iterate only over pieces on the board, not all 64 squares.
    for piece in board.piece_map().values():
        val = PIECE_VALUES.get(piece.piece_type, 0)
        if piece.color == chess.WHITE:
            white_material += val
        else:
            black_material += val

    return abs(white_material - black_material)

def is_interesting_position(board: chess.Board, move_number: int) -> bool:
    """
    Determines if a position is valuable for training, based on a hierarchy of checks
    from cheapest to most expensive, ensuring performance.
    """

    # 1. Early Exit Filters (Positions handled by other specialized databases)
    if move_number <= SKIP_OPENING_MOVES:
        return False

    piece_count = len(board.piece_map())

    if piece_count <= TABLEBASE_PIECE_COUNT:
        return False

    # 2. Priority Selection (Always keep these valuable positions)

    # Material imbalance strongly suggests tactical or compensatory play.
    if material_balance(board) >= MATERIAL_IMBALANCE_MIN:
        return True

    # Checks are critical moments that must be learned.
    if board.is_check():
        return True

    # Complex endgames (7-12 pieces) are crucial for learning technique.
    if piece_count <= ENDGAME_PIECE_THRESHOLD:
        return True

    # Highly complex middlegames with many choices teach strategic evaluation.
    # This check is last as `legal_moves` is computationally heavier.
    if len(list(board.legal_moves)) >= COMPLEX_MOVE_COUNT:
        return True

    # 3. Probabilistic Sampling (To learn quiet, positional play)
    # This ensures the model doesn't ONLY learn tactics.
    return random.random() < QUIET_POSITION_SAMPLE_RATE

def get_game_phase(board: chess.Board) -> str:
    """Classifies the game phase based on the number of pieces."""
    piece_count = len(board.piece_map())
    if piece_count >= 28:
        return 'opening'
    elif piece_count >= 14:
        return 'midgame'
    else:
        return 'endgame'

def calculate_position_score(board: chess.Board, result: str) -> float:
    """
    Converts game result ('1-0', '0-1', '1/2-1/2') to a score from the
    perspective of the current player. Ideal for a Tanh activation function.
    """
    is_white_turn = board.turn == chess.WHITE

    if result == '1-0':
        return 1.0 if is_white_turn else -1.0
    elif result == '0-1':
        return -1.0 if is_white_turn else 1.0
    else: # Draw
        return 0.0

print("‚úÖ Helper functions for intelligent position selection loaded and optimized.")

# --- Sanity Check ---
test_board = chess.Board("rnbq1rk1/pp2ppbp/3p1np1/8/3NP3/2N1B3/PPPQ1PPP/R3KB1R w KQ - 4 8")
print("\nüß™ Sanity Check with a complex middlegame position:")
print(f"   - Phase: {get_game_phase(test_board)}")
print(f"   - Material Balance: {material_balance(test_board)}")
print(f"   - Is Check: {test_board.is_check()}")
print(f"   - Is Interesting: {is_interesting_position(test_board, 15)}")
print(f"   - Score if White wins: {calculate_position_score(test_board, '1-0')}")

‚úÖ Helper functions for intelligent position selection loaded and optimized.

üß™ Sanity Check with a complex middlegame position:
   - Phase: opening
   - Material Balance: 0
   - Is Check: False
   - Is Interesting: True
   - Score if White wins: 1.0


In [None]:
# ==============================================================================
# üóÑÔ∏è SQLITE DATABASE FOR TRAINING POSITIONS (FIXED CHECKPOINT)
# ==============================================================================

import sqlite3
import json
import os
import time
import shutil # Make sure shutil is imported

# Paths (These should be defined in previous cells)
# LOCAL_DIR = "/content/match_data"
# DATA_FACTORY = '/content/drive/MyDrive/GambitFlow_Project/Synapse_Data_Factory'
DB_NAME = "match_positions_v2.db"
LOCAL_DB = os.path.join(LOCAL_DIR, DB_NAME)
DRIVE_DB = os.path.join(DATA_FACTORY, DB_NAME)
CHECKPOINT_FILE = os.path.join(DATA_FACTORY, "match_checkpoint.json")

def create_database(db_path):
    """Initialize database with proper schema"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS positions (
            id INTEGER PRIMARY KEY AUTOINCREMENT, fen TEXT NOT NULL, phase TEXT NOT NULL,
            value_target REAL NOT NULL, move_played TEXT, game_result TEXT, avg_elo INTEGER,
            material_balance INTEGER, piece_count INTEGER
        )
    ''')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_phase ON positions(phase)')
    cursor.execute('PRAGMA synchronous = OFF')
    cursor.execute('PRAGMA journal_mode = MEMORY')
    conn.commit()
    return conn

# --- FIXED CHECKPOINT FUNCTIONS ---
def load_checkpoint():
    """Load processing progress (Handles both old and new format)"""
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, 'r') as f:
            data = json.load(f)
            # This makes it compatible with the new format from Cell 5
            return data
    return {'files_processed': [], 'total_games_processed': 0}

# (save_checkpoint function is not used in this cell, so no need to change it)

# Initialize
print("üóÑÔ∏è Setting up database...")

if os.path.exists(DRIVE_DB) and not os.path.exists(LOCAL_DB):
    print("üì• Loading existing database from Drive...")
    shutil.copy(DRIVE_DB, LOCAL_DB)

conn = create_database(LOCAL_DB)
cursor = conn.cursor()

cursor.execute('SELECT COUNT(*) FROM positions')
existing_positions = cursor.fetchone()[0]

state = load_checkpoint()
conn.close() # Close connection after getting the count

print(f"‚úÖ Database ready")
# Use .get() for safety: if key doesn't exist, it returns 0 instead of an error.
print(f"üìä Total games processed: {state.get('total_games_processed', 0):,}")
print(f"üìä Positions saved: {existing_positions:,}")
print(f"üìä Target: 5,000,000 positions")

üóÑÔ∏è Setting up database...
üì• Loading existing database from Drive...
‚úÖ Database ready
üìä Total games processed: 83,723
üìä Positions saved: 5,000,047
üìä Target: 5,000,000 positions


In [None]:
# ==============================================================================
# üöÄ MAIN PROCESSING LOOP (FINAL, DIRECT EXECUTION)
# ==============================================================================

import sqlite3
import chess.pgn
import os
import json
import time
import shutil
import random

# --- 1. Configuration & Paths ---
# These variables should be defined in your previous cells
# LOCAL_DIR = "/content/match_data"
# DATA_FACTORY = '/content/drive/MyDrive/GambitFlow_Project/Synapse_Data_Factory'
# DB_NAME = "match_positions_v2.db"

LOCAL_DB = os.path.join(LOCAL_DIR, DB_NAME)
DRIVE_DB = os.path.join(DATA_FACTORY, DB_NAME)
CHECKPOINT_FILE = os.path.join(DATA_FACTORY, "match_checkpoint.json")

MIN_ELO_FILTER = 2600
TARGET_POSITIONS = 10_000_000
BATCH_SIZE = 10000  # Commit to DB every 10,000 positions

# --- 2. Load Helper Functions (Copied for self-containment) ---
PIECE_VALUES = {
    chess.PAWN: 1, chess.KNIGHT: 3, chess.BISHOP: 3,
    chess.ROOK: 5, chess.QUEEN: 9
}

def material_balance(board):
    white, black = 0, 0
    for piece in board.piece_map().values():
        val = PIECE_VALUES.get(piece.piece_type, 0)
        if piece.color == chess.WHITE: white += val
        else: black += val
    return abs(white - black)

def is_interesting_position(board, move_number):
    if move_number <= 10: return False
    pieces = len(board.piece_map())
    if pieces <= 6: return False
    if material_balance(board) >= 2: return True
    if board.is_check(): return True
    if pieces <= 12: return True
    if len(list(board.legal_moves)) >= 30: return True
    return random.random() < 0.20

def calculate_position_score(board, result):
    is_white_turn = board.turn == chess.WHITE
    if result == '1-0': return 1.0 if is_white_turn else -1.0
    elif result == '0-1': return -1.0 if is_white_turn else 1.0
    return 0.0

def get_game_phase(board):
    pieces = len(board.piece_map())
    if pieces <= 12: return 'endgame'
    return 'midgame'

# --- 3. Robust Checkpoint System ---
def load_checkpoint():
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, 'r') as f:
            return json.load(f)
    return {'files_processed': [], 'total_games_processed': 0}

def save_checkpoint(state):
    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump(state, f)

# --- 4. Main Processing Logic ---
try:
    state = load_checkpoint()

    # Copy DB from Drive if it exists and is not local yet
    if os.path.exists(DRIVE_DB) and not os.path.exists(LOCAL_DB):
        print("üì• Copying existing database from Drive for faster I/O...")
        shutil.copy(DRIVE_DB, LOCAL_DB)

    # Re-use create_database from the previous cell for schema and optimizations
    conn = create_database(LOCAL_DB)
    cursor = conn.cursor()

    # Get current number of positions saved
    cursor.execute("SELECT COUNT(id) FROM positions")
    positions_saved = cursor.fetchone()[0]

    # Get all PGN files and sort them to ensure consistent order
    pgn_files = sorted([f for f in os.listdir(LOCAL_DIR) if f.endswith(".pgn")])

    # Check if target is already met
    if positions_saved >= TARGET_POSITIONS:
        print(f"üéâ Target of {TARGET_POSITIONS:,} positions already reached. Nothing to do.")
    else:
        print(f"üöÄ Starting/Resuming processing...")
        print(f"üìä Current Positions: {positions_saved:,} / {TARGET_POSITIONS:,}")

        # Loop through all PGN files
        for pgn_file in pgn_files:
            if pgn_file in state.get('files_processed', []):
                print(f"‚è≠Ô∏è Skipping already processed file: {pgn_file}")
                continue

            print(f"\n--- Processing file: {pgn_file} ---")

            file_game_count = 0
            batch_data = []

            # Open the PGN file
            with open(os.path.join(LOCAL_DIR, pgn_file), 'r', encoding='utf-8') as pgn:
                while True:
                    game = chess.pgn.read_game(pgn)
                    if game is None: break

                    state['total_games_processed'] += 1
                    file_game_count += 1

                    try:
                        w_elo = int(game.headers.get("WhiteElo", 0))
                        b_elo = int(game.headers.get("BlackElo", 0))
                        if w_elo < MIN_ELO_FILTER or b_elo < MIN_ELO_FILTER: continue

                        avg_elo = (w_elo + b_elo) // 2
                        result = game.headers.get("Result", "*")
                        if result not in ['1-0', '0-1', '1/2-1/2']: continue
                    except:
                        continue # Skip game if headers are malformed

                    board = game.board()
                    for i, move in enumerate(game.mainline_moves()):
                        if is_interesting_position(board, i + 1):
                            batch_data.append((
                                board.fen(), get_game_phase(board), calculate_position_score(board, result),
                                board.san(move), result, avg_elo,
                                material_balance(board), len(board.piece_map())
                            ))
                            positions_saved += 1

                        board.push(move)

                        # Commit batch to DB when it's full
                        if len(batch_data) >= BATCH_SIZE:
                            cursor.executemany("INSERT INTO positions (fen, phase, value_target, move_played, game_result, avg_elo, material_balance, piece_count) VALUES (?,?,?,?,?,?,?,?)", batch_data)
                            conn.commit()
                            batch_data = []
                            print(f"  -> Games: {file_game_count:,} | Total Positions: {positions_saved:,}")

                    # Check if target is reached
                    if positions_saved >= TARGET_POSITIONS:
                        print(f"üéâ Target of {TARGET_POSITIONS:,} positions reached!")
                        break

            # Final commit for the current file
            if batch_data:
                cursor.executemany("INSERT INTO positions (fen, phase, value_target, move_played, game_result, avg_elo, material_balance, piece_count) VALUES (?,?,?,?,?,?,?,?)", batch_data)
                conn.commit()
                batch_data = []

            # Mark file as processed in checkpoint
            state['files_processed'].append(pgn_file)
            save_checkpoint(state)

            print(f"  -> Finished {pgn_file}. Backing up DB and saving progress...")
            shutil.copy(LOCAL_DB, DRIVE_DB)

            # Exit outer loop if target is reached
            if positions_saved >= TARGET_POSITIONS:
                break

except KeyboardInterrupt:
    print("\n\nüõë User interrupted processing. Saving progress...")

except Exception as e:
    print(f"\n\n‚ùå An unexpected error occurred: {e}")
    import traceback
    traceback.print_exc()

finally:
    # This block will run even if there's an error or interruption
    if 'conn' in locals() and conn:
        print("\nüíæ Finalizing... Committing any remaining data and closing database.")
        if 'batch_data' in locals() and batch_data:
             cursor.executemany("INSERT INTO positions (fen, phase, value_target, move_played, game_result, avg_elo, material_balance, piece_count) VALUES (?,?,?,?,?,?,?,?)", batch_data)
             conn.commit()

        # Get final accurate count
        cursor.execute("SELECT COUNT(id) FROM positions")
        final_count = cursor.fetchone()[0]
        conn.close()

        # Save final state and backup
        if 'state' in locals():
            save_checkpoint(state)

        if os.path.exists(LOCAL_DB):
            shutil.copy(LOCAL_DB, DRIVE_DB)

        print("\n--- ‚úÖ Processing Finished or Paused ---")
        print(f"üìä Final positions in DB: {final_count:,}")
        print("üíæ Progress saved. You can safely resume later.")

üöÄ Starting/Resuming processing...
üìä Current Positions: 7,294,055 / 10,000,000
‚è≠Ô∏è Skipping already processed file: lichess_elite_2025-03.pgn
‚è≠Ô∏è Skipping already processed file: lichess_elite_2025-04.pgn

--- Processing file: lichess_elite_2025-06.pgn ---
  -> Games: 491 | Total Positions: 7,304,055
  -> Games: 946 | Total Positions: 7,314,055
  -> Games: 1,367 | Total Positions: 7,324,055
  -> Games: 2,035 | Total Positions: 7,334,055
  -> Games: 2,742 | Total Positions: 7,344,055
  -> Games: 3,440 | Total Positions: 7,354,055
  -> Games: 4,062 | Total Positions: 7,364,055
  -> Games: 4,727 | Total Positions: 7,374,055
  -> Games: 5,503 | Total Positions: 7,384,055
  -> Games: 6,390 | Total Positions: 7,394,055
  -> Games: 7,182 | Total Positions: 7,404,055
  -> Games: 8,123 | Total Positions: 7,414,055
  -> Games: 8,667 | Total Positions: 7,424,055
  -> Games: 9,223 | Total Positions: 7,434,055
  -> Games: 9,607 | Total Positions: 7,444,055
  -> Games: 10,046 | Total Posi

In [None]:
# Cell 6: Validate Data Quality & Generate Statistics
# ==============================================================================
# üìä ANALYSIS & QUALITY CHECKS
# ==============================================================================

import sqlite3

# Reconnect to database
conn = sqlite3.connect(DRIVE_DB)
cursor = conn.cursor()

print("üîç Analyzing collected data...\n")
print("=" * 60)

# Basic statistics
cursor.execute('SELECT COUNT(*) FROM positions')
total = cursor.fetchone()[0]

cursor.execute('SELECT AVG(avg_elo) FROM positions')
avg_elo = cursor.fetchone()[0]

cursor.execute('SELECT MIN(avg_elo), MAX(avg_elo) FROM positions')
min_elo, max_elo = cursor.fetchone()

print(f"üìä DATASET OVERVIEW")
print(f"   Total positions: {total:,}")
print(f"   Average ELO: {avg_elo:.0f}")
print(f"   ELO range: {min_elo} - {max_elo}")

# Phase distribution
print(f"\nüìä PHASE DISTRIBUTION")
cursor.execute('SELECT phase, COUNT(*) FROM positions GROUP BY phase')
for phase, count in cursor.fetchall():
    percentage = (count / total) * 100
    print(f"   {phase.capitalize():8}: {count:>8,} ({percentage:5.1f}%)")

# Value distribution
print(f"\nüìä VALUE DISTRIBUTION (Game Outcomes)")
cursor.execute('''
    SELECT
        CASE
            WHEN value_target > 0.5 THEN 'White Win'
            WHEN value_target < -0.5 THEN 'Black Win'
            ELSE 'Draw'
        END as outcome,
        COUNT(*) as count
    FROM positions
    GROUP BY outcome
''')
for outcome, count in cursor.fetchall():
    percentage = (count / total) * 100
    print(f"   {outcome:10}: {count:>8,} ({percentage:5.1f}%)")

# Material balance distribution
print(f"\nüìä MATERIAL BALANCE")
cursor.execute('SELECT AVG(material_balance) FROM positions')
avg_mat = cursor.fetchone()[0]
cursor.execute('SELECT material_balance, COUNT(*) FROM positions WHERE material_balance >= 3 GROUP BY material_balance')
tactical = cursor.fetchall()
print(f"   Average imbalance: {avg_mat:.2f} pawns")
print(f"   Tactical positions (‚â•3 imbalance): {sum(c for _, c in tactical):,}")

# Sample positions
print(f"\nüìä SAMPLE POSITIONS (Random 5)")
print("-" * 60)
cursor.execute('SELECT fen, phase, value_target, avg_elo FROM positions ORDER BY RANDOM() LIMIT 5')
for idx, (fen, phase, value, elo) in enumerate(cursor.fetchall(), 1):
    print(f"\n{idx}. {phase.upper()} position (ELO {elo})")
    print(f"   FEN: {fen[:50]}...")
    print(f"   Eval: {value:+.2f}")

# Quality checks
print(f"\nüìä QUALITY CHECKS")

# Check for duplicates
cursor.execute('SELECT fen, COUNT(*) as cnt FROM positions GROUP BY fen HAVING cnt > 1 LIMIT 5')
duplicates = cursor.fetchall()
if duplicates:
    print(f"   ‚ö†Ô∏è Found {len(duplicates)} duplicate positions (normal, keep for frequency)")
else:
    print(f"   ‚úÖ No duplicate positions")

# Check FEN validity
cursor.execute('SELECT fen FROM positions LIMIT 100')
invalid_count = 0
for (fen,) in cursor.fetchall():
    try:
        chess.Board(fen)
    except:
        invalid_count += 1

if invalid_count == 0:
    print(f"   ‚úÖ All sampled FENs are valid")
else:
    print(f"   ‚ö†Ô∏è Found {invalid_count} invalid FENs in sample")

# Database size
db_size_mb = os.path.getsize(DRIVE_DB) / (1024**2)
print(f"\nüíæ DATABASE SIZE: {db_size_mb:.2f} MB")

conn.close()

print("\n" + "=" * 60)
print("‚úÖ Data validation complete!")
print(f"üìÇ Database location: {DRIVE_DB}")
print("\nüéâ Ready for model training!")

üîç Analyzing collected data...

üìä DATASET OVERVIEW
   Total positions: 10,000,097
   Average ELO: 2720
   ELO range: 2450 - 3147

üìä PHASE DISTRIBUTION
   Endgame : 2,136,637 ( 21.4%)
   Midgame : 7,863,460 ( 78.6%)

üìä VALUE DISTRIBUTION (Game Outcomes)
   Black Win : 3,490,665 ( 34.9%)
   Draw      : 2,926,376 ( 29.3%)
   White Win : 3,583,056 ( 35.8%)

üìä MATERIAL BALANCE
   Average imbalance: 1.25 pawns
   Tactical positions (‚â•3 imbalance): 1,612,047

üìä SAMPLE POSITIONS (Random 5)
------------------------------------------------------------

1. MIDGAME position (ELO 3046)
   FEN: r3kbnr/pp3ppp/2n1p3/2ppPb2/3P4/2N1BN2/PqP1BPPP/R2Q...
   Eval: +0.00

2. MIDGAME position (ELO 2712)
   FEN: 2k1r3/1pp2p2/p4Pp1/2p1Pb1p/5B1K/2P5/6PP/4R3 w - - ...
   Eval: -1.00

3. MIDGAME position (ELO 2546)
   FEN: 7r/2p5/3pBk2/P1pP4/2q5/3r1p1P/5P2/1Q4RK b - - 0 53...
   Eval: +1.00

4. MIDGAME position (ELO 3063)
   FEN: r1bqk2r/p4ppp/2pb4/n3p3/8/2NP1N2/PP1P1PPP/R1BQ1RK1...
   Eval: +0.

In [None]:
# Cell 7: Upload Database to Hugging Face
# ==============================================================================
# üöÄ CLOUD BACKUP & SHARING
# ==============================================================================

from huggingface_hub import HfApi

# Configuration
HF_TOKEN = "HF"  # Replace with your token
HF_USERNAME = "GambitFlow"
REPO_ID = f"{HF_USERNAME}/Elite-Data"

api = HfApi(token=HF_TOKEN)

print(f"üöÄ Uploading to Hugging Face...")
print(f"üì¶ Repository: {REPO_ID}")
print(f"üìÇ File: match_positions_v2.db")
print(f"‚öñÔ∏è Size: {os.path.getsize(DRIVE_DB) / (1024**2):.2f} MB")
print("\n‚è≥ This may take 5-15 minutes depending on size...")
print("-" * 60)

try:
    # Upload file
    api.upload_file(
        path_or_fileobj=DRIVE_DB,
        path_in_repo="match_positions_v2.db",
        repo_id=REPO_ID,
        repo_type="dataset",
        commit_message="Add match training positions (2500+ ELO, 2024 data)"
    )

    print("\n‚úÖ Upload successful!")
    print(f"üîó URL: https://huggingface.co/datasets/{REPO_ID}")
    print(f"üì• Direct download: https://huggingface.co/datasets/{REPO_ID}/resolve/main/match_positions_v2.db")

    print("\n" + "=" * 60)
    print("üéâ Match Data Collection Complete!")
    print("\nüìã Next Steps:")
    print("   1. ‚úÖ Opening Database (DONE)")
    print("   2. ‚úÖ Match Positions (DONE)")
    print("   3. ‚è≠Ô∏è Tactical Puzzles (Next)")
    print("   4. ‚è≠Ô∏è Endgame Positions")
    print("   5. ‚è≠Ô∏è Model Training")

except Exception as e:
    print(f"\n‚ùå Upload failed: {e}")
    print("\nTroubleshooting:")
    print("   1. Check HF_TOKEN is valid and has WRITE permission")
    print("   2. Ensure repository exists (create manually if needed)")
    print("   3. Check internet connection")
    print(f"   4. File is backed up locally: {DRIVE_DB}")

üöÄ Uploading to Hugging Face...
üì¶ Repository: GambitFlow/Elite-Data
üìÇ File: match_positions_v2.db
‚öñÔ∏è Size: 1177.25 MB

‚è≥ This may take 5-15 minutes depending on size...
------------------------------------------------------------


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...ata/match_positions_v2.db:   1%|          | 8.30MB / 1.23GB            

No files have been modified since last commit. Skipping to prevent empty commit.



‚úÖ Upload successful!
üîó URL: https://huggingface.co/datasets/GambitFlow/Elite-Data
üì• Direct download: https://huggingface.co/datasets/GambitFlow/Elite-Data/resolve/main/match_positions_v2.db

üéâ Match Data Collection Complete!

üìã Next Steps:
   1. ‚úÖ Opening Database (DONE)
   2. ‚úÖ Match Positions (DONE)
   3. ‚è≠Ô∏è Tactical Puzzles (Next)
   4. ‚è≠Ô∏è Endgame Positions
   5. ‚è≠Ô∏è Model Training
