In [1]:
import tensorflow as tf
import chess.pgn
import pandas as pd
import numpy as np
import os
import time
import multiprocessing as mp
import random
import io
import re
import dask.dataframe as dd
import glob
from pathlib import Path
    
from tqdm.notebook import tqdm

2025-10-09 19:36:45.136373: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760060205.225296     744 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760060205.251909     744 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-10-09 19:36:45.463231: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')

try:
    if gpus:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        tf.config.set_visible_devices(gpus[0], 'GPU')
        print('Now using gpu')
    else: print('Unable to use the gpu')


except RuntimeError as e:
    print(f'Something went wrong: {e}')


Now using gpu


In [3]:
piece_values = {
    chess.PAWN: 1,
    chess.KNIGHT: 3,
    chess.BISHOP: 3,
    chess.ROOK: 5,
    chess.QUEEN: 9,
}


def material_score(board):
    """Devuelve el balance de material (positivo = blancas mejor, negativo = negras mejor)."""
    score = 0
    for piece_type, val in piece_values.items():
        score += len(board.pieces(piece_type, chess.WHITE)) * val
        score -= len(board.pieces(piece_type, chess.BLACK)) * val
    return score


def game_pipeline_from_text(game_tuple):
    game = chess.pgn.read_game(io.StringIO(game_tuple[1]))
    if game is None:
        return None



    # Headers of PGN
    termination = game.headers.get('Termination','').lower()
    result = game.headers.get("Result", "*")
    outcome = 1 if result == "1-0" else -1 if result == "0-1" else 0

    # Flag is_loss_by_time
    is_loss_by_time = 1 if 'time forfeit' in termination or 'time' in termination else 0
    is_loss_by_resign = 1 if "resign" in termination or "abandon" in termination else 0

    # Gneral
    moves = list(game.mainline_moves())
    board = game.board()
    total_moves = len(moves)

    game_list = []
    ply = 0  # half-move counter

    pre_loss_score = None

    if is_loss_by_time and total_moves > 0:
        temp_board = game.board()
        for move in moves[:-1]:
            temp_board.push(move)
        pre_loss_score = material_score(temp_board)
    
    for move in moves:
        fen = board.fen()
        move_uci = move.uci()
        move_san = board.san(move)
        player = "white" if board.turn == chess.WHITE else "black"

        # --- Enrichment ---
        piece_moved = None
        is_capture = 0
        captured_piece = None
        is_check = 0
        is_checkmate = 0
        is_castling = board.is_castling(move)
        is_promotion = 1 if move.promotion else 0
        promotion_piece = move.promotion if move.promotion else None
        is_en_passant = board.is_en_passant(move)

        piece = board.piece_at(move.from_square)
        if piece:
            piece_moved = piece.piece_type

        if board.is_capture(move):
            is_capture = 1
            captured = board.piece_at(move.to_square)
            if captured:
                captured_piece = captured.piece_type

        # Apply move to see new board state
        board.push(move)
        is_check = 1 if board.is_check() else 0
        is_checkmate = 1 if board.is_checkmate() else 0

        mat_score = material_score(board)

        ply += 1
        game_list.append({
            "game_id": game_tuple[0],
            "ply": ply,
            "fen": fen,
            "move_uci": move_uci,
            "move_san": move_san,
            "player": player,
            "piece_moved": piece_moved,
            "is_capture": is_capture,
            "captured_piece": captured_piece if captured_piece else 0,
            "is_check": is_check,
            "is_checkmate": is_checkmate,
            "is_castling": int(is_castling),
            "is_en_passant": int(is_en_passant),
            "is_promotion": is_promotion,
            "promotion_piece": promotion_piece if promotion_piece else 0,
            "material_score": mat_score,
            "result": outcome,
            "game_moves": total_moves,
            "is_mate_win": 1 if "#" in move_san else 0,
            'termination': termination,
            'is_loss_by_time' : is_loss_by_time,
            'is_resignation': is_loss_by_time,
            'pre_loss_score': pre_loss_score if pre_loss_score else 0
        })

    return game_list if game_list else None


In [4]:

def get_completed_chunk_ids(path=None):

    completed_chunks = os.listdir(path)
    completed_chunks_ids = [int(re.search(r'\d+', chunk)[0]) for chunk in completed_chunks]
    return completed_chunks_ids

def process_and_save_chunk(games_chunk, chunk_idx,path=None):
    results = list(filter(None, map(lambda g: game_pipeline_from_text(g), games_chunk)))
    flat_results = [entry for game in results for entry in game]  # Flatten list
    df = pd.DataFrame(flat_results)
    df.to_parquet(path / f'parquet_{chunk_idx}.parquet', index=False)
    print(f"Saved chunk {chunk_idx} with {len(results)} valid games")

In [5]:
file = '../data/raw/lichess_db_standard_rated_2016-03.pgn'

In [6]:
parquet_dir = Path.cwd().parent / 'data' / 'enriched_parquets' 

In [7]:
def run_pipeline(source_file = None,save_dir=None):
    os.makedirs(save_dir,exist_ok=True)
    if source_file is None:
        return
        
    chunk_size = 10_000
    game_idx = 0
    games_chunk = []
    completed_chunks_ids = get_completed_chunk_ids(path=save_dir)
    
    with open(source_file) as f:
        total_games = sum(1 for _ in chess.pgn.read_game(f) if _ is not None)
    with open(source_file) as f:
        pbar = tqdm(total=total_games, desc="Processing games")
        while True:
            current_chunk_idx = game_idx // chunk_size    
            if current_chunk_idx in completed_chunks_ids:
                print(f'Skipping chunk {current_chunk_idx}')
                for _ in range(chunk_size):
                    game = chess.pgn.read_game(f)
                    if game is None:
                        break
                    game_idx += 1
                    pbar.update(1)
                continue
    
            game = chess.pgn.read_game(f)
            if game is None:
                break
            games_chunk.append((game_idx,str(game)))
            game_idx += 1
            pbar.update(1)
    
            if len(games_chunk) >= chunk_size:
                print(f'Processing chunk {current_chunk_idx} ')
                process_and_save_chunk(games_chunk, current_chunk_idx,save_dir)  # usa un número razonable o calcula primero
                games_chunk = []
    
    
    # Procesar último chunk
    if games_chunk:
        current_chunk_idx = game_idx // chunk_size
        print(f"Processing final chunk {current_chunk_idx}...")
        process_and_save_chunk(games_chunk, current_chunk_idx,save_dir)
    pbar.close()


In [8]:
run_pipeline(file,parquet_dir)

Processing games: 0it [00:00, ?it/s]

Processing chunk 0 
Saved chunk 0 with 9976 valid games



KeyboardInterrupt

