In [1]:
import pandas as pd
import numpy as np
import os
import re
import glob
import tensorflow as tf
import dask.dataframe as dd
import torch
import chess

2025-06-13 14:20:09.697519: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749846009.714801     574 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749846009.720088     574 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-13 14:20:09.741479: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
parquets = glob.glob('../data/processed/*.parquet')

In [3]:
df = pd.read_parquet(parquets[0])

In [4]:
# one_million_games = pd.concat([pd.read_parquet(parquet) for parquet in parquets])

In [5]:
type(chess.Move(32,42).uci())

str

In [6]:
def generate_full_uci_move_vocabulary() -> tuple[dict,dict]:

    """
    Generates a set of all the posible movements in a chess board of 64 squares, 
    create two dictionaries which will represent the board move in uci format and their respective idx value and viceversa
    
    Returns
    -------
        uci_to_idx : dict
                     All the possible uci moves in a chess board, uci format as keys and idx as values
        idx_to_uci : dict 
                     All the possible uci moves in a chess board, uci format as keys and idx as values
    
    """
    move_set = set()
    
    for from_sq in chess.SQUARES:
        for to_sq in chess.SQUARES:
            if from_sq == to_sq:
                continue

            move = chess.Move(from_sq, to_sq)
            move_set.add(move.uci())
            
            from_rank = chess.square_rank(from_sq) # Get the row in which the piece is coming from
            to_rank = chess.square_rank(to_sq) # Get the row in which will be moved the piece
            # if to_rank in [0, 7] and from_rank in [1,6]:  # posibles promociones
            if (from_rank == 1 and to_rank == 0) or (from_rank == 6 and to_rank ==7):
                
                for promo in [chess.QUEEN, chess.ROOK, chess.BISHOP, chess.KNIGHT]:
                    move_set.add(chess.Move(from_sq, to_sq, promotion=promo).uci())
                    
    move_list = sorted(move_set)
    uci_to_idx = {uci: idx for idx, uci in enumerate(move_list)}
    idx_to_uci = {idx: uci for uci, idx in uci_to_idx.items()}
    return uci_to_idx, idx_to_uci

def fen_to_tensor(fen:str) -> torch.Tensor:
    """
    Converts a FEN position into a torch tensor of shape (12,8,8),
    12 matrix of 8x8 positions, in which each type of piece eaither PNBRQK or pnbrqk,
    will ocupate a place in the matrix, each matrix for each set of piece representation.

    Parameters
    ----------
    fen : str
          The notation FEN to convert into numerical values
    Returns
    -------
    board_tensor : torch.Tensor
                   The representation of FEN notation in 12 matrix of 8x8

    """

    
    board = chess.Board(fen)
    
    piece_to_index = {piece:idx for idx,piece in enumerate('PNBRQKpnbrqk')} # represents the piece and index of each value of the str

    #TODO: add extra ccanals to indicate if there is castling available 4 canals, passant square, halfmove clock
    
    board_tensor = torch.zeros((12,8,8),dtype=torch.float32)
    for square in chess.SQUARES:
        piece = board.piece_at(square)
        if piece:
            idx = piece_to_index[piece.symbol()]
            row = 7 - (square // 8)
            col = square %8
            board_tensor[idx,row,col] = 1.0
    return board_tensor


def get_legal_moves_vocab(fen:str) -> tuple[dict[str,int],dict[int,str]]:
    """
    Generates a set of legal posible moves for a given position 

    IMPORTANT ---> All the dict generated are LOCAL and could not match with the global dict --> generate_full_uci_move_vocabulary()

    Parameters
    ----------
        fen: FEN notation of the current position
    Returns
    -------
        uci_to_idx: Dict {uci_move : idx}
        idc_to_uci: Dict {idx : uci_move}
    """

    board = chess.Board(fen)
    legal_moves = list(board.legal_moves)
    
    legal_moves_sorted = sorted(legal_moves, key=lambda m: m.uci())

    uci_to_idx = {move.uci():  idx for idx, move in enumerate(legal_moves_sorted)}
    idx_to_uci = {idx: move.uci() for idx,move in enumerate(legal_moves_sorted)}
    return uci_to_idx, idx_to_uci

 ## POSIBLEMENTE DESCARTADO, MEJORA ALTERNATIVA CON FUNCION  --> get_legal_moves_vocab enfoque "SPARSE"
# def get_legal_mask(board: chess.Board, uci_to_index: dict) -> torch.Tensor:
#     mask = torch.zeros(len(uci_to_index), dtype=torch.float32)
#     for move in board.legal_moves:
#         uci = move.uci()
#         if uci in uci_to_index:
#             mask[uci_to_index[uci]] = 1.0
#     return mask  # Shape: (n_moves,)

    


    move_list = sorted(move_set)
    uci_to_index = {uci: idx for idx, uci in enumerate(move_list)}
    index_to_uci = {idx: uci for uci, idx in uci_to_index.items()}
    return uci_to_index, index_to_uci
# Globales cargados una vez al inicio

def move_to_index(uci_move: str) -> int:
    return uci_to_index.get(uci_move, -1)  # -1 si no está

def index_to_move(idx: int) -> str:
    return index_to_uci.get(idx, "0000")  # dummy por si acaso

        

In [7]:
class ChessSequenceDataset(torch.utils.data.Dataset):

    def __init__(self,df,uci_to_idx):

        
        self.games = []
        self.uci_to_idx = uci_to_idx
        grouped = df.groupby('game_id')

        for game_id, group in grouped:
            group_sorted = group.sort_values(by='pyl',ascending=True)
            sequence = []

            for _,row in group_sorted.iterrows():
                fen = row['fen']
                uci = row['uci']

                move_idx = uci_to_index.get(uci,-1)
                if move_idx ==-1:
                    continue
                fen_tensor = fen_to_tensor(fen)
                sequence.append((fen_tensor,move_idx))
            if len(sequence)>0:
                self.games.append(sequence)
                
    def __len__(self):
        return len(self.df)
    def __getitem__(self,idx):
        return self.games[idx]
        
    

In [21]:
df=df.drop(axis=0,index=df.loc[df['game_id']==150015].index)
unique_ids= df['game_id'].unique()
sample_ids = np.random.choice(unique_ids,size=10_000,replace=False,)
df_subset = df[df['game_id'].isin(sample_ids)].reset_index(drop=True)


In [83]:
pattern = r'(\d+$)'

In [102]:
test= df.head(3000).copy()

In [103]:
test.iloc[0,0]

'rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1'

In [112]:
test['fen'].str.extract(r'(\d+$)').head()

Unnamed: 0,0
0,1
1,1
2,2
3,2
4,3


In [114]:
test['full_move'] = test['fen'].str.extract(r'(\d+$)').astype(int)

In [115]:
is_white = (test['player'] == 'white').astype(int)
ply = (test['full_move'] - 1) * 2 + (1 - is_white)  # Porque blancas = 0, negras = 1


In [116]:
test['ply'] = ply