In [1]:
import os
import numpy as np 
import time
import torch
import torch.nn as nn 
import torch.optim as optim 
from torch.utils.data import DataLoader 
from chess import pgn 
from tqdm import tqdm 

from Entrada import create_input_for_nn, encode_moves
from Modelo import ChessModel
from Dataset import ChessDataset

In [2]:
def load_pgn(file_path):
    games = []
    with open(file_path, 'r') as pgn_file:
        while True:
            game = pgn.read_game(pgn_file)
            if game is None:
                break
            games.append(game)
    return games

files = [file for file in os.listdir("../data/pgn/") if file.endswith(".pgn")]
LIMIT_OF_FILES = min(len(files), 28)
games = []
i = 1

for file in tqdm(files):
    games.extend(load_pgn(f"../data/pgn/{file}"))
    if i >= LIMIT_OF_FILES:
        break
    i += 1

 34%|███▍      | 27/79 [02:14<04:18,  4.97s/it]


In [3]:
print(f"GAMES PARSED: {len(games)}")

GAMES PARSED: 41570


In [4]:
# Funciones auxiliares
def process_batch(games_batch):
    X_batch, y_batch = create_input_for_nn(games_batch)
    y_batch, move_to_int = encode_moves(y_batch)
    return torch.tensor(X_batch, dtype=torch.float32), torch.tensor(y_batch, dtype=torch.long), move_to_int

def load_games_in_batches(batch_size, total_games):
    for i in range(0, total_games, batch_size):
        yield games[i:i + batch_size]

In [5]:
def save_checkpoint(model, optimizer, epoch, batch, loss, filename):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
        'batch': batch
    }, filename)

def load_checkpoint(model, optimizer, filename):
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    return checkpoint['epoch'], checkpoint['batch'], checkpoint['loss']



In [6]:
# Configuración
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

Using device: cuda


##CONVIRTIENDO LOS DATOS EN TENSORES 

In [7]:
batch_size = 1000  # Reducido para menor uso de memoria
total_games = len(games)
num_epochs = 3
checkpoint_interval = 5000  # Guardar cada 5000 batches
checkpoint_dir = 'checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

In [8]:
sample_games = games[:batch_size]
_, y_sample, move_to_int = process_batch(sample_games)
num_classes = len(move_to_int)

In [9]:
model = ChessModel(num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [10]:
# Cargar el último checkpoint si existe
last_checkpoint = max([f for f in os.listdir(checkpoint_dir) if f.startswith('checkpoint_')], default=None)
start_epoch = 0
start_batch = 0
if last_checkpoint:
    start_epoch, start_batch, _ = load_checkpoint(model, optimizer, os.path.join(checkpoint_dir, last_checkpoint))
    print(f"Resuming from checkpoint: {last_checkpoint}")

In [11]:
# Entrenamiento
for epoch in range(start_epoch, num_epochs):
    start_time = time.time()
    model.train()
    running_loss = 0.0
    batch_count = 0
    
    for games_batch in load_games_in_batches(batch_size, total_games):
        if epoch == start_epoch and batch_count < start_batch:
            batch_count += 1
            continue
        
        X_batch, y_batch, _ = process_batch(games_batch)
        
        dataset = ChessDataset(X_batch, y_batch)
        dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            running_loss += loss.item()

        batch_count += 1
        
        if batch_count % checkpoint_interval == 0:
            checkpoint_filename = os.path.join(checkpoint_dir, f'checkpoint_epoch{epoch}_batch{batch_count}.pth')
            save_checkpoint(model, optimizer, epoch, batch_count, running_loss / batch_count, checkpoint_filename)
            print(f"Checkpoint saved: {checkpoint_filename}")
        
        # Liberar memoria
        del X_batch, y_batch, dataset, dataloader
        torch.cuda.empty_cache()

    end_time = time.time()
    epoch_time = end_time - start_time
    minutes, seconds = divmod(epoch_time, 60)
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss / batch_count:.4f}, Time: {int(minutes)}m{int(seconds)}s')

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
