In [1]:
import numpy as np
import pandas as pd
import chess.pgn

In [2]:
def clean_header_value(value):
    if value == "?" or value is None:
        return None
    return value

def extract_game_data(game):
    board = game.board()
    moves_san = []
    white_castled = False
    black_castled = False

    for move in game.mainline_moves():
        san = board.san(move)
        moves_san.append(san)
        if san in ["O-O", "O-O-O"]:
            if board.turn == chess.WHITE:
                white_castled = True
            else:
                black_castled = True
        board.push(move)

    # Clean headers where necessary
    white_elo_raw = game.headers.get("WhiteElo", None)
    black_elo_raw = game.headers.get("BlackElo", None)

    white_elo = int(white_elo_raw) if white_elo_raw and white_elo_raw.isdigit() else None
    black_elo = int(black_elo_raw) if black_elo_raw and black_elo_raw.isdigit() else None

    return {
        "GameId": clean_header_value(game.headers.get("GameId")),
        "Site": clean_header_value(game.headers.get("Site")),
        "Date": clean_header_value(game.headers.get("Date")),
        "UTCDate": clean_header_value(game.headers.get("UTCDate")),
        "UTCTime": clean_header_value(game.headers.get("UTCTime")),
        "White": clean_header_value(game.headers.get("White")),
        "Black": clean_header_value(game.headers.get("Black")),
        "WhiteElo": white_elo,
        "BlackElo": black_elo,
        "Result": clean_header_value(game.headers.get("Result")),
        "Variant": clean_header_value(game.headers.get("Variant")),
        "TimeControl": clean_header_value(game.headers.get("TimeControl")),
        "ECO": clean_header_value(game.headers.get("ECO")),
        "Opening": clean_header_value(game.headers.get("Opening")),
        "Termination": clean_header_value(game.headers.get("Termination")),
        "NumMoves": len(moves_san),
        "NumTurns": len(moves_san) // 2,
        "WhiteCastled": white_castled,
        "BlackCastled": black_castled,
        "LastMove": moves_san[-1] if moves_san else None,
        "MovesSAN": " ".join(moves_san)
    }


In [3]:
def process_pgn_file(file_path, output_excel="lichess_games_summary.xlsx"):
    games_data = []
    with open(file_path, encoding="utf-8") as pgn_file:
        while True:
            game = chess.pgn.read_game(pgn_file)
            if game is None:
                break
            try:
                game_data = extract_game_data(game)
                games_data.append(game_data)
            except Exception as e:
                print(f"Error processing game: {e}")
                continue

    df = pd.DataFrame(games_data)
    df.to_excel(output_excel, index=False)
    print(f"Done! {len(df)} games saved to '{output_excel}'")

In [4]:
process_pgn_file("data/lichess_nishiv_chess.pgn")

Done! 10999 games saved to 'lichess_games_summary.xlsx'
