In [8]:
import chess.pgn
import os
import pandas as pd

def preprocess_lichess_pgn(pgn_path, output_csv="lichess_processed.csv", max_games=100000):
    games_data = []

    with open(pgn_path, encoding="utf-8") as pgn_file:
        for i in range(max_games):
            game = chess.pgn.read_game(pgn_file)
            if game is None:
                break
            
            game_data = {
                "event": game.headers.get("Event", ""),
                "site": game.headers.get("Site", ""),
                "white": game.headers.get("White", ""),
                "white_elo": game.headers.get("WhiteElo", ""),
                "black": game.headers.get("Black", ""),
                "black_elo": game.headers.get("BlackElo", ""),
                "result": game.headers.get("Result", ""),
                "termination": game.headers.get("Termination", ""),
                "time_control": game.headers.get("TimeControl", ""), 
                "opening": game.headers.get("Opening", ""),
                "moves": game.board().variation_san(game.mainline_moves())
            }
            games_data.append(game_data)
    
    df = pd.DataFrame(games_data)
    df.to_csv(output_csv, index=False)
    print(f"{len(df)} games saved to {output_csv}")
    return df

df_clean = preprocess_lichess_pgn("./data/lichess_db_standard_rated_2013-06.pgn")


100000 games saved to lichess_processed.csv


In [14]:
top_5_white = df_clean["white"].value_counts().head(5)
print(top_5_white)

def filter_top_players(df, players):
    return df[df["white"].isin(players) | df["black"].isin(players)]

filtered_df = filter_top_players(df_clean, ["fil77", "pink_overdoze"])
filtered_df.to_csv("lichess_preprocessed.csv", index=False)
print("Data saved to lichess_preprocessed.csv")


white
fil77            644
pink_overdoze    438
amir51           436
Paulus           407
tomi36           399
Name: count, dtype: int64
Data saved to lichess_preprocessed.csv


In [15]:
filtered_df.head(5)

Unnamed: 0,event,site,white,white_elo,black,black_elo,result,termination,time_control,opening,moves
1873,Rated Bullet game,https://lichess.org/rzhm0tmn,fil77,1401,PierreDaChello,1515,0-1,Normal,60+0,Horwitz Defense,1. d4 e6 2. f3 d6 3. h4 b6 4. g4 Bb7 5. e4 Nd7...
1890,Rated Bullet game,https://lichess.org/mj65f4y8,fil77,1393,ivanbruno,1195,1-0,Time forfeit,60+0,Queen's Pawn Game,1. d4 d5 2. f3 e6 3. h4 a6 4. e4 h6 5. e5 Nc6 ...
1906,Rated Bullet game,https://lichess.org/shvts5u8,fil77,1399,ivanbruno,1188,0-1,Normal,60+0,Queen's Pawn,1. d4 d6 2. f3 e5 3. e4 exd4 4. Qxd4 Nf6 5. c3...
1921,Rated Bullet game,https://lichess.org/km2aynp4,fil77,1382,ivanbruno,1209,1-0,Normal,120+0,Queen's Pawn,1. d4 d6 2. f3 e5 3. e4 exd4 4. Qxd4 Nc6 5. Qd...
2247,Rated Bullet game,https://lichess.org/kc8qnfhx,fil77,1388,ivanbruno,1202,1-0,Time forfeit,120+0,Queen's Pawn,1. d4 d6 2. f3 h6 3. h4 Nf6 4. g4 d5 5. g5 hxg...


In [None]:
descriptions = []

for _, game in filtered_df.iterrows():
    description = (
        f"In a {game['event']} on Lichess, {game['white']} played against {game['black']} "
        f"with the {game['opening']} opening. Ratings were {game['white_elo']} (White) vs {game['black_elo']} (Black). "
        f"The result was {game['result']} by {game['termination']}."
    )
    descriptions.append(description)


for desc in descriptions[:5]:
    print(desc)

In a Rated Bullet game on Lichess, fil77 played against PierreDaChello with the Horwitz Defense opening. Ratings were 1401 (White) vs 1515 (Black). The result was 0-1 by Normal.
In a Rated Bullet game on Lichess, fil77 played against ivanbruno with the Queen's Pawn Game opening. Ratings were 1393 (White) vs 1195 (Black). The result was 1-0 by Time forfeit.
In a Rated Bullet game on Lichess, fil77 played against ivanbruno with the Queen's Pawn opening. Ratings were 1399 (White) vs 1188 (Black). The result was 0-1 by Normal.
In a Rated Bullet game on Lichess, fil77 played against ivanbruno with the Queen's Pawn opening. Ratings were 1382 (White) vs 1209 (Black). The result was 1-0 by Normal.
In a Rated Bullet game on Lichess, fil77 played against ivanbruno with the Queen's Pawn opening. Ratings were 1388 (White) vs 1202 (Black). The result was 1-0 by Time forfeit.


In [4]:
df_clean.head(5)

Unnamed: 0,event,site,white,white_elo,black,black_elo,result,termination,time_control,opening,moves
0,Rated Bullet game,https://lichess.org/in28emmw,Kazuma,1756,kikeillana,1684,1-0,Normal,60+0,King's Indian Attack: Keres Variation #2,1. Nf3 d5 2. g3 Bg4 3. Bg2 Bxf3 4. Bxf3 e6 5. ...
1,Rated Bullet game,https://lichess.org/e174t8h7,Aceves,1487,calculus,1568,0-1,Time forfeit,60+1,Queen's Pawn Game #3,1. d4 d5 2. e3 Nf6 3. c3 Bg4 4. Qc2 e6 5. Bd3 ...
2,Rated Blitz game,https://lichess.org/d4ui60z6,melro,1144,patrimpas,1912,0-1,Normal,240+0,Sicilian Defense: Staunton-Cochrane Variation,1. e4 c5 2. c4 Nc6 3. d3 g6 4. Bd2 Bg7 5. Bc3 ...
3,Rated Blitz game,https://lichess.org/gx3qb4ur,Igortroufignofski,1155,Ndrina,1096,0-1,Normal,300+3,King's Indian Attack,1. Nf3 d5 2. g3 Nc6 3. Bg2 Bg4 4. O-O Bxf3 5. ...
4,Rated Blitz tournament https://lichess.org/tou...,https://lichess.org/kxhslrt4,panickat,1258,prob4,1459,0-1,Normal,300+2,Sicilian Defense: Bowdler Attack,1. e4 c5 2. Bc4 e6 3. h4 Nc6 4. Nf3 h6 5. g4 d...
