In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from fenpreprocessing import puzzle_cleaning, make_converted_file
from pathlib import Path

# Split Data and advance by one ply

The puzzles are presented after the first move in the move list is played, so we'll add a start+1 ply fen and target move column.

In [2]:
data = pd.read_csv('lichess_db_puzzle.csv', names=['PuzzleId', 'FEN', 'Moves', 'Rating', 'RatingDeviation', 'Popularity', 'NbPlays', 'Themes', 'GameUrl'])

In [3]:
train, almost_test = train_test_split(data, test_size=.1, random_state=89252) # Split the data first, we have a large set so will use 90% for training
val, test = train_test_split(almost_test, test_size=.5, random_state=89252) # Split test and validation data evenly

In [4]:
outdir = Path('fens')
outdir.mkdir(parents=True, exist_ok=True)

for name, section in {'train': train, 'test': test, 'val': val}.items():
    outfile = f'{name}.csv'
    puzzle_cleaning(section).to_csv(outdir / outfile, index=False)

## Convert to board representation arrays for all legal moves in puzzles

Give the size required, we're not going to use the full sets currently.

In [18]:
temp_df = pd.read_csv('fens/train.csv', skiprows=36000, nrows=12000, names=['FEN', 'target_move'])
train_partial, almost_test_partial = train_test_split(temp_df, test_size=.1) # Split the data first, we have a large set so will use 90% for training
val_partial, test_partial = train_test_split(almost_test_partial, test_size=.5) # Split test and validation data evenly

In [14]:
temp_df

Unnamed: 0,FEN,target_move
0,5rk1/1p2pp1p/3p2P1/2n5/3B4/1B4Q1/1R3KPP/q7 b -...,c5e4
1,5r1k/4N1np/p1q5/1p1p4/3P1Q2/P1P2P1P/1r4P1/R5K1...,f4f8
2,8/4r1k1/4Pqp1/7p/4Rp2/5B1P/6PK/6B1 w - - 6 47,g1d4
3,3rr1k1/1p3pp1/p3p1q1/3pPP1p/bP1P2B1/6Q1/P5PP/R...,g6g4
4,5b1r/1p1kpp1p/p5p1/3P1P2/3NP3/4K2P/PP4P1/2R5 b...,f8h6
...,...,...
11995,2kr1b1r/pp1Bp1pp/4q3/3Q1b2/5B2/8/PPP2PPP/R4RK1...,d8d7
11996,4r1rk/4bQ2/p2p1n1B/2nPp3/2q4P/2N2N2/P1B2PP1/2K...,f3g5
11997,3rk2r/p4ppp/1pqB4/2p1P3/Q2n4/7P/5RP1/2R3K1 w k...,a4d4
11998,6k1/2r3pp/3q4/1pRpN1P1/p2PbPQ1/7P/PP6/6K1 b - ...,c7c5


In [19]:
outdir = Path('fens')
outdir.mkdir(parents=True, exist_ok=True)

for name, section in {'train_partial4': train_partial, 'test_partial4': test_partial, 'val_partial4': val_partial}.items():
    section.to_csv(outdir / f'cleaned_{name}.csv')
    make_converted_file(outdir / f'cleaned_{name}.csv', outdir / f'converted_{name}.csv')
