In [1]:
import chess
import chess.pgn
import os
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
import time
import sys


In [3]:
board = chess.Board()
print(board)

r n b q k b n r
p p p p p p p p
. . . . . . . .
. . . . . . . .
. . . . . . . .
. . . . . . . .
P P P P P P P P
R N B Q K B N R


In [4]:
pgn = open("./data/KingBase2018-E60-E99.pgn")

In [5]:
#counting number of games in one dataset:
#Simple slow code:

# num_games = 0
# while True:
#     game = chess.pgn.read_game(pgn)
#     if game == None:
#         break
#     else:
#         num_games += 1
#         print(num_games)
# print(num_games)

#fast code:
num_games = 0


with open("./data/KingBase2018-E60-E99.pgn", 'r') as f2:
    for line in f2:
        #print(line)
        if line.startswith("[Event"):
            num_games += 1

with open("./data/KingBase2018-E60-E99.pgn", "r", errors="ignore") as f:
    data = f.read()
    print(f' type of png data: {type(data)}') 
    print(f'size of png data: {sys.getsizeof(data)}')
    

print(f"number of games: {num_games}")

 type of png data: <class 'str'>
size of png data: 87095114
number of games: 237619


In [6]:
# Create a ThreadPoolExecutor without specifying max_workers
with concurrent.futures.ThreadPoolExecutor() as executor:
    print(f"Default max_workers: {executor._max_workers}")

def task(n):
    print(f'Task {n} starting')
    time.sleep(2)
    print(f'Task {n} done')
    return n

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(task, i) for i in range(8)]
    for future in concurrent.futures.as_completed(futures):
        print(f"Result : {future.result}")

Default max_workers: 8
Task 0 starting
Task 1 starting
Task 2 starting
Task 3 starting
Task 4 startingTask 5 starting

Task 6 startingTask 7 starting

Task 0 done
Result : <bound method Future.result of <Future at 0x24359efe650 state=finished returned int>>
Task 1 doneTask 2 done
Result : <bound method Future.result of <Future at 0x24359c84f10 state=finished returned int>>

Result : <bound method Future.result of <Future at 0x24359efea10 state=finished returned int>>
Task 3 done
Result : <bound method Future.result of <Future at 0x24359efd7b0 state=finished returned int>>
Task 4 doneTask 5 done
Result : <bound method Future.result of <Future at 0x24359efeb60 state=finished returned int>>

Result : <bound method Future.result of <Future at 0x24359ae3190 state=finished returned int>>
Task 6 doneTask 7 done
Result : <bound method Future.result of <Future at 0x24359efee30 state=finished returned int>>

Result : <bound method Future.result of <Future at 0x24359efeec0 state=finished returned

In [2]:
import chess
import chess.pgn
import os
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
import time
import sys
import numpy as np



### Serialization explanation:

The chess board has **64 squares**, indexed by `python-chess` as:

```
[a1, b1, c1, d1, e1, f1, g1, h1,
 a2, ..., h2,
 ...
 a8, ..., h8]
```

Each square is converted into a **number**:

* Empty → `0`
* White king → `6`
* White rook → `4`
* White rook with castling rights → `7`
* Black king → `14`

---

### Initial position

```
8  . . . . k . . .
7  . . . . . . . .
6  . . . . . . . .
5  . . . . . . . .
4  . . . . . . . .
3  . . . . . . . .
2  . . . . . . . .
1  . . . . K . . R
   a b c d e f g h
```

* White king at `e1` → index `4` → value `6`
* White rook at `h1` → index `7` → value `7` (castling allowed)

---

### After white castles kingside (O-O)

```
8  . . . . k . . .
7  . . . . . . . .
6  . . . . . . . .
5  . . . . . . . .
4  . . . . . . . .
3  . . . . . . . .
2  . . . . . . . .
1  . . . . . R K .
   a b c d e f g h
```

* King moves to `g1` → index `6` → value `6`
* Rook moves to `f1` → index `5` → value `4`
* Castling rights are gone

---

### Final step

Each square’s value is split into **4 binary planes**, plus:

* **1 plane** for whose turn it is

Total:

```
8 × 8 × 5 = 320 values
```



In [3]:
class State():
    def __init__(self, board=None):
        if board is None:
            self.board = chess.Board()
        else:
            self.board = board
 
    def edges(self):
        return list(self.board.legal_moves)

    def value(self):
        return 1
    
    def serialize(self):
        assert self.board.is_valid()
        
        bstate = np.zeros(64, np.uint8)
        for i in range(64):
            pp = self.board.piece_at(i)
            if pp is not None:
                bstate[i] = {"P":1, "N":2, "B":3, "R":4, "Q":5, "K":6, \
                             "p":9, "n":10, "b":11, "r":12, "q":13, "k":14 }[pp.symbol()]
        
        if self.board.has_queenside_castling_rights(chess.WHITE):
            assert bstate[0] == 4
            bstate[0] = 7
        if self.board.has_kingside_castling_rights(chess.WHITE):
            assert bstate[7] == 4
            bstate[7] = 7
        if self.board.has_queenside_castling_rights(chess.BLACK):
            assert bstate[56] == 8+4
            bstate[56] = 8+7
        if self.board.has_kingside_castling_rights(chess.BLACK):
            assert bstate[63] == 8+4
            bstate[63] = 7


        if self.board.ep_square is not None:
            assert bstate[self.board.ep_square] == 0
            bstate[self.board.ep_square] = 8
        bstate = bstate.reshape(8,8)
                
    
        #binary state:
        state = np.zeros((8,8,5), np.uint8)
        #state[self.board.ep_square, :, :, 3] = 1  #self.board.ep_square: The potential en
                                                #passant square on the third or sixth rank or None.


        # 0-3 columns to binary
        state[:,:,0] = (bstate>>3)&1
        state[:,:,1] = (bstate>>2)&1
        state[:,:,2] = (bstate>>1)&1
        state[:,:,3] = (bstate>>0)&1

        #4th column is who's turn it is:
        state[:, :, 4] = (self.board.turn * 1.0)  # white turn := True

        # pp = self.board.shredder_fen()
        # print(pp)
        return state

    

    

#0-1 black wins -> -1 
#1-0 white wins -> +1 
#1/2 1/2 draw   ->  0

In [None]:
def get_dataset(num_samples=None):
    X,Y = [], []
    game_num = 0
    for fn in os.listdir("data"):
        pgn = open(os.path.join("data", fn))
        while 1:
            try: 
                game = chess.pgn.read_game(pgn)
            except Exception:
                break        
            
            print("parsing game number %d got %d examples" %(game_num, len(X)) )
            

            value = {"1-0": 1, "0-1": -1, "1/2-1/2": 0}[game.headers['Result']]
            board  = game.board()

            for i, move in enumerate(game.mainline_moves()):
                board.push(move)
                ser =  State(board).serialize()[:, :, 0]
                X.append(ser)
                Y.append(value)
            if num_samples is not None and len(X) > num_samples:
                return X,Y
            game_num += 1

    return X,Y

X,Y = get_dataset(1000)


parsing game number 0 got 0 examples
parsing game number 1 got 85 examples
parsing game number 2 got 163 examples
parsing game number 3 got 213 examples
parsing game number 4 got 351 examples
parsing game number 5 got 415 examples
parsing game number 6 got 544 examples
parsing game number 7 got 653 examples
parsing game number 8 got 701 examples
parsing game number 9 got 760 examples
parsing game number 10 got 821 examples
parsing game number 11 got 925 examples


## run def get_dataset() using multiprocessor:


In [52]:
def get_dataset_multi_process(path, num_samples=None):
    X, Y = [], []
    game_num = 0
    with open(path) as pgn:
        while True:
            try:
                game = chess.pgn.read_game(pgn)
            except Exception:
                break

            if game is None:
                break

            print("parsing game number %d got %d examples" % (game_num, len(X)))

            value = {"1-0": 1, "0-1": -1, "1/2-1/2": 0}[game.headers['Result']]
            board = game.board()

            for i, move in enumerate(game.mainline_moves()):
                board.push(move)
                ser = State(board).serialize()[:, :, 0]
                X.append(ser)
                Y.append(value)

            if num_samples is not None and len(X) > num_samples:
                return X, Y

            game_num += 1

    return X, Y


In [None]:
from multiprocessing import Pool
import time

files = [os.path.join("data", f) for f in os.listdir("data")]
files = files[:2]   # only first 2 files


# Normal single process run
start = time.time()
X, Y = [], []
for f in files:
    x, y = get_dataset_multi_process(f)
    X.extend(x)
    Y.extend(y)

end = time.time()
print("Single-process time:", end - start, "seconds")

# Multiprocessing timing
start = time.time()
processes = min(len(files), os.cpu_count())
with Pool(processes=processes) as pool:
    results = pool.map(get_dataset_multi_process(10000), files)

X = np.concatenate([r[0] for r in results])
Y = np.concatenate([r[1] for r in results])
end = time.time()
print("Multiprocessing time:", end - start, "seconds")



parsing game number 0 got 0 examples
parsing game number 1 got 85 examples
parsing game number 2 got 163 examples
parsing game number 3 got 213 examples
parsing game number 4 got 351 examples
parsing game number 5 got 415 examples
parsing game number 6 got 544 examples
parsing game number 7 got 653 examples
parsing game number 8 got 701 examples
parsing game number 9 got 760 examples
parsing game number 10 got 821 examples
parsing game number 11 got 925 examples
parsing game number 12 got 1021 examples
parsing game number 13 got 1153 examples
parsing game number 14 got 1271 examples
parsing game number 15 got 1320 examples
parsing game number 16 got 1453 examples
parsing game number 17 got 1556 examples
parsing game number 18 got 1598 examples
parsing game number 19 got 1657 examples
parsing game number 20 got 1685 examples
parsing game number 21 got 1741 examples
parsing game number 22 got 1798 examples
parsing game number 23 got 1849 examples
parsing game number 24 got 1972 examples


## Write on hdf5 file

In [None]:
X, Y = [], []
import h5py
import numpy as np
if os.path.exists("mytestfile.hdf5"):
    os.remove("mytestfile.hdf5")
with h5py.File("mytestfile.hdf5", "w") as f:
    X,Y = get_dataset(1000)
    f.create_dataset("X", data=X)
    f.create_dataset("Y", data=Y)

parsing game number 0 got 0 examples
parsing game number 1 got 85 examples
parsing game number 2 got 163 examples
parsing game number 3 got 213 examples
parsing game number 4 got 351 examples
parsing game number 5 got 415 examples
parsing game number 6 got 544 examples
parsing game number 7 got 653 examples
parsing game number 8 got 701 examples
parsing game number 9 got 760 examples
parsing game number 10 got 821 examples
parsing game number 11 got 925 examples


In [None]:
#open dataset as test:
with h5py.File("mytestfile.hdf5", "r") as f:
    dset = f['X']
    print(f.name)
    print(list(f.keys()))
    print(dset.shape)

/
['X', 'Y']
(1021, 8, 8)


In [None]:
state = State()
print(state.edges())

[Move.from_uci('g1h3'), Move.from_uci('g1f3'), Move.from_uci('b1c3'), Move.from_uci('b1a3'), Move.from_uci('h2h3'), Move.from_uci('g2g3'), Move.from_uci('f2f3'), Move.from_uci('e2e3'), Move.from_uci('d2d3'), Move.from_uci('c2c3'), Move.from_uci('b2b3'), Move.from_uci('a2a3'), Move.from_uci('h2h4'), Move.from_uci('g2g4'), Move.from_uci('f2f4'), Move.from_uci('e2e4'), Move.from_uci('d2d4'), Move.from_uci('c2c4'), Move.from_uci('b2b4'), Move.from_uci('a2a4')]
