# Chess Position Scanner 

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder, QuantileTransformer, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split 

import PIL
import scipy.ndimage as ndimage
import glob
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
from IPython.display import display

## Download the dataset and upack

In [None]:
!kaggle datasets download koryakinp/chess-positions

In [8]:
import zipfile

# Not unpacking via kaggle cli since the dataset is duplicated
archive = zipfile.ZipFile('chess-positions.zip')
for file in archive.namelist():
    if file.startswith('dataset/'):
        archive.extract(file, './')

## Loading

In [2]:
def path_to_fen(path):
    filename = os.path.basename(path).split('.')[0]
    fen = filename.replace('-', '/')
    return fen

def get_dataset(prefix):
    files = glob.glob(prefix)
    fen = [path_to_fen(path) for path in files]
    return pd.DataFrame({'fen': fen, 'path': files})

train = get_dataset('./dataset/train/*')
train

Unnamed: 0,fen,path
0,1b1B1b2/2pK2q1/4p1rB/7k/8/8/3B4/3rb3,./dataset/train/1b1B1b2-2pK2q1-4p1rB-7k-8-8-3B...
1,1b1b1b2/3r4/1rK4b/R7/R2R1k2/2Bp4/2P5/2r5,./dataset/train/1b1b1b2-3r4-1rK4b-R7-R2R1k2-2B...
2,1B1B1K2/3p1N2/6k1/R7/5P2/4q3/7R/1B6,./dataset/train/1B1B1K2-3p1N2-6k1-R7-5P2-4q3-7...
3,1b1B1K2/R2B4/7P/3b4/3R2B1/8/3R4/4Qk2,./dataset/train/1b1B1K2-R2B4-7P-3b4-3R2B1-8-3R...
4,1b1b1n2/1K1RN1b1/3pbN2/8/4q1k1/4P3/8/2n3N1,./dataset/train/1b1b1n2-1K1RN1b1-3pbN2-8-4q1k1...
...,...,...
79995,rr4N1/5n2/8/pq3Bk1/1N6/8/2KP4/8,./dataset/train/rr4N1-5n2-8-pq3Bk1-1N6-8-2KP4-...
79996,Rr4Q1/2b5/R1K5/7B/6n1/6q1/R3p1N1/5Rk1,./dataset/train/Rr4Q1-2b5-R1K5-7B-6n1-6q1-R3p1...
79997,rr4rk/1K2N3/8/q7/1n2N3/8/N1Q5/8,./dataset/train/rr4rk-1K2N3-8-q7-1n2N3-8-N1Q5-...
79998,rR4RN/p7/3BR3/8/K7/1B1pB3/1r2k3/B3r3,./dataset/train/rR4RN-p7-3BR3-8-K7-1B1pB3-1r2k...


## FEN and Image processing

In [5]:
digits = {'0', '1', '2', '3', '4', '5', '6', '7', '8'}
def decompress_fen(fen):
    return ''.join([
        ' ' * int(ch) if ch in digits else ch
        for ch in fen
    ]).split('/')

def load_image(filename):
    img = PIL.Image.open(filename)
    bw = img.convert('L')
    img_array = np.asarray(bw)
    return img_array

def segment_image(img, n=8):
    return np.array(np.split(np.array(np.split(img, n, 1)), n, 1))

## Tile dataset creation

In [5]:
os.makedirs('./dataset_pieces', exist_ok=True)
classes = [
    'p', 'b', 'n', 'r', 'k', 'q',
    'P', 'B', 'N', 'R', 'K', 'Q',
    'Empty',
]
for class_name in classes:
    os.makedirs('./dataset_pieces/class_'+class_name, exist_ok=True)

class_counters = {
    class_name: 0
    for class_name in classes
}

def process_board(path, verbose=False):
    if verbose:
        print('Processing ' + path)
    
    img = load_image(path)
    tiles = segment_image(img)
    
    fen = path_to_fen(path)
    board = decompress_fen(fen)

    n = 8
    empty_space_counter = 0
    empty_space_max = 10
    for y in range(n):
        for x in range(n):
            figure = board[y][x]
            
            if figure == ' ':
                figure = 'Empty'

                if (y + x) % 2 != empty_space_counter % 2:
                    continue
                
                empty_space_counter += 1

                if empty_space_counter >= empty_space_max:
                    continue

            class_counters[figure] += 1
            id = class_counters[figure]
            figure_img = PIL.Image.fromarray(tiles[y][x])
            figure_img.save(f'./dataset_pieces/class_{figure}/{figure}_image_{id}.jpeg')
            
train.path.apply(process_board)

0        None
1        None
2        None
3        None
4        None
         ... 
79995    None
79996    None
79997    None
79998    None
79999    None
Name: path, Length: 80000, dtype: object

## Balanced, sampled tile dataset creation

In [23]:
# Seeded sample 
rng = random.Random(42)

classes = [
    'p', 'b', 'n', 'r', 'k', 'q',
    'P', 'B', 'N', 'R', 'K', 'Q',
    'Empty',
]
for class_name in classes:
    class_folder = './dataset_pieces_sampled/class_'+class_name
    os.makedirs(class_folder, exist_ok=True)
    sample = rng.sample(glob.glob(f'dataset_pieces/class_{class_name}/*'), 10000)
    for id, file in enumerate(sample):
        shutil.copyfile(file, f'{class_folder}/{class_name}_image_{id}.jpeg')

## Color and Figure type dataset creation

In [None]:
def path_to_label(path):
    filename = os.path.basename(path)
    return filename.split('_')[0]

def make_piece_dataset(in_path, out_path):
    classes = ['p', 'b', 'n', 'r', 'q', 'k', 'empty']
    class_counters = {
        classname: 0
        for classname in classes
    }
    
    for classname in classes:
        os.makedirs(out_path + f'/class_{classname}/', exist_ok=True)
        
    for file in glob.glob(in_path + '/**/*.jpeg'):
        new_label = path_to_label(file).lower()
        class_counters[new_label] += 1
        id = class_counters[new_label]
        shutil.copy(file, out_path+f'/class_{new_label}/{new_label}_image_{id}.jpeg')

    # Add 10k more empty tiles to rebalance
    rng = random.Random(123)
    sample = rng.sample(glob.glob(f'./dataset_pieces/class_Empty/*'), 10000)
    for id, file in enumerate(sample):
        shutil.copyfile(file, f'{out_path}/class_empty/empty_image_{id+10001}.jpeg')

def make_color_dataset(in_path, out_path):
    classes = ['black', 'white']
    class_counters = {
        classname: 0
        for classname in classes
    }
    
    for classname in classes:
        os.makedirs(out_path + f'/class_{classname}/', exist_ok=True)
        
    for file in glob.glob(in_path + '/**/*.jpeg'):
        old_label = path_to_label(file)
        if old_label == 'Empty':
            continue
        new_label = 'white' if old_label.isupper() else 'black' 
        class_counters[new_label] += 1
        id = class_counters[new_label]
        shutil.copy(file, out_path+f'/class_{new_label}/{new_label}_image_{id}.jpeg')

# TODO: Use globals
make_piece_dataset('./dataset_pieces_sampled', './dataset_figures_only_sampled')
make_color_dataset('./dataset_pieces_sampled', './dataset_color_only_sampled')