# Data Exploration
In this notebook I will take a look at the different data sets that are avaibalbe.
I will test how well they can be transformed into a format that is usable for this project and lastly store the transformed data.

I use the excelent python-chess library to process data in the pgn format.

In [None]:
!pip install python-chess

The first dataset I want to look at is the FiCS Games Database: https://www.ficsgames.org/download.html

This database contains games millions of games, the first ones being from 1999.
A subcategorie are games from players with a rating above 2000.
I will only consider these games.

All games from one year can be downloaded in bluk as one pgn-file.
Below, I will write methods to convert the pgn format into the board state or into a string of words.

In [None]:
import chess
import chess.pgn 
from tqdm import tqdm_notebook as tqdm
import re

def dataExtractor(path):
    """
    Extracts the moves and fens of each game as strings
    path - the path to the pgn file in which the games are stored
    returns an array of objects containing the moves and the fens
    """
    
    re0 = re.compile(r"{.*?}", re.MULTILINE)
    re1 = re.compile(r"{.*}", re.MULTILINE)
    re2 = re.compile(r"\d+\..", re.MULTILINE)
    re3 = re.compile(r"\.", re.MULTILINE)
    re4 = re.compile(r"\$\d+", re.MULTILINE)
    
    pgn = open(path)
    data = {
        'moves': [],
        'fens': []
    }
    pgn.readlines()
    with tqdm(total=pgn.tell()-1) as pbar:
        pgn.seek(0)
        while True:
            pbar.n = pgn.tell()
            pbar.refresh()
            game = chess.pgn.read_game(pgn)

            if game is None:
                break

            string = str(game.mainline_moves())
            string = re.sub(re0, "", string) # Remove comments
            string = re.sub(re1, "", string)
            string = re.sub(re2, "", string)
            string = re.sub(re3, "", string)
            string = re.sub(re4, "", string)

            if "{" in string:
                continue
            
            fens = []
            
            board = game.board()

            for move in game.mainline_moves():
                board.push(move)
                fen = board.board_fen()
                fens.append(str(fen))
                
            data['moves'].append(list(filter(None, string.split(" "))))
            data['fens'].append(fens)
            
        return data
    
path = 'data/sample.pgn'

games = dataExtractor(path)

HBox(children=(IntProgress(value=0, max=68121757), HTML(value='')))

In [None]:
games['moves'][0]

I also wrote a method to convert the fen notation to the matrix representation.
However, I will not store data in this format to save space.

In [None]:
import numpy as np

def indexToArray(i, len_ = 12):
    '''
    Converts an index into a one-hot-encoded vector.
    i - the index of the 1.
    len_ - (optional) the len of the one-hot-vector. Default is 12.
    returns a vector of length len_
    '''
    
    array = [0] * len_
    
    if(i >= 0 and i < len(array)):
        array[i] = 1
        
    return array

def fenToMatrix(fen):
    '''
    Converts a fen string to a 8x8x16 matrix.
    fen - a string in the fen notation
    returns a 8x8x16 matrix
    '''
    
    # 'rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR'
    pieces = {
        'r': indexToArray(0),
        'n': indexToArray(1),
        'b': indexToArray(2),
        'q': indexToArray(3),
        'k': indexToArray(4),
        'p': indexToArray(5),
        'P': indexToArray(6),
        'R': indexToArray(7),
        'N': indexToArray(8),
        'B': indexToArray(9),
        'Q': indexToArray(10),
        'K': indexToArray(11),
    }
    
    matrix = []
    row = []
    
    for c in fen:
        try:
            cInt = int(c)
            
            for i in range(cInt):
                row.append(indexToArray(-1))
        except: # c can not be cast as integer       
            if c == '/':
                matrix.append(row)
                row = []
            else:
                row.append(pieces[c]) 
    matrix.append(row)
                
    return matrix

Lastly I save the data as a json

In [None]:
import json

with open('data/data.json', 'w') as outfile:  
    json.dump(games, outfile)

## Datasets with annotation
Next, I will look at the ~4000 games with annotations sourced from http://www.angelfire.com/games3/smartbridge/

The below command will throw errors but I handle these (they are still displayed even though they are irrelevant).

In [None]:
from os import listdir

annotated_games = []

for file in tqdm(listdir('data/annotated')):
    pgn = open('data/annotated/' + file)
    while True:
        game = chess.pgn.read_game(pgn)
        
        if game is None:
            break
            
        if len(game.errors) > 0:
            continue
        
        annotated_games.append(game)

In [None]:
len(annotated_games)

In [None]:
for moves in annotated_games[0].mainline():
    print(str(moves.move) + " " + moves.comment)

In [None]:
def dataExtractor2(games):
    """
    Extracts the moves and fens of each game as strings
    path - the path to the pgn file in which the games are stored
    returns an array of objects containing the moves and the fens
    """
    
    counter = 0
    
    re0 = re.compile(r"{.*?}", re.MULTILINE)
    re1 = re.compile(r"{.*}", re.MULTILINE)
    re2 = re.compile(r"\d+\..", re.MULTILINE)
    re3 = re.compile(r"\.", re.MULTILINE)
    re4 = re.compile(r"\$\d+", re.MULTILINE)
    
    data = {
        'moves': [],
        'fens': []
    }
    for game in tqdm(games):
        result1 = [] # Stores all the commented moves for this game
        result2 = [] # Stores all the commented fens for this game
        
        string = str(game.mainline_moves())
        string = re.sub(re0, "", string) # Remove comments
        string = re.sub(re1, "", string)
        string = re.sub(re2, "", string)
        string = re.sub(re3, "", string)
        string = re.sub(re4, "", string)
        
        if "{" in string:
            continue
        
        moves = list(filter(None, string.split(" ")))
        mainline = list(game.mainline())
                
        helper = []
        
        board = game.board()                
        
        for i in range(len(moves)):
            move = moves[i]
            comment = mainline[i].comment
            helper.append(move)
            board.push(mainline[i].move)
            
            if comment: # comment exists  
                counter += 1
                result1.append([helper.copy(), comment])
                result2.append([str(board.board_fen()), comment])
            
        data['moves'].append(result1)
        data['fens'].append(result2)
                               
    print(counter, 'commented moves found')        
    return data
                               
data = dataExtractor2(annotated_games)

In [None]:
data['fens'][0]

In [None]:
with open('data/data2.json', 'w') as outfile:  
    json.dump(data, outfile)