In [1]:
%%capture capt
!pip install matplotlib
!pip install pandas
!pip install seaborn
!pip install scikit-learn
!pip install chess
!pip install tensorflow
!pip install tensorflow_addons

In [2]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

import io
import pickle
import os
import glob

import chess.pgn
import chess.polyglot

In [3]:
%run S3.ipynb

In [4]:
#df = pd.read_csv("../Data/clean_df.csv")
df = open_csv("clean_df.csv")

In [5]:
df

Unnamed: 0,result,white_elo,black_elo,# moves,moves
0,1-0,2851,-1,67,W1.d4 B1.d5 W2.c4 B2.e6 W3.Nc3 B3.Nf6 W4.cxd5...
1,1-0,2851,-1,53,W1.e4 B1.d5 W2.exd5 B2.Qxd5 W3.Nc3 B3.Qa5 W4....
2,1-0,2851,-1,57,W1.e4 B1.e5 W2.Nf3 B2.Nc6 W3.Bc4 B3.Bc5 W4.c3...
3,1-0,2851,-1,49,W1.e4 B1.d5 W2.exd5 B2.Qxd5 W3.Nc3 B3.Qa5 W4....
4,1/2-1/2,2851,2633,97,W1.e4 B1.e5 W2.Nf3 B2.Nc6 W3.Bb5 B3.a6 W4.Ba4...
...,...,...,...,...,...
3509228,1-0,-1,-1,57,W1.e4 B1.e6 W2.d4 B2.d5 W3.e5 B3.c5 W4.dxc5 B...
3509229,1/2-1/2,-1,-1,80,W1.d4 B1.Nf6 W2.Nc3 B2.g6 W3.Bg5 B3.Bg7 W4.Nf...
3509230,1-0,-1,-1,57,W1.c4 B1.Nf6 W2.Nc3 B2.g6 W3.e4 B3.d6 W4.d4 B...
3509231,1-0,-1,-1,80,W1.g3 B1.d5 W2.Bg2 B2.c5 W3.d3 B3.Nf6 W4.Nf3 ...


In [6]:
#number of moves in total throughout the dataset
nb_total_moves = df['# moves'].sum()
print(f"There are, in total, {nb_total_moves} moves in the database")

There are, in total, 268533039 moves in the database


## Create a df where each line is a new move

In [7]:
# function to split a sequence of moves to a list of moves
def getMovesPGN(game_index):
    moves = df.iloc[game_index]['moves']
    if moves[0] == '':
        moves = moves[1:]
    pgn = io.StringIO(moves)
    return pgn

def getMovesSequence(game_index):
    pgn = getMovesPGN(game_index)
    game = chess.pgn.read_game(pgn)
    moves = []
    for move in game.mainline_moves():
        moves.append(move.uci())
    return moves    

In [8]:
#we create the game_index for the new database
#each game will be split in the amount of moves it has

try:
    moves_df = open_csv("full_moves_df")
except:
    game_index = []
    moves = []
    for i in tqdm(range(len(df))):
        nb_moves_game = df.iloc[i]['# moves']
        game_index += [i]*nb_moves_game
        moves_sequence = getMovesSequence(i)
        moves += moves_sequence
    
        moves_df = pd.DataFrame()
        moves_df['game_index'] = game_index
        moves_df['moves'] = moves
        moves_df['evaluation'] = [0]*len(game_index)
        to_csv(moves_df, "full_moves_df")

In [9]:
moves_df

Unnamed: 0.1,Unnamed: 0,game_index,moves,evaluation
0,0,0,d2d4,0
1,1,0,d7d5,0
2,2,0,c2c4,0
3,3,0,e7e6,0
4,4,0,b1c3,0
...,...,...,...,...
268533034,268533034,3509232,f8f3,0
268533035,268533035,3509232,h5f3,0
268533036,268533036,3509232,e4f3,0
268533037,268533037,3509232,g2e2,0


## Functions to create the metric

#### Write functions to evaluate a chess position

In [10]:
# functions to evaluate a given chess position, using a chess engine named stockfish

TIME_LIMIT = 0.05

def stockfish_evaluation(board, time_limit = TIME_LIMIT):
    #returns the score of the position, from the perspective of the white player
    #we will never change perspective throughout our work 
    engine = chess.engine.SimpleEngine.popen_uci("../../Stockfish/linux/stockfish/stockfish-ubuntu-x86-64-avx2")
    result = engine.analyse(board, chess.engine.Limit(time=time_limit))
    engine.close()
    return result['score'].white()

def position_eval(board, time_limit = TIME_LIMIT):
    # we need to consider scores where the engine has found a way to deliver mate differently
    # because is those cases, the score returned is a string, not an integer
    score = stockfish_evaluation(board, time_limit)
    if not score.is_mate():
        return score.score()
    #the engine has found a way to mate in a certain number of moves
    return mateScore(score)

def findNumberOfMovesBeforeMate(score):
    str_nb_moves_before_mate = ''
    s = str(score)
    i = len(s)-1
    while i >= 0 and s[i].isnumeric():
        str_nb_moves_before_mate = s[i] + str_nb_moves_before_mate
        i -= 1
    return int(str_nb_moves_before_mate)

def mateScore(score):
    nb_moves_before_mate = findNumberOfMovesBeforeMate(score)
    white_is_winning = (str(score.wdl())[9] == '1')
    # a.wdl() gives the probability of winning for white
    # if the 9th character is equal to 1, then white will deliver mate shortly
    # otherwise, white will lose soon and black is winning
    score_for_mate = 10000 - nb_moves_before_mate*100
    if not white_is_winning:
        score_for_mate = (-1)*score_for_mate
    return score_for_mate

In [11]:
# functions to play moves on a chess board

def getSquareNumber(square):
    #each square on the chess board is represented by an integer in the module chess that we use
    #this functions takes the name of a square as a parameter, for instance 'e2' and returns the integer associated
    col = square[0].lower()
    row = int(square[1])
    row_number = ord(col) - 97
    square_number = 8*(row-1) + row_number
    return square_number

def getMoveToPlay(startSquare, endSquare, promotion_piece=None):
    startSquare_num = getSquareNumber(startSquare)
    endStart_num = getSquareNumber(endSquare)
    move = chess.Move(startSquare_num, endStart_num, promotion = promotion_piece)
    return move

def getPieceToPromoteTo(move):
    #pieces : Pawn=1, Knight=2, Bishop=3, Rook=4, Queen=5, King=6
    if len(move) == 4:
        #a move is represented in our database with the square from which the piece starts 
        #and the square to which the piece goes, for instance 'e2e4'
        #if the move has a length of 4, there is no promotion happening 
        return None
    promotion_piece = move[-1]
    if promotion_piece == 'q':
        return 5
    if promotion_piece == 'r':
        return 4
    if promotion_piece == 'b':
        return 3
    if promotion_piece == 'n':
        return 2
    
def getMove(move):
    startSquare = move[:2]
    endSquare = move[2:4]
    if len(move) == 4:
        return getMoveToPlay(startSquare, endSquare)
    promotion_piece = getPieceToPromoteTo(move[-1])
    return getMoveToPlay(startSquare, endSquare, promotion_piece)

In [12]:
#function using the previous functions
#for a given game, the functions plays and evaluates each move of the game, 
#while also return a list of all the fen representations of each board during the game
#and the zobrist hashing of these boards
# a fen representation is a string that fully represents a chess position, with all the pieces, where they are
# and which player in currently playing. 
# a zobrist key is a way to represent a chess position with an 64-bits integer (theorized by the mathematician Zobrist)

def getGameEvalAndFen(game_index, dict_corres, nb_total_moves):
    board = chess.Board()
    index = dict_corres[game_index]
    
    evaluations = []
    while index < nb_total_moves and moves_df.iloc[index]['game_index'] == game_index:
        move = moves_df.iloc[index]['moves']
        move_to_play_on_board = getMove(move)
        board.push(move_to_play_on_board)
        eval_ = position_eval(board)
        evaluations.append(eval_)
        
        index += 1
        
    return evaluations

#### Write functions to enable the use of multithreading

In [13]:
#to analyse the game which has the index game_index, we need to know what is the 
#index of the first move of this game in moves_df
#instead of searching everytime, we do it once and store the findings in a dictionary

def dictCorrespondanceIndex_gameIndex(moves_df):
    dict_corres = {}
    index = 0
    for index in tqdm(range(len(moves_df))):
        game_index = moves_df.iloc[index]['game_index']
        if dict_corres.get(game_index) == None:
            dict_corres[game_index] = index
    return dict_corres

In [14]:
# we first check whether the dictionary has already been calculated

try:
    with open('../../Data/Metric_creation/dict_correspondance.pkl', 'rb') as f:
        dict_corres = pickle.load(f)
except:
    dict_corres = dictCorrespondanceIndex_gameIndex(moves_df)
    if not os.path.isdir('../../Data/Metric_creation'):
        if not os.path.isdir('../../Data'):
            os.mkdir('../../Data')
        os.mkdir('../../Data/Metric_creation')
    with open('../../Data/Metric_creation/dict_correspondance.pkl', 'wb') as f:
        pickle.dump(dict_corres, f)

In [16]:
nb_total_games = len(df)

In [17]:
# we will store the metric calculated in dictionaries
# we check whether the dictionaries already have been created or not

try:
    with open('../../Data/Metric_creation/evaluations_dict.pkl', 'rb') as f:
        evals_dict = pickle.load(f)
except:
    evals_dict = {}

In [18]:
#as we use multithreading to speed the process, we want to avoid having different threads open the
#same dictionaries and writing into it, as it is not thread safe. 
#Thus, we create a new file for each game analysed, that we will pick up later on and add them 
#to the dictionaries. Don't worry, we will only create them 50 by 50 and erase them once read

def addToDict(game_index):
    if evals_dict.get(game_index) is not None:
        return evals_dict[game_index]
        
    evaluations = getGameEvalAndFen(game_index, dict_corres, nb_total_moves)
    with open('../../Data/Metric_creation/evaluations_dict/' + str(game_index)+ '.pkl', 'wb') as f:
        pickle.dump(evaluations, f)
    return None

In [19]:
#we simply make sure that the paths we will use do exist

if not os.path.isdir('../../Data/Metric_creation'):
    os.mkdir('../../Data/Metric_creation')

if not os.path.isdir('../../Data/Metric_creation/evaluations_dict'):
    os.mkdir('../../Data/Metric_creation/evaluations_dict')

In [20]:
#this very lengthy function gets the files that have been created, containing the metric calculated
#and adds them to their respective dictionaries, then deletes them

def addFromFileToDict():
    eval_paths = glob.glob('../../Data/Metric_creation/evaluations_dict/*.pkl')
    for eval_path in eval_paths:
        with open(eval_path, 'rb') as f:
            eval_ = pickle.load(f)
        index = int(eval_path.split("/")[-1].split(".pkl")[0])
        evals_dict[index] = eval_
        os.remove(eval_path)
        
    with open('../../Data/Metric_creation/evaluations_dict.pkl', "wb") as fp:   
        pickle.dump(evals_dict, fp)

## Analyse the games and create the metric

In [21]:
import multiprocessing
from multiprocessing import Pool

processes_in_parallel = multiprocessing.cpu_count() 
print(f"Processes in parallel -> {processes_in_parallel}")

Processes in parallel -> 72


In [22]:
game_index = 0
print(f"Start at index: {game_index}")

Start at index: 114911


In [23]:
# we parallelize the executions of the metric creation per game
# makes it way faster to analyse each game

if __name__ == "__main__":
    with tqdm(total=nb_total_games) as pbar:
        while game_index < nb_total_games:
            with Pool() as p:
                async_results = [p.apply_async(addToDict, args=(game_index + i, )) for i in range(processes_in_parallel)]
                
                for i in range(processes_in_parallel):
                    res = async_results[i].get()
                    pbar.update(1)

            p.close()
            p.join()
            game_index += processes_in_parallel
            addFromFileToDict()

  1%|          | 34994/3394322 [24:33:03<3270:15:33,  3.50s/it]Process ForkPoolWorker-34994:
Process ForkPoolWorker-35051:
Process ForkPoolWorker-35042:
Process ForkPoolWorker-35014:
Process ForkPoolWorker-35017:
Process ForkPoolWorker-35031:
Process ForkPoolWorker-35018:
Process ForkPoolWorker-35046:
Process ForkPoolWorker-35030:
Process ForkPoolWorker-35052:
Process ForkPoolWorker-35013:
Process ForkPoolWorker-35003:
Process ForkPoolWorker-35010:
Process ForkPoolWorker-35027:
Process ForkPoolWorker-35060:
Process ForkPoolWorker-35048:
Process ForkPoolWorker-35037:
Process ForkPoolWorker-35011:
Process ForkPoolWorker-35008:
Process ForkPoolWorker-35023:
Process ForkPoolWorker-35043:
Process ForkPoolWorker-35006:
Process ForkPoolWorker-35053:
Process ForkPoolWorker-35032:
Traceback (most recent call last):
Process ForkPoolWorker-35026:
Process ForkPoolWorker-35058:
Process ForkPoolWorker-35020:
Process ForkPoolWorker-35009:
Process ForkPoolWorker-35001:
Process ForkPoolWorker-35005:
Pr

## Populate the database with our findings

In [22]:
## check that every game has been analyzed

for game_index in tqdm(range(nb_total_games)):
    if evals_dict.get(game_index) == None:
        addToDict(game_index)
addFromFileToDict()

100%|█████████████████████████████████████████████████████████████████████| 118319/118319 [00:00<00:00, 1098300.22it/s]


In [74]:
# we populate our dataset moves_df with the dictionaries that we have calculated previously

def addToDataFrame(evals_dict, fens_dict, zob_dict):
    evals_ = []
    fens = []
    zobrists = []
    for i in range(len(evals_dict)):
        evals_i = evals_dict.get(i)
        fens_i = fens_dict.get(i)
        zobrists_i = zob_dict.get(i)
        
        evals_ += evals_i
        fens += fens_i
        zobrists += zobrists_i
    if len(evals_) < len(moves_df):
        evals_ += [0]*(len(moves_df)-len(evals_))
        fens += ['']*(len(moves_df)-len(fens))
        zobrists += ['']*(len(moves_df)-len(zobrists))
        
    moves_df['evaluation'] = evals_
    moves_df['fen'] = fens
    moves_df['zobrist_key'] = zobrists

In [75]:
addToDataFrame(evals_dict, fens_dict, zob_dict)

In [None]:
moves_df.to_csv("../Data/moves_df.csv", index=False)

In [23]:
moves_df

Unnamed: 0,game_index,moves,evaluation,fen,zobrist_key
0,0,e2e4,35,rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR ...,9384546495678726550
1,0,e7e5,48,rnbqkbnr/pppp1ppp/8/4p3/4P3/8/PPPP1PPP/RNBQKBN...,595762792459712928
2,0,g1f3,111,rnbqkbnr/pppp1ppp/8/4p3/4P3/5N2/PPPP1PPP/RNBQK...,15213300192948443293
3,0,b8c6,47,r1bqkbnr/pppp1ppp/2n5/4p3/4P3/5N2/PPPP1PPP/RNB...,8704797333742910878
4,0,f1b5,52,r1bqkbnr/pppp1ppp/2n5/1B2p3/4P3/5N2/PPPP1PPP/R...,5409798013178080797
...,...,...,...,...,...
9023518,118318,a8c8,-6,2rq1rk1/pp2bppp/2n1pn2/3p4/8/P1NP1BP1/1P1BPP1P...,13935396515866781493
9023519,118318,f3g2,-2,2rq1rk1/pp2bppp/2n1pn2/3p4/8/P1NP2P1/1P1BPPBP/...,18028698229637126573
9023520,118318,a7a6,12,2rq1rk1/1p2bppp/p1n1pn2/3p4/8/P1NP2P1/1P1BPPBP...,2937820813377462641
9023521,118318,a1c1,25,2rq1rk1/1p2bppp/p1n1pn2/3p4/8/P1NP2P1/1P1BPPBP...,11978245410268853311


### Keep track of losses and wins

In [1]:
## We haven't kept track of who was the winner of each game, or if it is was a draw
#creating a new column in moves_df would not be efficient because for each move, we would have to keep track of the result 
#making it very redundant and this would make the dataset heavier for no reason
#hence, we create a dictionary that records the results, with the same idea as dict_corres

In [28]:
df

Unnamed: 0,result,white_elo,black_elo,# moves,moves
4,1/2-1/2,2851,2633,97,W1.e4 B1.e5 W2.Nf3 B2.Nc6 W3.Bb5 B3.a6 W4.Ba4...
5,1/2-1/2,2851,2748,52,W1.d4 B1.e6 W2.Nf3 B2.Nf6 W3.c4 B3.d5 W4.Nc3 ...
9,1-0,2851,2646,49,W1.e4 B1.c5 W2.Nf3 B2.d6 W3.d4 B3.cxd4 W4.Nxd...
10,1/2-1/2,2851,2725,68,W1.e4 B1.c5 W2.Nf3 B2.Nc6 W3.d4 B3.cxd4 W4.Nx...
11,1-0,2851,2555,147,W1.d4 B1.d5 W2.c4 B2.e6 W3.Nc3 B3.Nf6 W4.cxd5...
...,...,...,...,...,...
267891,1/2-1/2,2501,2540,31,W1.e4 B1.c5 W2.Nf3 B2.d6 W3.Bb5+ B3.Nc6 W4.Bx...
267892,0-1,2501,2588,142,W1.c4 B1.e5 W2.Nc3 B2.Nf6 W3.g3 B3.Bb4 W4.Bg2...
267897,1/2-1/2,2501,2506,33,W1.Nf3 B1.Nf6 W2.c4 B2.e6 W3.b3 B3.b6 W4.Bb2 ...
267899,0-1,2501,2537,39,W1.f4 B1.c5 W2.g3 B2.d5 W3.Bg2 B3.Nf6 W4.Nf3 ...


In [27]:
pd.unique(df.result)

array(['1/2-1/2', '1-0', '0-1'], dtype=object)

In [31]:
def dict_game_results(df):
    df_wins = df.reset_index()
    dict_wins = {}
    for i in tqdm(range(len(df_wins))):
        result_i = df.iloc[i].result
        if result_i == '1-0':
            dict_wins[i] = 1
        elif result_i == '0-1':
            dict_wins[i] = 0
        else:
            dict_wins[i] = 0.5
            
    return dict_wins

In [32]:
# we first check whether the dictionary has already been calculated
try:
    with open('../Data/Metric_creation/dict_wins.pkl', 'rb') as f:
        dict_corres = pickle.load(f)
except:
    dict_wins = dict_game_results(df)
    if not os.path.isdir('../Data/Metric_creation'):
        if not os.path.isdir('../Data'):
            os.mkdir('../Data')
        os.mkdir('../Data/Metric_creation')
    with open('../Data/Metric_creation/dict_wins.pkl', 'wb') as f:
        pickle.dump(dict_wins, f)

100%|████████████████████████████████████████████████████████████████████████| 118319/118319 [00:27<00:00, 4289.85it/s]
