# Capstone Project - Guess the Game!  
By Grandadam Patrik

In [2]:
from datetime import datetime
from datetime import date
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import time
import pandas as pd
import numpy as np
import berserk
import chess
import chess.variant
import chess.svg
import sys
import os
from itertools import chain
import random
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.patheffects as path_effects
import warnings

In [3]:
#session = berserk.TokenSession('eKHJ25gapbL9xpQq')
# client = berserk.Client(session=session)
client = berserk.Client()

In [4]:
variants = client.users.get_all_top_10()
variants = list(variants)
variants

['bullet',
 'blitz',
 'rapid',
 'classical',
 'ultraBullet',
 'crazyhouse',
 'chess960',
 'kingOfTheHill',
 'threeCheck',
 'antichess',
 'atomic',
 'horde',
 'racingKings']

In [5]:
# removing variants not of interest
variants.remove('rapid')
variants.remove('classical')
variants.remove('bullet')
variants.remove('ultraBullet')

In [6]:
variants

['blitz',
 'crazyhouse',
 'chess960',
 'kingOfTheHill',
 'threeCheck',
 'antichess',
 'atomic',
 'horde',
 'racingKings']

### Creating the base of games to be analyzed

In [7]:
# Creating a dictionnary of tournaments from which we are going to extract the games 
# key = variant
# value = id of the tournament (that will be used to extract the games)
tournaments = {
    'blitz': 'ifQ9TCih', 
    'crazyhouse': 'KRJmUfvH', ##
    'chess960': 'WKJDu74q',
    'kingOfTheHill': 'nUayLau9',
    'threeCheck': 'RKVWFUcH',
    'antichess': 'B78NA5ij', 
    'atomic': 'hPWUVAAC', 
    'horde': 'cTfMulCY',
    'racingKings': 'gDpG4byN'
}

In [8]:
saved_games = []
for key in tournaments:
    with np.load(os.path.join(key+'.npz'), allow_pickle=True) as npz_file:
        saved_games.append(dict(npz_file.items())['games'].tolist())

In [9]:
for i in np.arange(0, len(saved_games)):
    print(len(saved_games[i]), "games in the variant", list(tournaments.keys())[i], "(code of tournament:", list(tournaments.values())[i], ")")

42490 games in the variant blitz (code of tournament: ifQ9TCih )
7164 games in the variant crazyhouse (code of tournament: OwBEn5yb )
17366 games in the variant chess960 (code of tournament: WKJDu74q )
6621 games in the variant kingOfTheHill (code of tournament: nUayLau9 )
14355 games in the variant threeCheck (code of tournament: RKVWFUcH )
14500 games in the variant antichess (code of tournament: B78NA5ij )
16675 games in the variant atomic (code of tournament: hPWUVAAC )
7690 games in the variant horde (code of tournament: cTfMulCY )
7539 games in the variant racingKings (code of tournament: gDpG4byN )


In [10]:
# Selecting 5000 random games from each variant
# Challenge: there are some games that are "aborted" (one of both players did not make any move)
# We want to exclude these games and replace them only by games that have indeed been played.

random_games = []
random.seed(0)

# looping over each variant and picking games
for i in range(len(saved_games)):
    random_extract = random.sample(saved_games[i][1000:], 5000) # taking randomly 500 different games (from a first subset) 
    
    new_games = []
    removed_index = []
    k=0
    
    # checking that the games select have indeed at least 1 move by each player and replacing them if not the case
    for loop in range(10): # doing the procedure multiple times to ensure that the replaced games have more than 1 move as well
        for j in range(len(random_extract)):
            if len(random_extract[j]['moves'].split(' ')) < 2:
                random_extract[j] = saved_games[i][k] # replacing the game that has been aborted by a new game       
                k += 1 
                new_games.append(k)
                removed_index.append(j)
    random_games.append(random_extract) # appending the random games to our output
    print('variant:', variants[i], '; replaced games:', removed_index, '; new games:', max(new_games))

variant: blitz ; replaced games: [0, 128, 137, 144, 157, 244, 278, 310, 418, 434, 584, 645, 702, 706, 803, 885, 886, 926, 1089, 1106, 1143, 1238, 1248, 1292, 1351, 1358, 1427, 1456, 1495, 1569, 1575, 1578, 1618, 1622, 1624, 1668, 1723, 1816, 1899, 1918, 1977, 1986, 2038, 2064, 2085, 2122, 2201, 2249, 2282, 2295, 2366, 2391, 2459, 2493, 2609, 2668, 2687, 2740, 2814, 2933, 2981, 3003, 3026, 3045, 3119, 3139, 3314, 3329, 3337, 3381, 3387, 3411, 3467, 3473, 3551, 3576, 3582, 3662, 3699, 3701, 3770, 3805, 3854, 4021, 4028, 4101, 4109, 4113, 4120, 4126, 4240, 4515, 4523, 4605, 4614, 4617, 4633, 4787, 4911, 244, 278, 706, 3337, 4605] ; new games: 104
variant: crazyhouse ; replaced games: [8, 12, 14, 39, 62, 86, 93, 114, 166, 220, 248, 290, 373, 460, 491, 520, 530, 532, 549, 555, 626, 655, 712, 717, 727, 728, 767, 770, 784, 792, 924, 933, 1023, 1024, 1106, 1135, 1140, 1143, 1221, 1262, 1287, 1292, 1310, 1355, 1375, 1554, 1635, 1712, 1774, 1782, 1800, 1805, 1954, 2032, 2053, 2096, 2108, 2128, 2

In [11]:
all_games = list(chain.from_iterable(random_games))
print(len(all_games)) # 1000 games * 9 variants = 9000 games

45000


In [12]:
# definition of a function to convert the list of dictionnaries into single list with:  
# inputs: the keys representing the "path" in the dictionnaries. Ex: players --> white --> user --> id
# output: the list of the key of interest

def convert_to_list(keys, data=all_games):
    
    def extract_info(level, data=all_games):
        output = []
        for i in np.arange(0,len(data)):
            try:
                output.append(data[i][level])
            except:
                output.append(None) # show "None" if the key is not included in the dictionnary
        return output
    
    if len(keys) == 1:
        list_output = extract_info(keys[0], data)
    elif len(keys) == 2:
        first_level = extract_info(keys[0], data)
        list_output = extract_info(keys[1], first_level)
    elif len(keys) == 3:
        first_level = extract_info(keys[0], data)
        second_level = extract_info(keys[1], first_level)
        list_output = extract_info(keys[2], second_level)
    elif len(keys) == 4:
        first_level = extract_info(keys[0], data)
        second_level = extract_info(keys[1], first_level)
        third_level = extract_info(keys[2], second_level)
        list_output = extract_info(keys[3], third_level)
    return list_output

In [13]:
game_id = convert_to_list(['id'], all_games)
variant = convert_to_list(['perf'], all_games)
speed = convert_to_list(['speed'], all_games)
status = convert_to_list(['status'], all_games)
initial_fen = convert_to_list(['initialFen'], all_games)
white_id = convert_to_list(['players', 'white', 'user', 'id'], all_games)
white_title = convert_to_list(['players', 'white', 'user', 'title'], all_games)
white_rating = convert_to_list(['players', 'white', 'rating'], all_games)
black_id = convert_to_list(['players', 'black', 'user', 'id'], all_games)
black_title = convert_to_list(['players', 'black', 'user', 'title'], all_games)
black_rating = convert_to_list(['players', 'black', 'rating'], all_games)
winner = convert_to_list(['winner'], all_games)
moves = convert_to_list(['moves'], all_games)

In [14]:
# extracting position after N moves
N = 10
first_moves = []
for i in range(len(moves)):
    moves[i] = moves[i].split(' ') # convert moves to a list of moves to be readable by the chess package
    first_moves.append(moves[i][:(N*2)]) # N "moves" = N movements for white and N for black = N*2 movements

In [15]:
nb_moves = []
pos_N = []
for game in range(len(first_moves)):  
    nb_moves.append(len(first_moves[game])//2) ## number of moves (if only white has played the move N, the move is considered to be played)
    if len(first_moves[game]) < N*2-1: 
        pos_N.append(False) # check if 10 moves have been played
    else:
        pos_N.append(True) 

In [16]:
# check if the game has indeed been played (at least each player have played)
game_played = []
for game in range(len(first_moves)):  
    if len(first_moves[game]) < 2:
        game_played.append(False)
    else:
        game_played.append(True)

In [17]:
game_played.count(False) # Should be 0: no game with less than 1 move

0

In [18]:
position = [] # creating empty list to store the position after N moves

# iterating over the games (represented as sequences of moves)
for game in range(len(first_moves)):    
   
    # initialization of the chess board for each variant
    if variant[game] == 'blitz':
        board = chess.Board()   
    elif variant[game] == 'crazyhouse':   
        board = chess.variant.CrazyhouseBoard()
    elif variant[game] == 'chess960':
        board = chess.Board(initial_fen[game])
    elif variant[game] == 'kingOfTheHill':
        board = chess.variant.KingOfTheHillBoard()
    elif variant[game] == 'threeCheck':
        board = chess.variant.ThreeCheckBoard()
    elif variant[game] == 'antichess':
        board = chess.variant.AntichessBoard()
    elif variant[game] == 'atomic':
        board = chess.variant.AtomicBoard()
    elif variant[game] == 'horde':
        board = chess.variant.HordeBoard()
    elif variant[game] == 'racingKings':
        board = chess.variant.RacingKingsBoard()
    
    # playing the moves on the board
    for move in first_moves[game]:     # iterating over each move of each game
        try:
            board.push_san(move)    # moving the pieces of the board
        except:
            pass
    position.append(board.board_fen())  # getting the FEN of the positions

In [19]:
df = pd.DataFrame({
    'game_id': game_id,
    'variant': variant,
   # 'speed': speed,
   # 'white_id': white_id,
   # 'white_rating': white_rating,
   # 'black_id': black_id,
   # 'black_rating': black_rating,
   # 'winner': winner,
    'first_moves': first_moves,
    'nb_moves': nb_moves, 
    'moves_played_is_N': pos_N,
    'game_played': game_played,
    'position': position
})

In [20]:
# a function to check the piece standing on each square of the board
# input = chess board ; output = df of shape (1,64)
def board_to_df(board):   
    squares_dict = {}
    for square, square_nb in zip(chess.SQUARE_NAMES, chess.SQUARES): # loop over all squares
        try:
            squares_dict[f"{square}"] = board.piece_at(square_nb).symbol() # check if a piece stands on the square
        except:
            squares_dict[f"{square}"] = "Empty"
    squares_df = pd.DataFrame(squares_dict, index=[0])
    return squares_df

In [21]:
# creating a DF containing, for each game, all squares of the board
squares = pd.DataFrame()
for position in df['position']: # loop over all positions
    board_pos = chess.Board(position) # creating Board object
    squares = squares.append(board_to_df(board_pos)) # getting the position 

In [22]:
# appending the squares to the original DF
squares.reset_index(drop=True, inplace=True)
df.reset_index(drop=True, inplace=True)
df = df.join(squares)

We start by creating a function for the encoding:  

In [23]:
def encode_square(square, df=df):
    
    encoded_square = pd.DataFrame()
    
    # creating new columns with the name of the square + the suffix "_white". 1 if a White piece stands on the square, 0 otherwise
    encoded_square[(square + "_white")] = (((df[square]) == 'Q') | ((df[square]) == 'K') | ((df[square]) == 'R') | ((df[square]) == 'N') | ((df[square]) == 'B') | ((df[square]) == 'P')).astype(int)
    
    suffix = ["_q", "_k", "_r", "_b", "_n", "_p"] # creating the suffix for the encoding that correspond to each pieces

    # creating new columns with the name of the square + the suffix for the considered pieces. 1 if the corresponding piece is standing on the square, 0 otherwise
    for i in range(len(suffix)):
        encoded_square[(square + suffix[i])] = (((df[square]) == suffix[i][-1].upper()) | ((df[square]) == suffix[i][-1])).astype(int)
    
    return encoded_square

In [24]:
pd.DataFrame(df['e5']).join(encode_square('e5')).head(10)

Unnamed: 0,e5,e5_white,e5_q,e5_k,e5_r,e5_b,e5_n,e5_p
0,N,1,0,0,0,0,1,0
1,Empty,0,0,0,0,0,0,0
2,p,0,0,0,0,0,0,1
3,Empty,0,0,0,0,0,0,0
4,p,0,0,0,0,0,0,1
5,p,0,0,0,0,0,0,1
6,Empty,0,0,0,0,0,0,0
7,Empty,0,0,0,0,0,0,0
8,Empty,0,0,0,0,0,0,0
9,p,0,0,0,0,0,0,1


In [25]:
def encode_board(df):
    encoded_board = pd.DataFrame()
    for sq in chess.SQUARE_NAMES:
        encoded_square = encode_square(sq)
        encoded_board =  pd.concat([encoded_board, encoded_square], axis=1)
    return encoded_board

In [26]:
encode_board(df).head()

Unnamed: 0,a1_white,a1_q,a1_k,a1_r,a1_b,a1_n,a1_p,b1_white,b1_q,b1_k,...,g8_b,g8_n,g8_p,h8_white,h8_q,h8_k,h8_r,h8_b,h8_n,h8_p
0,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
def encode_df(df):
    df_encoded = pd.concat([df, encode_board(df)], axis=1)
    # df_encoded = df_encoded.drop(chess.SQUARE_NAMES, axis = 1) 
    return df_encoded

In [28]:
df = encode_df(df)
df.shape

(45000, 519)

In [29]:
df.isnull().sum().sum() # check no missing data

0

In [30]:
df[['game_id']].duplicated().sum() # check that no duplicates

0

In [31]:
df['variant'].value_counts() # check the number of games per variant

racingKings      5000
kingOfTheHill    5000
blitz            5000
threeCheck       5000
crazyhouse       5000
chess960         5000
atomic           5000
antichess        5000
horde            5000
Name: variant, dtype: int64

# Feature engineering & Exploratory Data Analysis (EDA) <a class="anchor" id="EDA"></a>

In [32]:
# squares of interest that will be needed in the analysis
white_first_row = ['a1', 'b1', 'c1', 'd1', 'e1', 'f1', 'g1', 'h1']
black_first_row = ['a8', 'b8', 'c8', 'd8', 'e8', 'f8', 'g8', 'h8']
center = ['d4', 'd5', 'e4', 'e5']
ext_center = ['c4', 'c5', 'f4', 'f5']
third_row_c_f = ['c3', 'c6', 'd3', 'd6', 'e3', 'e6', 'f3', 'f6']
b_g_col = ['b3','b4', 'b5', 'b6', 'g3','g4', 'g5', 'g6']
knight_squares = ['c3', 'd2', 'e2', 'f3', 'c6', 'd7', 'e7', 'f6']
castle_squares = ['g1', 'c1', 'g8', 'c8']

We then create the features of interest that will be analyzed. These features result from an iterative process and are the summary of the features that will be included in the model (except specified otherwise later).  
Features that have been tested but that did not show a promising output are therefore not presented here.  

In [33]:
# counting the material on the board
df['nb_R'] = (df[chess.SQUARE_NAMES] == 'R').sum(axis=1)
df['nb_r'] = (df[chess.SQUARE_NAMES] == 'r').sum(axis=1)
df['nb_N'] = (df[chess.SQUARE_NAMES] == 'N').sum(axis=1)
df['nb_n'] = (df[chess.SQUARE_NAMES] == 'n').sum(axis=1)
df['nb_B'] = (df[chess.SQUARE_NAMES] == 'B').sum(axis=1)
df['nb_b'] = (df[chess.SQUARE_NAMES] == 'b').sum(axis=1)
df['nb_Q'] = (df[chess.SQUARE_NAMES] == 'Q').sum(axis=1)
df['nb_q'] = (df[chess.SQUARE_NAMES] == 'q').sum(axis=1)
df['nb_K'] = (df[chess.SQUARE_NAMES] == 'K').sum(axis=1)
df['nb_k'] = (df[chess.SQUARE_NAMES] == 'k').sum(axis=1)
df['nb_P'] = (df[chess.SQUARE_NAMES] == 'P').sum(axis=1)
df['nb_p'] = (df[chess.SQUARE_NAMES] == 'p').sum(axis=1)

# counting the number of captures that have occured in the games
def count_captures(list_moves):
    count = "".join(list_moves).count('x')
    return count
df['nb_captures'] = df['first_moves'].apply(count_captures)

# counting the total "weight" of the pieces of each player
# using the "usual" weights in chess: queen = 9, rook = 5, bishop/knight = 3, pawn = 1
df['white_weight'] = df['nb_Q']*9 + df['nb_R']*5 + df['nb_N']*3 + df['nb_B']*3 + df['nb_P'] * 1
df['black_weight'] = df['nb_q']*9 + df['nb_r']*5 + df['nb_n']*3 + df['nb_b']*3 + df['nb_p'] * 1
df['diff_weight'] = df['white_weight'] - df['black_weight']

# counting the number of Kings on their first row
df['w_king_1st_row'] = ((df[white_first_row] == 'K')).sum(axis=1)
df['b_king_1st_row'] = ((df[black_first_row] == 'k')).sum(axis=1)

# counting the pieces on their first row
df['w_nb_pieces_1st_row'] = ((df[white_first_row] == 'K') | (df[white_first_row] == 'Q') | (df[white_first_row] == 'R') | (df[white_first_row] == 'B') | (df[white_first_row] == 'N') | (df[white_first_row] == 'P')).sum(axis=1)
df['b_nb_pieces_1st_row'] = ((df[black_first_row] == 'k') | (df[black_first_row] == 'q') | (df[black_first_row] == 'r') | (df[black_first_row] == 'b') | (df[black_first_row] == 'n') | (df[black_first_row] == 'p')).sum(axis=1)

# creating new features to be inspected: total number of pieces standing on the given squares
sq = [center, ext_center, third_row_c_f, b_g_col]
sq_names =  ["center", "ext_center", "third_row_c_f", "b_g_col"]

for square, square_names in zip(sq, sq_names):
    df['nb_pieces_'+square_names] = ((df[square] == 'K') | (df[square] == 'Q') | (df[square] == 'R') | (df[square] == 'B') | (df[square] == 'N') | (df[square] == 'k') | (df[square] == 'q') | (df[square] == 'r') | (df[square] == 'b') | (df[square] == 'n') ).sum(axis=1)
    df['nb_pawns_'+square_names] = ((df[square] == 'P') | (df[square] == 'p')).sum(axis=1)

# counting the knights on typical "knight squares"
df['knight_squares'] = ((df[knight_squares] == 'N') | (df[knight_squares] == 'n')).sum(axis=1)

df['king_castle_squares'] = ((df[castle_squares] == 'K') |  (df[castle_squares] == 'k')).sum(axis=1)

# counting the number of checks that have been given in the games
def count_checks(list_moves):
    count = "".join(list_moves).count('+')
    return count
df['nb_checks'] = df['first_moves'].apply(count_checks)

We include these features to a new dataframe that contains all engineered features.

In [34]:
df_feat_eng = df[['variant', 'nb_moves', 
           'nb_K', 'nb_Q', 'nb_R', 'nb_B', 'nb_N', 'nb_P',
           'nb_k', 'nb_q', 'nb_r', 'nb_b', 'nb_n', 'nb_p',          
           'nb_captures', 'w_king_1st_row', 'b_king_1st_row', 'w_nb_pieces_1st_row', 'b_nb_pieces_1st_row',
           'nb_pawns_center', 'nb_pawns_ext_center', 'nb_pawns_third_row_c_f', 'nb_pawns_b_g_col',
           'nb_pieces_center', 'nb_pieces_ext_center', 'nb_pieces_third_row_c_f', 'nb_pieces_b_g_col',
           'knight_squares', 'king_castle_squares', 'nb_checks']]

In [35]:
df=df.drop(['white_weight', 'black_weight', 'diff_weight'], axis=1)

## "Engineered" dataframe <a class="anchor" id="engdf"></a>

In [36]:
df_feat_eng.to_pickle('datasets/df_feat_eng_5000')

In [37]:
X_eng = df_feat_eng.drop('variant', axis=1)
y_eng = df_feat_eng['variant']

## "Augmented" dataframe <a class="anchor" id="augdf"></a>

In [38]:
df_feat_aug = pd.concat([df['variant'], encode_board(df)], axis=1)

In [39]:
df_feat_aug.head()

Unnamed: 0,variant,a1_white,a1_q,a1_k,a1_r,a1_b,a1_n,a1_p,b1_white,b1_q,...,g8_b,g8_n,g8_p,h8_white,h8_q,h8_k,h8_r,h8_b,h8_n,h8_p
0,blitz,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,blitz,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,blitz,1,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,blitz,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,blitz,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
df_feat_aug.to_pickle('datasets/df_feat_aug_5000')

In [41]:
X_aug = df_feat_aug.drop('variant', axis=1)
y_aug = df_feat_aug['variant']

## "Full" dataframe <a class="anchor" id="fulldf"></a>

In [42]:
full_df=pd.concat([df_feat_eng, df_feat_aug.drop('variant', axis=1)], axis=1)

In [43]:
full_df.to_pickle('datasets/full_df_5000')

In [44]:
full_df.head()

Unnamed: 0,variant,nb_moves,nb_K,nb_Q,nb_R,nb_B,nb_N,nb_P,nb_k,nb_q,...,g8_b,g8_n,g8_p,h8_white,h8_q,h8_k,h8_r,h8_b,h8_n,h8_p
0,blitz,10,1,1,2,2,2,8,1,1,...,0,0,0,0,0,0,0,0,0,0
1,blitz,10,1,1,2,2,1,8,1,1,...,0,0,0,0,0,0,0,0,0,0
2,blitz,10,1,1,2,2,2,7,1,1,...,0,0,0,0,0,0,0,0,0,0
3,blitz,10,1,1,2,2,2,7,1,1,...,0,0,0,0,0,0,0,0,0,0
4,blitz,10,1,1,2,2,2,7,1,1,...,0,0,0,0,0,0,0,0,0,0
