In [37]:
import pandas as pd

df = pd.read_csv('data/rated_2014-01_initial_cleaning.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 697470 entries, 0 to 697469
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   event         697470 non-null  object 
 1   white_elo     697470 non-null  float64
 2   black_elo     697470 non-null  float64
 3   time_control  697470 non-null  object 
 4   result        697470 non-null  object 
 5   termination   697470 non-null  object 
 6   moves         697470 non-null  object 
dtypes: float64(2), object(5)
memory usage: 37.2+ MB


### Choosing only classical games and deleting `event` column

In [38]:
df_classical = df[df['event'] == 'classical']
df_classical = df_classical[df_classical.columns[1:]]
df_classical.info()

<class 'pandas.core.frame.DataFrame'>
Index: 246495 entries, 2 to 697466
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   white_elo     246495 non-null  float64
 1   black_elo     246495 non-null  float64
 2   time_control  246495 non-null  object 
 3   result        246495 non-null  object 
 4   termination   246495 non-null  object 
 5   moves         246495 non-null  object 
dtypes: float64(2), object(4)
memory usage: 13.2+ MB


### Dividing time_control into two columns

In [39]:
df_classical['time_control'].head()

2     900+30
7      600+0
8      600+0
12    1800+8
13    1200+5
Name: time_control, dtype: object

In [40]:
df_classical[['time', 'increment']] = df_classical['time_control'].str.split('+', expand=True)

df_classical['time'] = pd.to_numeric(df_classical['time'])
df_classical['increment'] = pd.to_numeric(df_classical['increment'])

df_classical = df_classical.drop(columns=['time_control'])
df_classical.info()

<class 'pandas.core.frame.DataFrame'>
Index: 246495 entries, 2 to 697466
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   white_elo    246495 non-null  float64
 1   black_elo    246495 non-null  float64
 2   result       246495 non-null  object 
 3   termination  246495 non-null  object 
 4   moves        246495 non-null  object 
 5   time         246495 non-null  int64  
 6   increment    246495 non-null  int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 15.0+ MB


### Normalizing time and elo values.

In [41]:
from sklearn.preprocessing import StandardScaler
import joblib

time_cols = ['time', 'increment']
elo_cols = ['white_elo', 'black_elo']

scaler = StandardScaler()
df_classical[time_cols] = scaler.fit_transform(df_classical[time_cols])

joblib.dump(scaler, 'models/time_control_scaler_std.save')

df_classical[elo_cols] = scaler.fit_transform(df_classical[elo_cols])
joblib.dump(scaler, 'models/elo_scaler_std.save')

['models/elo_scaler_std.save']

In [42]:
df_classical.head()

Unnamed: 0,white_elo,black_elo,result,termination,moves,time,increment
2,-0.15503,-1.685366,1-0,normal,e4 d5 e5 Nc6 d4 g6 Bg5 Bg7 Nd2 f6 exf6 Nxf6 Nb...,0.388908,2.281395
7,-0.805046,0.504307,1-0,normal,e4 e6 Nc3 c6 f4 d5 f5 exf5 exd5 cxd5 d4 Qe7+ B...,-0.052241,-0.605796
8,-0.329661,0.598894,1-0,normal,e4 g6 Bc4 Bg7 c3 Nf6 e5 Ng4 d4 d5 Bb3 Nc6 h3 N...,-0.052241,-0.605796
12,-1.27073,0.286759,0-1,normal,d4 Nf6 c4 a6 Nc3 b6 e4 Bb7 e5 Ng8 Nh3 h6 Be2 e...,1.712354,0.164122
13,-0.344214,-2.527184,1/2-1/2,normal,d4 d5 f4 Nc6 Nc3 Nxd4 Nxd5 Qxd5,0.830056,-0.124597


### Creating one-hot encoding for game results and termination.

In [43]:
df_classical = pd.get_dummies(df_classical, columns=['result', 'termination'])
one_hot_cols = ['result_0-1','result_1-0','result_1/2-1/2','termination_normal','termination_rules infraction','termination_time forfeit']
df_classical[one_hot_cols] = df_classical[one_hot_cols].astype('float32') 
df_classical.head()

Unnamed: 0,white_elo,black_elo,moves,time,increment,result_0-1,result_1-0,result_1/2-1/2,termination_normal,termination_rules infraction,termination_time forfeit
2,-0.15503,-1.685366,e4 d5 e5 Nc6 d4 g6 Bg5 Bg7 Nd2 f6 exf6 Nxf6 Nb...,0.388908,2.281395,0.0,1.0,0.0,1.0,0.0,0.0
7,-0.805046,0.504307,e4 e6 Nc3 c6 f4 d5 f5 exf5 exd5 cxd5 d4 Qe7+ B...,-0.052241,-0.605796,0.0,1.0,0.0,1.0,0.0,0.0
8,-0.329661,0.598894,e4 g6 Bc4 Bg7 c3 Nf6 e5 Ng4 d4 d5 Bb3 Nc6 h3 N...,-0.052241,-0.605796,0.0,1.0,0.0,1.0,0.0,0.0
12,-1.27073,0.286759,d4 Nf6 c4 a6 Nc3 b6 e4 Bb7 e5 Ng8 Nh3 h6 Be2 e...,1.712354,0.164122,1.0,0.0,0.0,1.0,0.0,0.0
13,-0.344214,-2.527184,d4 d5 f4 Nc6 Nc3 Nxd4 Nxd5 Qxd5,0.830056,-0.124597,0.0,0.0,1.0,1.0,0.0,0.0


### Creating move embeddings with fastText model.

In [44]:
EMBEDDINGS_MODEL = 'models/chess2vec-3ws-16dim.bin'
import fasttext as ft

model = ft.load_model(EMBEDDINGS_MODEL)



In [45]:
from pandas import DataFrame
import numpy as np
def process_game_moves(vecs_dictionary: dict, moves_str: str):
    moves = moves_str.split(' ')

    return [vecs_dictionary[move] for move in moves]

def moves2vec(model, df_moves: DataFrame):
    unique_moves = set()
    for moves in df_moves.values:
        unique_moves.update(moves.split())
    
    moves_vecs = {move: model.get_word_vector(move) for move in unique_moves}
    
    return df_moves.apply(lambda moves_str: process_game_moves(moves_vecs, moves_str))

In [46]:
df_classical['moves'] = moves2vec(model, df_classical['moves'])
df_classical.head()

Unnamed: 0,white_elo,black_elo,moves,time,increment,result_0-1,result_1-0,result_1/2-1/2,termination_normal,termination_rules infraction,termination_time forfeit
2,-0.15503,-1.685366,"[[0.45821777, 0.44841745, 0.76662135, 0.748688...",0.388908,2.281395,0.0,1.0,0.0,1.0,0.0,0.0
7,-0.805046,0.504307,"[[0.45821777, 0.44841745, 0.76662135, 0.748688...",-0.052241,-0.605796,0.0,1.0,0.0,1.0,0.0,0.0
8,-0.329661,0.598894,"[[0.45821777, 0.44841745, 0.76662135, 0.748688...",-0.052241,-0.605796,0.0,1.0,0.0,1.0,0.0,0.0
12,-1.27073,0.286759,"[[0.43196738, 0.40913582, 0.66153723, 0.606756...",1.712354,0.164122,1.0,0.0,0.0,1.0,0.0,0.0
13,-0.344214,-2.527184,"[[0.43196738, 0.40913582, 0.66153723, 0.606756...",0.830056,-0.124597,0.0,0.0,1.0,1.0,0.0,0.0


In [47]:
len(df_classical['moves'].values[0][0])

16

In [48]:
df_classical.to_hdf('data/classical_16dim_s.h5', key='df', mode='w', index=False)