In [1]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,  StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np
from scipy import sparse

We did not want to store redundant columns, so in our dataset we do not have average values (these can be calculated without any problems).\
Method transofrm in our class CombinedAttributesAdder calculates them.

In [2]:
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, indices):
        self.indices = indices
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        avg_white_cpl = X[:,self.indices["white_cpl"]] / X[:,self.indices["white_moves"]] 
        avg_black_cpl = X[:,self.indices["black_cpl"]] / X[:,self.indices["black_moves"]] 
        avg_white_mistakes = X[:,self.indices["white_mistakes"]] / X[:,self.indices["white_moves"]] 
        avg_black_mistakes = X[:,self.indices["black_mistakes"]] / X[:,self.indices["black_moves"]] 
        avg_white_blunders = X[:,self.indices["white_blunders"]] / X[:,self.indices["white_moves"]] 
        avg_black_blunders = X[:,self.indices["black_blunders"]] / X[:,self.indices["black_moves"]] 
        avg_white_inaccuracies = X[:,self.indices["white_inaccuracies"]] / X[:,self.indices["white_moves"]] 
        avg_black_inaccuracies = X[:,self.indices["black_inaccuracies"]] / X[:,self.indices["black_moves"]] 
        
        return np.c_[X, avg_black_cpl, avg_black_blunders, avg_black_mistakes, avg_black_inaccuracies, avg_white_cpl, avg_white_blunders, avg_white_mistakes, avg_white_inaccuracies]

Now we create pipeline, which then can be used to transform new examples according to the same schema.

In [3]:
data_num = ['white_rating_diff', 'black_rating_diff', 'white_cpl', 'black_cpl', 'white_moves', 'black_moves', 'white_inaccuracies', 'white_mistakes', 'white_blunders', 'black_inaccuracies', 'black_mistakes', 'black_blunders']
data_num_dict = {col: idx for idx, col in enumerate(data_num)}
data_cat = ['result', 'time_control', 'termination']

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder(data_num_dict)),
    ('std_scaler', StandardScaler())
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, data_num),
    ('cat', OneHotEncoder(), data_cat)
])

In [4]:
all_files = ["data/dataBig_" + it +  ".csv" for it in ["180_0", "180_2", "300_0", "300_3"]]

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

data = pd.concat(li, axis=0, ignore_index=True)

data = data[data["white_moves"] > 10]
data = data[data["white_moves"] < 70]
data = data[abs(data["white_rating_diff"]) < 12]
data = data[abs(data["black_rating_diff"]) < 12]
data = data[(data["white_elo"] > 1000) & (data["white_elo"] < 2300)]
data = data[(data["black_elo"] > 1000) & (data["black_elo"] < 2300)]
data_prepared = full_pipeline.fit_transform(data)


In [5]:
assert len(data['white_elo']) == len(data_prepared)

In [6]:
np.save("./data/data_prepared.npy", data_prepared)
data["white_elo"].to_csv("./data/data_prepared_labels.csv", index=False, header=True)

We decided to have a small portion of data, containing only players for which we have some games, to tests our results on more realistic data than only one game.

In [7]:
skipped = 0
for x in range(20):
    df = pd.read_csv("players/player_" + str(x) + ".csv", index_col=None, header=0)
    df = df[df["time_control"].isin(["180+0", "180+2", "300+0", "300+3"])]
    df = df[df["white_moves"] > 10]
    df = df[df["white_moves"] < 70]
    df = df[abs(df["white_rating_diff"]) < 12]
    df = df[abs(df["black_rating_diff"]) < 12]
    df = df[(df["white_elo"] > 1000) & (df["white_elo"] < 2300)]
    df = df[(df["black_elo"] > 1000) & (df["black_elo"] < 2300)]

    if len(df) == 0:
        skipped += 1
        continue

    players_prepared = full_pipeline.transform(df)
    np.save(f"./players/player_prepared_{x - skipped}.npy", players_prepared)
    df["white_elo"].to_csv(f"./players/player_prepared_labels_{x - skipped}.csv", index=False, header=True)