In [None]:
%load_ext autoreload
%autoreload 2

In [253]:
import os
from joblib import parallel_backend

import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.utils import resample

from chesswinnerprediction.baseline.utils import (
    estimate_baseline_model,
    get_x_and_y,
    transform_and_scale_df,
    print_report
)
from chesswinnerprediction.baseline.constants import BASELINE_RANDOM_STATE
from chesswinnerprediction.constants import DRAW_STR, BLACK_WIN_STR, WHITE_WIN_STR, PROCESSED_FOLDER_PATH

In [208]:
data_dir = "lichess_db_standard_rated_2017-05"
data_path = os.path.join(PROCESSED_FOLDER_PATH, data_dir)

In [209]:
train_df = pd.read_csv(os.path.join(data_path, "train.csv"))
valid_df = pd.read_csv(os.path.join(data_path, "valid.csv"))

In [210]:
std_scaler = StandardScaler()
train_data = transform_and_scale_df(train_df, std_scaler)
valid_data = transform_and_scale_df(valid_df, std_scaler, fit_scaler=False)

In [249]:
valid_data = valid_data[:valid_data.shape[0] // 10]

In [250]:
X_train, y_train = get_x_and_y(train_data, predict_draws=True)
X_valid, y_valid = get_x_and_y(valid_data, predict_draws=True)

In [254]:
class CustomKNNClassifier(BaseEstimator, ClassifierMixin):
    classes_ = [DRAW_STR, BLACK_WIN_STR, WHITE_WIN_STR]
    
    def __init__(self, wins_constant=0.03, draws_constant=0.05, n_neighbors=100, weights="distance", score_size=0.1, random_state=None):
        self.wins_constant = wins_constant
        self.draws_constant = draws_constant
        self.n_neighbors = n_neighbors
        self.weights = weights
        self.score_size = score_size
        self.random_state = random_state
        self.knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, n_jobs=-1)

    def balance_dataset(self, train_data_df):
        draws = train_data_df[train_data_df["Result"] == DRAW_STR]
        black_win = train_data_df[train_data_df["Result"] == BLACK_WIN_STR]
        white_win = train_data_df[train_data_df["Result"] == WHITE_WIN_STR]

        n_draws = int(draws.shape[0] * (1 + self.draws_constant))
        n_black_wins = n_white_wins = int(n_draws * (1 + self.wins_constant))
        
        # todo draws_oversample: replace=True ??
        draws_oversample = resample(draws, replace=True, n_samples=n_draws, random_state=self.random_state)
        black_wins_undersample = resample(black_win, replace=False, n_samples=n_black_wins, random_state=self.random_state)
        white_wins_undersample = resample(white_win, replace=False, n_samples=n_white_wins, random_state=self.random_state)

        balanced_df = pd.concat([draws_oversample, black_wins_undersample, white_wins_undersample])
        return balanced_df

    def fit(self, x, y):
        train_data_df = pd.concat([x, y], axis=1)
        train_data_df = self.balance_dataset(train_data_df)
        X_balanced, y_balanced = train_data_df.drop("Result", axis=1), train_data_df["Result"]
        self.knn.fit(X_balanced, y_balanced)
        return self

    def predict(self, x):
        return self.knn.predict(x)

    def score(self, x, y, **kwargs):
        subset_size = int(len(x) * self.score_size)
        indices = np.random.choice(len(x), size=subset_size, replace=False)
        X_subset = x.iloc[indices]
        y_subset = y.iloc[indices]
        
        y_pred = self.predict(X_subset)
        return balanced_accuracy_score(y_subset, y_pred)


In [257]:
param_distributions = {
    "wins_constant": [0.01, 0.02, 0.03, 0.04, 0.05],
    "draws_constant": [0.01, 0.02, 0.03, 0.04, 0.05],
    "n_neighbors": [50, 100, 150, 200, 250],
    "weights": ["uniform", "distance"],
    "random_state": [BASELINE_RANDOM_STATE] 
}

random_search = RandomizedSearchCV(
    estimator=CustomKNNClassifier(),
    param_distributions=param_distributions,
    n_iter=1,
    scoring=None,
    cv=5,
    random_state=BASELINE_RANDOM_STATE,
    verbose=3,
    n_jobs=-1,
    error_score="raise",
    return_train_score=True,
)

In [258]:
with parallel_backend("multiprocessing"):
    random_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END draws_constant=0.03, n_neighbors=50, random_state=42, weights=uniform, wins_constant=0.03;, score=(train=0.576, test=0.445) total time=   2.8s
[CV 4/5] END draws_constant=0.03, n_neighbors=50, random_state=42, weights=uniform, wins_constant=0.03;, score=(train=0.579, test=0.474) total time=   3.1s
[CV 2/5] END draws_constant=0.03, n_neighbors=50, random_state=42, weights=uniform, wins_constant=0.03;, score=(train=0.577, test=0.463) total time=   3.2s
[CV 3/5] END draws_constant=0.03, n_neighbors=50, random_state=42, weights=uniform, wins_constant=0.03;, score=(train=0.575, test=0.458) total time=   3.7s
[CV 5/5] END draws_constant=0.03, n_neighbors=50, random_state=42, weights=uniform, wins_constant=0.03;, score=(train=0.580, test=0.440) total time=   4.1s


In [259]:
best_knn = random_search.best_estimator_

print_report(best_knn, X_train, y_train, X_valid, y_valid)


                                                Classification Report
                         Train Report                                      Validation Report
              precision    recall  f1-score   support         precision    recall  f1-score   support
         0-1       0.62      0.46      0.53    378005              0.58      0.43      0.49      4522
         1-0       0.64      0.44      0.52    392248              0.62      0.43      0.50      4764
     1/2-1/2       0.07      0.83      0.13     20431              0.04      0.55      0.08       240
    accuracy                           0.46    790684                                  0.43      9526
   macro avg       0.44      0.58      0.39    790684              0.41      0.47      0.36      9526
weighted avg       0.61      0.46      0.51    790684              0.59      0.43      0.49      9526
       
