In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from joblib import parallel_backend

import mlflow.sklearn
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.utils import resample

from config import MLRUNS_FOLDER_PATH, BASELINE_EXPERIMENT
from chesswinnerprediction.baseline.constants import BASELINE_RANDOM_STATE
from chesswinnerprediction.constants import (
    DRAW_STR,
    BLACK_WIN_STR,
    WHITE_WIN_STR,
    PROCESSED_FOLDER_PATH,
)
from chesswinnerprediction.baseline.utils import (
    estimate_baseline_model,
    get_x_and_y,
    transform_and_scale_df,
)

## Set the MLFlow tracking URI and experiment

In [3]:
mlflow.set_tracking_uri(MLRUNS_FOLDER_PATH)

In [4]:
if mlflow.get_experiment_by_name(BASELINE_EXPERIMENT) is not None:
    experiment_id = mlflow.get_experiment_by_name(BASELINE_EXPERIMENT).experiment_id
else:
    experiment_id = mlflow.create_experiment(BASELINE_EXPERIMENT)

In [5]:
mlflow.set_experiment(experiment_id=experiment_id)
mlflow.sklearn.autolog(disable=False, max_tuning_runs=0)

## Load Data

In [6]:
data_dir = "lichess_db_standard_rated_2017-05"
data_path = os.path.join(PROCESSED_FOLDER_PATH, data_dir)

In [7]:
train_df = pd.read_csv(os.path.join(data_path, "train.csv"))
valid_df = pd.read_csv(os.path.join(data_path, "valid.csv"))


KeyboardInterrupt



In [None]:
std_scaler = StandardScaler()
train_data = transform_and_scale_df(train_df, std_scaler)
valid_data = transform_and_scale_df(valid_df, std_scaler, fit_scaler=False)

In [None]:
X_train, y_train = get_x_and_y(train_data, predict_draws=True)
X_valid, y_valid = get_x_and_y(valid_data, predict_draws=True)

## KNN Implementation

In [None]:
class CustomKNNClassifier(BaseEstimator, ClassifierMixin):
    classes_ = [DRAW_STR, BLACK_WIN_STR, WHITE_WIN_STR]

    def __init__(
        self,
        wins_constant=0.03,
        draws_constant=0.05,
        n_neighbors=100,
        weights="distance",
        score_size=0.1,
        random_state=None,
        leaf_size=30,
    ):
        self.wins_constant = wins_constant
        self.leaf_size = leaf_size
        self.draws_constant = draws_constant
        self.n_neighbors = n_neighbors
        self.weights = weights
        self.score_size = score_size
        self.random_state = random_state
        self.predict_draws = True
        self.knn = KNeighborsClassifier(
            n_neighbors=n_neighbors, weights=weights, n_jobs=-1, leaf_size=leaf_size
        )
    
    def __repr__(self, **kwargs):
        return "CustomKNNClassifier"
    
    def __str__(self, **kwargs):
        return "CustomKNNClassifier"

    def balance_dataset(self, train_data_df):
        draws = train_data_df[train_data_df["Result"] == DRAW_STR]
        black_win = train_data_df[train_data_df["Result"] == BLACK_WIN_STR]
        white_win = train_data_df[train_data_df["Result"] == WHITE_WIN_STR]

        n_draws = int(draws.shape[0] * (1 + self.draws_constant))
        n_black_wins = n_white_wins = int(n_draws * (1 + self.wins_constant))

        draws_oversample = resample(
            draws, replace=True, n_samples=n_draws, random_state=self.random_state
        )
        black_wins_undersample = resample(
            black_win,
            replace=False,
            n_samples=n_black_wins,
            random_state=self.random_state,
        )
        white_wins_undersample = resample(
            white_win,
            replace=False,
            n_samples=n_white_wins,
            random_state=self.random_state,
        )

        balanced_df = pd.concat(
            [draws_oversample, black_wins_undersample, white_wins_undersample]
        )
        return balanced_df

    def fit(self, x, y):
        train_data_df = pd.concat([x, y], axis=1)
        balanced_df = self.balance_dataset(train_data_df)
        X_balanced, y_balanced = (
            balanced_df.drop("Result", axis=1),
            balanced_df["Result"],
        )
        self.knn.fit(X_balanced, y_balanced)
        return self

    def predict(self, x):
        return self.knn.predict(x)

    def predict_proba(self, x):
        return self.knn.predict_proba(x)

    def score(self, x, y, use_subset=True, **kwargs):
        if not use_subset:
            return balanced_accuracy_score(y, self.predict(x))

        subset_size = int(len(x) * self.score_size)
        indices = np.random.choice(len(x), size=subset_size, replace=False)
        X_subset = x.iloc[indices]
        y_subset = y.iloc[indices]

        y_pred = self.predict(X_subset)
        return balanced_accuracy_score(y_subset, y_pred)

In [None]:
param_distributions = {
    "wins_constant": [0.01, 0.1, 0.2],
    "draws_constant": [0.25, 0.5, 1],
    "n_neighbors": [65, 75, 85],
    "weights": ["uniform"],
    "random_state": [BASELINE_RANDOM_STATE],
}

random_search = RandomizedSearchCV(
    estimator=CustomKNNClassifier(),
    param_distributions=param_distributions,
    n_iter=5,
    scoring=None,
    cv=5,
    random_state=BASELINE_RANDOM_STATE,
    verbose=3,
    n_jobs=-1,
    error_score=np.nan,
    return_train_score=False,
    refit=True,
)

In [None]:
with mlflow.start_run(run_name="random_search") as run:
    random_search_run_id = run.info.run_id
    mlflow.log_param("predict_draws", True)
    
    with parallel_backend("multiprocessing"):
        random_search.fit(X_train, y_train)

In [None]:
random_search.best_score_

In [None]:
random_search.best_params_

## Best KNN model

In [None]:
# args = {
#     "wins_constant": 0.1,
#     "weights": "uniform",
#     "random_state": 42,
#     "n_neighbors": 75,
#     "draws_constant": 0.5,
#     "leaf_size": 30,
# }



In [None]:
with mlflow.start_run(run_id=random_search_run_id) as run:
    best_knn = random_search.best_estimator_
    best_knn = best_knn.fit(X_train, y_train)
    
    feature_importance = None
    score = estimate_baseline_model(best_knn, feature_importance, X_train, y_train, X_valid, y_valid)

In [None]:
# best_knn = random_search.best_estimator_
# best_knn = best_knn.fit(X_train, y_train)
# 
# feature_importance = None
# score = estimate_baseline_model(best_knn, feature_importance, X_train, y_train, X_valid, y_valid)