In [None]:
from dotenv import load_dotenv
import itertools
import os

from explainerdashboard import ClassifierExplainer, ExplainerDashboard
import pandas as pd
from sklearn.base import clone
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.inspection import PartialDependenceDisplay
import sklearn.metrics
from sqlalchemy import create_engine

import modelbuilderpaardensprong
from tweevoortwaalf.paardensprong import Paardensprong

In [None]:
load_dotenv()

PLAYERNAME = os.getenv("playername")

database_url_prod = os.getenv("PROD_DATABASE_URL").replace(
    "postgresql", "postgresql+psycopg"
)
engine_prod = create_engine(database_url_prod)

database_url_dev = os.getenv("DATABASE_URL").replace("postgresql", "postgresql+psycopg")
engine_dev = create_engine(database_url_dev)

In [None]:
with engine_prod.connect() as conn:
    games = pd.read_sql_query(
        "SELECT * FROM paardensprong.games", con=conn, index_col="game_id"
    )
    guesses = pd.read_sql_query(
        "SELECT * FROM paardensprong.guesses", con=conn, index_col="guess_id"
    )

guesses_relevant = guesses.set_index("game_id").rename(
    columns={"correct": "GuessCorrect"}
)[["guess_time", "GuessCorrect"]]

df = (
    games
    # Drop games which have no guess - probably time out because of long loading times
    .join(guesses_relevant, how="inner")
    .query("playername == @PLAYERNAME | game_id == 46")
    .assign(
        PuzzleTimeSec=lambda df: (df["guess_time"] - df["start_time"]).dt.seconds,
        # The on time is a bit strict; since you need a few seconds typing time
        # But that's on purpose: it makes sense to train to have a bit of spare time
        # And it helps the model since you have just a few more unsuccessfulls to train on
        OnTime=lambda df: df["PuzzleTimeSec"].lt(30),
        Success=lambda df: df["GuessCorrect"] & df["OnTime"],
    )
    # A few answers were given extremely late; probably when reconnecting
    .query("PuzzleTimeSec < 120")
    .assign(NTimesWordSeenBefore=lambda df: df.groupby("answer").cumcount())
)

In [None]:
X = df[
    [
        "start_time",
        "answer",
        "startpoint",
        "direction",
        "NTimesWordSeenBefore",
        "Success",
    ]
]


y = X.pop("Success").astype(int)



X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [None]:
grid = modelbuilderpaardensprong.grid
grid.fit(X_train, y_train)
pipe = grid.best_estimator_

In [None]:
grid.best_params_

In [None]:
def n_columns(columns):
    if columns == "all":
        return len(X_train.columns)
    return len(columns)


results = pd.concat(
    [
        pd.DataFrame(grid.cv_results_["params"]),
        pd.Series(grid.cv_results_["mean_test_score"], name="mean_test_score"),
        pd.Series(grid.cv_results_["mean_train_score"], name="mean_train_score"),
        pd.Series(grid.cv_results_["std_test_score"], name="std_test_score"),
    ],
    axis="columns",
).assign(
    Overfit=lambda df: df["mean_train_score"] - df["mean_test_score"],
    columns=lambda df: df["columnselection__columns"].apply(n_columns),
)

results.sort_values("mean_test_score", ascending=False)

### Inspect model

In [None]:
y_pred_train = pipe.predict_proba(X_train)
train_logloss = sklearn.metrics.log_loss(y_train, y_pred_train)
train_auc = sklearn.metrics.roc_auc_score(y_train, y_pred_train[:, 1])

y_pred_proba = pipe.predict_proba(X_test)
test_logloss = sklearn.metrics.log_loss(y_test, y_pred_proba)
test_auc = sklearn.metrics.roc_auc_score(y_test, y_pred_proba[:, 1])

dummy = DummyClassifier()
dummy.fit(X_train, y_train)
y_pred_dummy = dummy.predict_proba(X_test)
dummy_logloss = sklearn.metrics.log_loss(y_test, y_pred_dummy)
dummy_auc = sklearn.metrics.roc_auc_score(y_test, y_pred_dummy[:, 1])

print("Log loss:")
print(" Train - Test  -  Dummy")
print(f"{train_logloss: .3f} - {test_logloss:.3f} - {dummy_logloss: .3f}")

print("AUC:")
print(" Train - Test  -  Dummy")
print(f"{train_auc: .3f} - {test_auc:.3f} - {dummy_auc: .3f}")

In [None]:
explainer = ClassifierExplainer(pipe, X_test, y_test)
ExplainerDashboard(explainer).run(port=8051)

### Fit final model
Fit on total set to use all data, do some quick fatal flaw inspection on probabilities and logical relations between variables

In [None]:
total_estimator = clone(pipe)
total_estimator.fit(X, y)

X_transformed = X.copy()
for transformer in total_estimator.steps[:-1]:
    X_transformed = transformer[1].transform(X_transformed)

In [None]:
pd.Series(total_estimator.predict_proba(X)[:, 0]).plot(kind="hist")

In [None]:
display = PartialDependenceDisplay.from_estimator(
    total_estimator.named_steps["clf"], X_transformed, features=[0, 1, 2], kind="both"
)


display.plot(pdp_lim={1: (0.8, 1)})

# Using the model

In [None]:
def create_puzzle_options(n_per_answer=4):
    words = pd.read_csv(
        "../tweevoortwaalf/Data/suitable_8_letter_words.txt", header=None
    ).squeeze()
    startpoint = range(8)
    directions = [-1, 1]

    X_new = pd.DataFrame(
        itertools.product(words, startpoint, directions),
        columns=["answer", "startpoint", "direction"],
    )
    X_new = X_new.merge(
        df["answer"].value_counts().to_frame("NTimesWordSeenBefore").reset_index(),
        how="left",
    ).fillna(0)
    X_new = X_new.groupby("answer", group_keys=False).apply(
        lambda x: x.sample(n_per_answer)
    )
    return X_new.reset_index(drop=True)


X_new = create_puzzle_options(4)

In [None]:
y_pred = total_estimator.predict_proba(X_new)
X_new["probability"] = y_pred[:, 0]
X_new["probability"].plot(kind="kde")

In [None]:
def probability_option(p, n):
    return (p - p**2) ** n


def iterative_sampling(X_new, sample_size=250, n_to_sample=100):
    served = []
    X_predicted = pd.DataFrame()
    for _ in range(n_to_sample):
        unpredicted = X_new.loc[lambda df: ~df.index.isin(X_predicted.index)]
        if not unpredicted.empty:
            if len(unpredicted) <= sample_size:
                newly_predicted = unpredicted
            else:
                newly_predicted = unpredicted.sample(sample_size)
            X_predicted = pd.concat([X_predicted, newly_predicted])
        n = min(100, 5 * len(X_new) / len(X_predicted))
        X_predicted["weight"] = probability_option(X_predicted["probability"], n)
        served.append(X_predicted.sample(n=1, weights=X_predicted["weight"]).squeeze())
    return pd.concat(served, axis="columns").transpose()

In [None]:
n_to_play = 100
sample_methods = {
    "random": lambda p: p.sample(n_to_play),
    "largest": lambda x: x.nlargest(n_to_play, "probability"),
    "power=1": lambda p: p.sample(
        n_to_play, weights=probability_option(p["probability"], 1)
    ),
    "power=5": lambda p: p.sample(
        n_to_play, weights=probability_option(p["probability"], 5)
    ),
    "power=10": lambda p: p.sample(
        n_to_play, weights=probability_option(p["probability"], 10)
    ),
    "iterative100": lambda p: iterative_sampling(p, 100, n_to_play),
    "iterative250": lambda p: iterative_sampling(p, 250, n_to_play),
}


probs_played = pd.DataFrame()
for method, func in sample_methods.items():
    probs_played[method] = func(X_new)["probability"].reset_index(drop=True)

ax = probs_played.mean().sort_values().plot(kind="barh")
ax.bar_label(ax.containers[0], fmt="{:.1%}")

In [None]:
probs_played.plot(kind="kde")

In [None]:
with engine_dev.connect() as conn:
    X_new.to_sql(
        "puzzleoptions",
        con=conn,
        schema="paardensprong",
        if_exists="replace",
        index=False,
        method="multi",
    )
    conn.commit()

In [None]:
with engine_prod.connect() as conn:
    conn.rollback()

In [None]:
with engine_prod.connect() as conn:
    X_new.to_sql(
        "puzzleoptions",
        con=conn,
        schema="paardensprong",
        if_exists="replace",
        index=False,
        method="multi",
    )
    conn.commit()