In [None]:
from dotenv import load_dotenv
import os

from explainerdashboard import ClassifierExplainer, ExplainerDashboard
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn.metrics
from sqlalchemy import create_engine

import modelbuilderpaardensprong

In [None]:
load_dotenv()

PLAYERNAME = os.getenv('playername')

database_url_prod = os.getenv('PROD_DATABASE_URL').replace('postgresql', 'postgresql+psycopg')
engine_prod = create_engine(database_url_prod)

database_url_dev = os.getenv('DATABASE_URL').replace('postgresql', 'postgresql+psycopg')
engine_dev = create_engine(database_url_dev)

In [None]:
with engine_prod.connect() as conn:
    games = pd.read_sql_query('SELECT * FROM paardensprong.games', con=conn, index_col='game_id')
    guesses = pd.read_sql_query('SELECT * FROM paardensprong.guesses', con=conn, index_col='guess_id')

guesses_relevant = (guesses.set_index('game_id')
                    .rename(columns={'correct': 'GuessCorrect'})
                    [['guess_time', 'GuessCorrect']]
                    )                           

df = (games
      # Drop games which have no guess - probably time out because of long loading times
      .join(guesses_relevant, how='inner')
      .query('playername == @PLAYERNAME')
      .assign(PuzzleTimeSec = lambda df: (df['guess_time'] - df['start_time']).dt.seconds,
              # The on time is a bit strict; since you need a few seconds typing time
              # But that's on purpose: it makes sense to train to have a bit of spare time
              # And it helps the model since you have just a few more unsuccessfulls to train on
              OnTime = lambda df: df['PuzzleTimeSec'].lt(30),
              Success = lambda df: df['GuessCorrect'] & df['OnTime'],
              )
      # A few answers were given extremely late; probably when reconnecting
      .query('PuzzleTimeSec < 120')
      )

In [None]:
X = (df[['answer', 'startpoint', 'direction', 'Success']])
y = X.pop('Success').astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

In [None]:
pipe = modelbuilderpaardensprong.pipe
pipe.fit(X_train, y_train)

In [None]:
y_pred_proba = pipe.predict_proba(X_test)
print(sklearn.metrics.log_loss(y_test, y_pred_proba[:, -1]))
explainer = ClassifierExplainer(pipe, X_test, y_test)
ExplainerDashboard(explainer).run(port=8051)

# Using the model

In [None]:
from tweevoortwaalf.paardensprong import Paardensprong

In [None]:
puzzleoptions = []
for _ in range(1000):
    ps = Paardensprong()
    if ps.unique_solution():
        puzzleoptions.append({'answer': ps.answer, 'startpoint': ps.startpoint, 'direction': ps.direction})
    else:
        print(ps.answer)
X_new = pd.DataFrame(puzzleoptions)

In [None]:
y_pred = pipe.predict_proba(X_new)
X_new['probability'] = y_pred[:, 0]

In [None]:
with engine_dev.connect() as conn:
    X_new.to_sql('puzzleoptions', con=conn, schema='paardensprong', if_exists='replace', index=False, method='multi')
    conn.commit()

In [None]:
with engine_prod.connect() as conn:
    X_new.to_sql('puzzleoptions', con=conn, schema='paardensprong', if_exists='replace', index=False, method='multi')
    conn.commit()

# Check outcomes

In [None]:
from matplotlib import pyplot as plt
import numpy as np

In [None]:
with engine_dev.connect() as conn:
    puzzleoptions = pd.read_sql_query('SELECT * FROM paardensprong.puzzleoptions', con=conn)
def probability_option(p):
    return p - p**2
p = probability_option(puzzleoptions['probability'])
puzzleoptions.sample(weights=p).squeeze()

In [None]:
p = np.linspace(0, 1, 100)

def probability_option(p):
    return p - p**2
y = probability_option(p)

plt.plot(p, y)

In [None]:
one_row = X_new.nlargest(10, 'probability').sample(1).squeeze()
ps_now = Paardensprong(answer=one_row['answer'], direction=one_row['direction'], startpoint=one_row['direction'])
puzzle = ps_now.create_puzzle()
ps_now.show_puzzle(puzzle)