# CrazyAra

## Data Analysis of the Training Data

* file: analyze_train_data.ipynb
* brief: Filterts out the used games of lichess crazyhouse dataset and does some analysis on it.

* author: QueensGambit
* contact: johannes.czech@stud.tu-darmstadt.de
* version: 2018-11-28 initial version


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%reload_ext autoreload

In [None]:
import sys, os
sys.path.insert(0,'../../../')
import os
import sys
from DeepCrazyhouse.src.preprocessing.PGN2PlanesConverter import PGN2PlanesConverter
from DeepCrazyhouse.src.runtime.ColorLogger import enable_color_logging
from DeepCrazyhouse.src.preprocessing.dataset_loader import load_pgn_dataset
import logging
from io import StringIO
import matplotlib.pyplot as plt
import seaborn as sns
import chess.pgn
import pandas as pd
import numpy as np

In [None]:
%matplotlib inline
plt.style.use('seaborn-whitegrid')
enable_color_logging()

## Settings
_same as_ `convert_pgn_to_planes.ipynb`

In [None]:
min_elo_both = 2000
nb_games_per_file = 1000

In [None]:
s_idcs, x, yv, yp, pgn_dataset = load_pgn_dataset()

In [None]:
pgn_dataset.tree()

In [None]:
converter = PGN2PlanesConverter(limit_nb_games_to_analyze=0, nb_games_per_file=nb_games_per_file,
                  max_nb_files=0, min_elo_both=min_elo_both, termination_conditions=["Normal"], log_lvl=logging.DEBUG,
                  compression='lz4', clevel=5, dataset_type='train')

In [None]:
lst_all_pgn_sel, lst_nb_games_sel, lst_batch_white_won, lst_batch_black_won, lst_batch_draw = converter.filter_all_pgns()

In [None]:
sum(lst_nb_games_sel)

In [None]:
file = open('crazyara_lichess_dataset.pgn', mode='w')

In [None]:
for pgn_sel in lst_all_pgn_sel:
    for pgn in pgn_sel:
        file.writelines(pgn.readlines())

In [None]:
file.close()

In [None]:
pgn = open('crazyara_lichess_dataset.pgn')


In [None]:
columns=['Event','Site','Date','Round','White','Black','Result', 'WhiteElo', 'BlackElo', 'WhiteRatingDiff', 'BlackRatingDiff', 'Termination', 'TimeControl', 'UTCDate', 'UTCTime', 'Variant']

In [None]:
nb_games

In [None]:
len(lst_all_pgn_sel[0])

In [None]:
len(df)

## Fill the pandas dataframe

In [None]:
# this list contains the full data of the pandas table
data = []

# read the first game
game = chess.pgn.read_game(pgn)


# read in all games one by one
for offset, headers in chess.pgn.scan_headers(pgn):
#while game is not None:
    row = []
    # iterate over all collumns
    for colname in columns:
        # fill one row of data
        try:
            row.append(headers[colname])
        except KeyError:
            # add empty value if entry is missing
            row.append([])
            print(headers)
    # add the row to the full table content
    data.append(row)
    # read in the next game
    #game = chess.pgn.read_game_h(pgn)
                

In [None]:
pgn.close()

In [None]:
df = pd.DataFrame(data, columns=columns)

## Export the dataframe

In [None]:
df.to_csv('crazyara_lichess_dataset_stats.csv')

### Load the dataframe

In [None]:
df = pd.DataFrame.from_csv('data/crazyara_lichess_dataset_stats.csv')

In [None]:
df_full = pd.concat([df['White'], df['Black']])

In [None]:
((df_full.value_counts()[:10] / len(df)) * 100).round(2)

In [None]:
elo = np.concatenate((df['WhiteElo'].values, df['BlackElo'].values))

In [None]:
elo.astype(np.float)

In [None]:
len(elo[-5000:])

In [None]:
def example_plot(ax, fontsize=12):
     ax.plot([1, 2])
     ax.locator_params(nbins=3)
     ax.set_xlabel('x-label', fontsize=fontsize)
     ax.set_ylabel('y-label', fontsize=fontsize)
     ax.set_title('Title', fontsize=fontsize)
        
plt.close('all')
fig = plt.figure(figsize=(10*1.5,8*1.5))

ax1 = plt.subplot(211)
ax2 = plt.subplot(425)
ax3 = plt.subplot(224)
ax4 = plt.subplot(427)

top_x = 20
cum_perc = df_full.value_counts()[:top_x].sum() / len(df) * 100

plt.suptitle("CrazyAra's Traing Data\n569,537 Games total (%.2f" % cum_perc + "% " + "by %d players)" % top_x, y=1.05, size=20)

#ax = (df_full.value_counts()[:20][::-1] / len(df) * 100).plot('barh', title="CrazyAra's Traing Data")
ax = (df_full.value_counts()[:top_x][::-1]).plot('barh', title="\nTop %d Active Crazyhouse-Players with Matches >= 2,000 elo for both Players\nfrom January 2016 to June 2018 (database.lichess.org/)" % top_x, ax=ax1)
ax.set_xlabel("Number of Games")
#ax.set_ylabel("Crazyhouse Players on lichess.org")

ax2.hist(elo[-5000000:])
ax2.axvline(x=elo.mean(), linewidth=2, color='lightblue')
ax2.text(elo.mean() + elo.mean()*.02,5000000 / 20, "mean=%.2f" % elo.mean(), fontsize=12)
ax2.set_title("Elo Rating")
ax2.set_xlabel("Rating")

#example_plot(ax1)
#example_plot(ax2)
#example_plot(ax3)

df['TimeControl'].value_counts()[:15][::-1].plot('barh', title='Time Control', ax=ax3)
ax3.set_xlabel("Number of Games")

df['Result'].value_counts()[::-1].plot('barh', ax=ax4)
ax4.set_title('Game Results')
ax4.set_xlabel("Number of Games")

plt.tight_layout()

plt.savefig("plots/crazyara_training_data.png", bbox_inches='tight')
plt.savefig("plots/crazyara_training_data.pdf", bbox_inches='tight')

In [None]:
df_full = pd.concat([df['White'], df['Black']])
ax = (df_full.value_counts()[:20][::-1]).plot('barh', title="CrazyAra's Traing Data")
ax.set_xlabel("Number of Games")
ax.set_ylabel("Crazyhouse Players on lichess.org")

In [None]:
np.array(df['WhiteElo'].values, np.int).mean()

In [None]:
np.array(df['WhiteElo'].values, np.int).std()

In [None]:
df