Importing relevant modules

In [None]:
import os
import random
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import tensorflow as tf
import math
from scipy import special #comb, factorial
from keras import backend as K
from scipy.stats import uniform
from matplotlib import pyplot as plt
from sklearn import tree
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.preprocessing import MinMaxScaler, StandardScaler,LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report, roc_auc_score, recall_score, make_scorer, plot_confusion_matrix, confusion_matrix, accuracy_score,f1_score

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
sns.set_style('darkgrid')
df = pd.read_csv('/kaggle/input/chess/games.csv')
df.head()

Couple observations can be made:

1. Columns `id`, `white_id`, `black_id` are not important.

2. Either column `created_at` or `last_move_at` must contain some errors. Per [description](https://www.kaggle.com/datasnaek/chess) provided by the uploader, `created_at` stands for the time when the game began, and `last_move_at` stands for the time when the game ended. Hence $($ `last_move_at` $-$ `created_at` $)$ must represent a duration of the game. But according to the dataset, for some games, the duration is $0$ (which, given the cirumstances, is not possible). For example, consider the game with `id` TZJHLljE (first row in the dataset). The value in the column `created_at` is 1.504210e+12. And the value in `last_move_at` is also 1.504210e+12. But then it follows that (1.504210e+12)-(1.504210e+12)=0, implying that the duration of the game is zero. But also note that according to the dataset, the game ended because either of two players ran out of time (the value in the column `victory_status` equals to "outoftime"), which clearly contradicts the fact that the duration of the game was 0 seconds (or whatever metric was used to calculate time). Hence it is likely that either of two columns contains false number. In fact, we can see that there are numerous games that purportedly had zero duration:

In [None]:
dur = df[df['last_move_at']-df['created_at'] == 0].shape[0]
print(f'Number of games that had zero duration: {dur}, which makes \
up {round(dur/df.shape[0],2)*100}% of all games')

The upshot is: We should remove the columns `last_move_at` and `created_at`.

There is one more thing: column `moves` should be removed. If particular game was ended by a check mate, then by using data from the column `moves` we can say with 100% certainty who won (esentially with data in `moves` we can reconstruct the whole game). Our main goal is to see whether we can accurately predict a winner based on LIMITED information.

In [None]:
cols_to_drop = ['id','white_id','black_id','last_move_at','created_at','moves']
df.drop(cols_to_drop,axis=1,inplace=True)

Let's look at the label distribution (i.e., `winner`)

In [None]:
df['winner'].describe()

In [None]:
df['winner'].value_counts()

In this analysis, we will restrict our attention only to those games where there is a winner (i.e., we will not conisder draws).

In [None]:
df = df[df['winner'] != 'draw']

Let's have a look at categorical features

In [None]:
cat_features = np.array(['rated','victory_status','increment_code','opening_eco','opening_name'])
count = np.array([df[feature].unique().size for feature in cat_features])

to_sort = np.argsort(count)[::-1]
cat_features = cat_features[to_sort]
count = count[to_sort]

plt.figure(figsize=(11,6))
sns.barplot(cat_features,count)
plt.title("Number of unique values per each feature")
plt.ylabel('Count')
plt.xlabel('Feature')
plt.show()

We see that most of the features have very high cardinality.

For each feature, we will calculate how many values occur only once.

In [None]:
cat_features = np.array(['rated','victory_status','increment_code','opening_eco','opening_name'])
count = []

for feature in cat_features:
    freq = 0
    for value in df[feature].unique():
        if df[df[feature] == value].shape[0] == 1:
            freq+=1
    count.append(freq)
    
pd.DataFrame({'Feature': cat_features, 'Count of rare values': count})

As expected, features with high cardinality have a lot of extremely rare values (i.e., values that occur only once).

Now we will have a look at continuous features

In [None]:
cont_features = ['white_rating','black_rating','opening_ply']
df[cont_features].describe().round(2).T

In [None]:
cont_features = ['white_rating','black_rating','opening_ply']
WIDTH = 16
LENGTH = 7

rows = math.ceil(len(cont_features)/3)
fig, ax = plt.subplots(rows,3,figsize=(WIDTH,LENGTH))
ax = ax.flatten()
for i,feature in enumerate(cont_features):
    ax[i].hist(df[feature],alpha=0.6)
    ax[i].set_title(f'Distribution of a feature `{feature}`')

In [None]:
cont_features = ['white_rating','black_rating','opening_ply']
cat_variable = 'winner'
WIDTH = 16
LENGTH = 7

rows = math.ceil(len(cont_features)/3)
fig, ax = plt.subplots(rows,3,figsize=(WIDTH,LENGTH))
ax = ax.flatten()
for i,feature in enumerate(cont_features):
    sns.boxplot(x=cat_variable, y=feature, data=df,ax=ax[i])
    ax[i].set_title(f'Cond. dist. of feature `{feature}`')

The features alone doesn't seem to be doing good job at separating the winner. Let's use ANOVA test to verify independence.

In [None]:
from scipy.stats import f_oneway

cont_features = ['white_rating','black_rating','opening_ply']
label = 'winner'

dic = {'Categorical': [],
    'Numerical': [],
    'p-value': [],
    'p < 0.05': [],
    'statistic': []}


for feature in cont_features:
    values = []
    for value in df[label].unique():
        values.append(df[df[label] == value][feature].values)
    
    statistic, pval = f_oneway(*values)
    
    dic['Categorical'].append(label)
    dic['Numerical'].append(feature)
    dic['p-value'].append(pval)
    dic['p < 0.05'].append(pval<0.05)
    dic['statistic'].append(statistic)


pd.DataFrame(dic)

Rather unexpectedly, the ANOVA test suggests that each feature and target variable are in fact dependent.

Let's try to define new feature: difference between white_rating and black_rating, i.e., `white_rating`-`black_rating`.

In [None]:
df['rating_diff'] = df['white_rating']-df['black_rating']

One can see that, the larger the value in `rating_diff`, the more advantage (rating-wise) white side has over black side. Let's see the summary of our new variable and the distribution.

In [None]:
df['rating_diff'].describe()

In [None]:
dataframe = df
feature_1 = 'blueWins'
feature_2 = 'rating_diff'
plt.figure(figsize=(7,7))
sns.boxplot(y=feature_2, data=dataframe)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.hist(df['rating_diff'],alpha=0.6)
plt.title("Difference between white rating and black rating")
plt.xlabel('difference')
plt.ylabel("count")
plt.show()

As wee see, in most cases, games are relatively fair (i.e., both players have similar rating; more concretely $|\text{white_rating} - \text{black_rating}| ≤ 500$). But there is decent number of games where the discrepancy is relatively large.

Now, let's have a look at how our new feature discerns the winner of the game.

In [None]:
dataframe = df
feature_1 = 'winner'
feature_2 = 'rating_diff'
plt.figure(figsize=(7,7))
sns.boxplot(x=feature_1, y=feature_2, data=dataframe)
plt.show()

As expected: The smaller the value (i.e., black side has higher rating), the less likely it is that white side will win (and vice versa). Let's use ANOVA to check whether there is a statistical signifance of the difference between the conditional distributions. 

In [None]:
from scipy.stats import f_oneway

cont_features = ['rating_diff']
label = 'winner'

dic = {'Categorical': [],
    'Numerical': [],
    'p-value': [],
    'p < 0.05': [],
    'statistic': []}


for feature in cont_features:
    values = []
    for value in df[label].unique():
        values.append(df[df[label] == value][feature].values)
    
    statistic, pval = f_oneway(*values)
    
    dic['Categorical'].append(label)
    dic['Numerical'].append(feature)
    dic['p-value'].append(pval)
    dic['p < 0.05'].append(pval<0.05)
    dic['statistic'].append(statistic)


pd.DataFrame(dic)

It seems that the difference between ratings is indeed a good predictor of a winner.

Now we will try to classify. The features we will be using are the folliwng:

'increment_code', 'opening_eco', 'opening_name', 'rated', 'victory_status',
                    'turns', 'white_rating', 
                   'black_rating', 'opening_ply', 
                   'rating_diff'

One hot encoding features with high cardinality

In [None]:
highcardf = df[['increment_code','opening_eco','opening_name']].copy()
sparse_high = OneHotEncoder().fit_transform(highcardf)


Preprocessing continuous/non-high cardinality categorical features

In [None]:
from scipy import sparse


nohighcar_df = df[['rated','victory_status',
                   'turns','white_rating', 
                   'black_rating', 'opening_ply', 
                   'rating_diff']].copy()

#Process categorical features
nohighcar_df['rated'] = nohighcar_df['rated'].map({False: 0, True:1})
nohighcar_df = pd.get_dummies(nohighcar_df)


#Move all numerical features to the right of the dataframe
num_feat = df[['turns', 'white_rating', 'black_rating', 'opening_ply',
       'rating_diff']]
nohighcar_df.drop(['turns', 'white_rating', 'black_rating', 'opening_ply',
       'rating_diff'],axis=1,inplace=True)
nohighcar_df = pd.concat([nohighcar_df,num_feat],axis=1)

nohigh_sparse = sparse.csr_matrix(nohighcar_df.values)

In [None]:
X,y = sparse.hstack((sparse_high,nohigh_sparse)), df['winner']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=11)


sc = StandardScaler()



left = X_train[:,:-5]
right = sparse.csr_matrix(sc.fit_transform(X_train[:,-5:].todense()))
X_train = sparse.hstack((left,right)).tocsr()


left = X_test[:,:-5]
right = sparse.csr_matrix(sc.transform(X_test[:,-5:].todense()))
X_test = sparse.hstack((left,right)).tocsr()


Having prepared our data, let's try to classify. We will only use two models here: Logistic regression and decision trees.

# Logistic regression

In [None]:
log_random_state = None
log_clf = LogisticRegression(random_state=log_random_state,max_iter=500).fit(X_train, y_train)
print(classification_report(y_true=y_test, y_pred=log_clf.predict(X_test)))
plot_confusion_matrix(log_clf, X_test, y_test)

# Decision trees

In [None]:
tree_clf = tree.DecisionTreeClassifier().fit(X_train, y_train)
print(classification_report(y_true=y_test, y_pred=tree_clf.predict(X_test)))
plot_confusion_matrix(tree_clf, X_test, y_test)

We see that even relatively simple model (with no hyperparameters tuning) gives us reasonable results.