In [1]:
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers, models

2025-06-11 13:28:54.873436: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [78]:
LIMIT = 100000

In [79]:
with open("lichess_db_standard_rated_2014-07.pgn") as f:
    i = 0
    data = ""
    for l in f:
        i += 1
        data += l
        if i > LIMIT:
            break

moves = re.findall("\[WhiteElo \"(\d+)\"\]\n\[BlackElo \"(\d+)\"\]\n\[WhiteRatingDiff \"[-+]?\d+\"\]\n\[BlackRatingDiff \"[-+]?\d+\"\]\n\[ECO \"[A-Z0-9]+\"\]\n\[Opening \".*\"\]\n\[TimeControl \".*\"\]\n\[Termination \".*\"]\n\n(1\..*)\n", data, re.MULTILINE)


with open("games.csv", "w") as f:
    print("elo,moves", file=f)
    for m in moves:
        mo = re.sub(r'\{[^{}]*\}', '', m[2]) #clean
        mo = re.sub(r'\d+\.+ ', '', mo) #clean
        print(f"{(int(m[0])+int(m[1]))//2},{mo}", file=f)
        

In [80]:
len(moves)

5320

In [2]:
df = pd.read_csv('games.csv')  

In [15]:
df['elo_slice'] = (df['elo'] // 200) * 200

# Filter to reasonable range (e.g. 800–2800)
df = df[(df['elo_slice'] >= 800) & (df['elo_slice'] <= 2800)]

df

Unnamed: 0,elo,moves,elo_slice
0,1491,d3 d5 g3 e6 Bg2 Nf6 Nf3 Be7 O-O O-O Re1 a6 e4 ...,1400
1,1390,e4 e6 Nf3 c5 g3 a6 Bg2 Nc6 O-O d6 h3 Be7 Nc3 Q...,1200
2,1471,e4 g6 Nf3 Bg7 d4 e6 Nc3 Ne7 Be3 O-O Be2 d6 O-O...,1400
3,1725,e4 g6 d4 Bg7 e5 e6 f4 Ne7 Nf3 d5 Nc3 O-O Be3 N...,1600
4,1433,e4 c5 Nf3 Nc6 Bc4 Nf6 Nc3 e5 O-O d6 a3 Nd4 Nxd...,1400
...,...,...,...
5315,1434,e4 e6 Nf3 d6 e5 dxe5 Nxe5 Bd6 Nxf7 Kxf7 Qh5+ g...,1400
5316,1739,d4 d5 c4 c6 cxd5 Qxd5 Nc3 Qd8 Bf4 Nf6 Nf3 Nd5 ...,1600
5317,1840,e4 d5 e5 Bf5 d4 e6 c3 c5 Be3 Nc6 dxc5 Nxe5 f4 ...,1800
5318,1600,d4 e6 Nc3 b6 Nf3 Bb7 e3 f5 Bd3 Bb4 Bd2 Nf6 a3 ...,1600


In [16]:
slice_counts = df['elo_slice'].value_counts().sort_index()
min_samples = 2  # or bump to 5 if you like
good_slices = slice_counts[slice_counts >= min_samples].index
df_filtered = df[df['elo_slice'].isin(good_slices)].copy()
print(f"Kept {len(good_slices)} slices; {len(df_filtered)} games remain.")

Kept 6 slices; 5319 games remain.


In [17]:
# Cell Z: Re-split with only adequate classes
X = df_filtered['moves']
y = df_filtered['elo_slice']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train slice counts:\n", y_train.value_counts())
print("Test  slice counts:\n", y_test.value_counts())


Train slice counts:
 elo_slice
1400    1610
1600    1357
1200     671
1800     458
1000      80
2000      79
Name: count, dtype: int64
Test  slice counts:
 elo_slice
1400    402
1600    339
1200    168
1800    115
1000     20
2000     20
Name: count, dtype: int64


## Approach 1: Dumb Baseline (Most Frequent Class)

A DummyClassifier that always predicts the most frequent slice.

In [18]:
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train.values.reshape(-1,1), y_train)
y_pred = dummy.predict(X_test.values.reshape(-1,1))
print("Baseline accuracy:", accuracy_score(y_test, y_pred))

Baseline accuracy: 0.37781954887218044


## Approach 2: Simple Feature – Number of Moves

Use just the count of moves as a single feature.

In [19]:
def count_moves(san_str):
    return len(san_str.split())

X_train_len = X_train.apply(count_moves).to_frame('num_moves')
X_test_len  = X_test.apply(count_moves).to_frame('num_moves')

lr_len = LogisticRegression(max_iter=1000)
lr_len.fit(X_train_len, y_train)
y_pred = lr_len.predict(X_test_len)
print("Num‐moves accuracy:", accuracy_score(y_test, y_pred))


Num‐moves accuracy: 0.37781954887218044


## Approach 3: Bag-of-Moves (CountVectorizer) + Logistic Regression

Treat the SAN move list as text and vectorize.

In [20]:
vect = CountVectorizer(token_pattern=r"[^ ]+")
Xtr_counts = vect.fit_transform(X_train)
Xte_counts = vect.transform(X_test)

lr_bow = LogisticRegression(max_iter=1000)
lr_bow.fit(Xtr_counts, y_train)
y_pred = lr_bow.predict(Xte_counts)
print("BoW + LR accuracy:", accuracy_score(y_test, y_pred))


BoW + LR accuracy: 0.35902255639097747


## Approach 4: TF-IDF + Random Forest

TF-IDF weighing and a tree-based model.

In [21]:
tfidf = TfidfVectorizer(token_pattern=r"[^ ]+")
Xtr_tfidf = tfidf.fit_transform(X_train)
Xte_tfidf = tfidf.transform(X_test)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(Xtr_tfidf, y_train)
y_pred = rf.predict(Xte_tfidf)
print("TF-IDF + RF accuracy:", accuracy_score(y_test, y_pred))


TF-IDF + RF accuracy: 0.3881578947368421


## Approach 5: Simple RNN (LSTM) on Move Sequences

Tokenize moves, pad sequences, and train an LSTM classifier in Keras.


In [22]:
# Cell 7: LSTM Sequence Model
# 1) Tokenize
tokenizer = Tokenizer(lower=False, filters='')
tokenizer.fit_on_texts(X_train)
Xtr_seq = tokenizer.texts_to_sequences(X_train)
Xte_seq = tokenizer.texts_to_sequences(X_test)

# 2) Pad
maxlen = 200  # adjust based on distribution
Xtr_pad = pad_sequences(Xtr_seq, maxlen=maxlen, padding='post')
Xte_pad = pad_sequences(Xte_seq, maxlen=maxlen, padding='post')

# 3) Encode labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
ytr_enc = le.fit_transform(y_train)
yte_enc = le.transform(y_test)
num_classes = len(le.classes_)

# 4) Build model
model = models.Sequential([
    layers.Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=64, input_length=maxlen),
    layers.Bidirectional(layers.LSTM(64)),
    layers.Dense(64, activation='relu'),
    layers.Dense(num_classes, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# 5) Train & evaluate
model.fit(Xtr_pad, ytr_enc, epochs=30, batch_size=64, validation_split=0.1)
loss, acc = model.evaluate(Xte_pad, yte_enc)
print("LSTM test accuracy:", acc)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
LSTM test accuracy: 0.32048872113227844


## Approach 6: Transformer Encoder

A simple Transformer encoder (via Keras `TextVectorization` + `TransformerBlock`).

In [25]:
# Cell 8: Transformer-Based Model (sketch; may require more tuning)
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras import Input

# Vectorize text
max_tokens = 20000
vectorizer = TextVectorization(max_tokens=max_tokens, output_sequence_length=maxlen, split='whitespace')
vectorizer.adapt(X_train)

# Define a Transformer block
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = models.Sequential([
            layers.Dense(ff_dim, activation='relu'),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)
    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# Build model
inputs = Input(shape=(1,), dtype=tf.string)
x = vectorizer(inputs)
x = layers.Embedding(max_tokens, 64)(x)
x = TransformerBlock(embed_dim=64, num_heads=4, ff_dim=128)(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(64, activation='relu')(x)
outputs = layers.Dense(num_classes, activation='softmax')(x)
tfm_model = models.Model(inputs, outputs)
tfm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train & eval
tfm_model.fit(X_train.values, ytr_enc, epochs=50, batch_size=128, validation_split=0.1)
loss, acc = tfm_model.evaluate(X_test.values, yte_enc)
print("Transformer test accuracy:", acc)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Transformer test accuracy: 0.3054511249065399
