In this notebook we train a transformer to predict human chess moves.

In [9]:
# upload transformer.py and chess_transformer_utils.py from the repo
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving transformer.py to transformer.py
User uploaded file "transformer.py" with length 9702 bytes


In [2]:
# The games are stored as pgn-files on a google drive, so the drive has to be mounted first
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# check the upload and possibly remove changed files
#!rm chess_transformer_utils*
#!rm transformer*
!ls -al

total 44
drwxr-xr-x 1 root root  4096 Apr 26 09:24 .
drwxr-xr-x 1 root root  4096 Apr 26 09:16 ..
-rw-r--r-- 1 root root 20304 Apr 26 09:18 chess_transformer_utils.py
drwxr-xr-x 4 root root  4096 Apr 21 13:38 .config
drwx------ 5 root root  4096 Apr 26 09:19 drive
drwxr-xr-x 2 root root  4096 Apr 26 09:19 __pycache__
drwxr-xr-x 1 root root  4096 Apr 21 13:39 sample_data


In [10]:
%load_ext autoreload
%autoreload 2

from chess_transformer_utils import *
from transformer import *
import time
import matplotlib.pyplot as plt
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
vocab_size = 2020   #27 # Number of tokens (quite high because moves are encoded as well)
maxlen = 82  # length of the input

output_dim = 1968  # number of possible moves (starting square target combinations)

In [12]:
embed_dim = 64  # Embedding size for each token
num_heads = 8  # Number of attention heads
ff_dim = 256  # Hidden layer size in feed forward network inside transformer

In [13]:
# The multi block is the transformer component, the dense layers are for output 
import tensorflow
inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)

x0 = embedding_layer(inputs)

multi_block = Multi_Block(maxlen, embed_dim, num_heads, ff_dim, 4, rate = 0.0)
c,l = multi_block(x0,x0)

f = layers.Flatten()(l)

outputs = layers.Dense(output_dim, activation="softmax")(f)

model = keras.Model(inputs=inputs, outputs=outputs)


model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 82)]         0                                            
__________________________________________________________________________________________________
token_and_position_embedding_1  (None, 82, 64)       134528      input_2[0][0]                    
__________________________________________________________________________________________________
multi__block_1 (Multi_Block)    ((None, 5, 82, 64),  199950      token_and_position_embedding_1[0]
                                                                 token_and_position_embedding_1[0]
__________________________________________________________________________________________________
flatten (Flatten)               (None, 5248)         0           multi__block_1[0][1]         

In [14]:
# This code implements gradient accumulation, it's source is a github comment here:
# https://github.com/keras-team/keras/issues/3556

import sys
from tensorflow.keras import backend as K


def convert_to_accumulate_gradient_optimizer(orig_optimizer, update_params_frequency, accumulate_sum_or_mean=True):
    if update_params_frequency < 1:
        raise ValueError('update_params_frequency must be >= 1')
    print('update_params_frequency: %s' % update_params_frequency)
    print('accumulate_sum_or_mean: %s' % accumulate_sum_or_mean)
    orig_get_gradients = orig_optimizer.get_gradients
    orig_get_updates = orig_optimizer.get_updates
    accumulated_iterations = K.variable(0, dtype='int64', name='accumulated_iterations')
    orig_optimizer.accumulated_iterations = accumulated_iterations

    def updated_get_gradients(self, loss, params):
        return self.accumulate_gradient_accumulators

    def updated_get_updates(self, loss, params):
        self.accumulate_gradient_accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        updates_accumulated_iterations = K.update_add(accumulated_iterations, 1)
        new_grads = orig_get_gradients(loss, params)
        if not accumulate_sum_or_mean:
            new_grads = [g / K.cast(update_params_frequency, K.dtype(g)) for g in new_grads]
        self.updated_grads = [K.update_add(p, g) for p, g in zip(self.accumulate_gradient_accumulators, new_grads)]
        def update_function():
            with tensorflow.control_dependencies(orig_get_updates(loss, params)):
                reset_grads = [K.update(p, K.zeros(K.int_shape(p), dtype=K.dtype(p))) for p in self.accumulate_gradient_accumulators]
            return tensorflow.group(*(reset_grads + [updates_accumulated_iterations]))
        def just_store_function():
            return tensorflow.group(*[updates_accumulated_iterations])
        
        update_switch = K.equal((updates_accumulated_iterations) % update_params_frequency, 0)
        
        with tensorflow.control_dependencies(self.updated_grads):
            self.updates = [K.switch(update_switch, update_function, just_store_function)]
            return self.updates

    orig_optimizer.get_gradients = updated_get_gradients.__get__(orig_optimizer, type(orig_optimizer))
    orig_optimizer.get_updates = updated_get_updates.__get__(orig_optimizer, type(orig_optimizer))

In [15]:
# paths have to be adapted ...
batch_size = 512

train_generator = data_generator('/content/drive/My Drive/selectedOTB.pgn', 1.0, batch_size, pool_size= 10,  elo=True, move_tokens = 10)
validation_generator = data_generator('/content/drive/My Drive/sel_val_OTB.pgn', 1.0, 600, elo=True, move_tokens = 10, validation = True)

In [16]:
# optional gradient accumulation 

from tensorflow.keras.optimizers import Adam
opt = Adam()

#STEPS       = 4  # how many batches you go before updating
#convert_to_accumulate_gradient_optimizer(opt, STEPS)

In [19]:
model.compile(opt, "sparse_categorical_crossentropy", metrics=["accuracy"])

In [20]:
# Save checkpoints - especially important on Colab - path has to be adapted.

checkpoint_path = "/content/drive/My Drive/chesstransformer_checkpoints/cp_best_val_loss.ckpt"

# Create a callback that saves the model's weights every 5 epochs
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    verbose=1, 
    monitor='val_loss',
    mode='min',
    save_weights_only=False,
    save_best_only=True,
    period=1)



In [21]:
history = model.fit(
      train_generator,
      steps_per_epoch=1000,
      epochs=300,
      validation_steps = 100,
      #callbacks=[cp_callback],
      validation_data=validation_generator
    )

Epoch 1/300
 112/1000 [==>...........................] - ETA: 1:07:22 - loss: 6.5174 - accuracy: 0.0487

KeyboardInterrupt: ignored