In [None]:
import tensorflow as tf
from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2
import numpy as np
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
from Polyomino import Polyomino
from MCTS import search

ITERATIONS = 100 # Number of times to swap between self-play and neural net training
SELFPLAY_GAMES = 40 # Number of self-play games before training the neural net on a minibatch
SEARCH_ITERATIONS = 50 # Number of whole game simulations per tree
BATCH_SIZE = 256 # Number of states to train on per epoch
height = 8
width = 10
games = [Polyomino(height, width, idx) for idx in range(SELFPLAY_GAMES)]
action_size = len(games[0].pieces)

# Model takes in the number of remaining pieces as input once the convolution layers are finished
input1 = tf.keras.layers.Input(shape=(height,width,1))
input2 = tf.keras.layers.Input(shape=(5))
conv1 = tf.keras.layers.Conv2D(64,(3,3), activation = 'relu', padding = 'same')(input1)
conv1 = tf.keras.layers.Conv2D(128,(3,3), activation = 'relu', padding = 'same')(conv1)
conv1 = tf.keras.layers.Conv2D(256,(3,3), activation = 'relu', padding = 'same')(conv1)
flat = tf.keras.layers.Flatten()(conv1)
flat2 = tf.keras.layers.Dense(128, activation="relu")(flat)
merged = tf.keras.layers.Concatenate()([flat2, input2])
dense = tf.keras.layers.Dense(128, activation="relu")(merged)
output1 = tf.keras.layers.Dense(action_size, activation="softmax", name='pi')(dense)
dense = tf.keras.layers.Dense(128, activation="relu")(dense)
output2= tf.keras.layers.Dense(1, activation="sigmoid", name='v',
                               kernel_initializer=tf.keras.initializers.RandomNormal(mean=0.1))(dense)
model = tf.keras.models.Model(inputs=[input1,input2], outputs=[output1, output2])
model.compile(loss=['categorical_crossentropy', "mean_squared_error"], loss_weights=[1., 100.],
              optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001))

examples = []
avg_values = []
history = []
waiting_for_result = []
for _ in tqdm(range(ITERATIONS)):
    # Conversion to a frozen function increases prediction efficiency
    full_model = tf.function(lambda x: model(x))
    full_model = full_model.get_concrete_function([tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype),
                                                   tf.TensorSpec(model.inputs[1].shape, model.inputs[1].dtype)])
    frozen_func = convert_variables_to_constants_v2(full_model)
    frozen_func.graph.as_graph_def()

    while (unfinished_games:=[game for game in games if game.get_legal_actions() != []]) != []:
        roots = search(np.array(unfinished_games), frozen_func, SEARCH_ITERATIONS)
        for idx, root in enumerate(roots):
            pi_example = root.children_visits
            actions = [child.action for child in root.children]

            # In order to train the model, we have to put back the invalid actions we took out
            # during tree search and give them a value of 0
            invalid_actions = [i for i in range(action_size) if i not in [action[0] for action in actions]]
            for action in invalid_actions:
                if action < len(pi_example):
                    pi_example = np.insert(pi_example, action, 0)
                else:
                    pi_example = np.append(pi_example, 0)

            waiting_for_result.append((unfinished_games[idx].get_board(), pi_example/np.sum(pi_example),
                                       unfinished_games[idx].get_available_pieces(), unfinished_games[idx].idx))
            # Stochastically determine the next piece to place
            choice = random.choices(population=[i for i in range(len(actions))],
                                    weights=root.children_visits/np.sum(root.children_visits))[0]
            action = root.children[choice].action
            unfinished_games[idx].step(action)

    # Resulting game outcome of each state is not known until the games are finished
    examples += [(s, a, p, games[idx].value()) for s, a, p, idx in waiting_for_result]
    waiting_for_result = []
    for i in range(SELFPLAY_GAMES):
        games[i].reset_game()

    # Model is trained on most recent 3000 states
    recent_examples = examples[-3000:].copy()
    random.shuffle(recent_examples)
    states = np.reshape(np.array([example[0] for example in recent_examples[0:BATCH_SIZE]]), (BATCH_SIZE,height,width,1))
    actions = np.array([example[1] for example in recent_examples[0:BATCH_SIZE]])
    pieces = np.array([example[2] for example in recent_examples[0:BATCH_SIZE]])
    values = np.array([example[3] for example in recent_examples[0:BATCH_SIZE]])
    # print(values)
    avg_values.append(np.mean(np.array([example[3] for example in recent_examples])))
    if len(examples) >= 3000:
        history.append(model.fit([states, pieces], [actions, values], epochs=1))

In [None]:
# Play games manually after model is trained
games = [Polyomino(height, width, idx) for idx in range(1)]
game = games[0]
# game.step((12,0))
# game.step((0,1))
# game.step((2,4))
# game.step((8,7))
# game.step((15,21))
# game.step((11,22))
# game.step((17,19))
# game.step((12,24))
# game.step((17,30))
# game.step((18,34))
# game.step((8,35))
# game.step((8,47))
# game.step((10,51))
# game.step((14,57))
root = search(np.array(games), frozen_func, 500)[0]
print(root.children_values)
pi_example = root.children_visits
actionss = [child.action for child in root.children]
invalid_actions = [i for i in range(action_size) if i not in [action[0] for action in actionss]]
for action in invalid_actions:
    if action < len(pi_example):
        pi_example = np.insert(pi_example, action, 0)
    else:
        pi_example = np.append(pi_example, 0)
print(pi_example)
print(root.get_normalized_priors())
print(model([np.reshape(game.get_board(), (1,8,10,1)), np.reshape(game.get_available_pieces(), (1,5))]))

print(np.reshape(game.get_board(), (8,10)))
print(game.get_legal_actions())

In [None]:
# Section to visualize results post-training

plt.rcParams["figure.figsize"] = [6.4, 4.8]
plt.rcParams["figure.autolayout"] = True
data1 = [1 - value for value in avg_values]
data2 = [loss.history["loss"][0] for loss in history]
t = [i for i in range(len(data1))]
t2 = [i for i in range(len(data1)-len(data2),len(data1))]
fig, ax1 = plt.subplots()

ax1.set_xlabel('Iterations')
ax1.set_ylabel('1 - Game Result', color='red')
# ax1.set_ylim(0,0.65)
ax1.plot(t, data1, color='red')
ax1.tick_params(axis='y', labelcolor='red')
ax2 = ax1.twinx()

ax2.set_ylabel('Network Loss', color="blue")
ax2.plot(t2, data2, color="blue")
ax2.tick_params(axis='y', labelcolor="blue")
plt.title("Training results over 8x10 board")
plt.show()