In [1]:
import json
import os
import sys
from pathlib import Path

import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset

In [2]:
path = str(Path().resolve() / 'agents/Group27/mcts')
if (path not in sys.path):
    sys.path.append(path)

from PolicyModel import PolicyModel
from HeuristicModel import HeuristicModel

In [3]:
modelName = input()
if (modelName == ''):
    modelName = 'test'

# Data

In [4]:
def loadData(dataFile):
    path = str(Path().resolve())
    with open(os.path.join(path, '..', 'data', f'{dataFile}.json'), 'rb') as f:
        rawData = json.load(f)
    print(len(rawData))
    return rawData

rawExpertData = loadData('expert-v-expert')
rawSelfPlayData = loadData('chump-v-chump')

5998
105501


In [5]:
for item in rawSelfPlayData.items():
    print(item[0])
    tempMoves = item[1]['moves']
    print(tempMoves)
    break

0 0 0 0 0 0 0 0 0 0 0 
 0 0 0 0 0 0 0 0 0 0 0 
  0 0 0 0 0 0 0 0 0 0 0 
   0 0 0 0 0 0 0 0 0 0 0 
    0 0 0 0 0 0 0 0 0 0 0 
     0 0 0 0 0 0 0 0 0 0 0 
      0 0 0 0 0 0 0 0 0 0 0 
       0 0 0 0 0 0 0 0 0 0 0 
        0 0 0 0 0 0 0 0 0 0 0 
         0 0 0 0 0 0 0 0 0 0 0 
          0 0 0 0 0 0 0 0 0 0 0 

[[6, 8, 9, 8, 7, 8, 8, 5, 6, 7, 12], [6, 5, 7, 9, 8, 3, 13, 10, 10, 8, 10], [9, 8, 11, 5, 11, 12, 10, 8, 11, 8, 10], [5, 9, 6, 7, 7, 13, 4, 7, 8, 10, 9], [8, 6, 7, 6, 10, 6, 6, 13, 12, 8, 6], [4, 10, 10, 6, 12, 6, 11, 7, 4, 8, 9], [5, 8, 7, 5, 5, 14, 9, 13, 9, 19, 6], [15, 6, 7, 9, 10, 12, 11, 7, 6, 7, 8], [10, 7, 6, 9, 2, 12, 4, 12, 10, 6, 5], [7, 5, 10, 10, 9, 8, 9, 6, 7, 9, 12], [12, 11, 11, 7, 2, 6, 8, 10, 7, 7, 10]]


In [6]:
tempBoard = "R 0 0 0 R B 0 0 B R 0 \n R 0 B 0 0 B R B 0 R B \n  B 0 R 0 R R B 0 B R 0 \n   0 0 0 0 0 B 0 R 0 B 0 \n    B R 0 0 B R B R R R R \n     R B B B 0 0 R B 0 0 R \n      B R 0 R 0 R R B R B 0 \n       B R R B R B B B 0 R R \n        B R 0 0 B 0 0 0 R B 0 \n         0 B R B B B R B B 0 R \n          B 0 0 0 0 B R B R R R \n"

## State Tensors

In [7]:
def tensorfyBoard(boardString):
    boardString = boardString.replace('R', '1').replace('B', '2')
    boardRows = boardString.strip().split('\n')
    board = [list(map(int, row.strip().split())) for row in boardRows]

    board = torch.tensor(board, dtype=torch.int)

    rStones = (board == 1).int()
    bStones = (board == 2).int()
    nStones = (board == 0).int()

    return torch.stack([rStones, bStones, nStones])

tensorfyBoard(tempBoard)

tensor([[[1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0],
         [1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0],
         [0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0],
         [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
         [0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1],
         [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1],
         [0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0],
         [0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1],
         [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0],
         [0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1],
         [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1]],

        [[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0],
         [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1],
         [1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0],
         [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0],
         [1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0],
         [0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0],
         [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0],
         [1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0],
         [1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0],
         [0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0],
         [1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0]],

        [[0, 1, 1, 1, 0, 0, 

## Move Frequencies

In [8]:
def encodeMoves(moves2D, boardTensor):
    moves = torch.tensor(np.array(moves2D).flatten())

    # smoothing
    moves += boardTensor[2].flatten()

    # normalise
    moves = moves / moves.sum()
    return moves

encodeMoves(tempMoves, tensorfyBoard(tempBoard))

tensor([0.0057, 0.0086, 0.0096, 0.0086, 0.0067, 0.0077, 0.0086, 0.0057, 0.0057,
        0.0067, 0.0125, 0.0057, 0.0057, 0.0067, 0.0096, 0.0086, 0.0029, 0.0125,
        0.0096, 0.0105, 0.0077, 0.0096, 0.0086, 0.0086, 0.0105, 0.0057, 0.0105,
        0.0115, 0.0096, 0.0086, 0.0105, 0.0077, 0.0105, 0.0057, 0.0096, 0.0067,
        0.0077, 0.0077, 0.0125, 0.0048, 0.0067, 0.0086, 0.0096, 0.0096, 0.0077,
        0.0057, 0.0077, 0.0067, 0.0096, 0.0057, 0.0057, 0.0125, 0.0115, 0.0077,
        0.0057, 0.0038, 0.0096, 0.0096, 0.0057, 0.0125, 0.0067, 0.0105, 0.0067,
        0.0048, 0.0086, 0.0086, 0.0048, 0.0077, 0.0077, 0.0048, 0.0057, 0.0134,
        0.0086, 0.0125, 0.0086, 0.0182, 0.0067, 0.0144, 0.0057, 0.0067, 0.0086,
        0.0096, 0.0115, 0.0105, 0.0067, 0.0067, 0.0067, 0.0077, 0.0096, 0.0067,
        0.0067, 0.0096, 0.0019, 0.0125, 0.0048, 0.0125, 0.0096, 0.0057, 0.0057,
        0.0077, 0.0048, 0.0096, 0.0096, 0.0086, 0.0077, 0.0086, 0.0057, 0.0067,
        0.0096, 0.0115, 0.0115, 0.0115, 

## Generation

In [9]:
def processDataSet(dataSet):
    boardTensors = []
    moveTensors = []
    payoffTensors = []

    for (boardString, data) in dataSet.items():
        boardTensor = tensorfyBoard(boardString)
        moveTensor = encodeMoves(data['moves'], boardTensor)

        boardTensors.append(boardTensor)
        moveTensors.append(moveTensor)
        payoffTensors.append(torch.tensor(data['payoff']))

    boards = torch.stack(boardTensors)
    moves = torch.stack(moveTensors)
    payoffs = torch.stack(payoffTensors)

    return boards, moves, payoffs

expertBoards, expertMoves, expertPayoffs = processDataSet(rawExpertData)
print(expertBoards.shape, expertMoves.shape, expertPayoffs.shape)
selfPlayBoards, selfPlayMoves, selfPlayPayoffs = processDataSet(rawSelfPlayData)
print(selfPlayBoards.shape, selfPlayMoves.shape, selfPlayPayoffs.shape)

torch.Size([5998, 3, 11, 11]) torch.Size([5998, 121]) torch.Size([5998])
torch.Size([105501, 3, 11, 11]) torch.Size([105501, 121]) torch.Size([105501])


## Data Augmentation

The Hex board is symmetric. We can use this to augment our data. For each board state, we can generate 6 more board states by rotating the board by 60 degrees each time. This will give us 7 times more data to train on.

In [10]:
# TODO

# Policy Model
## Training

In [11]:
policyModel = PolicyModel(boardSize=11)

In [12]:
def trainModel(model, criterion, boards, expected, softmax, batchSize=256, learningRate=1e-3, epochs=10):

    optimiser = torch.optim.Adam(model.parameters(), lr=learningRate)

    dataset = TensorDataset(boards, expected)
    dataLoader = DataLoader(dataset, batch_size=batchSize, shuffle=True)

    model.train()

    for epoch in range(epochs):
        epochLoss = 0.0
        for batchBoards, batchExpected in dataLoader:
            optimiser.zero_grad()

            # forward
            logits = model(batchBoards)
            if softmax:
                predictions = torch.nn.functional.log_softmax(logits, dim=1)
            else:
                predictions = logits.squeeze(1)

            # loss
            loss = criterion(predictions, batchExpected)
            epochLoss += loss.item()

            # backpropagation
            loss.backward()
            optimiser.step()

        print(f'Epoch {epoch+1}: {epochLoss/len(dataLoader):.4f}')

    return model

criterion = torch.nn.KLDivLoss(reduction='batchmean')

# train on expert moves
print('EXPERT')
trainModel(policyModel, criterion, expertBoards.float(), expertMoves, softmax=True, learningRate=1e-2)
# train on self-play moves
print('SELF-PLAY')
trainModel(policyModel, criterion, selfPlayBoards.float(), selfPlayMoves, softmax=True, learningRate=1e-3)

EXPERT
Epoch 1: 0.1506
Epoch 2: 0.1469
Epoch 3: 0.1472
Epoch 4: 0.1465
Epoch 5: 0.1469
Epoch 6: 0.1465
Epoch 7: 0.1468
Epoch 8: 0.1469
Epoch 9: 0.1470
Epoch 10: 0.1468
SELF-PLAY
Epoch 1: 0.7738
Epoch 2: 0.7706
Epoch 3: 0.7708
Epoch 4: 0.7704
Epoch 5: 0.7705
Epoch 6: 0.7709
Epoch 7: 0.7707
Epoch 8: 0.7709
Epoch 9: 0.7709
Epoch 10: 0.7706


PolicyModel(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (convp): Conv2d(64, 2, kernel_size=(1, 1), stride=(1, 1))
  (fcp): Linear(in_features=242, out_features=121, bias=True)
)

## Results

In [13]:
def infer(model, boardString):
    model.eval()
    with torch.no_grad():
        boardTensor = tensorfyBoard(boardString)
        logits = model(boardTensor.float().unsqueeze(0))
        probs = torch.nn.functional.softmax(logits, dim=1)

    return probs

infer(policyModel, tempBoard)

tensor([[0.0080, 0.0081, 0.0083, 0.0084, 0.0081, 0.0084, 0.0081, 0.0085, 0.0077,
         0.0082, 0.0081, 0.0083, 0.0084, 0.0083, 0.0083, 0.0083, 0.0080, 0.0083,
         0.0086, 0.0084, 0.0082, 0.0080, 0.0078, 0.0082, 0.0081, 0.0086, 0.0082,
         0.0086, 0.0081, 0.0080, 0.0085, 0.0082, 0.0084, 0.0082, 0.0084, 0.0082,
         0.0085, 0.0080, 0.0081, 0.0086, 0.0082, 0.0081, 0.0082, 0.0085, 0.0082,
         0.0083, 0.0083, 0.0083, 0.0087, 0.0079, 0.0080, 0.0084, 0.0082, 0.0081,
         0.0083, 0.0084, 0.0079, 0.0080, 0.0085, 0.0079, 0.0083, 0.0079, 0.0083,
         0.0085, 0.0077, 0.0080, 0.0082, 0.0086, 0.0080, 0.0082, 0.0081, 0.0083,
         0.0084, 0.0083, 0.0080, 0.0081, 0.0085, 0.0083, 0.0084, 0.0085, 0.0084,
         0.0083, 0.0086, 0.0085, 0.0084, 0.0085, 0.0082, 0.0078, 0.0087, 0.0079,
         0.0084, 0.0082, 0.0085, 0.0082, 0.0085, 0.0083, 0.0083, 0.0086, 0.0082,
         0.0083, 0.0083, 0.0081, 0.0084, 0.0080, 0.0089, 0.0080, 0.0083, 0.0082,
         0.0085, 0.0085, 0.0

In [14]:
torch.save(policyModel.state_dict(), f'./models/{modelName}_policy.pth')

# Heuristic Model

## Training

In [15]:
heuristicModel = HeuristicModel(boardSize=11)

In [16]:
criterion = torch.nn.MSELoss()

# train on expert moves
print('EXPERT')
trainModel(heuristicModel, criterion, expertBoards.float(), expertPayoffs.float(), softmax=False, batchSize=256, learningRate=1e-2)
# train on self-play moves
print('SELF-PLAY')
trainModel(heuristicModel, criterion, selfPlayBoards.float(), selfPlayPayoffs.float(), softmax=False, batchSize=256, learningRate=1e-3)

EXPERT
Epoch 1: 1.1217
Epoch 2: 1.1213
Epoch 3: 1.1240
Epoch 4: 1.1201
Epoch 5: 1.1229
Epoch 6: 1.1215
Epoch 7: 1.1253
Epoch 8: 1.1229
Epoch 9: 1.1235
Epoch 10: 1.1206
SELF-PLAY
Epoch 1: 1.0634
Epoch 2: 1.0619
Epoch 3: 1.0618
Epoch 4: 1.0619
Epoch 5: 1.0624
Epoch 6: 1.0629
Epoch 7: 1.0618
Epoch 8: 1.0618
Epoch 9: 1.0619
Epoch 10: 1.0618


HeuristicModel(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (convh): Conv2d(64, 2, kernel_size=(1, 1), stride=(1, 1))
  (fch): Linear(in_features=242, out_features=1, bias=True)
)

## Results

In [17]:
infer(heuristicModel, tempBoard)

tensor([[1.]])

In [18]:
torch.save(heuristicModel.state_dict(), f'./models/{modelName}_heuristic.pth')