In [1]:
import json
import os
import sys
from pathlib import Path

import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset

In [2]:
path = str(Path().resolve() / 'agents/Group27/mcts')
if (path not in sys.path):
    sys.path.append(path)

from PolicyModel import PolicyModel
from HeuristicModel import HeuristicModel

In [3]:
modelName = input()
if (modelName == ''):
    modelName = 'test'

# Data

In [None]:
def loadData(dataFile):
    path = str(Path().resolve())
    with open(os.path.join(path, '..', 'data', f'{dataFile}.json'), 'rb') as f:
        rawData = json.load(f)
    print(len(rawData))
    return rawData

rawExpertData = loadData('expert-v-expert')
rawSelfPlayData = loadData('monkey-v-monkey')

In [None]:
for item in rawSelfPlayData.items():
    print(item[0])
    tempMoves = item[1]['moves']
    print(tempMoves)
    break

In [6]:
tempBoard = "R 0 0 0 R B 0 0 B R 0 \n R 0 B 0 0 B R B 0 R B \n  B 0 R 0 R R B 0 B R 0 \n   0 0 0 0 0 B 0 R 0 B 0 \n    B R 0 0 B R B R R R R \n     R B B B 0 0 R B 0 0 R \n      B R 0 R 0 R R B R B 0 \n       B R R B R B B B 0 R R \n        B R 0 0 B 0 0 0 R B 0 \n         0 B R B B B R B B 0 R \n          B 0 0 0 0 B R B R R R \n"

## State Tensors

In [None]:
def tensorfyBoard(boardString):
    boardString = boardString.replace('R', '1').replace('B', '2')
    boardRows = boardString.strip().split('\n')
    board = [list(map(int, row.strip().split())) for row in boardRows]

    board = torch.tensor(board, dtype=torch.int)

    rStones = (board == 1).int()
    bStones = (board == 2).int()
    nStones = (board == 0).int()

    return torch.stack([rStones, bStones, nStones])

tensorfyBoard(tempBoard)

## Move Frequencies

In [None]:
def encodeMoves(moves2D, boardTensor):
    moves = torch.tensor(np.array(moves2D).flatten())

    # smoothing
    moves += boardTensor[2].flatten()

    # normalise
    moves = moves / moves.sum()
    return moves

encodeMoves(tempMoves, tensorfyBoard(tempBoard))

## Generation

In [None]:
def processDataSet(dataSet):
    boardTensors = []
    moveTensors = []
    payoffTensors = []

    for (boardString, data) in dataSet.items():
        boardTensor = tensorfyBoard(boardString)
        moveTensor = encodeMoves(data['moves'], boardTensor)

        boardTensors.append(boardTensor)
        moveTensors.append(moveTensor)
        payoffTensors.append(torch.tensor(data['payoff']))

    boards = torch.stack(boardTensors)
    moves = torch.stack(moveTensors)
    payoffs = torch.stack(payoffTensors)

    return boards, moves, payoffs

expertBoards, expertMoves, expertPayoffs = processDataSet(rawExpertData)
print(expertBoards.shape, expertMoves.shape, expertPayoffs.shape)
selfPlayBoards, selfPlayMoves, selfPlayPayoffs = processDataSet(rawSelfPlayData)
print(selfPlayBoards.shape, selfPlayMoves.shape, selfPlayPayoffs.shape)

## Data Augmentation

The Hex board is symmetric. We can use this to augment our data. For each board state, we can generate 6 more board states by rotating the board by 60 degrees each time. This will give us 7 times more data to train on.

In [10]:
# TODO

# Policy Model
## Training

In [11]:
policyModel = PolicyModel(boardSize=11)

In [None]:
def trainModel(model, criterion, boards, expected, softmax, batchSize=256, learningRate=1e-3, epochs=10):

    optimiser = torch.optim.Adam(model.parameters(), lr=learningRate)

    dataset = TensorDataset(boards, expected)
    dataLoader = DataLoader(dataset, batch_size=batchSize, shuffle=True)

    model.train()

    for epoch in range(epochs):
        epochLoss = 0.0
        for batchBoards, batchExpected in dataLoader:
            optimiser.zero_grad()

            # forward
            logits = model(batchBoards)
            if softmax:
                predictions = torch.nn.functional.log_softmax(logits, dim=1)
            else:
                predictions = logits.squeeze(1)

            # loss
            loss = criterion(predictions, batchExpected)
            epochLoss += loss.item()

            # backpropagation
            loss.backward()
            optimiser.step()

        print(f'Epoch {epoch+1}: {epochLoss/len(dataLoader):.4f}')

    return model

criterion = torch.nn.KLDivLoss(reduction='batchmean')

# train on expert moves
print('EXPERT')
trainModel(policyModel, criterion, expertBoards.float(), expertMoves, softmax=True, learningRate=1e-2)
# train on self-play moves
print('SELF-PLAY')
trainModel(policyModel, criterion, selfPlayBoards.float(), selfPlayMoves, softmax=True, learningRate=1e-3)

## Results

In [None]:
def infer(model, boardString):
    model.eval()
    with torch.no_grad():
        boardTensor = tensorfyBoard(boardString)
        logits = model(boardTensor.float().unsqueeze(0))
        probs = torch.nn.functional.softmax(logits, dim=1)

    return probs

infer(policyModel, tempBoard)

In [14]:
torch.save(policyModel.state_dict(), f'./models/{modelName}_policy.pth')

# Heuristic Model

## Training

In [15]:
heuristicModel = HeuristicModel(boardSize=11)

In [None]:
criterion = torch.nn.MSELoss()

# train on expert moves
print('EXPERT')
trainModel(heuristicModel, criterion, expertBoards.float(), expertPayoffs.float(), softmax=False, batchSize=256, learningRate=1e-2)
# train on self-play moves
print('SELF-PLAY')
trainModel(heuristicModel, criterion, selfPlayBoards.float(), selfPlayPayoffs.float(), softmax=False, batchSize=256, learningRate=1e-3)

## Results

In [None]:
infer(heuristicModel, tempBoard)

In [18]:
torch.save(heuristicModel.state_dict(), f'./models/{modelName}_heuristic.pth')