In [6]:
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import importlib
import tictactoe
importlib.reload(tictactoe);
from tictactoe import TicTacToe


In [7]:
#
# Hyper parameters
# 
alpha = 0.2
gamma = 0.9
epsilon = 0.2

In [28]:
#
# Create the model and optimizer
# 
model = nn.Sequential(
    nn.Linear(27, 81),    
    nn.ReLU(),
    nn.Linear(81, 9)
)
games = 0
optimizer = torch.optim.SGD(model.parameters(), lr = alpha)


In [29]:
#
# Full validation
#
def validategame(env, results):
    state, player, opponent = env.state, env.player, env.opponent
    for action in [a for a in range(9) if env.is_valid(a)]:
        env.move(action)
        if env.is_won():
            results['losses'] += 1
        elif env.is_full():
            results['draws'] += 1
        else:
            q = model(env.stateTensor)
            qa = max([a for a in range(9) if env.is_valid(a)], key = lambda x: q[x])
            env.move(qa)
            if env.is_won():
                results['wins'] += 1
            elif env.is_full():
                results['draws'] += 1
            else:
                validategame(env, results)
        env.board, env.player, env.opponent = list(state), player, opponent

def validate():
    train = model.training
    model.eval()
    
    env = TicTacToe()
    q = model(env.stateTensor)
    qa = max([a for a in range(9) if env.is_valid(a)], key = lambda x: q[x])
    env.move(qa)
    results = {'wins': 0, 'losses': 0, 'draws': 0}
    validategame(env, results)
    nonloss = results['wins'] + results['draws']
    total = results['losses'] + nonloss
    print(f"Cross: {100*nonloss/total:.2f}% of {total} ({results['wins']}/{results['draws']}/{results['losses']})")

    env = TicTacToe()
    results = {'wins': 0, 'losses': 0, 'draws': 0}
    validategame(env, results)
    nonloss = results['wins'] + results['draws']
    total = results['losses'] + nonloss
    print(f"Circle: {100*nonloss/total:.2f}% of {total} ({results['wins']}/{results['draws']}/{results['losses']})")

    if train:
        model.train()

validate()

Cross: 55.67% of 282 (103/54/125)
Circle: 33.49% of 851 (178/107/566)


In [None]:
#
# TRAINING
#
log_interval = 10000
losses = []
model.train()
for _ in range(1000000):
    env = TicTacToe()
    games += 1
    moves = 0
    
    qstack = []

    while not (env.is_won() or env.is_full()):
        q = model(env.stateTensor)
        e = random.uniform(0, 1)

        validmoves = [a for a in range(9) if env.is_valid(a)]
        if moves == 0:
            e /= 2
        if  e < epsilon:
            action = random.choice(validmoves)
        else:
            action = max(validmoves, key=lambda x: q[x])
        
        qstack.append((q, action, validmoves))
        env.move(action)
        moves += 1

    qlist = []
    targetlist = []

    (q, action, validmoves) = qstack.pop()
    targetq = q.clone().detach()
    targetq[action] = 1 if env.is_won() else 0
    loss = F.mse_loss(q, targetq)
    qlist.append(q)
    targetlist.append(targetq)



    while len(qstack) > 0:
        next_q = targetq
        nextvalidmoves = validmoves
        (q, action, validmoves) = qstack.pop()
        targetq = q.clone().detach()
        next_max = -max([next_q[a] for a in nextvalidmoves]).item()
        targetq[action] = -0.1 + gamma * next_max
        qlist.append(q)
        targetlist.append(targetq)

    qtensor = torch.stack(qlist)
    targettensor = torch.stack(targetlist)
    loss = F.mse_loss(qtensor, targettensor)

    losses.append(loss.item())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if games % log_interval == 0:
        print(f'{games}: average loss: {sum(losses)/len(losses)}')
        validate()
