In [1]:
import sys
sys.path.append('../')
import datetime

import torch as T

from board import Connect4Board
from board2dqn import createStateTensor
from agent import Connect4Agent, calculateReward
from validation import validate
from dqn import exportOnnx

def log(message):
    print(f"[{datetime.datetime.now().strftime('%H:%M:%S')}] {message}")

In [2]:
#
# Hyper parameters
# 
lr = 0.1
gamma = 0.9
epsilon = 0.3
eps_min = 0.05
eps_dec = 1e-7
batch_count = 2
batch_size = 256
memory_size = 128000

target_update_interval = 0

In [3]:
agent = Connect4Agent(
    lr = lr, 
    epsilon = epsilon, 
    epsilon_end = eps_min, 
    epsilon_decay = eps_dec,
    batch_size = batch_size, 
    batch_count = batch_count,
    memory_size = memory_size,
    gamma = gamma,
    targetUpdateInterval=target_update_interval
)

In [4]:
# load agent from checkpoint
agent.loadCheckpoint(f'connect4-30000')

Loaded checkpoint connect4-30000.


In [6]:
#
# TRAINING
#
gamesToGo = 100000

log_interval = 5000

validation_interval = 10000
validation_gamesPerPlayer = 1000
validation_procsPerPlayer = 8
validation_strength = 50

lastLoggedGame = 0
games = set()
allGames = set()

log(f"Starting training for {gamesToGo} games.")

for game in range(1, gamesToGo+1):
    env = Connect4Board()
        
    next_state = createStateTensor(env)
    
    while not env.Finished:
        state = next_state
        validMoves = [a for a in range(7) if env.is_valid(a)]
        action = agent.getTrainingAction(state, validMoves)
        env.move(action)
        next_state = createStateTensor(env)
        validMovesMask = T.zeros(7, dtype=bool)
        validMovesMask[validMoves] = True
        reward = calculateReward(env)
        agent.store_transition(state, action, next_state, validMovesMask, env.Finished, reward)

    games.add(env.gameKey)
    allGames.add(env.gameKey)
    agent.learn()

    if game % log_interval == 0:
        log(f'{game} games, div: {100*len(games)/(game+1-lastLoggedGame):.2f} / {100*len(allGames)/(game+1):.2f}')
        games.clear()
        lastLoggedGame = game
        agent.printStats()
    if game % validation_interval == 0:
        agent.saveCheckpoint(f'connect4-{game}')
        log(f'Validation:')
        validate(agent.evaluationModel, validation_gamesPerPlayer, validation_procsPerPlayer, validation_strength)

[16:55:36] Starting training for 100000 games.
[17:07:51] 5000 games, div: 99.28 / 99.28
Average loss (last 9974): 0.009782006812703215, last: 0.006781626492738724, epsilon: 0.29950129999998565
[17:20:31] 10000 games, div: 99.10 / 99.11
Average loss (last 10000): 0.011139358226303011, last: 0.007518141996115446, epsilon: 0.29900129999997127
Checkpoint 'connect4-10000' saved.
[17:20:31] Validation:
Validation with 500 games per player on 4 processes each, MCTS with 50 games.
Player 1: 417 won, 80 lost, 3 draws -> 83.40%, div: 88.20%
Player 2: 366 won, 133 lost, 1 draws -> 73.20%, div: 99.20%
[17:34:04] 15000 games, div: 99.34 / 99.01
Average loss (last 10000): 0.011432169418456033, last: 0.009589195251464844, epsilon: 0.2985012999999569
[17:45:36] 20000 games, div: 99.36 / 98.90
Average loss (last 10000): 0.010960771030560135, last: 0.01399368979036808, epsilon: 0.2980012999999425
Checkpoint 'connect4-20000' saved.
[17:45:36] Validation:
Validation with 500 games per player on 4 process

KeyboardInterrupt: 

In [5]:
validate(agent.evaluationModel, 2000, 8, 50)

Validation with 2000 games per player on 8 processes each, MCTS with 50 games.
Player 1: 1572 won, 424 lost, 4 draws -> 78.60%, div: 95.10%
Player 2: 1535 won, 459 lost, 6 draws -> 76.75%, div: 97.05%


In [None]:
exportOnnx(agent.evaluationModel, 'connect4')