In [1]:
import sys
sys.path.append('../')
import datetime

import torch as T

from board import Connect4Board
from board2dqn import createStateTensor
from agent import Connect4Agent, calculateReward
from validation import validate
from dqn import exportOnnx

def log(message):
    print(f"[{datetime.datetime.now().strftime('%H:%M:%S')}] {message}")

In [2]:
#
# Hyper parameters
# 
lr = 0.05
gamma = 0.9
epsilon = 0.01
eps_min = 0.1
eps_dec = 0
batch_count = 4
batch_size = 512
memory_size = 64000

target_update_interval = 0

In [12]:
agent = Connect4Agent(
    lr = lr, 
    epsilon = epsilon, 
    epsilon_end = eps_min, 
    epsilon_decay = eps_dec,
    batch_size = batch_size, 
    batch_count = batch_count,
    memory_size = memory_size,
    gamma = gamma,
    targetUpdateInterval=target_update_interval
)

In [13]:
# load agent from checkpoint
agent.loadCheckpoint(f'connect4-d')

Loaded checkpoint connect4-d.


In [15]:
#
# TRAINING
#
gamesToGo = 20000

log_interval = 5000

validation_interval = 10000
validation_games = 1000
omega = 0

lastLoggedGame = 0
games = set()
allGames = set()

log(f"Starting training for {gamesToGo} games.")

for game in range(1, gamesToGo+1):
    env = Connect4Board()
        
    next_state = createStateTensor(env)
    
    while not env.Finished:
        state = next_state
        validMoves = [a for a in range(7) if env.is_valid(a)]
        action = agent.getTrainingAction(state, validMoves)
        env.move(action)
        next_state = createStateTensor(env)
        validMovesMask = T.tensor([env.is_valid(a) for a in range(7)], dtype=bool)
        reward = calculateReward(env)
        agent.store_transition(state, action, next_state, validMovesMask, env.Finished, reward)

    games.add(env.gameKey)
    allGames.add(env.gameKey)
    agent.learn()

    if game % log_interval == 0:
        log(f'{game} games, div: {100*len(games)/(game+1-lastLoggedGame):.2f} / {100*len(allGames)/(game+1):.2f}')
        games.clear()
        lastLoggedGame = game
        agent.printStats()
    if game % validation_interval == 0:
        agent.saveCheckpoint(f'connect4-{game}')
        log(f'Validation:')
        validate(agent, validation_games, omega)

[15:42:43] Starting training for 20000 games.


  validMovesMask = T.tensor([env.is_valid(a) for a in range(7)], dtype=bool)


KeyboardInterrupt: 

a: 0.01042946996674873 100.00 99.80  
b: 0.0.0030970556917702197 99.7 100  


In [5]:
validate(agent.evaluationModel, 1000, 0)

Validation with 0% random moves for 1000 games per player.
Player 1: 995 won, 4 lost, 1 draws -> 99.50%, div: 82.80%
Player 2: 999 won, 1 lost, 0 draws -> 99.90%, div: 90.70%
