In [1]:
import sys
sys.path.append('../')
import datetime
import numpy as np
from board import Connect4Board
from agent import Connect4Agent, createStateTensor, calculateReward
from validation import validate

def log(message):
    print(f"[{datetime.datetime.now().strftime('%H:%M:%S')}] {message}")

In [2]:
#
# Hyper parameters
# 
lr = 0.1
gamma = 0.9
epsilon = 0.005
eps_min = 0
eps_dec = 0
batch_size = 512
memory_size = 256000

target_update_interval = 64

In [3]:
agent = Connect4Agent(
    lr = lr, 
    epsilon = epsilon, 
    epsilon_end = eps_min, 
    epsilon_decay = eps_dec,
    batch_size = batch_size, 
    memory_size = memory_size,
    gamma = gamma,
    targetUpdateInterval=target_update_interval
)

In [None]:
# load agent from checkpoint
agent.loadCheckpoint(f'connect4-10000-9070')

In [5]:
#
# TRAINING
#
gamesToGo = 500000

log_interval = 5000

validation_interval = 5000
validation_games = 1000
omega = 0

log(f"Starting training for {gamesToGo} games.")

for game in range(gamesToGo):
    env = Connect4Board()
        
    next_state = createStateTensor(env)
    
    while not env.Finished:
        state = next_state
        action = agent.getTrainingAction(state, env.ValidMovesMask)
        env.move(action)
        next_state = createStateTensor(env)
        reward = calculateReward(env)
        agent.store_transition(state, action, next_state, env.ValidMovesMask, env.Finished, reward)

    agent.learn()

    if game % log_interval == 0:
        log(f'{game} games')
        agent.printStats()
    if game % validation_interval == 0:
        agent.saveCheckpoint(f'connect4-{game}')
        log(f'Validation:')
        validate(agent, validation_games, omega)

[14:58:21] 270000 games
Average loss (last 5000): 0.009917308277450501, last: 0.00943911075592041, epsilon: 0.01
Checkpoint 'connect4-270000' saved.
[14:58:21] Validation:
Validation with 0% random moves for 1000 games per player.
Player 1: 997 won, 3 lost, 0 draws -> 99.70%, div: 84.70%
Player 2: 993 won, 7 lost, 0 draws -> 99.30%, div: 99.20%
[15:06:18] 275000 games
Average loss (last 5000): 0.010166908929031342, last: 0.008420916274189949, epsilon: 0.01
Checkpoint 'connect4-275000' saved.
[15:06:18] Validation:
Validation with 0% random moves for 1000 games per player.
Player 1: 1000 won, 0 lost, 0 draws -> 100.00%, div: 83.40%
Player 2: 995 won, 4 lost, 1 draws -> 99.50%, div: 98.70%
[15:13:25] 280000 games
Average loss (last 5000): 0.009880946130212396, last: 0.013797072693705559, epsilon: 0.01
Checkpoint 'connect4-280000' saved.
[15:13:25] Validation:
Validation with 0% random moves for 1000 games per player.
Player 1: 998 won, 2 lost, 0 draws -> 99.80%, div: 77.20%
Player 2: 996

: 

265000: 100.00 99.70