In [None]:
import sys
sys.path.append('../')
import datetime
import numpy as np
from board import Connect4Board
from agent import Connect4Agent, createStateTensor, calculateReward
from validation import validate

def log(message):
    print(f"[{datetime.datetime.now().strftime('%H:%M:%S')}] {message}")

In [None]:
#
# Hyper parameters
# 
lr = 0.05
gamma = 0.9
epsilon = 0.5
eps_min = 0.001
eps_dec = 1e-5
batch_size = 1024
memory_size = 256000

In [None]:
games = 0
agent = Connect4Agent(
    lr = lr, 
    epsilon = epsilon, 
    epsilon_end = eps_min, 
    epsilon_decay = eps_dec,
    batch_size = batch_size, 
    memory_size = memory_size,
    gamma = gamma
)
agent.numberOfParameters

In [None]:
# load agent from checkpoint
games = 45000
agent.loadCheckpoint(f'connect4-{games}')

In [None]:
#
# TRAINING
#
gamesToGo = 100000

log_interval = 5000

validation_interval = 5000
validation_games = 1000
omega = 1

log(f"Starting training at {games} games for {gamesToGo} games.")

for _ in range(gamesToGo):
    games += 1
    env = Connect4Board()
        
    next_state = createStateTensor(env)
    
    while not env.Finished:
        state = next_state
        action = agent.getTrainingAction(state, env.ValidMovesMask)
        env.move(action)
        next_state = createStateTensor(env)
        reward = calculateReward(env)
        agent.store_transition(state, action, next_state, env.ValidMovesMask, env.Finished, reward)

    agent.learn()

    if games % log_interval == 0:
        log(f'{games} games')
        agent.printStats()
    if games % validation_interval == 0:
        agent.saveCheckpoint(f'connect4-{games}')
        log(f'Validation:')
        validate(agent, validation_games, omega)