In [1]:
from self_play import SelfPlayManager
from games import TicTacToe
from net import TicTacToeNet
from trainer import NeuralNetworkTrainer

net = TicTacToeNet()
trainer = NeuralNetworkTrainer(net, device='cpu')

print("----- Self-play generation started -----")
self_play_manager = SelfPlayManager(net, TicTacToe)
data = self_play_manager.generate_self_play(num_games=100, num_workers=4)


----- Self-play generation started -----
odict_keys(['conv.0.weight', 'conv.0.bias', 'conv.2.weight', 'conv.2.bias', 'fully_connected_policy.weight', 'fully_connected_policy.bias', 'fully_connected_value.weight', 'fully_connected_value.bias'])
[SelfPlayManager] Collecting 100 games with 4 workers...


[SelfPlayManager] Self-play: 100%|████████████| 100/100 [00:02<00:00, 44.89it/s]

[SelfPlayManager] Collected 100 games.





In [2]:

print()
print("----- Training started -----")
trainer.train(data, epochs=10)

print("Are the models the same? ", net == trainer.net)
print()



----- Training started -----
[Trainer] Training started...


[Trainer] Epochs:  10%|██▌                       | 1/10 [00:00<00:01,  6.13it/s]

[Trainer] Epoch 1/10 - Loss: 36.2499, Policy Loss: 26.1861, Value Loss: 10.0638


[Trainer] Epochs:  30%|███████▊                  | 3/10 [00:00<00:00,  8.97it/s]

[Trainer] Epoch 2/10 - Loss: 35.7273, Policy Loss: 25.7536, Value Loss: 9.9737
[Trainer] Epoch 3/10 - Loss: 34.9408, Policy Loss: 24.9952, Value Loss: 9.9456
[Trainer] Epoch 4/10 - Loss: 33.7958, Policy Loss: 23.9427, Value Loss: 9.8531


[Trainer] Epochs:  50%|█████████████             | 5/10 [00:00<00:00,  9.75it/s]

[Trainer] Epoch 5/10 - Loss: 32.2187, Policy Loss: 22.5739, Value Loss: 9.6448
[Trainer] Epoch 6/10 - Loss: 31.1560, Policy Loss: 21.5180, Value Loss: 9.6380


[Trainer] Epochs:  70%|██████████████████▏       | 7/10 [00:00<00:00,  9.99it/s]

[Trainer] Epoch 7/10 - Loss: 30.5684, Policy Loss: 20.9819, Value Loss: 9.5865


[Trainer] Epochs:  90%|███████████████████████▍  | 9/10 [00:00<00:00, 10.21it/s]

[Trainer] Epoch 8/10 - Loss: 30.1418, Policy Loss: 20.4138, Value Loss: 9.7280
[Trainer] Epoch 9/10 - Loss: 29.7196, Policy Loss: 20.2368, Value Loss: 9.4828


[Trainer] Epochs: 100%|█████████████████████████| 10/10 [00:01<00:00,  9.83it/s]

[Trainer] Epoch 10/10 - Loss: 29.3545, Policy Loss: 20.0140, Value Loss: 9.3405
[Trainer] Training finished. Loss: 29.3545, Policy Loss: 20.0140, Value Loss: 9.3405
Are the models the same?  True






In [3]:
from promoter import ModelPromoter
from evaluator import ModelEvaluator

print("----- Model evaluation and promotion started -----")
evaluator = ModelEvaluator(game_class=TicTacToe, mcts_params={"num_simulations": 50})
promoter = ModelPromoter(model_dir="models", evaluator=evaluator, net_class=TicTacToeNet)
NUM_SELF_PLAY_GAMES = 100

best_net = promoter.get_best_model()
win_rate, metrics = promoter.evaluate_and_maybe_promote(net, metadata={"episode": 0})

print()
print("----- Evaluation complete -----")
# Optional: Print summary
print(f"[Summary] Win rate: {win_rate:.2%} | Metrics: {metrics}")

----- Model evaluation and promotion started -----


[Evaluator] Evaluating: 100%|███████████████████| 20/20 [00:00<00:00, 28.54it/s]

[Evaluator]: Candidate Win Rate: 25.00% (W:5 L:14 D:1)
[Promoter]: ❌ Candidate rejected (win rate: 25.00%)

----- Evaluation complete -----
[Summary] Win rate: 25.00% | Metrics: {'wins': 5, 'losses': 14, 'draws': 1, 'total': 20, 'win_rate': 0.25}



