In [1]:
import import_ipynb

import random
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import numpy as np

import pygame
from environment import TicTacToe3D
from qAgent import QAgent
from dumbAgent import DumbAgent

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
  from IPython import display

plt.ion()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

pygame 2.5.2 (SDL 2.28.3, Python 3.8.19)
Hello from the pygame community. https://www.pygame.org/contribute.html
importing Jupyter notebook from environment.ipynb
importing Jupyter notebook from qAgent.ipynb
importing Jupyter notebook from dumbAgent.ipynb


In [2]:
episode_scores1 = []
episode_scores2 = []
moves = []

def plot_moves(show_result=False):
    plt.figure(1)
    if show_result:
        plt.title('Result')
    else:
        plt.clf()
        plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Moves')
    
    plt.plot(moves, color='red', alpha=0.2)

    plt.pause(0.001)  # pause a bit so that plots are updated
    if is_ipython:
        if not show_result:
            display.display(plt.gcf())
            display.clear_output(wait=True)
        else:
            display.display(plt.gcf())

def plot_scores(show_result=False):
    plt.figure(1)
    if show_result:
        plt.title('Result')
    else:
        plt.clf()
        plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Final Score')

    plt.plot(episode_scores1, label='Player 1')

    # plt.pause(0.001)  # pause a bit so that plots are updated
    # if is_ipython:
    #     if not show_result:
    #         display.display(plt.gcf())
    #         display.clear_output(wait=True)
    #     else:
    #         display.display(plt.gcf())

In [3]:
env = TicTacToe3D(headless=False)
qAgent = QAgent(training=True)
dumpAgent = DumbAgent(depth=1)

In [4]:
def finalScore(t, result, player):
    if result == 0:
        return 0
    elif result == player:
        return (71.5 - 0.5 * player - t) / 64
    else:
        return ((t - 7.5 - 0.5 * player) / 56) - 1

In [5]:
num_episodes = 1000

for i_episode in range(num_episodes):
    # Initialize the environment and get its state
    state, possible_move = env.reset()
    env.draw_lines()
    state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0).view(-1, 64)

    # print(state)

    player = 1

    state = state * player

    for t in count():
        if player == 1:
            action = qAgent.findBestMove(board=state, possible_move=possible_move, player=player)
        else:
            action = dumpAgent.findBestMove(board=state.detach().numpy().reshape(4,4,4), possible_move=possible_move, player=player)
            if action != None:
                action = torch.tensor([4 * action[0] + action[1]], device=device).unsqueeze(0)

        # print(f"{player} {action}")

        if action == None:
            # terminated = draw
            reward = 0
            terminated = True
        else:
            act = action.item()

            # print(act)
            row = act // 4
            col = act % 4
            # print(f"{player} {row+1}, {col+1}")
            # terminated = lose
            movable, observation, possible_move, reward, terminated = env.move(row, col, player)

            env.draw_figures()

        reward = torch.tensor([reward], device=device)
        done = terminated or (action == None)

        player = -player

        if terminated:
            next_state = None
        else:
            next_state = torch.tensor(observation * player, dtype=torch.float32, device=device).unsqueeze(0).view(-1, 64)

        # Store the transition in memory
        # action_remember = torch.zeros(16, dtype=torch.int64, device=device)
        # action_remember[action.item()] = 1
        qAgent.memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the policy network)
        qAgent.optimize_model()

        # Soft update of the target network's weights
        # θ′ ← τ θ + (1 −τ )θ′
        target_net_state_dict = qAgent.target_net.state_dict()
        policy_net_state_dict = qAgent.policy_net.state_dict()
        for key in policy_net_state_dict:
            target_net_state_dict[key] = policy_net_state_dict[key]*qAgent.TAU + target_net_state_dict[key]*(1-qAgent.TAU)
        qAgent.target_net.load_state_dict(target_net_state_dict)

        if done:
            episode_scores1.append(finalScore(t+1, env.check(), 1))
            episode_scores2.append(finalScore(t+1, env.check(), -1))
            moves.append(t+1)
            # episode_durations.append(finalScore(t, reward.item(), player))
            plot_scores()
            plt.show()
            plot_moves()
            break

qAgent.save_weights("./weights/qlearn.pth")

print('Complete')
plot_scores(show_result=True)
plt.ioff()
plt.show()
plot_moves(show_result=True)
plt.ioff()
plt.show()

KeyboardInterrupt: 

<Figure size 640x480 with 0 Axes>

In [None]:
pygame.quit()

: 