In [None]:
import creversi
import random

class QLearning:
    def __init__(self):
        self.q_table = {}

    def get_q_value(self, state, action):
        return self.q_table.get((state, action), 0.0)

    def update_q_value(self, state, action, value):
        self.q_table[(state, action)] = value

class QAgent:
    def __init__(self, epsilon=0.1, alpha=0.1, gamma=0.9):
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.q_learning = QLearning()

    def choose_action(self, state, legal_moves):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(legal_moves)
        else:
            q_values = [self.q_learning.get_q_value(state, action) for action in legal_moves]
            max_q_value = max(q_values)
            best_actions = [action for action, value in zip(legal_moves, q_values) if value == max_q_value]
            return random.choice(best_actions)

    def train(self, state, action, reward, next_state, legal_moves):
        current_q_value = self.q_learning.get_q_value(state, action)
        max_next_q_value = max([self.q_learning.get_q_value(next_state, next_action) for next_action in legal_moves])
        new_q_value = (1 - self.alpha) * current_q_value + self.alpha * (reward + self.gamma * max_next_q_value)
        self.q_learning.update_q_value(state, action, new_q_value)

    def reset_q_table(self):
        self.q_learning = QLearning()

class RandomAgent:
    def choose_action(self, state, legal_moves):
        return random.choice(legal_moves)

In [None]:
def print_board(board):
    print(str(board))

def play_game(q_agent, opponent_agent, board, first = True):
    if first:
      current_agent = q_agent
    else:
      current_agent = opponent_agent

    while not board.is_game_over():
        state = str(board)
        legal_moves = [creversi.move_to_str(move) for move in board.legal_moves]

        # print_board(board)

        if isinstance(current_agent, QAgent):
            action = current_agent.choose_action(state, legal_moves)
            board.move_from_str(action)
        else:
            action = opponent_agent.go(board)
            board.move(action)

        # board.move_from_str(action)
        current_agent = opponent_agent if current_agent == q_agent else q_agent

    # print("end with q_agent" if current_agent == q_agent else "end with opponent_agent")
    if first:
      firstname = "q_agent"
      secondname = "opponent_agent"
    else:
      firstname = "opponent_agent"
      secondname = "q_agent"
    if current_agent == opponent_agent:
      n_black = 64 - board.piece_num()
      n_white = board.piece_num()
    else:
      n_white = 64 - board.piece_num()
      n_black = board.piece_num()
    if n_white > n_black:
      if first:
        winner = 2
      else:
        winner = 1
      print(secondname + " white win", n_white)
    elif n_black > n_white:
      if first:
        winner = 1
      else:
        winner = 2
      print(firstname + " black win", n_black)
    else:
      winner = 0
      print("draw")

    return winner

In [None]:
q_agent = QAgent()
opponent_agent = GreedyPlayer("/content/epsilon_greedy_model.pt", device)
board = creversi.Board()

opponent_agentcount = 0
q_agentcount = 0

In [None]:
round = 500
for i in range(round):
  board = creversi.Board()
  result = play_game(q_agent, opponent_agent, board, True)
  if result == 1:
    q_agentcount += 1
  elif result == 2:
    opponent_agentcount += 1
  board = creversi.Board()
  result = play_game(q_agent, opponent_agent, board, False)
  if result == 1:
    q_agentcount += 1
  elif result == 2:
    opponent_agentcount += 1

opponent_agent white win 49
q_agent white win 47
opponent_agent white win 40
opponent_agent black win 38
q_agent black win 55
q_agent white win 37
opponent_agent white win 39
q_agent white win 46
q_agent black win 35
q_agent white win 40
opponent_agent white win 49
q_agent white win 41
opponent_agent white win 39
draw
opponent_agent white win 51
q_agent white win 51
opponent_agent white win 43
opponent_agent black win 37
opponent_agent white win 35
opponent_agent black win 36
q_agent black win 51
opponent_agent black win 36
opponent_agent white win 47
q_agent white win 54
q_agent black win 33
q_agent white win 39
opponent_agent white win 47
opponent_agent black win 40
opponent_agent white win 46
q_agent white win 36
draw
q_agent white win 46
opponent_agent white win 39
opponent_agent black win 38
opponent_agent white win 34
q_agent white win 35
opponent_agent white win 33
opponent_agent black win 33
opponent_agent white win 39
opponent_agent black win 55
q_agent black win 36
q_agent wh

In [None]:
print("q_agent win", q_agentcount)
print("opponent_agent win", opponent_agentcount)
print("draw", round * 2 - q_agentcount - opponent_agentcount)

q_agent win 451
opponent_agent win 511
draw 38


In [None]:
print("q_agent winrate", q_agentcount / (round * 2))
print("opponent_agent winrate", opponent_agentcount / (round * 2))

q_agent winrate 0.451
opponent_agent winrate 0.511
