In [19]:
from pprint import pprint
import numpy as np
import numpy.ma as ma
import pandas as pd
from pettingzoo.classic import tictactoe_v3
import gym
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import random

In [6]:
class DQNMarlAgent:

    def __init__(self, env, agent_name):
        self.agent_name = agent_name
        self.action_size = env.action_spaces[agent_name].n
        self.state_size = self._calculate_state_size(env)
        self.gamma = 0.95
        self.learning_rate = 0.001
        self.epsilon = 1
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.memory = deque(maxlen = 1000)
        self.model = self.build_model()

    def _calculate_state_size(self, env):
        space = env.observation_spaces[self.agent_name]['observation']
        state_size = 1
        for factor in space.shape:
            state_size *= factor

        return state_size

    def build_model(self):
        model = Sequential()
        model.add(Dense(48, input_dim=self.state_size, activation='tanh'))
        model.add(Dense(self.action_size, activation='softmax'))
        model.compile(
            loss='mse',
            optimizer=Adam(learning_rate=self.learning_rate)
        )

        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state, mask):
        if random.uniform(0,1) <= self.epsilon:
            return np.random.choice(np.where(mask == 1)[0])
        else:
            act_values = self.model.predict(state)[0]
            return np.argmax(
                ma.masked_array(
                    act_values, np.logical_not(mask).astype(int)
                )
            )
    
    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return None

        minibatch = random.sample(self.memory, batch_size) 
        for state, action, reward, next_state, done in minibatch:
            if done:
                target = reward
            else:
                target = reward + self.gamma * \
                    np.amax(self.model.predict(next_state)[0]) 

            train_target = self.model.predict(state)
            train_target[0][action] = target
            self.model.fit(state, train_target, verbose=0)

    def adapt_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [15]:
env = tictactoe_v3.env()
env.reset()

agents = {}
prev_action = {}
prev_state = {}

for agent_name in env.agents:
    agents[agent_name] = DQNMarlAgent(env, agent_name)
    prev_action[agent_name] = -1
    prev_state[agent_name] = np.zeros((1, agents[agent_name].state_size), dtype=int)

In [16]:
pprint(prev_action)

{'player_1': -1, 'player_2': -1}


In [17]:
pprint(prev_state)

{'player_1': array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 'player_2': array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [21]:
batch_size = 16
game_finish_counter = 0

env.reset()
for i in range(32):
    for agent in env.agent_iter():
        obserwation, reward, done, info = env.last()

        if not done:
            state = obserwation['observation'].reshape(1, -1)

            action = agents[agent].act(state, obserwation['action_mask'])
            prev_action[agent] = action

            env.step(action)
            agents[agent].remember(prev_state[agent], action, reward, state, done)
            prev_state[agent] = state
            agents[agent].replay(batch_size)
            agents[agent].adapt_epsilon()
        else:
            game_finish_counter += 1
            print(f"Game finished at episode: {i}, time: {game_finish_counter}")

            if i % 4 == 0 and i != 0:
                env.render()
            env.step(None)
            
    env.reset()
    
    for agent_name in env.agents:
        prev_actions[agent_name] = -1
        prev_state[agent_name] = np.zeros(
            (1, agents[agent_name].state_size), dtype=int)

Game finished at episode: 0, time: 1
Game finished at episode: 0, time: 2
Game finished at episode: 1, time: 3
Game finished at episode: 1, time: 4
Game finished at episode: 2, time: 5
Game finished at episode: 2, time: 6
Game finished at episode: 3, time: 7
Game finished at episode: 3, time: 8
Game finished at episode: 4, time: 9
     |     |     
  O  |  X  |  -  
_____|_____|_____
     |     |     
  X  |  X  |  O  
_____|_____|_____
     |     |     
  -  |  X  |  O  
     |     |     
Game finished at episode: 4, time: 10
     |     |     
  O  |  X  |  -  
_____|_____|_____
     |     |     
  X  |  X  |  O  
_____|_____|_____
     |     |     
  -  |  X  |  O  
     |     |     
Game finished at episode: 5, time: 11
Game finished at episode: 5, time: 12
Game finished at episode: 6, time: 13
Game finished at episode: 6, time: 14
Game finished at episode: 7, time: 15
Game finished at episode: 7, time: 16
Game finished at episode: 8, time: 17
     |     |     
  -  |  -  |  X  
___