In [9]:
from pprint import pprint
import numpy as np
import numpy.ma as ma
import pandas as pd
import gym
import random
from collections import deque
###########################################
import warnings
warnings.filterwarnings("ignore")
###########################################
from pettingzoo.classic import tictactoe_v3
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [52]:
class DQNMarlAgent:

    def __init__(self, env, agent_name):
        self.agent_name = agent_name
        self.action_size = env.action_spaces[agent_name].n
#         self.state_size = self._calculate_state_size(env)
        self.state_size = 9
        self.gamma = 0.95
        self.learning_rate = 0.01
        self.epsilon = 1
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.memory = deque(maxlen = 1000)
        self.model = self.build_model()

#     def _calculate_state_size(self, env):
#         space = env.observation_spaces[self.agent_name]['observation']
#         state_size = 1
#         for factor in space.shape:
#             state_size *= factor

#         return state_size

    def build_model(self):
        model = Sequential()
        # relu is most common activatin function right now
        model.add(Dense(32, input_dim=self.state_size, activation='relu'))
        model.add(Dense(self.action_size, activation='softmax'))
        model.compile(
            loss='mse',
            optimizer=Adam(learning_rate=self.learning_rate)
        )

        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state, mask):
        if random.uniform(0,1) <= self.epsilon:
            return np.random.choice(np.where(mask == 1)[0])
        else:
            act_values = self.model.predict(state)[0]
            return np.argmax(
                ma.masked_array(
                    act_values, np.logical_not(mask).astype(int)
                )
            )
    
    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return None
        minibatch = random.sample(self.memory, batch_size) 
        for state, action, reward, next_state, done in minibatch:
            if done:
                target = reward
            else:
                target = reward + self.gamma * \
                    np.amax(self.model.predict(next_state)[0]) 

            train_target = self.model.predict(state)
            train_target[0][action] = target
            self.model.fit(state, train_target, verbose=0)

    def adapt_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [21]:
def init_env():
    env = tictactoe_v3.env()
    env.reset()

    agents = {}
    prev_action = {}
    prev_state = {}

    # init agents and initial states/action
    for agent_name in env.agents:
        agents[agent_name] = DQNMarlAgent(env, agent_name)
        prev_action[agent_name] = -1
        prev_state[agent_name] = np.zeros((1, agents[agent_name].state_size), dtype=int)
        
    return env, agents, prev_action, prev_state

In [32]:
def encode_state(state):
    return (state[:, :, 0] - state[:, :, 1]).reshape(1, -1)

In [54]:
batch_size = 4
game_finish_counter = 0

env, agents, prev_action, prev_state = init_env()

for i in range(1500):
    print(f'game{i}')
    for agent in env.agent_iter():
        observation, reward, done, info = env.last()

        if not done:
            if i > 1490:
                env.render()
            state = observation['observation']
            state = encode_state(state)
            
            # choose action
            action = agents[agent].act(state, observation['action_mask'])
            prev_action[agent] = action
    
            env.step(action)
            agents[agent].remember(prev_state[agent], action, reward, state, done)
            prev_state[agent] = state
            agents[agent].replay(batch_size)
            agents[agent].adapt_epsilon()
        else:
            if i > 1490:
                env.render()
            env.step(None)
            
    env.reset()
    print('game_finished')
    for agent_name in env.agents:
        prev_action[agent_name] = -1
        prev_state[agent_name] = np.zeros(
            (1, agents[agent_name].state_size), dtype=int)

game0
game_finished
game1
game_finished
game2
game_finished
game3
game_finished
game4
game_finished
game5
game_finished
game6
game_finished
game7
game_finished
game8
game_finished
game9
game_finished
game10
game_finished
game11
game_finished
game12
game_finished
game13
game_finished
game14
game_finished
game15
game_finished
game16
game_finished
game17
game_finished
game18
game_finished
game19
game_finished
game20
game_finished
game21
game_finished
game22
game_finished
game23
game_finished
game24
game_finished
game25
game_finished
game26
game_finished
game27
game_finished
game28
game_finished
game29
game_finished
game30
game_finished
game31
game_finished
game32
game_finished
game33
game_finished
game34
game_finished
game35
game_finished
game36
game_finished
game37
game_finished
game38
game_finished
game39
game_finished
game40
game_finished
game41
game_finished
game42
game_finished
game43
game_finished
game44
game_finished
game45
game_finished
game46
game_finished
game47
game_finished
ga

game_finished
game379
game_finished
game380
game_finished
game381
game_finished
game382
game_finished
game383
game_finished
game384
game_finished
game385
game_finished
game386
game_finished
game387
game_finished
game388
game_finished
game389
game_finished
game390
game_finished
game391
game_finished
game392
game_finished
game393
game_finished
game394
game_finished
game395
game_finished
game396
game_finished
game397
game_finished
game398
game_finished
game399
game_finished
game400
game_finished
game401
game_finished
game402
game_finished
game403
game_finished
game404
game_finished
game405
game_finished
game406
game_finished
game407
game_finished
game408
game_finished
game409
game_finished
game410
game_finished
game411
game_finished
game412
game_finished
game413
game_finished
game414
game_finished
game415
game_finished
game416
game_finished
game417
game_finished
game418
game_finished
game419
game_finished
game420
game_finished
game421
game_finished
game422
game_finished
game423
game_finis

game_finished
game752
game_finished
game753
game_finished
game754
game_finished
game755
game_finished
game756
game_finished
game757
game_finished
game758
game_finished
game759
game_finished
game760
game_finished
game761
game_finished
game762
game_finished
game763
game_finished
game764
game_finished
game765
game_finished
game766
game_finished
game767
game_finished
game768
game_finished
game769
game_finished
game770
game_finished
game771
game_finished
game772
game_finished
game773
game_finished
game774
game_finished
game775
game_finished
game776
game_finished
game777
game_finished
game778
game_finished
game779
game_finished
game780
game_finished
game781
game_finished
game782
game_finished
game783
game_finished
game784
game_finished
game785
game_finished
game786
game_finished
game787
game_finished
game788
game_finished
game789
game_finished
game790
game_finished
game791
game_finished
game792
game_finished
game793
game_finished
game794
game_finished
game795
game_finished
game796
game_finis

game_finished
game1119
game_finished
game1120
game_finished
game1121
game_finished
game1122
game_finished
game1123
game_finished
game1124
game_finished
game1125
game_finished
game1126
game_finished
game1127
game_finished
game1128
game_finished
game1129
game_finished
game1130
game_finished
game1131
game_finished
game1132
game_finished
game1133
game_finished
game1134
game_finished
game1135
game_finished
game1136
game_finished
game1137
game_finished
game1138
game_finished
game1139
game_finished
game1140
game_finished
game1141
game_finished
game1142
game_finished
game1143
game_finished
game1144
game_finished
game1145
game_finished
game1146
game_finished
game1147
game_finished
game1148
game_finished
game1149
game_finished
game1150
game_finished
game1151
game_finished
game1152
game_finished
game1153
game_finished
game1154
game_finished
game1155
game_finished
game1156
game_finished
game1157
game_finished
game1158
game_finished
game1159
game_finished
game1160
game_finished
game1161
game_finish

game_finished
game1476
game_finished
game1477
game_finished
game1478
game_finished
game1479
game_finished
game1480
game_finished
game1481
game_finished
game1482
game_finished
game1483
game_finished
game1484
game_finished
game1485
game_finished
game1486
game_finished
game1487
game_finished
game1488
game_finished
game1489
game_finished
game1490
game_finished
game1491
     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  -  |  -  |  -  
     |     |     
     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  X  |  -  |  -  
     |     |     
     |     |     
  -  |  O  |  -  
_____|_____|_____
     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  X  |  -  |  -  
     |     |     
     |     |     
  -  |  O  |  -  
_____|_____|_____
     |     |     
  -  |  X  |  -  
_____|_____|_____
     |     |     
  X  |  -  |  -  
  

     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  -  |  X  |  -  
_____|_____|_____
     |     |     
  -  |  -  |  -  
     |     |     
     |     |     
  -  |  O  |  -  
_____|_____|_____
     |     |     
  -  |  X  |  -  
_____|_____|_____
     |     |     
  -  |  -  |  -  
     |     |     
     |     |     
  -  |  O  |  -  
_____|_____|_____
     |     |     
  -  |  X  |  -  
_____|_____|_____
     |     |     
  X  |  -  |  -  
     |     |     
     |     |     
  -  |  O  |  -  
_____|_____|_____
     |     |     
  -  |  X  |  -  
_____|_____|_____
     |     |     
  X  |  -  |  O  
     |     |     
     |     |     
  -  |  O  |  X  
_____|_____|_____
     |     |     
  -  |  X  |  -  
_____|_____|_____
     |     |     
  X  |  -  |  O  
     |     |     
     |     |     
  -  |  O  |  X  
_____|_____|_____
     |     |     
  -  |  X  |  -  
_____|_____|_____
     |     |     
  X  |  -  |  O  
     |     |     
game_finished
game1499
     

In [55]:
agents['player_1'].model.save('agent1.h5')
agents['player_2'].model.save('agent2.h5')