In [1]:
import tensorflow as tf
import numpy as np
import random
from collections import deque
#import sys
#import psutil

class Game:
    def __init__(self, n_players=5):
        self.n_players = n_players
        self.reset()

    def reset(self):
        self.players_position = [0 for _ in range(self.n_players)]
        self.players_score = [0 for _ in range(self.n_players)]
        self.stage_rewards = [1, 2, 13]
        self.stage_capacity = [self.n_players, 3, 1]
        self.max_moves = 10
        self.number_of_pushes = 0

    def get_state(self):
        state = {
            'players_position': self.players_position,
            'players_score': self.players_score,
            'time_left': self.max_moves
        }
        return state

    def player_step(self, action=0, player_index=0):
        # actions: 0 - stay, 1 - up
        if action == 0:
            pass
        else:
            if self.players_position[player_index] != len(self.stage_rewards)-1:
                my_stage = self.players_position[player_index] + 1
                on_stage_players = np.where(np.array(self.players_position) == my_stage)[0]
                # print('on_stage_players', on_stage_players, my_stage, self.stage_capacity[my_stage])
                if len(on_stage_players) > self.stage_capacity[my_stage]-1:
                    self.number_of_pushes+=1
                    whom_to_push = np.random.choice(on_stage_players)
                    # print('push', whom_to_push+1, ', I am', player_index+1)
                    self.players_position[whom_to_push] = 0
                self.players_position[player_index] += 1

    def step(self, actions_list):
        for i, action in enumerate(actions_list):
                self.player_step(action, player_index=i)
        
        reward = [self.stage_rewards[self.players_position[i]] for i in range(len(self.players_position))]
        self.players_score =  np.array(self.players_score)+np.array(reward)
            
        self.max_moves -= 1
        is_game_end = False
        if self.max_moves == 0:
            is_game_end = True

        state = self.get_state()

        return state, reward, is_game_end

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=50_000)
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.001
        self.epsilon_decay = 0.99
        self.model = self._build_model()

    def _build_model(self):
        model = tf.keras.models.Sequential()
        model.add(tf.keras.layers.Dense(32, input_dim=self.state_size, activation='relu'))
        model.add(tf.keras.layers.Dense(32, activation='relu'))
        model.add(tf.keras.layers.Dense(32, activation='relu'))
        model.add(tf.keras.layers.Dense(32, activation='relu'))
        model.add(tf.keras.layers.Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam())
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state, verbose=0)
        return np.argmax(act_values[0])

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        states = np.zeros((batch_size, self.state_size))
        targets = np.zeros((batch_size, self.action_size))
        for i, (state, action, reward, next_state, done) in enumerate(minibatch):
            states[i] = state
            target = self.model.predict(state, verbose=0)
            if done:
                target[0][action] = reward
            else:
                Q_future = max(self.model.predict(next_state, verbose=0)[0])
                target[0][action] = reward + Q_future * self.gamma
            targets[i] = target
        self.model.fit(states, targets, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
# Initialize game and agents
n_players = 6
game = Game(n_players)
state_size = len(game.get_state()['players_position']) + len(game.get_state()['players_score']) + 1  # +1 for 'time_left'
action_size = 2
agents = [DQNAgent(state_size, action_size) for _ in range(n_players)]
batch_size = 64
epochs = 3500

dumb_filename = f'n_players-{n_players}epochs-{epochs}ledderlen-{len(game.stage_rewards)}movespergame-{game.max_moves}.txt'
with open(f"{dumb_filename}", "w") as f:
    f.write(f'stage_rewards = {game.stage_rewards}\n')
    f.write(f'stage_capacity = {game.stage_capacity}\n')

# Train the agents
for epoch in range(epochs):
    game.reset()
    state = game.get_state()
    state_values = np.concatenate([np.array(state['players_position']), 
                            np.array(state['players_score']), 
                            np.array([state['time_left']])])
    state = np.reshape(state_values, [1, state_size])
    while True:
        actions = [agent.act(state) for agent in agents]
        next_state, reward, done = game.step(actions)
        next_state = np.concatenate([np.array(next_state['players_position']), 
                            np.array(next_state['players_score']), 
                            np.array([next_state['time_left']])])
        next_state = np.reshape(next_state, [1, state_size])
        #print(state, actions, reward, next_state, done)
        for i, agent in enumerate(agents):
            agent.remember(state, actions[i], reward[i], next_state, done)
        state = next_state

        if epoch%25==0:
                with open(f"{dumb_filename}", "a") as f:
                    f.write(f'{game.get_state()}, number_of_pushes = {game.number_of_pushes}\n')

        if done:
            break
        
    for agent in agents:
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)


    #print(psutil.virtual_memory()[3] / 1000000000, 'Gb')
    print(f'Epoch {epoch+1} is done!')

print("Finished Successfully!")

2023-06-25 22:54:24.208125: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2023-06-25 22:54:24.677137: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2023-06-25 22:54:24.684097: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-25 22:54:24.684162: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 3090 computeCapability: 8.6
coreClock: 1.725GHz coreCount: 82 deviceMemorySize: 23.69GiB deviceMemoryBandwidth: 871.81GiB/s
2023-06-25 22:54:24.684171: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2023-06-25 22:54:24.686094: I tensorflow/stream_executor/

Epoch 1 is done!
Epoch 2 is done!
Epoch 3 is done!
Epoch 4 is done!
Epoch 5 is done!
Epoch 6 is done!


2023-06-25 22:54:25.250428: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2023-06-25 22:54:25.250629: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3609600000 Hz
2023-06-25 22:54:25.284327: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2023-06-25 22:54:25.770785: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11
2023-06-25 22:54:25.770822: I tensorflow/stream_executor/cuda/cuda_blas.cc:1838] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


In [None]:
for i, agent in enumerate(agents):
    agent.model.save_weights(f"3500_model_w.a{i}")

In [None]:
model_json = agents[0].model.to_json()
with open("3500_model.json", "w") as json_file:
    json_file.write(model_json)

In [None]:
%pip install tensorflow==2.5.0

In [None]:
%pip install numpy==1.19.5

In [None]:
%pip install --upgrade pip