In [1]:
import gymnasium as gym
import tensorflow as tf
import numpy as np
np.set_printoptions(precision=3)

np.random.seed(42)
tf.random.set_seed(42)

import mahjong_env

In [32]:
env = gym.make("Mahjong-v0")

In [53]:
input_shape = [108]
n_outputs = 27

model = tf.keras.Sequential([
    tf.keras.Input(input_shape),
    tf.keras.layers.Dense(64, activation="elu"),
    tf.keras.layers.Dense(64, activation="elu"),
    tf.keras.layers.Dense(n_outputs)
])

In [85]:
def epsilon_greedy_policy(observation, epsilon):
    observation = np.concatenate([observation[x] for x in sorted(observation)])
    if np.random.rand() < epsilon:
        action = np.random.choice(np.nonzero(observation[:27])[0])
    else:
        action_q_values = model.predict(observation[np.newaxis], verbose=0)[0]
        action_q_values[(observation[:27]==0)] = -np.inf # mask invalid actions
        action = np.argmax(action_q_values)
    return action

In [86]:
class CircularReplayBuffer:
    def __init__(self, max_size):
        self.buffer = np.empty(max_size, dtype=object)
        self.max_size = max_size
        self.index = 0
        self.size = 0

    def append(self, obj):
        # Modify observations
        observation, action, reward, next_observation, done, truncated = obj
        observation = np.concatenate([observation[x] for x in sorted(observation)])
        next_observation = np.concatenate([next_observation[x] for x in sorted(next_observation)])
        
        self.buffer[self.index] = (observation, action, reward, next_observation, done, truncated)
        self.size = min(self.size + 1, self.max_size)
        self.index = (self.index + 1) % self.max_size

    def sample(self, batch_size):
        indices = np.random.randint(self.size, size=batch_size)
        return self.buffer[indices]
    
replay_buffer = CircularReplayBuffer(5_000)

In [87]:
def sample_experiences(batch_size):
    batch = replay_buffer.sample(batch_size)
    return [
        np.array([experience[field_index] for experience in batch])
        for field_index in range(6)
    ]

In [88]:
def play_one_step(env, observation, epsilon):
    action = epsilon_greedy_policy(observation, epsilon)
    next_observation, reward, done, truncated, info = env.step(action)
    replay_buffer.append((observation, action, reward, next_observation, done, truncated))
    return next_observation, reward, done, truncated, info

In [89]:
env.reset(seed=42)
np.random.seed(42)
tf.random.set_seed(42)
rewards = [] 
best_score = -10_000

batch_size = 32
discount_factor = 0.95
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
loss_fn = tf.keras.losses.mse

def training_step(batch_size):
    observations, actions, rewards, next_observations, dones, truncateds = sample_experiences(batch_size)
    next_q_values = model.predict(next_observations, verbose=0)
    max_next_q_values = next_q_values.max(axis=1)
    next_observation_exists = 1.0 - (dones | truncateds)
    target_Q_values = rewards + next_observation_exists * discount_factor * max_next_q_values
    target_Q_values = target_Q_values.reshape(-1, 1)

    mask = tf.one_hot(actions, n_outputs)
    with tf.GradientTape() as tape:
        all_Q_values = model(observations)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

In [99]:
NUMBER_OF_EPISODES = 20
for episode in range(NUMBER_OF_EPISODES):
    observation, _ = env.reset()
    episode_reward = 0
    for step in range(1_000):
        epsilon = max(1 - episode / NUMBER_OF_EPISODES, 0.01)
        observation, reward, done, truncated, info = play_one_step(env, observation, epsilon)
        episode_reward += reward
        if done or truncated:
            break

    print(f"\rEpisode: {episode + 1}, Steps: {step + 1}, Reward: {episode_reward:.2f}, eps: {epsilon:.3f}", end="")

    rewards.append(episode_reward)
    if episode_reward >= best_score:
        best_weights = model.get_weights()
        best_score = episode_reward

    if episode > 5:
        training_step(batch_size)

Episode: 20, Steps: 16, Reward: 1.82, eps: 0.05000