In [None]:
import numpy as np
import gym
from tqdm import tqdm
import matplotlib.pyplot as plt

# Initialize the 2048 environment
env = Game2048Env()
env.set_illegal_move_reward(-1)

# SARSA Agent Parameters
alpha = 0.15  # Learning rate
gamma = 0.99  # Discount factor
epsilon = 0.9  # Initial exploration rate, the lower it is the less exploring it does
epsilon_decay = 0.999  # Decay factor for epsilon
min_epsilon = 0.1  # Minimum value of epsilon
num_episodes = 35000  # Number of training episodes

# Define hash function
def hash_state(state):
    return hash(state.tostring()) % (2**16)

# Initialize the Q-table
state_space_size = (env.size, env.size, env.squares)  # Assuming binary-layered observation
action_space_size = env.action_space.n
Q = np.zeros((2**16, action_space_size))
def is_highest_in_bottom(board_2d):
    bottom_row = board_2d[-1, :]
    if np.max(bottom_row) == np.max(board_2d):
        return True
    return False

# Function to choose the next action
def choose_action(state_index):
    # action = 0 => 'up', 1 => 'right', 2 => 'down', 3 => 'left'
    q_values = Q[state_index, :].copy()

    if is_highest_in_bottom(env.get_board()):
        q_values[0] *= 0.7  # scale down 'up' move a bit

    # Epsilon-greedy policy with modified Q-values
    if np.random.uniform(0, 1) < epsilon:
        return env.action_space.sample()  # Explore
    else:
        return np.argmax(q_values)        # Exploit


# Function to learn the Q-value
def update(state_index, next_state_index, reward, action, next_action):
    predict = Q[state_index, action]
    target = reward + gamma * Q[next_state_index, next_action]
    Q[state_index, action] += alpha * (target - predict)

def highest_tile_in_bottom_right_corner(board_2d):
    max_val = np.max(board_2d)
    return 1 if board_2d[3, 3] == max_val else 0


def monotonicity_score(board_2d):
    score = 0

    # Check each row for descending order
    for row in board_2d:
        for i in range(len(row) - 1):
            if row[i] >= row[i + 1]:
                score += 1

# Check each column for descending order
    for col_idx in range(board_2d.shape[1]):
        col = board_2d[:, col_idx]
        for i in range(len(col) - 1):
            if col[i] >= col[i + 1]:
                score += 1

    return score


def calculate_reward(state, next_state, done, info):

    # Get board representation directly from the environment
    board_2d = env.get_board().copy()
    board_2d[:] = next_state.reshape(4, 4, env.squares)[:, :, 0]

    reward = 0
    if not done:
        reward += np.sum(next_state) - np.sum(state)
    else:
        reward -= 20

    if info['highest'] >= 2048:
        reward += 50

    corner_bonus = highest_tile_in_bottom_right_corner(board_2d)
    reward += 2.0 * corner_bonus

    monotonic = monotonicity_score(board_2d)
    reward += 0.2 * monotonic

    return reward


rewards = []


for episode in tqdm(range(num_episodes)):
    state = env.reset().flatten()
    state_index = hash_state(state)
    action = choose_action(state_index)
    episode_reward = 0

    while True:
        # Get the next state and reward
        next_state, reward, done, info = env.step(action)
        next_state = next_state.flatten()
        next_state_index = hash_state(next_state)

        # Normalize and adjust reward
        reward = calculate_reward(state, next_state, done, info)
        episode_reward += reward

        # Choose the next action
        next_action = choose_action(next_state_index)

        # Update Q-value
        update(state_index, next_state_index, reward, action, next_action)

        # Transition to the next state
        state_index = next_state_index
        action = next_action

        # Check if episode is done
        if done:
            break

    # Decay epsilon
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

    # Track rewards
    rewards.append(episode_reward)




In [None]:
# Plot rewards
plt.plot(rewards)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("SARSA Agent Training Rewards")
plt.show()

# Plot moving average of rewards
window = 100
moving_avg = np.convolve(rewards, np.ones(window) / window, mode='valid')
plt.plot(moving_avg)
plt.xlabel("Episode")
plt.ylabel("Average Reward")
plt.title("SARSA Agent Training Rewards (Smoothed)")
plt.show()
