In [None]:
import numpy as np
import gym
from tqdm import tqdm
import matplotlib.pyplot as plt

# Initialize the 2048 environment
env = Game2048Env()

# SARSA Agent Parameters
alpha = 0.15  # Learning rate
gamma = 0.99  # Discount factor
epsilon = 0.9  # Initial exploration rate, the lower it is the less exploring it does
epsilon_decay = 0.999  # Decay factor for epsilon
min_epsilon = 0.1  # Minimum value of epsilon
num_episodes = 15000  # Number of training episodes

# Define hash function
def hash_state(state):
    return hash(state.tostring()) % (2**16)

# Initialize the Q-table
state_space_size = (env.size, env.size, env.squares)  
action_space_size = env.action_space.n
Q = np.zeros((2**16, action_space_size))

# Function to choose the next action
def choose_action(state_index):
    if np.random.uniform(0, 1) < epsilon:
        return env.action_space.sample()  # Explore
    else:
        return np.argmax(Q[state_index, :])  # Exploit

# Function to learn the Q-value
def update(state_index, next_state_index, reward, action, next_action):
    predict = Q[state_index, action]
    target = reward + gamma * Q[next_state_index, next_action]
    Q[state_index, action] += alpha * (target - predict)

def calculate_reward(state, next_state, done, info):
    reward = 0
    # Penalty for ending the game
    if done:
        reward -= 20
    else:
        # Reward for merging tiles
        reward += np.sum(next_state) - np.sum(state)
    # Reward for reaching 2048
    if info['highest'] >= 2048:
        reward += 50
    return reward

# Starting the SARSA learning
rewards = []
episodes_explored = 0
episodes_exploited = 0

for episode in tqdm(range(num_episodes)):
    t = 0
    state = env.reset().flatten()
    state_index = hash_state(state)
    action = choose_action(state_index)
    episode_reward = 0

    while True:
        # Get the next state and reward
        next_state, reward, done, info = env.step(action)
        next_state = next_state.flatten()
        next_state_index = hash_state(next_state)

        # Normalize and adjust reward
        reward = calculate_reward(state, next_state, done, info)
        episode_reward += reward

        # Choose the next action
        next_action = choose_action(next_state_index)

        # Update Q-value
        update(state_index, next_state_index, reward, action, next_action)

        # Transition to the next state
        state_index = next_state_index
        action = next_action

        # Check if episode is done
        if done:
            break

    # Decay epsilon
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

    # Track rewards
    rewards.append(episode_reward)



In [None]:
# Plot rewards
plt.plot(rewards)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("SARSA Agent Training Rewards")
plt.show()

# Plot moving average of rewards
window = 100
moving_avg = np.convolve(rewards, np.ones(window) / window, mode='valid')
plt.plot(moving_avg)
plt.xlabel("Episode")
plt.ylabel("Average Reward")
plt.title("SARSA Agent Training Rewards (Smoothed)")
plt.show()
