In [None]:
import numpy as np
import gym
from tqdm import tqdm
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

# Initialize the 2048 environment
env = Game2048Env()

# Neural Network Parameters
input_size = 2**16  # Size of hashed state space
output_size = env.action_space.n
learning_rate = 0.001

# Build the Neural Network model
def build_model():
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_size,)),
        Dense(128, activation='relu'),
        Dense(output_size, activation='linear')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='mse')
    return model

# Initialize the model
model = build_model()

# Experience replay buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []

    def add(self, experience):
        if len(self.buffer) >= self.capacity:
            self.buffer.pop(0)
        self.buffer.append(experience)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        return [self.buffer[i] for i in indices]

buffer = ReplayBuffer(capacity=10000)

# Training parameters
batch_size = 64
gamma = 0.98
epsilon = 1.0
epsilon_decay = 0.9995
min_epsilon = 0.1
num_episodes = 100

# Define hash function
def hash_state(state):
    return hash(state.tostring()) % (2**16)

# Function to encode hashed state into one-hot vector
def one_hot_encode(hash_value, size):
    encoded = np.zeros(size)
    encoded[hash_value] = 1
    return encoded

# Function to choose the next action
def choose_action(state):
    if np.random.uniform(0, 1) < epsilon:
        return env.action_space.sample()  # Explore
    q_values = model.predict(state[np.newaxis])[0]
    return np.argmax(q_values)  # Exploit

# Function to train the model with experience replay
def train_model():
    if len(buffer.buffer) < batch_size:
        return

    batch = buffer.sample(batch_size)
    states, actions, rewards, next_states, dones = zip(*batch)

    states = np.array(states)
    next_states = np.array(next_states)
    q_values = model.predict(states)
    next_q_values = model.predict(next_states)

    for i in range(batch_size):
        target = rewards[i]
        if not dones[i]:
            target += gamma * np.max(next_q_values[i])
        q_values[i, actions[i]] = target

    model.fit(states, q_values, epochs=1, verbose=0)

# Starting the training
rewards = []

for episode in tqdm(range(num_episodes)):
    raw_state = env.reset()
    state_hash = hash_state(raw_state)
    state = one_hot_encode(state_hash, input_size)
    episode_reward = 0
    done = False

    while not done:
        # Choose action
        action = choose_action(state)

        # Perform action
        next_raw_state, reward, done, info = env.step(action)
        next_state_hash = hash_state(next_raw_state)
        next_state = one_hot_encode(next_state_hash, input_size)

        episode_reward += reward

        # Store experience in replay buffer
        buffer.add((state, action, reward, next_state, done))

        # Train the model
        train_model()

        # Update state
        state = next_state

    # Decay epsilon
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

    rewards.append(episode_reward)




In [None]:
# Plot rewards
plt.plot(rewards)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("Neural Network SARSA Agent Training Rewards")
plt.show()
