# Q-Learning for Frozen-Lake

In [2]:
from __future__ import division

#Python version : 3.11.9
import gym            #version : 0.26.2
import numpy as np    #version : 2.0.2
import random
import tensorflow as tf     #version : 2.18.0
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
env = gym.make("FrozenLake-v1", is_slippery=False)  # Use a deterministic FrozenLake environment

### Defining the Network

In [None]:
# Define the feed-forward network used to choose actions
class QNetwork(tf.keras.Model):
    def __init__(self):
        super(QNetwork, self).__init__()
        # Register W as a trainable variable using add_weight
        self.W = self.add_weight(
            name="W", 
            shape=(16, 4), 
            initializer=tf.random_uniform_initializer(minval=0, maxval=0.01), 
            trainable=True
        )

    def call(self, inputs):
        return tf.matmul(inputs, self.W)

# Initialize the network
q_network = QNetwork()
print(q_network.trainable_variables)

# Define the loss function and optimizer
nextQ = tf.constant([[0.0] * 4], dtype=tf.float32)  # Replace placeholder with a constant/tensor
loss_fn = tf.keras.losses.MeanSquaredError()  # Replace reduce_sum with a standard loss function
optimizer = tf.keras.optimizers.SGD(learning_rate=0.1)


### Training the netwrok

In [None]:
# Set learning parameters
gamma = 0.99  # Discount factor
epsilon = 0.1  # Exploration rate
num_episodes = 100

# Create lists to contain total rewards and steps per episode
jList = []
rList = []

# Training loop
for episode in range(num_episodes):
    g_state = env.reset()  # Reset the environment
    # print(state)
    state = g_state[0]
    rAll = 0  # Total reward for the episode
    truncated = False
    step = 0

    while step < 99:
        step += 1

        # Choose an action: epsilon-greedy policy
        state_one_hot = np.identity(16)[state:state+1]  # One-hot encoding of the state
        Q_values = q_network(tf.constant(state_one_hot, dtype=tf.float32)).numpy()
        if np.random.rand(1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q_values)

        # Take the action, get the reward and the next state
        next_state, reward, term, truncated, info_ = env.step(action)
        print(next_state, reward, term, truncated, info_)

        # Obtain the Q' values for the next state
        next_state_one_hot = np.identity(16)[next_state:next_state + 1]
        Q_values_next = q_network(tf.constant(next_state_one_hot, dtype=tf.float32)).numpy()
        max_Q_next = np.max(Q_values_next)

        # Update Q-values using the Bellman equation
        target_Q = Q_values.copy()
        target_Q[0, action] = reward + gamma * max_Q_next

        # Train the network
        with tf.GradientTape() as tape:
            Q_out = q_network(tf.constant(state_one_hot, dtype=tf.float32))
            loss = tf.reduce_sum(tf.square(Q_out - tf.constant(target_Q, dtype=tf.float32)))

        gradients = tape.gradient(loss, q_network.trainable_variables)
        # filtered_gradients_and_vars = [(g, v) for g, v in zip(gradients, q_network.trainable_variables) if g is not None]
        optimizer.apply_gradients(zip(gradients, q_network.trainable_variables))

        rAll += reward
        state = next_state

        if truncated:
            # Reduce epsilon (exploration rate) over time
            epsilon = 1.0 / ((episode / 50) + 10)
            break

    jList.append(step)
    rList.append(rAll)

print(sum(rList))
print(f"Percent of successful episodes: {sum(rList) / num_episodes * 100:.2f}%")


In [None]:
plt.plot(rList)

In [None]:
plt.plot(jList)