In [50]:
import gymnasium as gym
from matplotlib import pyplot as plt
import numpy as np
import pickle

In [51]:
render = False
isTraining = True

In [52]:
'''
This parameter, referred to as "episodes," dictates the number of learning iterations the model will undergo. It essentially defines the total 
number of training iterations or epochs the model will execute during the learning process.
'''
episodes = 15000

'''
This is the learning rate, denoted by the symbol α in the Q-learning algorithm. It determines to what extent the newly acquired information 
overrides the existing information in the Q-table.
'''
learning_rate_a = 0.9

'''
This is the discount factor, denoted by the symbol γ in the Q-learning algorithm. It represents the importance of future rewards relative to 
immediate rewards. It discounts the value of future rewards that are further in time.
'''
discount_factor_g = 0.9

'''
Epsilon (ε) is a parameter that controls the exploration vs. exploitation trade-off in reinforcement learning. A high epsilon value encourages 
more exploration, meaning the agent chooses random actions more frequently, while a low epsilon value encourages exploitation, meaning the agent 
relies more on its learned Q-values to select actions.
'''
epsilon = 1

'''
This parameter determines the rate at which epsilon decreases over time. Typically, epsilon starts at a relatively high value and gradually 
decays towards zero as the agent learns more about the environment.
'''
epsilon_decay_rate = 0.0001

In [53]:
# Environment setup
env = gym.make('FrozenLake-v1', map_name="8x8", is_slippery=True, render_mode='human' if render else None)

if isTraining:
    # Q-table initialization
    q = np.zeros((env.observation_space.n, env.action_space.n))
else:
    f = open('frozen_lake8x8.pk1', 'rb')
    q = pickle.load(f)
    f.close()

# Random number generator
rng = np.random.default_rng()

# Rewards per episode initialization
rewards_per_episode = np.zeros(episodes)

# Loop over episodes
for i in range(episodes):
    # Reset environment
    state = env.reset()[0]
    terminated = False
    truncated = False
    
    # Episode loop
    while(not terminated and not truncated):
        # Exploration vs Exploitation trade-off
        if isTraining and rng.random() < epsilon:
            action = env.action_space.sample()  # Random action
        else:
            action = np.argmax(q[state,:])  # Greedy action based on Q-values

        # Take a step in the environment
        new_state, reward, terminated, truncated, _ = env.step(action)

        if isTraining:
            # Q-value update using Q-learning equation
            q[state, action] = q[state, action] + learning_rate_a * (
                reward + discount_factor_g * np.max(q[new_state,:]) - q[state, action]
            )

        # Transition to the new state
        state = new_state

    # Epsilon decay
    epsilon = max(epsilon - epsilon_decay_rate, 0)

    # Learning rate decay
    if epsilon == 0:
        learning_rate_a = 0.0001

    # Store rewards for this episode
    if reward == 1:
        rewards_per_episode[i] = 1

# Close the environment
env.close()

# Cumulative sum of rewards over 1000 episodes
sum_rewards = np.zeros(episodes)
for t in range(episodes):
    sum_rewards[t] = np.sum(rewards_per_episode[max(0, t-100):(t+1)])

# Plotting
plt.plot(sum_rewards)
plt.savefig('frozen_lake8x8.png')  # Save plot to a file

if isTraining:
    # Save Q-table to a file
    f = open("frozen_lake8x8.pk1","wb")
    pickle.dump(q, f)
    f.close()


KeyboardInterrupt: 