In [None]:
import gym
import numpy as np
import time, pickle, os

In [None]:
# Creating the environment
env = gym.make("FrozenLake-v1", render_mode = "human")

In [None]:
# Defining rates & factors
epsilon = 0.9 # Epsilon greedy factor
minEpsilon = 0.1
maxEpsilon = 1
epsilonDecayRate = .05

totalEpisodes = 10000
maxSteps = 100

lr = 0.81 # Learning rate
gamma = 0.96 # Discount factor

In [None]:
# Initializing Q-Table
qTable = np.zeros((env.observation_space.n, env.action_space.n)) # Q(s,a)

In [None]:
# Defining what action to take given a state based on Epsilon-Greedy (Policy?)
def chooseAction(state):
    action = 0
    if np.random.uniform(0,1) < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(qTable[state, :])
    return action

In [None]:
# Defining learning step based on SARSA
def learn(state, action, reward, state2, action2):
    predicted = qTable[state, action]
    target = reward + gamma * qTable[state2, action2]
    qTable[state, action] = qTable[state, action] + lr * target - predicted

In [None]:
# Start
rewards = 0

for episode in range(totalEpisodes):

    print("Episode: ", str(episode))
    print("Epsilon: " + str(epsilon))
    print("Rewards: " + str(rewards))
    print(qTable)
    
    t = 0
    state = env.reset()[0]
    action = chooseAction(state)

    while t < maxSteps:
        env.render()

        state2, reward, terminated, truncated, _ = env.step(action)
        # print("Immediate Reward: " + str(reward))
        action2 = chooseAction(state2)
        learn(state, action, reward, state2, action2)
        state = state2
        action = action2

        t += 1
        rewards += 1

        if (terminated or truncated):
            break

        time.sleep(0.1)

    epsilon = minEpsilon + (maxEpsilon - minEpsilon) * np.exp(-epsilonDecayRate * episode)
    os.system('clear')

print("Score over time: ", rewards/totalEpisodes)
print(qTable)

with open("frozenLake_qTable_sarsa.pk1", 'wb') as f:
    pickle.dump(qTable, f)