In [206]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [207]:
def chooseAction(state, qTable, isTraining, env, epsilon):
    action = 0
    if isTraining and np.random.uniform(0,1) < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(qTable[state, :])
    return action

In [208]:
def updateQTable(state, action, reward, newState, nextAction, qTable, lr, gamma):
    currentQ = qTable[state, action]
    nextQ = qTable[newState, nextAction]
    qTable[state, action] = currentQ + lr*(reward + gamma*nextQ - currentQ)
    return qTable

In [209]:
def runQLearning(totalEpisodes, isTraining = True, render = False):

    # Creating the environment
    env = gym.make("FrozenLake-v1", is_slippery = False, render_mode = 'human' if render else None)

    # Initialize Q-Table
    if isTraining:
        #qTable = np.random.uniform(low = 0, high = 1, size = (env.observation_space.n, env.action_space.n)) # Q(s,a) initialized randomly
        qTable = np.zeros((env.observation_space.n, env.action_space.n)) # Q(s,a) initialized with zeros
    else:
        # Load trained Q-Table
        f = open('frozenLake_qTable_SARSA_v2', 'rb')
        qTable = pickle.load(f)
        f.close()
    
    # Defining rates & factors
    epsilon = 1 # Epsilon greedy factor
    epsilonDecayRate = .0001
    lr = .9 # Learning rate
    gamma = .9 # Discount factor

    rewards = np.zeros(totalEpisodes)
    stepsPerEpisode = np.zeros(totalEpisodes)

    # Start
    for episode in range(totalEpisodes):
        
        steps = 0
        
        if isTraining:
            print("Episode: " + str(episode))
        else:
            print("Starting attempt...")
        
        terminated = False
        truncated = False

        state = env.reset()[0]
        action = chooseAction(state, qTable, isTraining, env, epsilon)

        while(not terminated and not truncated):
            
            newState, reward, terminated, truncated, _ = env.step(action)

            nextAction = chooseAction(newState, qTable, isTraining, env, epsilon)

            if isTraining:
                updateQTable(state, action, reward, newState, nextAction, qTable, lr, gamma)

            state = newState
            action = nextAction

            steps += 1

        epsilon = max(epsilon - epsilonDecayRate, 0)

        # Stabilizing Q-Table when exploration has finished
        if(epsilon == 0):
            lr = .0001
        
        if reward == 1:
            rewards[episode] = 1

        if episode % 500 == 0:
            stepsPerEpisode[episode] = steps
        
    env.close()

    if isTraining:
        print("Training completed. Environment closed")
    else:
        print("Attempt finished, total steps: " + str(steps))
    
    # Graphics
    sumRewards = np.zeros(totalEpisodes)
    for i in range(totalEpisodes):
        sumRewards[i] = np.sum(rewards[max(0, i-100):(i+1)]) #Rewards every 100 episodes
    if isTraining:
        figure, (ax1, ax2) = plt.subplots(1,2)
        ax1.plot(sumRewards)
        ax1.set_title("Sum of rewards")
        ax1.set_xlabel("Episodes")
        ax1.grid(True)
        ax2.plot(stepsPerEpisode)
        ax2.set_title("Steps per episode")
        ax2.set_xlabel("Episodes")
        ax2.grid(True)
        plt.savefig("frozenLake_SARSA_v2_graphics.png")
    
    # Saving Q-Table after the training
    if isTraining:
        with open("frozenLake_qTable_SARSA_v2", 'wb') as f:
            pickle.dump(qTable, f)

In [210]:
if __name__ == '__main__':
    #runQLearning(15000)
    runQLearning(1, isTraining=False, render=True)

Starting attempt...
Attempt finished, total steps: 6
