In [1]:
import gym
import gym_pdw

import numpy as np
from matplotlib import pyplot as plt

In [2]:
env = gym.make('gym_pdw:pdw-v0')

In [3]:
def SARSA_lambda(gamma, alpha, epsilon, episodes, lambda_val):
    
    # Set the goal variant
    goal_pos = env.set_goal('A')

    # Setting terminal action-state value 
    Terminal_state = goal_pos

    # Initializing Q function values to zero
    Q = np.zeros([env.observation_space.shape[0], env.observation_space.shape[1], env.action_space.n])

    
    # Epsilon-greedy action selection
    def select_action(epsilon, state):
        if np.random.uniform(0,1) < epsilon:
            action = env.random_action()
        else:
            action = np.argmax(Q[state[0],state[1]][:])
        return action

    
    # Q function update 
    def update(state, action, reward, next_state, next_action, E):
        nonlocal Q
        error = reward + gamma*Q[next_state[0],next_state[1]][next_action] - Q[state[0],state[1]][action]
        Q = Q + alpha*error*E

        
    steps = np.zeros([episodes])
    avg_reward = np.zeros([episodes])
    
    for episode in range(episodes):

        env.reset()
        
        curr_state = env.get_state()
        action = env.random_action()
        
        print(curr_state ,"init====================")

        E = np.zeros([12,12,4])

        while True:
            
            if steps[episode] != 0:
                action = select_action(epsilon, curr_state)
            
            next_state, reward = env.step(action)
            next_action = select_action(epsilon, next_state)

            # Update eligibility traces
            E *= gamma*lambda_val
            E[curr_state[0],curr_state[1]][action] += 1
            
            # Update Q values
            update(curr_state, action, reward, next_state, next_action, E)
            
            
            steps[episode] +=1
            avg_reward[episode] = avg_reward[episode] + (reward - avg_reward[episode])/steps[episode]
            
            
            curr_state = next_state
            print(curr_state, "curr_state", env.start_positions)

            if curr_state == goal_pos:
                print("yes", steps[episode])
                break
    
    return avg_reward, steps, Q

In [4]:
def plot_sarsa_lambda(avg_reward, steps, episodes):
    '''
    Gets the data for all curves and plots them in one graph
    '''

    
    # Figure instances will be returned.
    fig1=plt.figure(figsize=(10,6)).add_subplot(111)
    fig2=plt.figure(figsize=(10,6)).add_subplot(111)

    # colors for different values of epsilon
    colors = ['g', 'r', 'k', 'b', 'y','m', 'c']

    fig1.plot(range(episodes), avg_reward, colors[0], label = " Average reward " )
    fig2.plot(range(episodes), steps, colors[1], label = " Steps")

    # Labelling the plot
    fig1.title.set_text('SARSA avg reward')
    fig1.set_ylabel('Average Reward')
    fig1.set_xlabel('episodes')
    fig1.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

    # Labelling the plot
    fig2.title.set_text('SARSA num steps')
    fig2.set_ylabel('Steps')
    fig2.set_xlabel('episodes')
    fig2.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

    # Display the plot
#     plt.show()


In [5]:
def plot_compare(avg_reward_all, steps_all, lambda_vals):
    
    # Figure instances will be returned.
    fig1=plt.figure(figsize=(10,6)).add_subplot(111)
    fig2=plt.figure(figsize=(10,6)).add_subplot(111)

    # colors for different values of c
    colors = ['k', 'r', 'g', 'm', 'y','k', 'c']

    # For each value of c, plot the average reward vs steps
    for i in range(len(avg_reward_all)):
        fig1.plot(range(episodes), steps_all[i], colors[i], label = "lambda = " + str(lambda_vals[i]) )
    
    # For each c, plot the % times optimal arm selected vs steps
    for i in range(len(steps_all)):
        fig2.plot(range(episodes), steps_all[i], colors[i], label = "lambda = " + str(lambda_vals[i]) )
    
    # Labelling the  plot
    fig1.title.set_text('For all lambdas Average reward vs episodes')
    fig1.set_ylabel('Average Reward')
    fig1.set_xlabel('episodes')
    fig1.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    
    # Labelling the plot
    fig2.title.set_text('For all lambdas steps vs episodes')
    fig2.set_ylabel('Steps')
    fig2.set_xlabel('episodes')
    fig2.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

    # Display the plot
    plt.show()



In [8]:
E = np.ones([12,12,4])
E *= 0.7
E[2,3][3] += 1
E

array([[[0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7]],

       [[0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7]],

       [[0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 1.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
        [0.7, 0.7, 0.7, 0.7],
      