Consider the following parameterization of a policy: There is a \preference" for each
action, for each column and for each row. Thus the set of preferences can be denoted by
θx(N; 0); θy(N; 0); θx(S; 0); θy(S; 0); θx(E; 0); θy(E; 0); :::; θx(W; 9); θy(W; 9), for a total of
80 preference values. The total preference for an action a in a state (i, j) is given by
θx(a; i) + θy(a; j). The action probabilities are generated by a soft-max function using
these preferences.
4. Implement a MC policy gradient algorithm. Choose appropriate learning rates, and turn
in two curves for each variant as indicated in the first part as well as the optimal policies
learnt.

In [1]:
import gym
import gym_pdw

import numpy as np
from matplotlib import pyplot as plt

In [2]:
def softmax_action(state, parameters):
    '''
    Returns action and the probability of that action
    '''
    x, y = state
    north = parameters[x,0,0] + parameters[y,1,0]
    east = parameters[x,0,1] + parameters[y,1,1]
    west = parameters[x,0,2] + parameters[y,1,2]
    south = parameters[x,0,3] + parameters[y,1,3]
    
    # The preferences of each action
    preferences = [north, east, west, south]    
    prefs = np.exp(preferences)
    # The probability of selecting each action
    softmax = (prefs)/np.sum(prefs)
    
    action = np.random.choice([0,1,2,3], 1, p=softmax)
#     print(action)
    action = np.squeeze(action)
    
    return action, softmax[action]

In [3]:
def selected_param_pos(state, action):
    '''
    Returns the row, column, the direction assigned to the parameter
    '''
    direction = action
    row,col = state
    return row, col, direction

In [20]:
def Update(parameters, states, actions, rewards, params_pos, probs, gamma, alpha):
    '''
    Updates the the parameters
    '''
    
    # Initialize return
    G = 0
    
    i=0
    for prob, pos, reward in zip(probs[::-1], params_pos[::-1], rewards[::-1]):
        
        # The positions of parameters that was used to select the action in ith time step
        row, col, direction = pos
        
        # The parameters that were used in action selection
        theta1 = parameters[row, 0, direction]
        theta2 = parameters[col, 1, direction]
                
        # The return 
        G = reward + pow(gamma,i)*G
        
        # The update equations
        theta1 = theta1 + alpha*pow(gamma,i)*(1 - prob)
        theta2 = theta2 + alpha*pow(gamma,i)*(1 - prob)
        
        parameters[row, 0, direction] = theta1
        parameters[col, 1, direction] = theta2
        
        i+=1
        
        return parameters


In [21]:
def plot_MCPG(avg_reward, steps, episodes):
    '''
    Gets the data for all curves and plots them in one graph
    '''

    
    # Figure instances will be returned.
    fig1=plt.figure(figsize=(10,6)).add_subplot(111)
    fig2=plt.figure(figsize=(10,6)).add_subplot(111)

    # colors for different values of epsilon
    colors = ['g', 'r', 'k', 'b', 'y','m', 'c']

    fig1.plot(range(episodes), avg_reward, colors[0], label = " Average reward " )
    fig2.plot(range(episodes), steps, colors[1], label = " Steps")

    # Labelling the plot
    fig1.title.set_text('Monte carlo policy gradient avg reward')
    fig1.set_ylabel('Average Reward')
    fig1.set_xlabel('episodes')
    fig1.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

    # Labelling the plot
    fig2.title.set_text('Monte carlo policy gradient num steps')
    fig2.set_ylabel('Steps')
    fig2.set_xlabel('episodes')
    fig2.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

    # Display the plot
    plt.show()

In [22]:

def MC_PG(alpha, gamma, episodes):
    '''
    Performs monte carlo policy gradient
    '''
    
    # For each row,direction and For each column,direction one parameter
    parameters = np.ones([12,2,4])
    
    # Set goal postion
    goal_pos = env.set_goal('A')

    # The num steps and avg_reward that we get from each episode is stored
    steps = np.zeros([episodes])
    avg_reward = np.zeros([episodes])
    
    for episode in range(episodes):
        # An episode
        
        # Store all the states, actions, rewards till the end of episode 
        states = []
        actions = []
        rewards = []
        
        # The parameters that are at each time step
        params_pos = []
        # The probability of picking those parameters
        probs = []
        
        # Reset environment
        env.reset()
        
        # Initialize state
        curr_state = env.get_state()
        
        
        while True:
            # For each step of an episode
            
            # select an action according to softmax probabilities
            action, prob = softmax_action(curr_state, parameters)
            print(action)
            
            # Get reward and next state
            next_state, reward = env.step(action)
            
            probs.append(prob)
            params_pos.append(selected_param_pos(curr_state, action))
            
            states.append(curr_state)
            actions.append(action)
            rewards.append(reward)

            # Increase the number of steps
            steps[episode] +=1
            
            # Average reward till now
            avg_reward[episode] = avg_reward[episode] + (reward - avg_reward[episode])/steps[episode]

            curr_state = next_state
            print(curr_state, "curr_state", env.start_positions)

            if curr_state == goal_pos:
                print("Goal reached!!", steps[episode])
                break
        
        # Update the parameters used in action selection in the episode
        parameters = Update(parameters, states, actions, rewards, params_pos, probs, gamma, alpha)
            
    
    return avg_reward, steps
    

In [23]:
if __name__=='__main__':

    env = gym.make('gym_pdw:pdw-v0')
    
    # parameters
    gamma = 0.9
    alpha = 0.01
    epsilon = 0.1
    episodes = 50
    
    avg_reward, steps, Q = MC_PG(gamma,alpha, episodes)
    plot_MCPG(avg_reward, steps, episodes)
    

[[6, 0], [7, 0], [10, 0], [11, 0]] 1
[7, 0]
0
[8, 0] curr_state [[6, 0], [8, 0], [10, 0], [11, 0]]
2
[7, 1] curr_state [[6, 0], [7, 1], [10, 0], [11, 0]]
2
[7, 2] curr_state [[6, 0], [7, 2], [10, 0], [11, 0]]
3
[7, 4] curr_state [[6, 0], [7, 4], [10, 0], [11, 0]]
2
[7, 5] curr_state [[6, 0], [7, 5], [10, 0], [11, 0]]
1
[6, 5] curr_state [[6, 0], [6, 5], [10, 0], [11, 0]]
1
[7, 6] curr_state [[6, 0], [7, 6], [10, 0], [11, 0]]
0
[7, 8] curr_state [[6, 0], [7, 8], [10, 0], [11, 0]]
1
[7, 10] curr_state [[6, 0], [7, 10], [10, 0], [11, 0]]
3
[8, 10] curr_state [[6, 0], [8, 10], [10, 0], [11, 0]]
2
[7, 11] curr_state [[6, 0], [7, 11], [10, 0], [11, 0]]
0
[7, 11] curr_state [[6, 0], [7, 11], [10, 0], [11, 0]]
2
[7, 11] curr_state [[6, 0], [7, 11], [10, 0], [11, 0]]
0
[7, 11] curr_state [[6, 0], [7, 11], [10, 0], [11, 0]]
0
[7, 11] curr_state [[6, 0], [7, 11], [10, 0], [11, 0]]
1
[7, 10] curr_state [[6, 0], [7, 10], [10, 0], [11, 0]]
3
[7, 11] curr_state [[6, 0], [7, 11], [10, 0], [11, 0]]
2
[

1
[9, 9] curr_state [[6, 0], [0, 11], [10, 0], [9, 9]]
0
[9, 9] curr_state [[6, 0], [0, 11], [10, 0], [9, 9]]
2
[10, 9] curr_state [[6, 0], [0, 11], [10, 0], [10, 9]]
2
[9, 9] curr_state [[6, 0], [0, 11], [10, 0], [9, 9]]
1
[9, 10] curr_state [[6, 0], [0, 11], [10, 0], [9, 10]]
0
[9, 11] curr_state [[6, 0], [0, 11], [10, 0], [9, 11]]
1
[8, 11] curr_state [[6, 0], [0, 11], [10, 0], [8, 11]]
2
[8, 11] curr_state [[6, 0], [0, 11], [10, 0], [8, 11]]
3
[8, 11] curr_state [[6, 0], [0, 11], [10, 0], [8, 11]]
2
[8, 11] curr_state [[6, 0], [0, 11], [10, 0], [8, 11]]
2
[8, 11] curr_state [[6, 0], [0, 11], [10, 0], [8, 11]]
3
[8, 11] curr_state [[6, 0], [0, 11], [10, 0], [8, 11]]
2
[7, 11] curr_state [[6, 0], [0, 11], [10, 0], [7, 11]]
1
[7, 10] curr_state [[6, 0], [0, 11], [10, 0], [7, 10]]
1
[6, 11] curr_state [[6, 0], [0, 11], [10, 0], [6, 11]]
1
[6, 11] curr_state [[6, 0], [0, 11], [10, 0], [6, 11]]
2
[6, 11] curr_state [[6, 0], [0, 11], [10, 0], [6, 11]]
3
[6, 11] curr_state [[6, 0], [0, 11]

3
[10, 11] curr_state [[10, 11], [0, 11], [10, 0], [0, 11]]
2
[10, 11] curr_state [[10, 11], [0, 11], [10, 0], [0, 11]]
2
[10, 11] curr_state [[10, 11], [0, 11], [10, 0], [0, 11]]
2
[11, 11] curr_state [[11, 11], [0, 11], [10, 0], [0, 11]]
1
[11, 10] curr_state [[11, 10], [0, 11], [10, 0], [0, 11]]
2
[11, 10] curr_state [[11, 10], [0, 11], [10, 0], [0, 11]]
0
[10, 10] curr_state [[10, 10], [0, 11], [10, 0], [0, 11]]
1
[10, 10] curr_state [[10, 10], [0, 11], [10, 0], [0, 11]]
1
[10, 11] curr_state [[10, 11], [0, 11], [10, 0], [0, 11]]
3
[10, 11] curr_state [[10, 11], [0, 11], [10, 0], [0, 11]]
2
[10, 11] curr_state [[10, 11], [0, 11], [10, 0], [0, 11]]
3
[9, 11] curr_state [[9, 11], [0, 11], [10, 0], [0, 11]]
2
[9, 11] curr_state [[9, 11], [0, 11], [10, 0], [0, 11]]
1
[9, 11] curr_state [[9, 11], [0, 11], [10, 0], [0, 11]]
2
[8, 11] curr_state [[8, 11], [0, 11], [10, 0], [0, 11]]
3
[7, 11] curr_state [[7, 11], [0, 11], [10, 0], [0, 11]]
1
[7, 11] curr_state [[7, 11], [0, 11], [10, 0], [

3
[6, 11] curr_state [[6, 11], [0, 11], [10, 0], [0, 11]]
2
[6, 11] curr_state [[6, 11], [0, 11], [10, 0], [0, 11]]
2
[6, 11] curr_state [[6, 11], [0, 11], [10, 0], [0, 11]]
0
[6, 11] curr_state [[6, 11], [0, 11], [10, 0], [0, 11]]
2
[6, 11] curr_state [[6, 11], [0, 11], [10, 0], [0, 11]]
1
[5, 11] curr_state [[5, 11], [0, 11], [10, 0], [0, 11]]
3
[5, 11] curr_state [[5, 11], [0, 11], [10, 0], [0, 11]]
0
[5, 11] curr_state [[5, 11], [0, 11], [10, 0], [0, 11]]
2
[5, 11] curr_state [[5, 11], [0, 11], [10, 0], [0, 11]]
2
[5, 11] curr_state [[5, 11], [0, 11], [10, 0], [0, 11]]
1
[6, 11] curr_state [[6, 11], [0, 11], [10, 0], [0, 11]]
2
[7, 11] curr_state [[7, 11], [0, 11], [10, 0], [0, 11]]
2
[7, 10] curr_state [[7, 10], [0, 11], [10, 0], [0, 11]]
2
[7, 11] curr_state [[7, 11], [0, 11], [10, 0], [0, 11]]
2
[7, 11] curr_state [[7, 11], [0, 11], [10, 0], [0, 11]]
3
[7, 10] curr_state [[7, 10], [0, 11], [10, 0], [0, 11]]
1
[7, 10] curr_state [[7, 10], [0, 11], [10, 0], [0, 11]]
3
[7, 10] curr

[8, 11] curr_state [[8, 11], [0, 11], [10, 0], [0, 11]]
0
[9, 11] curr_state [[9, 11], [0, 11], [10, 0], [0, 11]]
2
[9, 11] curr_state [[9, 11], [0, 11], [10, 0], [0, 11]]
2
[9, 10] curr_state [[9, 10], [0, 11], [10, 0], [0, 11]]
0
[9, 10] curr_state [[9, 10], [0, 11], [10, 0], [0, 11]]
2
[8, 10] curr_state [[8, 10], [0, 11], [10, 0], [0, 11]]
2
[7, 11] curr_state [[7, 11], [0, 11], [10, 0], [0, 11]]
0
[6, 11] curr_state [[6, 11], [0, 11], [10, 0], [0, 11]]
1
[6, 11] curr_state [[6, 11], [0, 11], [10, 0], [0, 11]]
1
[6, 11] curr_state [[6, 11], [0, 11], [10, 0], [0, 11]]
0
[7, 11] curr_state [[7, 11], [0, 11], [10, 0], [0, 11]]
3
[7, 10] curr_state [[7, 10], [0, 11], [10, 0], [0, 11]]
2
[8, 11] curr_state [[8, 11], [0, 11], [10, 0], [0, 11]]
3
[7, 11] curr_state [[7, 11], [0, 11], [10, 0], [0, 11]]
2
[6, 11] curr_state [[6, 11], [0, 11], [10, 0], [0, 11]]
3
[6, 11] curr_state [[6, 11], [0, 11], [10, 0], [0, 11]]
3
[6, 11] curr_state [[6, 11], [0, 11], [10, 0], [0, 11]]
3
[6, 11] curr_s

3
[1, 11] curr_state [[0, 11], [0, 11], [0, 11], [1, 11]]
2
[1, 11] curr_state [[0, 11], [0, 11], [0, 11], [1, 11]]
3
[1, 11] curr_state [[0, 11], [0, 11], [0, 11], [1, 11]]
3
[1, 10] curr_state [[0, 11], [0, 11], [0, 11], [1, 10]]
2
[1, 10] curr_state [[0, 11], [0, 11], [0, 11], [1, 10]]
2
[2, 10] curr_state [[0, 11], [0, 11], [0, 11], [2, 10]]
0
[3, 10] curr_state [[0, 11], [0, 11], [0, 11], [3, 10]]
2
[4, 10] curr_state [[0, 11], [0, 11], [0, 11], [4, 10]]
3
[4, 9] curr_state [[0, 11], [0, 11], [0, 11], [4, 9]]
2
[4, 11] curr_state [[0, 11], [0, 11], [0, 11], [4, 11]]
3
[4, 11] curr_state [[0, 11], [0, 11], [0, 11], [4, 11]]
3
[4, 10] curr_state [[0, 11], [0, 11], [0, 11], [4, 10]]
3
[4, 11] curr_state [[0, 11], [0, 11], [0, 11], [4, 11]]
3
[4, 11] curr_state [[0, 11], [0, 11], [0, 11], [4, 11]]
2
[4, 11] curr_state [[0, 11], [0, 11], [0, 11], [4, 11]]
3
[4, 10] curr_state [[0, 11], [0, 11], [0, 11], [4, 10]]
1
[4, 9] curr_state [[0, 11], [0, 11], [0, 11], [4, 9]]
0
[4, 11] curr_sta

ValueError: not enough values to unpack (expected 3, got 2)