IMPORTING NECESSARY LIBRARIES

In [64]:
import gridworld
import numpy as np
import random

EXPERIMENTATION WITH GIVEN PARAMETERS

In [65]:
world = gridworld.GridWorld() #creating the object

In [66]:
world.WORLD_WIDTH, world.WORLD_HEIGHT

(15, 10)

In [67]:
world.obstacles

[[2, 3], [3, 3], [7, 3], [8, 3], [9, 3], [7, 8], [8, 8], [9, 8]]

In [68]:
world.START,world.GOAL

([6, 1], [8, 11])

In [69]:
possible_actions = [world.ACTION_UP , world.ACTION_RIGHT , world.ACTION_DOWN, world.ACTION_LEFT]
possible_actions

[0, 3, 1, 2]

In [70]:
present_state = world.START
next_state, reward = world.step(present_state, world.ACTION_UP)
next_state,reward

([5, 1], -1)

THE INITIALIZATIONS: 

In [57]:
states=[]
learning_rate=0.001
e_value=0.2
discount_factor = 1
state_values = [['X' if [j,i] in world.obstacles else -1 for i in range(world.WORLD_WIDTH)] for j in range(world.WORLD_HEIGHT)]
state_values

[[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
 [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
 [-1, -1, -1, 'X', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
 [-1, -1, -1, 'X', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
 [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
 [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
 [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
 [-1, -1, -1, 'X', -1, -1, -1, -1, 'X', -1, -1, -1, -1, -1, -1],
 [-1, -1, -1, 'X', -1, -1, -1, -1, 'X', -1, -1, -1, -1, -1, -1],
 [-1, -1, -1, 'X', -1, -1, -1, -1, 'X', -1, -1, -1, -1, -1, -1]]

FUNCTION WHICH DETERMINES WHICH ACTION TO CHOOSE

In [58]:
def whichAction(current_state,e_value):
    action = -1 # a random value
    max_reward = -1 # the general case for the reward
    num = random.uniform(0,1) #selects a random floating point number in between 0 to 1
    final_state=current_state 
    if num > e_value : 
        #performing a greedy action
        i=0
        for new_action in possible_actions:
            i+=1
            new_state,new_reward = world.step(current_state,new_action)
            if new_reward> max_reward:
                max_reward,action = new_reward,new_action
            elif (i==4)and (max_reward==-1): #if the greedy action makes no progress, we select a random value
                action = random.choice(possible_actions)
        final_state = world.step(current_state,action)[0]
        while(final_state == current_state):
            # this while loop ensures that the steps where the agent stays in the same cell is skipped
            # this would improve space complexity
            action = random.choice(possible_actions)
            final_state = world.step(current_state,action)[0]
    else:
        action = random.choice(possible_actions)
        final_state = world.step(current_state,action)[0]
        while(final_state == current_state):
            # this while loop ensures that the steps where the agent stays in the same cell is skipped
            # this would improve space complexity
            action = random.choice(possible_actions)
            final_state = world.step(current_state,action)[0]
    return action

A FUNCTION TO PERFORM THE ACTION

In [59]:
def performAction(state,action):
    return world.step(state,action)

THE MODEL TRAINING FUNCTION

In [60]:
def training(current_state,training_iter):
    print("Starting state is ",current_state)
    for i in range(training_iter):
        if current_state != world.GOAL : #unless goal reached
            #accessing the global values from the initializations cell
            global states
            global learning_rate
            global e_value
            global state_values
            new_action = whichAction(current_state,e_value) #selects new action
            new_state = performAction(current_state,new_action)[0] #performs the new action
            states.append(new_state) 
            #printing the performed action
            print("In iteration number ",i+1,": ")
            print("Now the agent has reached ",new_state," by taking the action ",new_action)
            
            current_state = new_state
        else: # if goal reached
            print("GAME OVER!")
            #backtracing the values
            reverse_states = list(reversed(states))
            reward = 0
            state_values[world.GOAL[0]][world.GOAL[1]] =reward
            for state in reverse_states:
                if state==world.GOAL:
                    continue
                print("updating ",[state[0],state[1]],end=" ")
                prev_state = reverse_states[reverse_states.index(state)-1]
                # applying the temporal difference formula TD(0)
                # v(s)⇐v(s)+α(r+γv(s')-v(s)) 
                reward = state_values[state[0]][state[1]] 
                + learning_rate*(reward + discount_factor*state_values[prev_state[0]][prev_state[1]] - state_values[state[0]][state[1]])
                state_values[state[0]][state[1]] = reward
                print("to ",reward)
            states = [] # RESET
            break

THIS IS DONE FOR SIMPLICITY TO CHECK WHAT IF THE AGENT STARTS FROM SOMEWHERE NEAR THE GOAL:

In [61]:
training([9,9],10)

Starting state is  [9, 9]
In iteration number  1 : 
Now the agent has reached  [8, 9]  by taking the action  0
In iteration number  2 : 
Now the agent has reached  [8, 10]  by taking the action  3
In iteration number  3 : 
Now the agent has reached  [8, 11]  by taking the action  3
GAME OVER!
updating  [8, 10] to  -0.999
updating  [8, 9] to  -1.000998


A POLICY IS CREATED

In [63]:
for i in range(world.WORLD_HEIGHT):
    for j in range(world.WORLD_WIDTH):
        print(state_values[i][j],end="      ")
    print()

-1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      
-1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      
-1      -1      -1      X      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      
-1      -1      -1      X      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      
-1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      
-1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      
-1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      -1      
-1      -1      -1      X      -1      -1      -1      -1      X      -1      -1      -1      -1      -1      -1      
-1      -1      -1      X      -1   

In [45]:
states=[]
learning_rate=0.001
e_value=0.2
discount_factor = 1
state_values = [['X' if [j,i] in world.obstacles else -1 for i in range(world.WORLD_WIDTH)] for j in range(world.WORLD_HEIGHT)]
state_values

[[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
 [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
 [-1, -1, -1, 'X', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
 [-1, -1, -1, 'X', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
 [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
 [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
 [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
 [-1, -1, -1, 'X', -1, -1, -1, -1, 'X', -1, -1, -1, -1, -1, -1],
 [-1, -1, -1, 'X', -1, -1, -1, -1, 'X', -1, -1, -1, -1, -1, -1],
 [-1, -1, -1, 'X', -1, -1, -1, -1, 'X', -1, -1, -1, -1, -1, -1]]

FOR OUR ACTUAL GRID 1 MILLION ITERATIONS REQUIRED ! (SOMEWHAT LESS THAN THAT)

In [47]:
training(world.START,1000000)

Starting state is  [6, 1]
In iteration number  1 : 
Now the agent has reached  [5, 1]  by taking the action  0
In iteration number  2 : 
Now the agent has reached  [5, 0]  by taking the action  2
In iteration number  3 : 
Now the agent has reached  [4, 0]  by taking the action  0
In iteration number  4 : 
Now the agent has reached  [3, 0]  by taking the action  0
In iteration number  5 : 
Now the agent has reached  [2, 0]  by taking the action  0
In iteration number  6 : 
Now the agent has reached  [1, 0]  by taking the action  0
In iteration number  7 : 
Now the agent has reached  [0, 0]  by taking the action  0
In iteration number  8 : 
Now the agent has reached  [1, 0]  by taking the action  1
In iteration number  9 : 
Now the agent has reached  [0, 0]  by taking the action  0
In iteration number  10 : 
Now the agent has reached  [1, 0]  by taking the action  1
In iteration number  11 : 
Now the agent has reached  [0, 0]  by taking the action  0
In iteration number  12 : 
Now the ag

In [48]:
for i in range(world.WORLD_HEIGHT):
    for j in range(world.WORLD_WIDTH):
        print(state_values[i][j],end=" ")
    print()

-8.51850411751016 -8.53125052449471 -6.507007822223544 -3.9470255306788875 -2.661278644955883 -2.0818749917310453 -1.778173257177081 -1.6105010687667756 -1.404385740851885 -1.2955669252907587 -1.1985023765228586 -1.1356917782213811 -1.100383139690948 -1.0710765229225974 -1.0444891600362909 
-13.168785251870801 -13.493809980431342 -12.995428710879768 -4.634491668334458 -2.8796137061425093 -2.195967529934447 -1.8911164579429969 -1.7033081324092842 -1.529995227794249 -1.3362961779011349 -1.206737942468598 -1.1334967835392968 -1.1147348671436157 -1.0932622267209138 -1.0577299283866326 
-23.503490750823627 -24.73599077446877 -17.466265049429225 X -1.7914460578434739 -1.72714227341347 -1.6542108450470847 -1.5366616596537424 -1.4015000406994822 -1.2457775094594377 -1.1399394496149997 -1.0795163298835821 -1.0686384319535218 -1.0659488093746816 -1.0408072019593917 
-50.561438560429636 -33.604542489821974 -17.926116052903925 X -1.500222606960636 -1.42419291680767 -1.4030685114856307 -1.281279987