# Windy City TD 
SARSA on-policy control

This notebook works the the problem of on-policy TD control for estimating Q ≈ q<sub>*</sub> 
    
We apply this towards the windy city grid world problem. The objective is to get from the start square to reach the terminal state. The grid world will have columns that contain a 'wind' effect where transitions into that state will push them 'up' or 'north' a specified amount of units affecting the traversal of the 'map'.
    
Objects to create:
- environment to simulate the 'windy city'
- entity to simulate the traversal thru the paths
    
![image.png](attachment:image.png)

In [1]:
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.table import Table
import numpy as np

In [2]:
#define environment object
class environment():
    def __init__(self, dims = [7,10], start = [3,0] , terminal = [3,7], reward = -1, windy_cols = [0,0,0,1,1,1,2,2,1,0]):
        """
        this class initializes the environment
        
        inputs
        dims - a scalar to define a dxd grid environment
        terminals - a list of lists of ordered pairs representing the location in the grid of the terminal states
        reward -  the reward value for moving one space (unless in a terminal state)
        
        self defined variables
        actions - determines actions that can be taken in the environment where 0 = left, 1 = up, 2 = right, 3 = down
        
        returns
        None
        
        cheating here as i did not do checks on things for integrity...
        """
        self.dims = dims
        self.start = start
        self.terminal = terminal
        self.reward = reward
        self.windy_cols = windy_cols
        self.actions = [np.array([0, -1]),
                       np.array([-1, 0]),
                       np.array([0, 1]),
                       np.array([1, 0])]
            
    def is_terminal(self,i,j):
        """
        This function checks if coordinates are for a terminal state
        
        inputs
        i = y coord (row)
        j = x coord (col)
        
        returns
        boolean on terminal status
        """
        #print("episode end")
        return [i,j] == self.terminal
        
        
    def next_state(self,i,j,a):
        """
        this function determines the next state and the reward received
        This function also checks if current state is terminal, or next state is terminal
        This function will also bounce back to the original coordinates if it is an edge/border square in the grid
        
        inputs
        i = y coord (row)
        j = x coord (col)
        a = action taken where 0 = left, 1 = up, 2 = right, 3 = down
        
        returns
        i_next, j_next representing next state coordinates
        reward received for moving
        """
        # check if current state is terminal, if so return current state and reward = 0
        #i = row, j = col
        if self.is_terminal(i,j):
            return i, j, 0
        
        # get next state
        next = [i,j] + self.actions[a]     
        
             
        #apply wind
        windy = 0
        if self.windy_cols[j] != 0:
            windy = 1
            next[0] = next[0] - self.windy_cols[next[1]]         
        
        #check if next state is outside boundries of the grid
        if windy == 0:
            if next[0] < 0 or next[1] <0:
                return i, j, self.reward
            elif next[0] > (self.dims[0]-1) or next[1] > (self.dims[1]-1):
                return i, j, self.reward
            else:
                return next[0], next[1], self.reward  
        else:
            if next[0] < 0:
                return i, next[1], self.reward
            else:
                return next[0], next[1], self.reward 


In [3]:
class entity():
    def __init__(self, state_action_values, environment, algo_param_alpha = 0.25, gamma = 0.5, epsilon = 0.05, start = [3,0]):
        """
        This class initializes the entity which will 
        walk thru the grid world and collect data
        this follows a puerly greedy policy atm
        
        inputs
        rand_policy = policy ot follow if not initialized
        state_action_values - calculated state-action values for all states except the terminal state
        algo_param_alpha - step size to be used
        
        """
        
        self.state_action_values = state_action_values
        self.algo_param_alpha = algo_param_alpha
        self.gamma = gamma
        self.environment = environment
        self.start_location = start
        self.current_location = start
        self.epsilon = epsilon

                
    def pick_action(self, state_action_values):
        #need to fix this asap
        
        #simple pick action based in random policy to go any direction with equal probability
        greedy_action = np.argmax(state_action_values)
        #print(greedy_action)
        
        action = greedy_action
        
        if self.epsilon == 0:
            return action
        
        if np.random.binomial(1, self.epsilon, 1):
            temp_list = [0,1,2,3]
            #print(temp_list)
            temp_list.pop(greedy_action)
            #print(temp_list)
            action = np.random.choice(temp_list)
            #print(action)
        
        return action
    
    def take_action(self, action):
        next_i, next_j, reward = self.environment.next_state(self.current_location[0], self.current_location[1], action)
        return next_i, next_j, reward
    
    def restart_walk(self):
        self.current_location = self.start_location
    
    def take_a_walk(self):
        
        self.restart_walk()
        
        terminal = False
        
        #choose first action according to e-greedy
        i, j = self.current_location[0], self.current_location[1]
        state_action_values = self.state_action_values[(i,j)]
        action = self.pick_action(state_action_values)
        next_i, next_j, reward = self.take_action(action)
        
        while not terminal:

            state_action_values = self.state_action_values[(next_i, next_j)]
            next_action = self.pick_action(state_action_values)
            next_action_value = state_action_values[next_action]
            
            #update last state value
            self.state_action_values[(i,j)][action] = \
            (self.state_action_values[(i,j)][action] + \
            (self.algo_param_alpha * \
            (reward + \
            (self.gamma * next_action_value) - \
            self.state_action_values[(i,j)][action])))
             
            #update location
            self.current_location = [next_i, next_j] 
            i, j = next_i, next_j
            
            #update to iterate 
            action = next_action
            next_i, next_j, reward = self.take_action(action)
            
            
            if reward == 0:
                terminal = True
                
    def policy_walk(self):
        self.restart_walk()
        
        terminal = False
        
        action_dict = {0 : "left", 1 : "up", 2 : "right", 3 : "down"}
        
        number_of_steps = 0
        
        orig_epsilon = self.epsilon
        
        self.epsilon = 0
        
        while not terminal:
            print("current state is:", self.current_location)
            print("current step is:", number_of_steps)
            state_action_values = self.state_action_values[(self.current_location[0],self.current_location[1])]
            
            action = self.pick_action(state_action_values)
            print("next action is:", action_dict[action])
            
            next_i, next_j, reward = self.take_action(action)
            
            #update location
            self.current_location = [next_i, next_j] 
            
            number_of_steps = number_of_steps + 1
            
            if number_of_steps > 100:
                print("walk failure")
                reward = 0
            
            if reward == 0:
                print("you have reached the terminal state in ", number_of_steps, " steps!")
                self.epsilon = orig_epsilon                      
                terminal = True          
                
    def give_state_action_values(self):
        return self.state_action_values
    

In [4]:
#step size
algo_param_alpha = 0.25

In [5]:
#dimensions
dims = [7,10]

In [6]:
#arbitrary state_action_values
#using a dict because look up is much easier then
state_action_values = {(None,None,None):0}

for i in range(dims[0]):
    for j in range(dims[1]):
        state_action_values[(i,j)] = [0,0,0,0]
        
state_action_values.pop((None,None,None))
            
#state_action_values

0

In [7]:
environment1 = environment()
entity1 = entity(state_action_values, environment1)

In [8]:
number_of_episodes = 1000

for episode in range(number_of_episodes):
    #if episode%100 == 0:
        #print("episode #:", episode)
    entity1.take_a_walk()
    
state_action_values = entity1.give_state_action_values()

policy = np.zeros((dims[0],dims[1],4))

for key in state_action_values:
    i =  key[0]
    j = key[1]
    policy[i,j] = state_action_values[key]
    


In [9]:
state_action_values

{(0, 0): [-1.9999485922787879,
  -1.9999503625615827,
  -1.9999485432221986,
  -1.9999539389169119],
 (0, 1): [-1.999939061322225,
  -1.9999365209044877,
  -1.9999280402288542,
  -1.9999404983826925],
 (0, 2): [-1.999901811271719,
  -1.9999095158247866,
  -1.9998957293026394,
  -1.9998960820416738],
 (0, 3): [-1.9998586881188531,
  -1.9998105433595605,
  -1.999806749280292,
  -1.9998538766445098],
 (0, 4): [-1.9997177519038638,
  -1.9997326066145835,
  -1.9996683399703445,
  -1.9997408787429278],
 (0, 5): [-1.999671629647038,
  -1.9996046055019885,
  -1.9993326689246176,
  -1.9995375223091636],
 (0, 6): [-1.9994273785356669,
  -1.998964723466449,
  -1.9986139923560367,
  -1.9991598512377335],
 (0, 7): [-1.9986185648998485,
  -1.998095703548619,
  -1.9970272169012597,
  -1.9983198850011563],
 (0, 8): [-1.9985056343002987,
  -1.996764803071235,
  -1.99378694711906,
  -1.9968443443749107],
 (0, 9): [-1.9978201088669096,
  -1.9940017774464187,
  -1.9931969687313846,
  -1.9870812219972174],

In [10]:
#optimal steps for this problem is 15, book notes 17 is the average on their solution
#this method sets epsilon to 0 and uses the q values to do a pure exploitation.
print("doing policy walk")
entity1.policy_walk()

doing policy walk
current state is: [3, 0]
current step is: 0
next action is: right
current state is: [3, 1]
current step is: 1
next action is: right
current state is: [3, 2]
current step is: 2
next action is: right
current state is: [3, 3]
current step is: 3
next action is: right
current state is: [2, 4]
current step is: 4
next action is: right
current state is: [1, 5]
current step is: 5
next action is: right
current state is: [1, 6]
current step is: 6
next action is: right
current state is: [1, 7]
current step is: 7
next action is: right
current state is: [0, 8]
current step is: 8
next action is: right
current state is: [0, 9]
current step is: 9
next action is: down
current state is: [1, 9]
current step is: 10
next action is: down
current state is: [2, 9]
current step is: 11
next action is: down
current state is: [3, 9]
current step is: 12
next action is: down
current state is: [4, 9]
current step is: 13
next action is: down
current state is: [5, 9]
current step is: 14
next action is

In [11]:
#let's see if we can improve this with another 10k steps?
number_of_episodes = 10000

for episode in range(number_of_episodes):
    #if episode%1000 == 0:
        #print("episode #:", episode)
    entity1.take_a_walk()
    
state_action_values = entity1.give_state_action_values()

policy = np.zeros((dims[0],dims[1],4))

for key in state_action_values:
    i =  key[0]
    j = key[1]
    policy[i,j] = state_action_values[key]

In [12]:
state_action_values

{(0, 0): [-1.9999689311005138,
  -1.9999614769866956,
  -1.9999623971984168,
  -1.9999674686755562],
 (0, 1): [-1.9999485383562827,
  -1.999950436531828,
  -1.9999490289478135,
  -1.9999489798352177],
 (0, 2): [-1.999918920751626,
  -1.9999367558090952,
  -1.999909730109978,
  -1.9999213901492008],
 (0, 3): [-1.9998586881188531,
  -1.9998847364606358,
  -1.999849478630013,
  -1.9998862614591024],
 (0, 4): [-1.999901946958802,
  -1.9998210644137036,
  -1.9996525417637219,
  -1.9998101268684458],
 (0, 5): [-1.9998254060240872,
  -1.999624454804096,
  -1.9993390246013347,
  -1.9996616873383533],
 (0, 6): [-1.9996738233526687,
  -1.9992843790776194,
  -1.9984056608686225,
  -1.9992344130598327],
 (0, 7): [-1.9994074655056067,
  -1.998428210486932,
  -1.9968038955108156,
  -1.9985522614950468],
 (0, 8): [-1.9987398833935175,
  -1.9969234585730309,
  -1.9948321133429217,
  -1.9968748496994095],
 (0, 9): [-1.9974285677037098,
  -1.9936325389518634,
  -1.9937881568246036,
  -1.9904812650233938

In [13]:
#optimal steps for this problem is 15, book notes 17 is the average on their solution
#this method sets epsilon to 0 and uses the q values to do a pure exploitation.
print("doing policy walk")
entity1.policy_walk()

doing policy walk
current state is: [3, 0]
current step is: 0
next action is: right
current state is: [3, 1]
current step is: 1
next action is: right
current state is: [3, 2]
current step is: 2
next action is: right
current state is: [3, 3]
current step is: 3
next action is: right
current state is: [2, 4]
current step is: 4
next action is: right
current state is: [1, 5]
current step is: 5
next action is: right
current state is: [1, 6]
current step is: 6
next action is: right
current state is: [1, 7]
current step is: 7
next action is: right
current state is: [0, 8]
current step is: 8
next action is: right
current state is: [0, 9]
current step is: 9
next action is: down
current state is: [1, 9]
current step is: 10
next action is: down
current state is: [2, 9]
current step is: 11
next action is: down
current state is: [3, 9]
current step is: 12
next action is: down
current state is: [4, 9]
current step is: 13
next action is: down
current state is: [5, 9]
current step is: 14
next action is