In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
class WindyGridWorld:
    
    def __init__(self, n, terminal_state, windy_blocks, on_policy = 'on-policy', verbose=False):
        
        self.grid_size = n
        self.terminal_state = terminal_state
        self.windy_blocks = windy_blocks
        self.on_policy = on_policy
        self.verbose = verbose
        self.gamma = 0.1
        
        self.Q = np.zeros((self.grid_size, self.grid_size, 4)) + 0.25
        
        self.actions = [(0,-1), (0, 1), (-1,0), (1, 0)]# Left, Right, Up, Down

        #Equi-probable action selection at each state for all actions
        self.policy = np.random.randint(1,4,size = (self.grid_size, self.grid_size))
        
        print("Initial State-Action/Q Function", *self.Q, sep='\n',end='\n\n')
        print("Initial Policy", *self.policy, sep='\n',end='\n\n')
        
        if self.verbose:
            print("Shape of Q", self.Q.shape)
            print("Shape of Policy", self.policy.shape)
        
    def _get_reward(self, x, y):
        if x==self.terminal_state[0] and y==self.terminal_state[1]:
            return 3
        return -1
    
    def _get_next_state(self, x, y, a):
        
        if x+a[0]!=-1 and x+a[0]!=self.grid_size:
            x = x + a[0]
        if y+a[1]!=-1 and y+a[1]!=self.grid_size:
            y = y + a[1]
        if x in self.windy_blocks[0] and y in self.windy_blocks[1]:
            if x-1!=-1:
                x-=1
        return x,y
        
    def play(self, epochs = 10, threshold = 0.1):
        
        if self.on_policy == 'on-policy':
            res = self._play_on_policy(epochs, threshold)
            print("Final Policy:")
            print(*self.policy, sep = '\n', end = '\n\n')
            print("Final State-Action/Q Function:")
            print(*self.Q, sep = '\n', end = '\n\n')
            return res
        
        elif self.on_policy == 'off-policy':
            res = self._play_off_policy(epochs, threshold)
            print("Final Policy:")
            print(*self.policy, sep = '\n', end = '\n\n')
            print("Final State-Action/Q Function:")
            print(*self.Q, sep = '\n', end = '\n\n')
            return res
        
        elif self.on_policy == 'expected':
            res = self._play_expected_sarsa(epochs, threshold)
            print("Final Policy:")
            print(*self.policy, sep = '\n', end = '\n\n')
            print("Final State-Action/Q Function:")
            print(*self.Q, sep = '\n', end = '\n\n')
            return res
        
        elif self.on_policy == 'double':
            res = self._play_double_policy(epochs, threshold)
            print("Final Policy:")
            print(*self.policy, sep = '\n', end = '\n\n')
            print("Final State-Action/Q Function:")
            print(*self.Q, sep = '\n', end = '\n\n')
            return res
        
    def _get_action(self, x,y):
        
        m = np.round(np.min(self.Q[x][y])-1, 3)
        probs = (np.round(self.Q[x][y], 5)-m) / np.sum(np.round(self.Q[x][y], 5) - m)
        try:
            
            return np.random.choice([0,1,2,3],p=probs)
        except:
            print(m,probs)
            
    def _get_action_double(self, x,y):
        
        m = np.round(np.min(self.Q[x][y] + self.Q1[x][y])-1, 3)
        probs = (np.round(self.Q[x][y] + self.Q1[x][y], 5)-m) / np.sum(np.round(self.Q[x][y] + self.Q1[x][y], 5) - m)
        try:
            
            return np.random.choice([0,1,2,3],p=probs)
        except:
            print(m,probs)
    
    
    def _play_on_policy(self, epochs, threshold):
        
        iters = 0
        while True:
            
            iters+=1
            x=0
            y=0
            diff = 0
            
            while True:
                
                    action = self._get_action(x,y)
                    new_x, new_y = self._get_next_state(x,y,self.actions[action])
                    reward = self._get_reward(new_x, new_y)

                    if self.verbose:
                        print([x,y,action,reward])
                    
                    self.Q[x,y,action] = self.Q[x,y,action] + 0.1*(reward + self.gamma*self.Q[new_x, new_y, action] - self.Q[x,y,action])
                    
                    self.policy[x,y] = np.argmax(self.Q[x][y])
                    
                    x, y = new_x, new_y
                    
                    if reward>0:
                        break

            if self.verbose:
                print("Q function at iter:", iters)
                print(*self.Q, sep = '\n', end = '\n\n')
            
            if self.verbose:
                print("Policy at iter:", iters)
                print(*self.policy, sep = '\n', end = '\n\n')
            
            if epochs==iters:
                return diff, iters

    def _play_off_policy(self, epochs, threshold):
        
        iters = 0
        while True:
            
            iters+=1
            x=0
            y=0
            diff = 0
            
            while True:
                
                    action = self._get_action(x,y)
                    new_x, new_y = self._get_next_state(x,y,self.actions[action])
                    reward = self._get_reward(new_x, new_y)

                    if self.verbose:
                        print([x,y,action,reward])
                    
                    self.Q[x,y,action] = self.Q[x,y,action] + 0.1*(reward + self.gamma * np.max(self.Q[new_x, new_y]) - self.Q[x,y,action])
                    
                    self.policy[x,y] = np.argmax(self.Q[x][y])
                    
                    x, y = new_x, new_y
                    
                    if reward>0:
                        break

            if self.verbose:
                print("Q function at iter:", iters)
                print(*self.Q, sep = '\n', end = '\n\n')
            
            if self.verbose:
                print("Policy at iter:", iters)
                print(*self.policy, sep = '\n', end = '\n\n')
            
            if epochs==iters:
                return diff, iters
            
    def _play_expected_sarsa(self, epochs, threshold):
        
        iters = 0
        while True:
            
            iters+=1
            x=0
            y=0
            diff = 0
            
            while True:
                
                    action = self._get_action(x,y)
                    new_x, new_y = self._get_next_state(x,y,self.actions[action])
                    reward = self._get_reward(new_x, new_y)

                    if self.verbose:
                        print([x,y,action,reward])
                    
                    m = np.round(np.min(self.Q[x][y])-1, 3)
                    probs = (np.round(self.Q[x][y], 5)-m) / np.sum(np.round(self.Q[x][y], 5) - m)
                    self.Q[x,y,action] = self.Q[x,y,action] + 0.1*(reward + self.gamma * np.sum(self.Q[new_x, new_y] * probs) - self.Q[x,y,action])
                    
                    self.policy[x,y] = np.argmax(self.Q[x][y])
                    
                    x, y = new_x, new_y
                    
                    if reward>0:
                        break

            if self.verbose:
                print("Q function at iter:", iters)
                print(*self.Q, sep = '\n', end = '\n\n')
            
            if self.verbose:
                print("Policy at iter:", iters)
                print(*self.policy, sep = '\n', end = '\n\n')
            
            if epochs==iters:
                return diff, iters
            
    def _play_double_policy(self, epochs, threshold):
        
        iters = 0
        self.Q1 = np.zeros((self.grid_size, self.grid_size, 4)) + 0.25
        while True:
            
            iters+=1
            x=0
            y=0
            diff = 0
            
            while True:
                
                    action = self._get_action_double(x,y)
                    new_x, new_y = self._get_next_state(x,y,self.actions[action])
                    reward = self._get_reward(new_x, new_y)

                    if self.verbose:
                        print([x,y,action,reward])
                    
                    if np.random.binomial(1,0.5,1):
                        self.Q[x,y,action] = self.Q[x,y,action] + 0.1*(reward + self.gamma * self.Q1[new_x, new_y, np.argmax(self.Q[new_x, new_y])] - self.Q[x,y,action])
                    else:
                        self.Q1[x,y,action] = self.Q1[x,y,action] + 0.1*(reward + self.gamma * self.Q[new_x, new_y, np.argmax(self.Q1[new_x, new_y])] - self.Q1[x,y,action])
                        
                    self.policy[x,y] = np.argmax(self.Q[x][y] + self.Q1[x][y])
                    
                    x, y = new_x, new_y
                    
                    if reward>0:
                        break

            if self.verbose:
                print("Q function at iter:", iters)
                print(*self.Q, sep = '\n', end = '\n\n')
            
            if self.verbose:
                print("Policy at iter:", iters)
                print(*self.policy, sep = '\n', end = '\n\n')
            
            if epochs==iters:
                return diff, iters

# On Policy

In [3]:
blocks = [[1,2,3],[2]]
game = WindyGridWorld(4, (1,3), windy_blocks = blocks, on_policy = 'on-policy', verbose=False)
print(game.play(epochs=10000, threshold=0.00000000000001))

Initial State-Action/Q Function
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]

Initial Policy
[2 2 3 3]
[3 1 1 3]
[3 1 3 3]
[2 2 2 1]

Final Policy:
[0 0 0 3]
[0 0 1 3]
[1 1 1 2]
[1 1 2 2]

Final State-Action/Q Function:
[[-1.11111111 -1.11111111 -1.11111111 -1.11111111]
 [-1.11111111 -1.11111111 -1.11111111 -1.11111111]
 [-1.11111111 -1.11111111 -1.11111111 -1.11111111]
 [-1.11111111 -1.11111111 -1.11111111  3.025     ]]
[[-1.11111111 -1.11111111 -1.11111111 -1.11111111]
 [-1.11111111 -1.11111111 -1.11111111 -1.11111111]
 [-1.11111111  3.025      -1.11111111 -1.11111111]
 [ 0.25        0.25        0.25        0.25      ]]
[[-1.11111111 -1.06975    -1.11111111 -1.11

# Off-Policy

In [4]:
blocks = [[1,2,3],[2]]
game = WindyGridWorld(4, (1,3), windy_blocks = blocks, on_policy = 'off-policy', verbose=False)
print(game.play(epochs=10000, threshold=0.00000000000001))

Initial State-Action/Q Function
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]

Initial Policy
[2 3 1 2]
[3 3 3 1]
[3 3 1 1]
[2 1 3 2]

Final Policy:
[1 1 1 3]
[1 1 1 1]
[1 1 1 2]
[1 2 3 2]

Final State-Action/Q Function:
[[-1.1106975 -1.106975  -1.1106975 -1.1106975]
 [-1.1106975 -1.06975   -1.106975  -1.106975 ]
 [-1.106975  -0.6975    -1.06975   -1.06975  ]
 [-1.06975   -0.6975    -0.6975     3.025    ]]
[[-1.1106975 -1.106975  -1.1106975 -1.106975 ]
 [-1.1106975 -1.06975   -1.106975  -1.06975  ]
 [-1.106975   3.025     -1.06975   -0.6975   ]
 [ 0.25       0.25       0.25       0.25     ]]
[[-1.106975   -1.06975    -1.1106975  -1.1106975 ]
 [-1.106975   -0.6975   

### Expected SARSA

In [5]:
blocks = [[1,2,3],[2]]
game = WindyGridWorld(4, (1,3), windy_blocks = blocks, on_policy = 'expected', verbose=False)
print(game.play(epochs=10000, threshold=0.00000000000001))

Initial State-Action/Q Function
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]

Initial Policy
[3 2 3 2]
[3 1 1 2]
[3 2 1 2]
[1 2 2 3]

Final Policy:
[1 1 1 3]
[1 1 1 2]
[1 1 1 2]
[1 2 2 2]

Final State-Action/Q Function:
[[-1.11110882 -1.11102985 -1.11110882 -1.11110518]
 [-1.11110881 -1.10809693 -1.1110297  -1.11095893]
 [-1.11102389 -0.99734621 -1.10787627 -1.10787627]
 [-1.10922278 -0.85786578 -0.85786578  3.025     ]]
[[-1.11110518 -1.11095904 -1.11110882 -1.11103425]
 [-1.11110517 -1.10809883 -1.11102974 -1.10812804]
 [-1.11088125  3.025      -1.1040668  -0.85428082]
 [ 0.25        0.25        0.25        0.25      ]]
[[-1.11103409 -1.1081259  -1.11110518 -1.11

### Double Q-Learning

In [6]:
blocks = [[1,2,3],[2]]
game = WindyGridWorld(4, (1,3), windy_blocks = blocks, on_policy = 'double', verbose=False)
print(game.play(epochs=10000, threshold=0.00000000000001))

Initial State-Action/Q Function
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]

Initial Policy
[3 2 3 2]
[1 1 3 2]
[2 2 3 1]
[3 3 1 2]

Final Policy:
[1 1 1 3]
[1 1 1 2]
[1 1 1 2]
[1 2 1 2]

Final State-Action/Q Function:
[[-1.1106975 -1.106975  -1.1106975 -1.1106975]
 [-1.1106975 -1.06975   -1.106975  -1.106975 ]
 [-1.106975  -0.6975    -1.06975   -1.06975  ]
 [-1.06975   -0.6975    -0.6975     3.025    ]]
[[-1.1106975  -1.106975   -1.1106975  -1.106975  ]
 [-1.1106975  -1.06975    -1.106975   -1.06975   ]
 [-1.10697487  3.025      -1.06974995 -0.6975    ]
 [ 0.25        0.25        0.25        0.25      ]]
[[-1.106975   -1.06975    -1.1106975  -1.1106975 ]
 [-1.106