https://github.com/MohammadAsadolahi/Reinforcement-Learning-solving-a-simple-4by4-Gridworld-using-policy-iteration-in-python/blob/main/README.md

In [4]:
import numpy as np

class GridWorld:
    def __init__(self):
        # S O O O
        # O O O *
        # O * O O
        # O * 0 T
        self.actionSpace = ('U', 'D', 'L', 'R')
        self.actions = {
            (0, 0): ('D', 'R'),
            (0, 1): ('L', 'D', 'R'),
            (0, 2): ('L', 'D', 'R'),
            (0, 3): ('L', 'D'),
            (1, 0): ('U', 'D', 'R'),
            (1, 1): ('U', 'L', 'D', 'R'),
            (1, 2): ('U', 'L', 'D', 'R'),
            (1, 3): ('U', 'L', 'D'),
            (2, 0): ('U', 'D', 'R'),
            (2, 1): ('U', 'L', 'D', 'R'),
            (2, 2): ('U', 'L', 'D', 'R'),
            (2, 3): ('U', 'L', 'D'),
            (3, 0): ('U', 'R'),
            (3, 1): ('U', 'L', 'R'),
            (3, 2): ('U', 'L', 'R')
        }
        self.rewards = {(3, 3): 0.03, (1, 3): -0.01, (2, 1): -0.011, (3, 1): -0.01}
        self.explored = 0
        self.exploited = 0

    def getRandomPolicy(self):
        policy = {}
        for state in self.actions:
            policy[state] = np.random.choice(self.actions[state])
        return policy

    def reset(self):
        return (0, 0)

    def is_terminal(self, s):
        return s not in self.actions

    def getNewState(self, state, action):
        i, j = zip(state)
        row = int(i[0])
        column = int(j[0])
        if action == 'U':
            row -= 1
        elif action == 'D':
            row += 1
        elif action == 'L':
            column -= 1
        elif action == 'R':
            column += 1
        return row, column

    def chooseAction(self, state, policy, exploreRate):
        # Step 1: Generate a random number between 0 and 1
        random_value = np.random.rand()

        # Step 2 & 3: Compare with exploreRate
        if random_value < exploreRate:
            self.explored += 1
            return np.random.choice(self.actions[state])
        else:
            self.exploited += 1
            return policy[state]

    def greedyChoose(self, state, values):
        possibleActions = self.actions[state]
        stateValues = []

        for action in possibleActions:
            nextState = self.getNewState(state, action)
            stateValues.append(values.get(nextState, 0))

        bestActionIndex = np.argmax(stateValues)
        return possibleActions[bestActionIndex]

    def move(self, state, policy, exploreRate):
        action = self.chooseAction(state, policy, exploreRate)
        newState = self.getNewState(state, action)
        reward = self.rewards.get(newState, 0)
        return newState, reward

    def printValues(self, values):
        line = ""
        counter = 0
        for item in values:
            line += f" | {values[item]} | "
            counter += 1
            if counter > 3:
                print(line)
                print("--------------------------------")
                counter = 0
                line = ""
        print(line)
        print("----------------------------")

    def printPolicy(self, policy):
        line = ""
        counter = 0
        for item in policy:
            line += f" | {policy[item]} | "
            counter += 1
            if counter > 3:
                print(line)
                print("----------------------------")
                counter = 0
                line = ""
        print(line)
        print("----------------------------")


In [5]:
enviroment = GridWorld()
policy = enviroment.getRandomPolicy()
# enviroment.printPolicy(policy)

#example optimal policy = {(0, 0): 'R', (0, 1): 'R', (0, 2): 'D', (0, 3): 'D', (1, 0): 'R', (1, 1): 'D', (1, 2): 'D', (1, 3): 'D',
#           (2, 0): 'R', (2, 1): 'D', (2, 2): 'R', (2, 3): 'D', (3, 0): 'R', (3, 1): 'R', (3, 2): 'R'}

for i in range(1001):
  values = {}
  for state in policy:
      values[state] = 0
  values[(3, 3)] = 5

  for j in range(1000):
    state = enviroment.reset()
    stepCounts=0
    while (not enviroment.is_terminal(state)) and (stepCounts<50):
      nextState, reward = enviroment.move(state, policy, exploreRate=0.05)
      values[state] = reward + 0.1 * values[nextState]
      state=nextState
      stepCounts+=1
  for item in policy:
        policy[item] = enviroment.greedyChoose(item, values)

  if (i%100)==0:
    print(f"\n\n\n step:{i}")
    # enviroment.printVaues(values)
    enviroment.printPolicy(policy)

print(f"exploited:{enviroment.exploited}  explored:{enviroment.explored}")




 step:0
 | R |  | D |  | L |  | L | 
----------------------------
 | R |  | U |  | U |  | U | 
----------------------------
 | R |  | U |  | D |  | D | 
----------------------------
 | U |  | R |  | R | 
----------------------------



 step:100
 | R |  | D |  | L |  | L | 
----------------------------
 | R |  | U |  | D |  | D | 
----------------------------
 | U |  | R |  | D |  | D | 
----------------------------
 | U |  | R |  | R | 
----------------------------



 step:200
 | R |  | D |  | D |  | D | 
----------------------------
 | R |  | R |  | D |  | D | 
----------------------------
 | U |  | R |  | D |  | D | 
----------------------------
 | U |  | R |  | R | 
----------------------------



 step:300
 | R |  | R |  | D |  | D | 
----------------------------
 | U |  | R |  | D |  | D | 
----------------------------
 | R |  | D |  | D |  | D | 
----------------------------
 | R |  | R |  | R | 
----------------------------



 step:400
 | D |  | D |  | D |  | D | 
--------