In [1]:
import numpy as np

class Env:
    def __init__(self):
        self.height = 5
        self.width = 5
        self.posX = 0
        self.posY = 0
        self.endX = self.width-1
        self.endY = self.height-1
        self.actions = [0, 1, 2, 3]
        self.stateCount = self.height*self.width
        self.actionCount = len(self.actions)

    def reset(self):
        self.posX = 0
        self.posY = 0
        self.done = False
        return 0, 0, False

    # take action
    def step(self, action):
        if action == 0: # left
            self.posX = self.posX-1 if self.posX > 0 else self.posX
        if action == 1: # right
            self.posX = self.posX+1 if self.posX < self.width - 1 else self.posX
        if action == 2: # up
            self.posY = self.posY-1 if self.posY > 0 else self.posY
        if action == 3: # down
            self.posY = self.posY+1 if self.posY < self.height - 1 else self.posY

        done = self.posX == self.endX and self.posY == self.endY
        # mapping (x,y) position to number between 0 and 5x5-1=24
        nextState = self.width * self.posY + self.posX
        reward = 1 if done else 0
        return nextState, reward, done

    # return a random action
    def randomAction(self):
        return np.random.choice(self.actions)

    # display environment
    def render(self):
        for i in range(self.height):
            for j in range(self.width):
                if self.posY == i and self.posX == j:
                    print("O", end='')
                elif self.endY == i and self.endX == j:
                    print("T", end='')
                else:
                    print(".", end='')
            print("")

In [3]:
import numpy as np
import time
import os

# create environment
env = Env()

# QTable : contains the Q-Values for every (state,action) pair
qtable = np.random.rand(env.stateCount, env.actionCount).tolist()
print(qtable)
row=len(qtable)
column=len(qtable[0])
print(f'Rows:{row}, Column:{column}')
# hyperparameters
epochs = 50
gamma = 0.1
epsilon = 0.08
decay = 0.1

# training loop


[[0.702956537853538, 0.6269657485699104, 0.7003873944526388, 0.9390872316928403], [0.58924745998225, 0.6892971330791077, 0.4430719899944646, 0.9212898577870015], [0.44073047613601346, 0.3413163403717627, 0.7960874281106192, 0.6168808572084995], [0.17261493041459242, 0.6395005361674297, 0.21952290281549092, 0.8457346902339788], [0.2638098178393087, 0.12949322341242708, 0.7927899964675091, 0.5905579713625053], [0.18346834782509802, 0.15128356043236557, 0.6419112519303377, 0.1278975079639485], [0.9655908029616044, 0.2226101684013504, 0.09936812823016228, 0.6227816335020957], [0.49005218129816275, 0.8222750155127554, 0.10180431912711374, 0.6865582981360606], [0.3913426233817826, 0.7332256602748387, 0.19582002238722473, 0.9732145118333289], [0.23101525040261295, 0.1729005080138396, 0.1526889360593613, 0.1252957829933502], [0.4398754164419537, 0.7535119397732939, 0.3950102596290255, 0.8483973564821479], [0.6910538352624203, 0.6033398195930644, 0.5444629005200261, 0.9510082131767615], [0.5570

In [4]:
for i in range(epochs):
    state, reward, done = env.reset()
    steps = 0

    while not done:
        os.system('clear')
        print("epoch #", i+1, "/", epochs)
        env.render()
        time.sleep(0.05)

        # count steps to finish game
        steps += 1

        # act randomly sometimes to allow exploration
        if np.random.uniform() < epsilon:
            action = env.randomAction()
        # if not select max action in Qtable (act greedy)
        else:
            action = qtable[state].index(max(qtable[state]))

        # take action
        next_state, reward, done = env.step(action)

        # update qtable value with Bellman equation
        qtable[state][action] = reward + gamma * max(qtable[next_state])

        # update state
        state = next_state
    # The more we learn, the less we take random actions
    epsilon -= decay*epsilon

    print("\nDone in", steps, "steps".format(steps))
    time.sleep(0.8)
    if epochs == 50:
      print(qtable)

epoch # 1 / 50
O....
.....
.....
.....
....T
epoch # 1 / 50
.....
O....
.....
.....
....T
epoch # 1 / 50
O....
.....
.....
.....
....T
epoch # 1 / 50
O....
.....
.....
.....
....T
epoch # 1 / 50
O....
.....
.....
.....
....T
epoch # 1 / 50
.O...
.....
.....
.....
....T
epoch # 1 / 50
.....
.O...
.....
.....
....T
epoch # 1 / 50
.....
O....
.....
.....
....T
epoch # 1 / 50
.....
O....
.....
.....
....T
epoch # 1 / 50
.....
.O...
.....
.....
....T
epoch # 1 / 50
.....
.....
.O...
.....
....T
epoch # 1 / 50
.....
.....
O....
.....
....T
epoch # 1 / 50
.....
.....
.....
O....
....T
epoch # 1 / 50
.....
.....
.....
.....
O...T
epoch # 1 / 50
.....
.....
.....
.....
O...T
epoch # 1 / 50
.....
.....
.....
O....
....T
epoch # 1 / 50
.....
.....
.....
O....
....T
epoch # 1 / 50
.....
.....
O....
.....
....T
epoch # 1 / 50
.....
.....
.O...
.....
....T
epoch # 1 / 50
.....
.....
.....
.O...
....T
epoch # 1 / 50
.....
.....
.....
..O..
....T
epoch # 1 / 50
.....
.....
.....
.O...
....T
epoch # 1 