In [1]:
import numpy as np

In [2]:
np.set_printoptions(precision=3)

In [3]:
class Environment():
    
    def __init__(self, epsilon=0.9, episodes=100, max_steps=100, alpha=0.085, gamma=0.9):
        self.epsilon = epsilon      
        self.num_episodes = episodes
        self.max_steps = max_steps
        self.alpha = alpha
        self.gamma = gamma
        self.state = None
        self.reward = 0
        self.state_space = 5*4
        self.action_space = 4
        self.Q = np.zeros((self.state_space, self.action_space), np.float32)

    def next_action(self, state, exploitation=False):       
        if np.random.uniform(0,1) < self.epsilon and exploitation == False: 
            action = np.random.randint(self.action_space) 
        else: 
            action = np.argmax(self.Q[state, :])          
        return action 
  
    def next_state(self, state, action):
        x = int(state/5)
        y = state%5
    
        old_x,old_y = x,y
            
        if (action == 0):
            if (x != 0):
                x = x-1
        
        elif (action == 1):
            if (y != 4):
                y = y+1
        
        elif (action == 2):
            if (x != 3):
                x = x+1      
        
        elif (action == 3):
            if (y != 0):
                y = y-1
                                        
        self.state = x*5+y
        
        if ((x,y) == (old_x,old_y)):
            reward = -1
        elif ((x,y) == (0,0)):
            reward = 10
        elif ((x,y) == (1,3)):
            reward = -5
        else:
            reward = 0

        if ((old_x,old_y) == (1,3)):
            self.state_visualization[old_x][old_y] = "F"
        else:
            self.state_visualization[old_x][old_y] = "-"
        self.state_visualization[x][y] = "S"

        return self.state, reward

    def updateQ(self, current_state, next_state, reward, current_action, next_action):
        Q = self.Q[current_state][current_action] 
        Qnext = (reward + self.gamma * self.Q[next_state][next_action])
        self.Q[current_state][current_action] = (1-self.alpha)*Q + self.alpha*Qnext
        
    def reset(self):
        self.state_visualization = np.array([['G', '-' ,'-' ,'-', '-'],
                              ['-', '-' ,'-' ,'F', '-'],
                              ['-', '-' ,'-' ,'-', '-'],
                              ['-', '-' ,'-' ,'-', 'S']])
        self.state = 19
        return self.state
        
    def is_goal_state(self):
        return self.state == 0
    
    def actions_map(self, i):
        if (i==0):
            return "Up"
        elif (i==1):
            return "Right"
        elif (i==2):
            return "Down"
        elif (i==3):
            return "Left"

In [4]:
environment = Environment(episodes=300, max_steps=100)


In [5]:
for episode in range(environment.num_episodes): 
    t = 0
    
    current_state = environment.reset()

    while t < environment.max_steps and not environment.is_goal_state(): 
        
        current_action = environment.next_action(current_state) 
        
        next_state, reward = environment.next_state(current_state, current_action)

        next_action = environment.next_action(next_state, exploitation=True)
        
        environment.updateQ(current_state, next_state, reward, current_action, next_action)
  
        current_state = next_state
          
        t += 1

In [6]:
exploitation = True
current_state = environment.reset()
current_action = environment.next_action(current_state, exploitation) 

In [7]:
while t < environment.max_steps and not environment.is_goal_state():

    next_state, reward = environment.next_state(current_state, current_action)

    print(environment.state_visualization, ",   ", 
          environment.actions_map(current_action), 
          ", reward:", reward, "\n\n")

    next_action = environment.next_action(next_state, exploitation)

    current_state = next_state
    current_action = next_action

    t += 1

[['G' '-' '-' '-' '-']
 ['-' '-' '-' 'F' '-']
 ['-' '-' '-' '-' 'S']
 ['-' '-' '-' '-' '-']] ,    Up , reward: 0 


[['G' '-' '-' '-' '-']
 ['-' '-' '-' 'F' 'S']
 ['-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-']] ,    Up , reward: 0 


[['G' '-' '-' '-' 'S']
 ['-' '-' '-' 'F' '-']
 ['-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-']] ,    Up , reward: 0 


[['G' '-' '-' 'S' '-']
 ['-' '-' '-' 'F' '-']
 ['-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-']] ,    Left , reward: 0 


[['G' '-' 'S' '-' '-']
 ['-' '-' '-' 'F' '-']
 ['-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-']] ,    Left , reward: 0 


[['G' 'S' '-' '-' '-']
 ['-' '-' '-' 'F' '-']
 ['-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-']] ,    Left , reward: 0 


[['S' '-' '-' '-' '-']
 ['-' '-' '-' 'F' '-']
 ['-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-']] ,    Left , reward: 10 


