In [181]:
import numpy as np
import sys
from time import time

# Board

The Board class will have the following values.
1. Number of rows
2. Number of columns
3. Win State
4. Lose State
5. Start State
6. Obstacle

It will have the following methods:
1. is_win_state : to verify whether the winning state is reached
2. is_lose_state : to verify whether the loosing state is reached
3. is_not_obstacle : to verify whther the obstacle is reached


In [182]:
class Board:
    def __init__(self,board_row,board_column,win_state,lose_state,start_state,obstacle):
        self.board_row = board_row
        self.board_column = board_column
        self.win_state = win_state
        self.lose_state = lose_state
        self.start_state = start_state
        self.obstacle = obstacle
    
    def is_win_state (self,state):
        return (self.win_state == state)
    def is_lose_state (self,state):
        return (self.lose_state == state)
    def is_not_obstacle (self,state):
        return (self.obstacle != state)
    

# Environment
The environment will have the following values:
1. Action : the action picked by the agent
2. reward : the reward for the action picked
3. state : the current state of the agent
4. board : the board configuration
5. gamma : the discount factor

It will have the following methods 
1. get_curr_reward: returns the reward of the current state
2. get_next_state: returns the next state corresponding to the action taken by the agent in the current state.
3. reset: to reset the environment.

In [198]:
class Envirnonment:
    def __init__(self,action,board,gamma,reward = 0):
        self.action = action
        self.reward = 0
        self.board = board
        self.state = board.start_state
        self.gamma = gamma
        
    def reset(self):
        self.state = board.start_state
        self.reward = 0
        
    def get_curr_reward(self,state):
        if self.board.is_win_state(state) :
            return 1
        if self.board.is_lose_state(state) :
            return -1
        else:
            return self.gamma
    
    def get_next_state(self,action):
        val = np.random.rand()
        if action == "up":
            
            if val<=0.8:
                next_state = (self.state[0]+1,self.state[1])
            elif val<=0.9:
                next_state = (self.state[0],self.state[1]-1)
            elif val<=1:
                next_state = (self.state[0],self.state[1]+1)
        elif action == "down":
            if val<=0.8:
                next_state = (self.state[0]-1,self.state[1])
            elif val<=0.9:
                next_state = (self.state[0],self.state[1]-1)
            elif val<=1:
                next_state = (self.state[0],self.state[1]+1)
        elif action == "left":
            if val<=0.8:
                next_state = (self.state[0],self.state[1]-1)
            elif val<=0.9:
                next_state = (self.state[0]-1,self.state[1])
            elif val<=1:
                next_state = (self.state[0]+1,self.state[1])
        else:
            if val<=0.8:
                next_state = (self.state[0],self.state[1]+1)
            elif val<=0.9:
                next_state = (self.state[0]-1,self.state[1])
            elif val<=1:
                next_state = (self.state[0]+1,self.state[1])
        
        if (next_state[0] >= 0) and (next_state[0] <= 2):
                if (next_state[1] >= 0) and (next_state[1] <= 3):
                    if board.is_not_obstacle(next_state):
                        return next_state
        return self.state

# Agent
The agent class will have the following entities:

1. state: The start state of the agent
2. env: The environment in which the agent is
3. state_value: dictionary of value functions for a state
4. learning_rate: the value of learning rate
5. exploration_rate: the value of exploration for using explore exploit.
6. action: the current action of the agent 
7. board: the board for playing the game.
8. reward: value of reward for the agent.
9. state_store: the array of states visited by the agent.

The Agent class will have the following methods:
1. play: The method to play the game and learn.
2. select_action: The method to pick an action.


In [212]:
class Agent:
    def __init__(self,learning_rate,exploration_rate,board):
        self.learning_rate = learning_rate
        self.exploration_rate = exploration_rate
        self.action = self.select_action()
        self.board = board
        self.state = board.start_state
        self.env = Envirnonment(self.action,self.board,-0.1,0)
        self.state_value = {}
        self.reward = -1000
        self.state_store = []
        
    
    def reset(self):
        self.state = board.start_state
        self.reward = -1000
        self.env.reset()
    
    def play(self, rounds = 10):
        i = 0
        epoch = 10
        while i<epoch:
            
            if self.board.is_win_state(self.state):
#                 print("hello 1")
                reward = self.env.get_curr_reward(self.state)
                
                self.state_value[self.state] = reward
                print("Game Ends , Final Reward", reward)
                print("Epoch number ",i)
                
                for s in reversed(self.state_store):
                    reward = self.state_value[s] + self.learning_rate * (reward - self.state_value[s])
                    self.state_value[s] = round(reward,3)
                self.reset()
                i+=1
            else:
#                 print("hello 2")
                action = self.select_action()
                self.state_store.append(self.env.get_next_state(action))
                print("Current position {}  action {}".format(self.env.state,action))
                self.env.state = self.state_store[-1]
                self.state = self.state_store[-1]
                self.state_value[self.state] = self.env.get_curr_reward(self.state)
    def select_action(self):
        my_next_reward = -100
        action = ""
        
        if np.random.uniform(0,1) <= self.exploration_rate:
            action = np.random.choice(["up","down","left","right"])
        else:
            
            for a in ["up","down","left","right"]:
                temp_state = self.env.get_next_state(a)
                if temp_state in self.state_value.keys() :
                    next_reward = self.state_value[temp_state]
                else :
                    self.state_value[temp_state] = 0
                    next_reward = self.state_value[temp_state]
                if next_reward >= my_next_reward:
                    action = a
                    my_next_reward = next_reward
        return action

In [213]:
board = Board(3,4,(2,3),(1,3),(0,0),(1,1))
agent = Agent(learning_rate = 0.1,exploration_rate = 0.5,board = board)

In [214]:
agent.play(rounds = 10)

Current position (0, 0)  action up
Current position (1, 0)  action down
Current position (0, 0)  action up
Current position (1, 0)  action left
Current position (2, 0)  action left
Current position (2, 0)  action right
Current position (2, 1)  action down
Current position (2, 1)  action right
Current position (2, 1)  action right
Current position (2, 1)  action right
Current position (2, 2)  action right
Game Ends , Final Reward 1
Epoch number  0
Current position (0, 0)  action down
Current position (0, 0)  action right
Current position (0, 0)  action up
Current position (1, 0)  action up
Current position (2, 0)  action right
Current position (2, 1)  action down
Current position (2, 1)  action up
Current position (2, 1)  action left
Current position (2, 0)  action up
Current position (2, 0)  action right
Current position (1, 0)  action right
Current position (1, 0)  action down
Current position (0, 0)  action right
Current position (0, 1)  action down
Current position (0, 1)  action le

In [215]:
# print(agent.select_action())

In [216]:
for i in range(3):
    for j in range(4):
        if i==1 and j==1:
            print(0.00,end = " ")
        else:
            print(agent.state_value[(i,j)],end = " ")
    print(" ")

-0.106 -0.127 -0.168 -0.22  
-0.084 0.0 -0.115 -0.293  
-0.042 0.068 0.248 0.325  
