## Import Neccessary libraries 

In [1]:
import numpy as np
from math import sqrt
import random

### Create State Table
State table checks for valid moves. A state refers to agent current position in the environment.
Since the environment table is 5 x 5, there are 25 states.

In [91]:
def getStateTable(states):
    stateTable = np.zeros((states,5))
    n = sqrt(len(stateTable))
    for i in range(len(stateTable)):
        for j in range(5):     
            if i < n:
                stateTable[i][0] = -1
            else:
                stateTable[i][0] = i - n
            if i >= n * (n - 1):
                stateTable[i][1] = -1
            else:
                stateTable[i][1] = i + n
            if i % n == 0:
                stateTable[i][2] = -1
            else:
                stateTable[i][2] = i - 1
            if i % n == n - 1:
                stateTable[i][3] = -1
            else:
                stateTable[i][3] = i + 1
            stateTable[i][4] = i
    return stateTable.astype(int)

### Create Rewards Table
The table represents the reward value (or point) when the agent is in a certain state. 2, 3, 4 represent snake, quicksand, and pits respectively. 5 represents the Treasure. The agent gets -5 points if he is in an obstacle state, and gets 10 points if he is in Treasure state.  

In [71]:
def getRewardTable(board, stateTable):
    rewardTable = np.zeros((len(stateTable), len(stateTable[0])))
    for i in range(len(stateTable)):
        for j in range(len(stateTable[0])):
            if stateTable[i][j] == i:
                rewardTable[i][j] = -5
            r = int(stateTable[i][j] / int(sqrt(len(stateTable))))
            c = int(stateTable[i][j] % int(sqrt(len(stateTable))))
            if board[r][c] == '2' or board[r][c] == '3' or board[r][c] == '4':
                rewardTable[i][j] = -5
            elif board[r][c] == '5':
                rewardTable[i][j] = 10
    return rewardTable

### Create Environment For the Agent
An environment includes the board, state table, and reward table

In [72]:
def getEnv(board, states):
    env = {}
    env['board'] = board
    env['stateTable'] = getStateTable(states)
    env['rewardTable'] = getRewardTable(board, getStateTable(states))
    return env

### Helper function that visualize the board
We will print the board for every step, so we can keep track of agent's action

In [73]:
def renderBoard(board):
    for i in range(len(board)):
        for j in range(len(board)):
            print(board[i][j], end='\t')
        print('\n')

### Reinforcement Q-learning Implementation
Q-learning is one of the most popular Reinforcement Learning algorithm, which helps agent to learn the value of being in a given state, and calculate the a specific action from there. 1 Represents the action that the agent takes after each step.
The alpha (learning rate) represents how much of the past memory of agent is overridden by the new information obtained by that agent. Discount Factor (gamma) determines the importance of the upcoming rewards. The equation of Q-learning is given by:
* Q_new[state(t), action(t)] =  Q_old[state(t), action(t)] * (1 - alpha) +  (reward + gamma * max(Q[state(t + 1), action])) * alpha

In [86]:
def QLearning(agent, env, sessions):
    alpha = agent['alpha']
    gamma = agent['gamma']
    qTable = agent['qTable'].copy()
    b = env['board']
    rewards = env['rewardTable']
    stateTable = env['stateTable'].tolist()
    actions = []
    start = 0
    goal = 0
    length = len(b)
    for r in b:
        for pos in r:
            if pos == '1':
                start = b.index(r) + len(b) * r.index(pos)
            elif pos == '5':
                goal = b.index(r) + len(b) * r.index(pos)   
    
    for state in stateTable:
        action = []
        for s in state:
            if s != -1:
                action.append(state.index(s))
        actions.append(action)
    print("Display Board State:")
    print("\n")
    #Change to numpy array     
    actions = np.array(actions)
    for i in range(sessions):
        stepNum = 0
        currState = start
        #Print board to console        
        renderBoard(b)
        while currState != goal:
            stepNum += 1
            #Choose possible action from the current State             
            action = random.choice(actions[currState])
            nextState = stateTable[currState][action]
            nextRewards = []
            for action in actions[nextState]:
                nextRewards.append(qTable[nextState, action])
            currQ = qTable[currState][action]
            currRewards = rewards[currState][action]
            #Q-learning formula implementation             
            nextQ = (1 - alpha) *  currQ + alpha * (currRewards + gamma * max(nextRewards))
            qTable[currState][action] = nextQ
            oR = int(currState / length)
            oC = int(currState % length)
            nR = int(nextState / length)
            nC = int(nextState % length)
            if b[oR][oC] == '1':
                b[oR][oC] = '0'
            else:
                b[oR][oC] = b[oR][oC].replace('1', '')
            if b[nR][nC] == '0':
                 b[nR][nC] = '1'
            else:
                b[nR][nC] += '1'
            print("Step number: %d" % stepNum)
            renderBoard(b)           
            currState = nextState
            print('Current state: %d' % currState)
            if currState == goal:
                b[int(start / length)][int(start % length)] = '1'
                b[int(goal / length)][int(goal % length)] = '5'
                print("Goal reached after %d steps" % stepNum)       
    return agent['qTable']

### Create Agent to test Q-learning Implementation

In [87]:
def getAgent(env, alpha, gamma, sessions):
    agent = dict()
    agent['alpha'] = alpha
    agent['gamma'] = gamma
    agent['qTable'] = np.zeros((len(env['stateTable']), len(env['stateTable'][0])))
    agent = QLearning(agent, env, sessions)
    return agent

### Run Q-learning Implementation

In [90]:
# Initialize Environment
board = [['1','2','3','0','2'],
         ['0','0','4','2','0'],
         ['0','2','3','0','5'],
         ['0','4','0','0','3'],
         ['0','2','5','4','2']]
env = getEnv(board, 25)
agent = getAgent(env = env, gamma = 0.8, alpha = 0.5, sessions = 5)

Display Board State:


1	2	3	0	2	

0	0	4	2	0	

0	2	3	0	5	

0	4	0	0	3	

0	2	5	4	2	

Step number: 1
0	21	3	0	2	

0	0	4	2	0	

0	2	3	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 1
Step number: 2
0	2	31	0	2	

0	0	4	2	0	

0	2	3	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 2
Step number: 3
0	2	3	1	2	

0	0	4	2	0	

0	2	3	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 3
Step number: 4
0	2	3	0	2	

0	0	4	21	0	

0	2	3	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 8
Step number: 5
0	2	3	0	2	

0	0	4	2	1	

0	2	3	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 9
Step number: 6
0	2	3	0	21	

0	0	4	2	0	

0	2	3	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 4
Step number: 7
0	2	3	1	2	

0	0	4	2	0	

0	2	3	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 3
Step number: 8
0	2	3	1	2	

0	0	4	2	0	

0	2	3	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 3
Step number: 9
0	2	3	0	21	

0	0	4	2	0	

0	2	3	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 4
Step number: 10
0	2	3	0	2	

0	0	4	2	1	

0	2	3	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current 

  actions = np.array(actions)


0	4	1	0	3	

0	2	5	4	2	

Current state: 17
Step number: 74
0	2	3	0	2	

0	0	4	2	0	

0	2	31	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 12
Step number: 75
0	2	3	0	2	

0	0	4	2	0	

0	21	3	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 11
Step number: 76
0	2	3	0	2	

0	0	4	2	0	

0	2	31	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 12
Step number: 77
0	2	3	0	2	

0	0	4	2	0	

0	2	3	1	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 13
Step number: 78
0	2	3	0	2	

0	0	4	2	0	

0	2	3	0	51	

0	4	0	0	3	

0	2	5	4	2	

Current state: 14
Goal reached after 78 steps
1	2	3	0	2	

0	0	4	2	0	

0	2	3	0	5	

0	4	0	0	3	

0	2	5	4	2	

Step number: 1
1	2	3	0	2	

0	0	4	2	0	

0	2	3	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 0
Step number: 2
0	21	3	0	2	

0	0	4	2	0	

0	2	3	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 1
Step number: 3
0	21	3	0	2	

0	0	4	2	0	

0	2	3	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 1
Step number: 4
0	2	31	0	2	

0	0	4	2	0	

0	2	3	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 2
Step number: 5
0	2	3	1	2	

0	4	0	0	3	

0	2	5	4	21	

Current state: 24
Step number: 61
0	2	3	0	2	

0	0	4	2	0	

0	2	3	0	5	

0	4	0	0	31	

0	2	5	4	2	

Current state: 19
Step number: 62
0	2	3	0	2	

0	0	4	2	0	

0	2	3	0	5	

0	4	0	1	3	

0	2	5	4	2	

Current state: 18
Step number: 63
0	2	3	0	2	

0	0	4	2	0	

0	2	3	1	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 13
Step number: 64
0	2	3	0	2	

0	0	4	2	0	

0	2	31	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 12
Step number: 65
0	2	3	0	2	

0	0	4	2	0	

0	2	31	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 12
Step number: 66
0	2	3	0	2	

0	0	4	2	0	

0	2	31	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 12
Step number: 67
0	2	3	0	2	

0	0	4	2	0	

0	21	3	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 11
Step number: 68
0	2	3	0	2	

0	0	4	2	0	

0	21	3	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 11
Step number: 69
0	2	3	0	2	

0	0	4	2	0	

0	21	3	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 11
Step number: 70
0	2	3	0	2	

0	0	4	2	0	

0	2	31	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 12
Step numb

0	1	4	2	0	

0	2	3	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 6
Step number: 85
0	2	3	0	2	

0	0	41	2	0	

0	2	3	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 7
Step number: 86
0	2	3	0	2	

0	0	4	21	0	

0	2	3	0	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 8
Step number: 87
0	2	3	0	2	

0	0	4	2	0	

0	2	3	1	5	

0	4	0	0	3	

0	2	5	4	2	

Current state: 13
Step number: 88
0	2	3	0	2	

0	0	4	2	0	

0	2	3	0	51	

0	4	0	0	3	

0	2	5	4	2	

Current state: 14
Goal reached after 88 steps
