In [1]:
import numpy as np
import random
import pygame

env_y = 5
env_x = 5
env_state_space = np.ones((env_y, env_x))
env_state_space = np.negative(env_state_space)

y_bound = env_y - 1
x_bound = env_x - 1

env_term_state = [(0,0), (y_bound,0), (0,x_bound), (y_bound,x_bound)]

#env_term_state = [(y_bound,x_bound)]

print(env_term_state)

for item in env_term_state:
    env_state_space[item] = 50

print("State Space: ", env_state_space.shape)
print(env_state_space)
print("Terminal State Co-ordinates", env_term_state)

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html
[(0, 0), (4, 0), (0, 4), (4, 4)]
State Space:  (5, 5)
[[50. -1. -1. -1. 50.]
 [-1. -1. -1. -1. -1.]
 [-1. -1. -1. -1. -1.]
 [-1. -1. -1. -1. -1.]
 [50. -1. -1. -1. 50.]]
Terminal State Co-ordinates [(0, 0), (4, 0), (0, 4), (4, 4)]


# Movement Code of the Environment

In [2]:
def move(state,action):
    # (y,x)
    
    switch = {
        0: up,
        1: right,
        2: down,
        3: left,
        4: pickup
    }

    func = switch.get(action, lambda: "Invalid action")
    new_state, penalty = func(state)
    return new_state, penalty

#make sure moves are within a bound
def valid_move(state):
    penalty = False
    
    if state[0] > y_bound:
        state = (state[0]-1, state[1])
        penalty = True
    elif state[0] < 0:
        state = (state[0]+1, state[1])
        penalty = True
        
    if state[1] > x_bound:
        state = (state[0], state[1] - 1)
        penalty = True
    elif state[1] < 0:
        state = (state[0], state[1] + 1)
        penalty = True
    
    return state, penalty
    
def up(state):
    new_state = (state[0] - 1, state[1])
    new_state, penalty = valid_move(new_state)
    return new_state, penalty

def right(state):
    new_state = (state[0], state[1] + 1)
    new_state, penalty = valid_move(new_state)
    return new_state, penalty
    
def down(state):
    new_state = (state[0] + 1, state[1])
    new_state, penalty = valid_move(new_state)
    return new_state, penalty
    
def left(state):
    new_state = (state[0], state[1] - 1)
    new_state, penalty = valid_move(new_state)
    return new_state, penalty

#unused for now
def pickup(state):
    return state

# Actual transition of the environment based on action

In [3]:
def step(state, action):        
    
    if state in env_term_state:
        done = True
        state_prime = state
        reward = 0
    else:
        done = False
        state_prime, penalty = move(state,action)
        if penalty == True:
            add_reward = -50
        else:
            add_reward = 0
        reward = env_state_space[(state_prime[0], state_prime[1])] + add_reward
        
    return state_prime, reward, done

In [4]:
ACTION_SPACE = list(range(0,4))

print("Action Space:")
print(ACTION_SPACE)
print(["Up", "Right", "Down", "Left"])

Reward = env_state_space

gamma = 1 
significant_improvement = 0.01 

Action Space:
[0, 1, 2, 3]
['Up', 'Right', 'Down', 'Left']


# Q Learning
## _$Q^{new}$($s_{t}$, $a_{t}$)  &larr; (1 - $\alpha$) $\cdot$ Q($s_{t}$, $a_{t}$) + $\alpha$ $\cdot$ ($r_{t}$ + $\gamma$ $\cdot$ $max_{a}$Q($s_{t+1}$, $a$))_

In [5]:
Q = np.zeros((len(ACTION_SPACE), env_y, env_x)) 
alpha = 0.01
gamma = 1
epsilon = 0.1

done = False

for iteration in range(100):
    for y in range(env_y):
        for x in range(env_x):
            s = (y, x)      

            while done != True:

                if random.uniform(0, 1) < epsilon:
                    action = random.choice(ACTION_SPACE)
                else:
                    action = np.argmax(Q[:,s[0],s[1]])

                s_new, rew, done = step(s,action) 
                old_Q = Q[action][s[0]][s[1]]

                next_max = np.max(Q[:, s_new[0], s_new[1]])

                Q[action][s[0]][s[1]] = (1 - alpha) * old_Q + alpha * (rew + gamma * next_max)
                s = s_new

            done = False
                
print(Q)

[[[ 0.00000000e+00 -1.85901204e+00 -1.90463973e+00 -1.74702991e+00
    0.00000000e+00]
  [ 4.98587711e+01  1.48032832e+00  3.02894445e-01  3.87379592e+01
    4.33680061e+01]
  [ 3.04065573e+01  3.03466662e+01  4.31418444e-02  1.60880395e+01
    2.09196067e+01]
  [ 4.45676145e-01  3.34582825e-01  5.36981671e-02  2.65534688e-01
    6.25010835e-01]
  [ 0.00000000e+00  1.58013165e-01  1.09710871e-01  3.10648736e-01
    0.00000000e+00]]

 [[ 0.00000000e+00  2.10749006e-01  2.75609686e+01  4.91718111e+01
    0.00000000e+00]
  [ 3.41725116e+00  1.11336129e+00  1.83081493e-01  1.47320271e+00
   -2.37563502e+00]
  [ 2.52160986e-02  6.65151380e-02  7.86755823e-02  1.00777507e-01
   -2.73072950e+00]
  [ 8.25159045e-01  6.34676103e-02  1.92720877e-01  2.00732663e+01
   -1.09231727e+00]
  [ 0.00000000e+00  1.69125455e-02  1.97143487e+01  4.34999828e+01
    0.00000000e+00]]

 [[ 0.00000000e+00  7.43871534e-01  3.64218842e-01  4.13272875e+00
    0.00000000e+00]
  [ 1.82267676e+00  7.47337726e-01 -1.4

# Trajectory Printing

In [6]:
switch_arrow = {
    0: u"\u2191",
    1: u"\u2192",
    2: u"\u2193",
    3: u"\u2190",
    4: "x"
}

trajectory_grid = ""

for y in range(env_y):
    for x in range(env_x):
        if((y,x) in env_term_state):
            trajectory_grid += "x "
        else:
            trajectory_grid += switch_arrow.get(np.argmax(Q[:,y,x])) + " "
    trajectory_grid += "\n"
    
        
print(trajectory_grid, "\n")
trajectory_grid = ""

x ← → → x 
↑ ← ← ↑ ↑ 
↑ ↑ ← ↑ ↑ 
↓ ← ← → ↓ 
x ← → → x 
 



In [7]:
print(Q[:,1,4])
print(np.argmax(Q[:,3,3]))

[43.36800609 -2.37563502  0.4498724   1.98103979]
1


In [8]:
def render():
    
    screen.fill(BLACK)
    
    for y in range(height):
        for x in range(width):                
            if y == current_state[0] and x == current_state[1]:
                colour = GREEN
            elif env_state_space[y,x] != -1:
                colour = RED
            else:
                colour = WHITE

            rect_pos_x = x*(CONSTANT_SIZE+1)+1
            rect_pos_y = y*(CONSTANT_SIZE+1)+1
            rect = pygame.Rect(rect_pos_x, rect_pos_y , CONSTANT_SIZE, CONSTANT_SIZE)
            pygame.draw.rect(screen, colour, rect)


for episode in range(5):
    
    done = False
    reward_total = 0

    start_y = np.random.randint(low=0, high=env_y)
    start_x = np.random.randint(low=0, high=env_x)
    current_state = (start_y, start_x)

    BLACK = (0, 0, 0)
    WHITE = (255, 255, 255)
    GREEN = (0, 255, 0)
    RED = (255, 0, 0)

    pygame.init()
    
    CONSTANT_SIZE = 50
    height = env_state_space.shape[0]
    width = env_state_space.shape[1]
    
    size_y = (CONSTANT_SIZE * height) + height + 1
    size_x = (CONSTANT_SIZE * width) + width + 1
    screen_size = (size_x, size_y)

    screen = pygame.display.set_mode(screen_size)

    pygame.display.set_caption("Badworld")

    session = True

    clock = pygame.time.Clock()
    wait_milli_sec = 350
    
    print("Episode ", episode + 1)
    print("Starting State: ", current_state)

    z = 0
    
    render()

    pygame.display.flip()
    pygame.time.wait(wait_milli_sec)    

    while session:

        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
        
        action = np.argmax(Q[:,current_state[0],current_state[1]])
        
        #action = Pi[current_state[0],current_state[1]]
        obs, rew, done = step(current_state, action) 
        
        current_state = obs
        reward_total += rew
        
        if done == True:
            break

        render()
        
        print("Cumulative Reward: " ,reward_total)

        pygame.display.flip()
        pygame.time.wait(wait_milli_sec)
        clock.tick(60)
        
    print("Ending state: ", current_state)
    
pygame.quit()

Episode  1
Starting State:  (3, 0)
Cumulative Reward:  50.0
Ending state:  (4, 0)
Episode  2
Starting State:  (3, 0)
Cumulative Reward:  50.0
Ending state:  (4, 0)
Episode  3
Starting State:  (3, 1)
Cumulative Reward:  -1.0
Cumulative Reward:  49.0
Ending state:  (4, 0)
Episode  4
Starting State:  (1, 2)
Cumulative Reward:  -1.0
Cumulative Reward:  -2.0
Cumulative Reward:  48.0
Ending state:  (0, 0)
Episode  5
Starting State:  (2, 2)
Cumulative Reward:  -1.0
Cumulative Reward:  -2.0
Cumulative Reward:  -3.0
Cumulative Reward:  47.0
Ending state:  (0, 0)


In [9]:
pygame.quit()