In [1]:
import numpy as np
import random
import pygame

env_y = 10
env_x = 10
env_z = env_y * env_x
env_state_space = np.ones((env_z, env_y, env_x))
env_state_space = np.negative(env_state_space)

y_bound = env_y - 1
x_bound = env_x - 1

env_term_state = list()
for y in range(env_y):
    for x in range(env_x):
        env_term_state.append((y,x))

for state in range(env_z):
    env_state_space[state][env_term_state[state]] = 50        
        
print("State Space: ", env_state_space.shape)
print("Terminal State Co-ordinates", env_term_state)

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html
State Space:  (100, 10, 10)
Terminal State Co-ordinates [(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7), (0, 8), (0, 9), (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (2, 0), (2, 1), (2, 2), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (2, 8), (2, 9), (3, 0), (3, 1), (3, 2), (3, 3), (3, 4), (3, 5), (3, 6), (3, 7), (3, 8), (3, 9), (4, 0), (4, 1), (4, 2), (4, 3), (4, 4), (4, 5), (4, 6), (4, 7), (4, 8), (4, 9), (5, 0), (5, 1), (5, 2), (5, 3), (5, 4), (5, 5), (5, 6), (5, 7), (5, 8), (5, 9), (6, 0), (6, 1), (6, 2), (6, 3), (6, 4), (6, 5), (6, 6), (6, 7), (6, 8), (6, 9), (7, 0), (7, 1), (7, 2), (7, 3), (7, 4), (7, 5), (7, 6), (7, 7), (7, 8), (7, 9), (8, 0), (8, 1), (8, 2), (8, 3), (8, 4), (8, 5), (8, 6), (8, 7), (8, 8), (8, 9), (9, 0), (9, 1), (9, 2), (9, 3), (9, 4), (9, 5), (9, 6), (9, 7), (9, 8), (9, 9)]


# Movement Code of the Environment

In [2]:
def move(state,action):
    # (y,x)
    
    switch = {
        0: up,
        1: right,
        2: down,
        3: left,
        4: pickup
    }

    func = switch.get(action, lambda: "Invalid action")
    new_state = func(state)
    return new_state

#make sure moves are within a bound
def valid_move(state):
    if state[0] > y_bound:
        state = (state[0]-1, state[1])
    elif state[0] < 0:
        state = (state[0]+1, state[1])
        
    if state[1] > x_bound:
        state = (state[0], state[1] - 1)
    elif state[1] < 0:
        state = (state[0], state[1] + 1)
    
    return state
    
def up(state):
    new_state = (state[0] - 1, state[1])
    new_state = valid_move(new_state)
    return new_state

def right(state):
    new_state = (state[0], state[1] + 1)
    new_state = valid_move(new_state)
    return new_state
    
def down(state):
    new_state = (state[0] + 1, state[1])
    new_state = valid_move(new_state)
    return new_state
    
def left(state):
    new_state = (state[0], state[1] - 1)
    new_state = valid_move(new_state)
    return new_state

#unused for now
def pickup(state):
    return state

# Actual transition of the environment based on action

In [3]:
def step(state, action, terminal_state):        
    
    if state == env_term_state[terminal_state]:
        done = True
        state_prime = state
        reward = 0
    else:
        done = False
        state_prime = move(state,action)
        reward = env_state_space[(terminal_state, state_prime[0], state_prime[1])]
    return state_prime, reward, done

# Best action selection function

In [4]:
def best_action_value(state,terminal_state):
    best_action = None
    best_value  = float('-inf')

    for action in ACTION_SPACE:
        state_prime, reward, done = step(state, action,terminal_state) 
        v = reward + gamma * V[terminal_state, state_prime[0], state_prime[1]] 
        
        if v > best_value:
            best_value = v
            best_action = action
    return best_action

In [5]:
ACTION_SPACE = list(range(0,4))

print("Action Space:")
print(ACTION_SPACE)
print(["Up", "Right", "Down", "Left"])

Reward = env_state_space
V = np.zeros((env_z, env_y, env_x)) 
Pi = np.zeros((env_z, env_y, env_x))  

print("V - ", V.shape)
print("Pi - ", Pi.shape)

gamma = 1 
significant_improvement = 0.01 

Action Space:
[0, 1, 2, 3]
['Up', 'Right', 'Down', 'Left']
V -  (100, 10, 10)
Pi -  (100, 10, 10)


In [6]:
# just doing 20 training iterations for now
for iteration in range(20):
    for z in range(env_z):
        for y in range(env_y):
            for x in range(env_x):
                s = (y, x)
                action = best_action_value(s,z)
                s_new, rew, done = step(s,action,z) 
                V[z,s[0],s[1]] = rew + gamma * V[(z,s_new[0],s_new[1])] 
                Pi[z,s[0], s[1]] = action
                if done == True:
                    #4 is the terminal state action (do nothing)
                    Pi[z,s[0], s[1]] = 4

In [7]:
def render():
    
    screen.fill(BLACK)
    
    for y in range(height):
        for x in range(width):                
            if y == current_state[0] and x == current_state[1]:
                colour = GREEN
            elif env_state_space[z,y,x] != -1:
                colour = RED
            else:
                colour = WHITE

            rect_pos_x = x*(CONSTANT_SIZE+1)+1
            rect_pos_y = y*(CONSTANT_SIZE+1)+1
            rect = pygame.Rect(rect_pos_x, rect_pos_y , CONSTANT_SIZE, CONSTANT_SIZE)
            pygame.draw.rect(screen, colour, rect)



for episode in range(5):
    
    done = False
    reward_total = 0

    start_y = np.random.randint(low=0, high=env_y)
    start_x = np.random.randint(low=0, high=env_x)
    current_state = (start_y, start_x)

    term_state_y = np.random.randint(low=0, high=env_y)
    term_state_x = np.random.randint(low=0, high=env_x)
    terminal_state_grid = (term_state_y, term_state_x)

    BLACK = (0, 0, 0)
    WHITE = (255, 255, 255)
    GREEN = (0, 255, 0)
    RED = (255, 0, 0)

    pygame.init()
    
    CONSTANT_SIZE = 50
    height = env_state_space.shape[1]
    width = env_state_space.shape[2]
    
    size_y = (CONSTANT_SIZE * height) + height + 1
    size_x = (CONSTANT_SIZE * width) + width + 1
    screen_size = (size_x, size_y)

    screen = pygame.display.set_mode(screen_size)

    pygame.display.set_caption("Badworld")

    session = True

    clock = pygame.time.Clock()
    wait_milli_sec = 350
    
    print("Episode ", episode + 1)
    print("Starting State: ", current_state)

    z = env_term_state.index(terminal_state_grid)

    render()

    pygame.display.flip()
    pygame.time.wait(wait_milli_sec)    

    while session:

        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()

        action = Pi[z,current_state[0],current_state[1]]
        obs, rew, done = step(current_state, action,z) 
        
        current_state = obs
        reward_total += rew
        
        if done == True:
            break

        render()
        
        print("Cumulative Reward: " ,reward_total)

        pygame.display.flip()
        pygame.time.wait(wait_milli_sec)
        clock.tick(60)
        
    print("Ending state: ", current_state)
    
pygame.quit()

Episode  1
Starting State:  (5, 3)
Cumulative Reward:  -1.0
Cumulative Reward:  -2.0
Cumulative Reward:  -3.0
Cumulative Reward:  47.0
Ending state:  (4, 0)
Episode  2
Starting State:  (6, 6)
Cumulative Reward:  -1.0
Cumulative Reward:  -2.0
Cumulative Reward:  -3.0
Cumulative Reward:  -4.0
Cumulative Reward:  -5.0
Cumulative Reward:  -6.0
Cumulative Reward:  44.0
Ending state:  (8, 1)
Episode  3
Starting State:  (5, 6)
Cumulative Reward:  -1.0
Cumulative Reward:  -2.0
Cumulative Reward:  -3.0
Cumulative Reward:  47.0
Ending state:  (5, 2)
Episode  4
Starting State:  (2, 4)
Cumulative Reward:  -1.0
Cumulative Reward:  -2.0
Cumulative Reward:  -3.0
Cumulative Reward:  -4.0
Cumulative Reward:  -5.0
Cumulative Reward:  45.0
Ending state:  (4, 0)
Episode  5
Starting State:  (6, 4)
Cumulative Reward:  -1.0
Cumulative Reward:  -2.0
Cumulative Reward:  48.0
Ending state:  (7, 2)


In [8]:
pygame.quit()

In [9]:
#texting dreams - works but kernal dies after each rendering

# font = pygame.font.Font('freesansbold.ttf', 10) 
# draw_text = True

# if draw_text == True:
#     text_pos_x = current_state[1]*(CONSTANT_SIZE+1)+1
#     text_pos_y = current_state[0]*(CONSTANT_SIZE+1)+1
#     rect = pygame.Rect(text_pos_x, text_pos_y , CONSTANT_SIZE, CONSTANT_SIZE)
#     text_word = "(" +  str(current_state[0]) + "; " + str(current_state[1]) + ")"
#     text_pos_x = current_state[1]*(CONSTANT_SIZE+1)+1
#     text_pos_y = current_state[0]*(CONSTANT_SIZE+1)+1
#     text = font.render(text_word, True, BLACK, WHITE) 
#     textRect = text.get_rect()  
#     textRect.center = (text_pos_x+(CONSTANT_SIZE/2), text_pos_y+(CONSTANT_SIZE/2)) 
#     screen.blit(text, textRect)

# Trajectory Printing

In [10]:
switch_arrow = {
    0: u"\u2191",
    1: u"\u2192",
    2: u"\u2193",
    3: u"\u2190",
    4: "x"
}

trajectory_grid = ""

for z in range(env_z):
    for y in range(env_y):
        for x in range(env_x):
            trajectory_grid += switch_arrow.get(Pi[z,y,x]) + " "
        trajectory_grid += "\n"
    print(trajectory_grid, "\n")
    trajectory_grid = ""

x ← ← ← ← ← ← ← ← ← 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
 

→ x ← ← ← ← ← ← ← ← 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
 

→ → x ← ← ← ← ← ← ← 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
 

→ → → x ← ← ← ← ← ← 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
 

→ → → → x ← ← ← ← ← 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ ↑ 
↑