In [23]:
import numpy as np
import random
import pygame

env_y = 5
env_x = 5
env_state_space = np.ones((env_y, env_x))
env_state_space = np.negative(env_state_space)

y_bound = env_y - 1
x_bound = env_x - 1

#env_term_state = [(0,0), (y_bound,0), (0,x_bound), (y_bound,x_bound)]

env_term_state = [(y_bound,x_bound)]

print(env_term_state)

for item in env_term_state:
    env_state_space[item] = 50

print("State Space: ", env_state_space.shape)
print(env_state_space)
print("Terminal State Co-ordinates", env_term_state)

[(4, 4)]
State Space:  (5, 5)
[[-1. -1. -1. -1. -1.]
 [-1. -1. -1. -1. -1.]
 [-1. -1. -1. -1. -1.]
 [-1. -1. -1. -1. -1.]
 [-1. -1. -1. -1. 50.]]
Terminal State Co-ordinates [(4, 4)]


# Movement Code of the Environment

In [2]:
def move(state,action):
    # (y,x)
    
    switch = {
        0: up,
        1: right,
        2: down,
        3: left,
        4: pickup
    }

    func = switch.get(action, lambda: "Invalid action")
    new_state = func(state)
    return new_state

#make sure moves are within a bound
def valid_move(state):
    if state[0] > y_bound:
        state = (state[0]-1, state[1])
    elif state[0] < 0:
        state = (state[0]+1, state[1])
        
    if state[1] > x_bound:
        state = (state[0], state[1] - 1)
    elif state[1] < 0:
        state = (state[0], state[1] + 1)
    
    return state
    
def up(state):
    new_state = (state[0] - 1, state[1])
    new_state = valid_move(new_state)
    return new_state

def right(state):
    new_state = (state[0], state[1] + 1)
    new_state = valid_move(new_state)
    return new_state
    
def down(state):
    new_state = (state[0] + 1, state[1])
    new_state = valid_move(new_state)
    return new_state
    
def left(state):
    new_state = (state[0], state[1] - 1)
    new_state = valid_move(new_state)
    return new_state

#unused for now
def pickup(state):
    return state

# Actual transition of the environment based on action

In [3]:
def step(state, action):        
    
    if state in env_term_state:
        done = True
        state_prime = state
        reward = 0
    else:
        done = False
        state_prime = move(state,action)
        reward = env_state_space[(state_prime[0], state_prime[1])]
        
    return state_prime, reward, done

In [4]:
ACTION_SPACE = list(range(0,4))

print("Action Space:")
print(ACTION_SPACE)
print(["Up", "Right", "Down", "Left"])

Reward = env_state_space

gamma = 1 
significant_improvement = 0.01 

Action Space:
[0, 1, 2, 3]
['Up', 'Right', 'Down', 'Left']


# Q Learning
## _$Q^{new}$($s_{t}$, $a_{t}$)  &larr; (1 - $\alpha$) $\cdot$ Q($s_{t}$, $a_{t}$) + $\alpha$ $\cdot$ ($r_{t}$ + $\gamma$ $\cdot$ $max_{a}$Q($s_{t+1}$, $a$))_

In [None]:
Q = np.zeros((len(ACTION_SPACE), env_y, env_x)) 
alpha = 0.000000000000001
gamma = 0.6
epsilon = 0.15

for iteration in range(10000):
    for y in range(env_y):
        for x in range(env_x):
            s = (y, x)      
                    
            if random.uniform(0, 1) < epsilon:
                action = random.choice(ACTION_SPACE)
            else:
                action = np.argmax(Q[:,y,x])

            s_new, rew, done = step(s,action) 
            old_Q = Q[action][y][x]
            
            next_max = np.max(Q[:, y, x])
            
            #next_max = np.argwhere(Q[:,y,x] == np.max(Q[:, y, x]))
            #next_max = random.choice(next_max)[0]

            next_max = np.max(Q[:, y, x])
            Q[action][y][x] = (1 - alpha) * old_Q + alpha * (rew + gamma * next_max)

print(Q)

In [44]:
print(Q[:,0,2])

[-0.12192875 -0.12192854 -0.12192829 -0.12192787]


In [31]:
def render():
    
    screen.fill(BLACK)
    
    for y in range(height):
        for x in range(width):                
            if y == current_state[0] and x == current_state[1]:
                colour = GREEN
            elif env_state_space[y,x] != -1:
                colour = RED
            else:
                colour = WHITE

            rect_pos_x = x*(CONSTANT_SIZE+1)+1
            rect_pos_y = y*(CONSTANT_SIZE+1)+1
            rect = pygame.Rect(rect_pos_x, rect_pos_y , CONSTANT_SIZE, CONSTANT_SIZE)
            pygame.draw.rect(screen, colour, rect)


for episode in range(5):
    
    done = False
    reward_total = 0

    start_y = np.random.randint(low=0, high=env_y)
    start_x = np.random.randint(low=0, high=env_x)
    current_state = (start_y, start_x)

    BLACK = (0, 0, 0)
    WHITE = (255, 255, 255)
    GREEN = (0, 255, 0)
    RED = (255, 0, 0)

    pygame.init()
    
    CONSTANT_SIZE = 50
    height = env_state_space.shape[0]
    width = env_state_space.shape[1]
    
    size_y = (CONSTANT_SIZE * height) + height + 1
    size_x = (CONSTANT_SIZE * width) + width + 1
    screen_size = (size_x, size_y)

    screen = pygame.display.set_mode(screen_size)

    pygame.display.set_caption("Badworld")

    session = True

    clock = pygame.time.Clock()
    wait_milli_sec = 350
    
    print("Episode ", episode + 1)
    print("Starting State: ", current_state)

    z = 0
    
    render()

    pygame.display.flip()
    pygame.time.wait(wait_milli_sec)    

    while session:

        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
        
        action = np.argmax(Q[:,current_state[0],current_state[1]])
        
        #action = Pi[current_state[0],current_state[1]]
        obs, rew, done = step(current_state, action) 
        
        current_state = obs
        reward_total += rew
        
        if done == True:
            break

        render()
        
        print("Cumulative Reward: " ,reward_total)

        pygame.display.flip()
        pygame.time.wait(wait_milli_sec)
        clock.tick(60)
        
    print("Ending state: ", current_state)
    
pygame.quit()

Episode  1
Starting State:  (0, 4)
Cumulative Reward:  -1.0
Cumulative Reward:  -2.0
Cumulative Reward:  -3.0
Cumulative Reward:  -4.0
Cumulative Reward:  -5.0
Cumulative Reward:  -6.0
Cumulative Reward:  -7.0
Cumulative Reward:  -8.0
Cumulative Reward:  -9.0
Cumulative Reward:  -10.0


error: display Surface quit

In [8]:
pygame.quit()

# Trajectory Printing

In [47]:
switch_arrow = {
    0: u"\u2191",
    1: u"\u2192",
    2: u"\u2193",
    3: u"\u2190",
    4: "x"
}

trajectory_grid = ""

for y in range(env_y):
    for x in range(env_x):
        if((y,x) in env_term_state):
            trajectory_grid += "x "
        else:
            trajectory_grid += switch_arrow.get(np.argmax(Q[:,y,x])) + " "
    trajectory_grid += "\n"
    
        
print(trajectory_grid, "\n")
trajectory_grid = ""

↑ ↑ → ↓ ← 
← ↑ → ↓ → 
→ ↓ → ← ↓ 
↓ ↓ → ↓ ↓ 
↑ → ← → x 
 



# Q Learning
## _$Q^{new}$($s_{t}$, $a_{t}$)  &larr; (1 - $\alpha$) $\cdot$ Q($s_{t}$, $a_{t}$) + $\alpha$ $\cdot$ ($r_{t}$ + $\gamma$ $\cdot$ $max_{a}$Q($s_{t+1}$, $a$))_

In [10]:
# Q = np.zeros((len(ACTION_SPACE), env_y, env_x)) 
# alpha = 0.1
# gamma = 0.6

# for iteration in range(20):
#     for y in range(env_y):
#         for x in range(env_x):
#             s = (y, x)
#             for action in ACTION_SPACE:
#                 s_new, rew, done = step(s,action) 
#                 old_Q = Q[action][y][x]
#                 next_max = np.max(Q[:, y, x])
#                 Q[action][y][x] = (1 - alpha) * old_Q + alpha * (rew + gamma * next_max)

# print(Q)

In [11]:
temp = np.zeros((1,3))
temp[0][0] = 1
temp[0][1] = 1 

print(np.argwhere(temp == 0)[0][1])

2
