In [92]:
# Import libraries

import os, sys, warnings
import numpy as np
import matplotlib.pyplot as plt

#sys.path.append('../Environment/')
#from Environment.environment import *
%run ../Environment/environment.ipynb

HBox(children=(VBox(children=(Button(description='Pos+', style=ButtonStyle()), Button(description='Pos-', styl…

In [104]:
# Initialize some variables

# Restart the environment
env = EnvironmentState()

NUM_ACTIONS = 5
ACTIONS = np.asarray([0, 1, 2, 3, 4])

NUM_POSITIONS = (env.REGION_HEIGHT/0.05)+1
NUM_ANGLES    = ((258-102)/3)+1                               # Evironment states include a 
                                                              # range of angles (102 to 258)
POS_STATES = np.zeros((int(NUM_POSITIONS), int(NUM_ANGLES)))  # for every y position the agent
ANG_STATES = np.zeros((int(NUM_POSITIONS), int(NUM_ANGLES)))  # can occupy 

# Fill the state values
for i in range(int(NUM_ANGLES)):
    POS_STATES[:,i] = np.arange(-1*env.REGION_HEIGHT/2, env.REGION_HEIGHT/2+0.05, 0.05)
    
for i in range(int(NUM_POSITIONS)):
    ANG_STATES[i,:] = np.arange(102, 258+3, 3)

# Initialize the Q values
Q = np.zeros((int(NUM_POSITIONS), int(NUM_ANGLES), int(NUM_ACTIONS)))

# Resize the states and Q values for easier searching, intersections, and copmarisons
POS_STATES = np.round(POS_STATES.reshape(int(NUM_POSITIONS*NUM_ANGLES), order = 'F'), 2)
ANG_STATES = ANG_STATES.reshape(int(NUM_POSITIONS*NUM_ANGLES), order = 'F')
Q = Q.reshape(int(NUM_POSITIONS*NUM_ANGLES), NUM_ACTIONS, order = 'F')

# Now Q is a state x action matrix
# Column 1 is wait action, 2 is pos up, 3 is pos down, 4 is aim up, 5 is aim down
# Each row corresponds to a (pos & angle) state

In [105]:
# Initialize parameters
NUM_OBS = 0 # Number of obstacles
NUM_STEPS = 20 # Number of steps in an episode
NUM_EPISODES = 100

alpha = 0.1
eps   = 0.1
gamma = 0.9
episode = 0
hits = np.zeros((NUM_EPISODES))

# Use a Q-Learning algorithm based on the algorithm on page 131 of Sutton and Barto.
for episode in range(NUM_EPISODES):
    env.randomize(NUM_OBS)
    n = 0
    while True:
        # Find the current state
        s = np.where( (POS_STATES == np.round(env._agent_position_y,2)) & (ANG_STATES == env._agent_aiming_angle) )[0]
        
        # Create a policy based on epsilon greedy
        if len(np.unique(Q[s,:])) <= 1: # If all Q values are same, set policy to equal for all actions
            pi = np.zeros((NUM_ACTIONS))
            pi[:] = 1/NUM_ACTIONS
        else:
            pi = np.asarray([((1 - eps) + eps/NUM_ACTIONS if Q[s,i]==np.max(Q[s,:]) else eps/NUM_ACTIONS) for i in range(NUM_ACTIONS)])
        
        # Choose and take an action based on policy
        action = np.random.choice(ACTIONS, replace = True, p = pi)
        env.take_action(action)
        
        # With the action taken, the agent is now in the future state s'. Find that state
        s_prime = np.where( (POS_STATES == np.round(env._agent_position_y,2)) & (ANG_STATES == env._agent_aiming_angle) )[0]
        
        # Collect the reward
        R = env.compute_reward()
        
        # Update the Q value
        if len(np.unique(Q[s_prime,:])) <= 1: # If all Q values are the same, use any one
            Q[s,action] = Q[s,action] + alpha*(R + gamma*Q[s_prime,0] - Q[s,action])
        else:
            Q[s,action] = Q[s,action] + alpha*(R + gamma*np.max(Q[s_prime,:]) - Q[s,action])
        
        n = n + 1
        
        # At the end of the episode, determine if the algorithm hit the target and break
        # the loop to move on to the next episode
        if n == NUM_STEPS:
            hits[episode] = 1 if R == 1 else 0
            break
    
print(Q)
print(hits)
print(np.sum(hits)/len(hits))

[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
[0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0.
 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0.
 0. 0. 0. 0.]
0.16


In [106]:
print(round(env._agent_position_y,2), env._agent_aiming_angle)

print(NUM_POSITIONS*NUM_ANGLES)
print(env.Y_POSITIONS)

-0.05 177
2173.0
[-1.   -0.95 -0.9  -0.85 -0.8  -0.75 -0.7  -0.65 -0.6  -0.55 -0.5  -0.45
 -0.4  -0.35 -0.3  -0.25 -0.2  -0.15 -0.1  -0.05  0.    0.05  0.1   0.15
  0.2   0.25  0.3   0.35  0.4   0.45  0.5   0.55  0.6   0.65  0.7   0.75
  0.8   0.85  0.9   0.95  1.  ]
