In [21]:
# Import libraries

import os, sys, warnings
import numpy as np
import matplotlib.pyplot as plt
from IPython import display
from IPython.display import Markdown as md

#sys.path.append('../Environment/')
#from Environment.environment import *
%run ../Environment/environment.ipynb

In [22]:
def print_matrix(name, matrix):
    matrix = np.atleast_2d(matrix)
    s = ''
    for idx, v in enumerate(matrix.flatten()):
        s = s + ' {:0.3f} '.format(v)
        if idx != (matrix.size-1):
            if idx % matrix.shape[0] == matrix.shape[0]-1:
                s = s + '\\\\'
            else:
                s = s + '&'
    return md(name + ' = $' + '\\begin{bmatrix}' + s + '\\end{bmatrix}$')

# Initialize Environment / Model Variables

In [23]:
# Restart the environment
env = EnvironmentState()

NUM_ACTIONS = 5 # Define the actions
ACTIONS = np.asarray([0, 1, 2, 3, 4])

# Define the state space, defined to be the possible y positions of te agent.
NUM_POS_STATES = (env.REGION_HEIGHT/0.05)+1
NUM_ANG_STATES = (160/3)+1
POS_STATES = np.round(np.arange(-1*env.REGION_HEIGHT/2, env.REGION_HEIGHT/2+0.05, 0.05),2)
ANG_STATES = np.round(np.arange(100, 260, 3),2)-1

# Initialize state-value function
# V = np.zeros((int(NUM_POS_STATES),int(NUM_ANG_STATES)))
V = np.zeros((2, int(NUM_ACTIONS), int(NUM_ACTIONS)))

# initialize policy
pi = np.zeros((NUM_ACTIONS))
pi[:] = 1/NUM_ACTIONS

# Run TD Learning Algorithm

In [24]:
# Initialize parameters
NUM_OBS = 0 # Number of obstacles
NUM_STEPS = 50# Number of steps in an episode
NUM_EPISODES = 50

alpha = 0.5
eps   = 0.01
gamma = 0.50
#episode = 0
hits  = np.zeros((NUM_EPISODES))
R_SUM = np.zeros((NUM_EPISODES))
MSE   = np.zeros((NUM_EPISODES))

action_c = 0
action_l = 0 

for episode in range(NUM_EPISODES):
    env.initialize() # Try different levels of complexity for agent and target position
    env.randomize_obstruction(NUM_OBS)

    R_vector = np.zeros((NUM_STEPS))
    n = 0
    while True:            
        # Choose and take an action based on policy
        action_c = np.random.choice(ACTIONS, replace=True, p = pi)
        env.take_action(action_c)

        # Collect reward value for this state
        R = env.compute_reward()
        R_vector[n] = R

        # Update the value function
        V[0, action_l, action_c] = V[0, action_l, action_c] + alpha * (R + gamma * V[1, action_l, action_c] - V[0, action_l, action_c])
        
        V[1, :, :] = V[0, :, :]
        
        n = n + 1

        action_l = action_c

        # At the end of the episode, determine if the algorithm hit the target and break
        # the loop to move on to the next episode
        if n == NUM_STEPS:
            hits[episode] = 1 if R == 1 else 0 # If agent is aiming at the target, R is 1. 
            R_SUM[episode] = np.sum(R_vector)
            MSE[episode]  = np.var(R_vector) # For the unbiased case, the RSE is the variance
            break

# print(hits)

In [25]:
# Example graphs to make:
# Sum of rewards vs. Episodes
# % accuracy vs. Episodes
# RMSE vs. Episodes
plt.figure()
episodes = np.arange(1, NUM_EPISODES+1, 1)

R_Sum_Ave = np.zeros((NUM_EPISODES))
accuracy = np.zeros((NUM_EPISODES))
MSE_Ave = np.zeros((NUM_EPISODES))

R_Sum_Ave[0] = R_SUM[0]
R_Sum_Ave[1:] = np.asarray([np.sum(R_SUM[:n])/(n+1) for n in range(1, NUM_EPISODES)])

accuracy[0] = hits[0]
accuracy[1:] = np.asarray([np.sum(hits[:n])/(n+1) for n in range(1, NUM_EPISODES)])

MSE_Ave[0] = MSE[0]
MSE_Ave[1:] = np.asarray([np.sum(MSE[:n])/(n+1) for n in range(1, NUM_EPISODES)])

# Make an example graph
fig, axes = plt.subplots(figsize = [12,18], nrows = 3, ncols = 1, sharex = True)
ax1 = axes[0]; ax2 = axes[1]; ax3 = axes[2]

ax1.plot(episodes, R_Sum_Ave, 'r-')
ax1.set_ylabel('Average Sum of Rewards', fontsize = 14)

ax2.plot(episodes, accuracy, 'r-')
ax2.set_ylabel('Average Accuracy', fontsize = 14)

ax3.plot(episodes, MSE_Ave, 'r-')
ax3.set_ylabel('Mean Squared Error of Reward')
ax3.set_xlabel('Episodes', fontsize = 14)

plt.show(block = False)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [26]:
# Write the data to a comma seperated text file for comparison with another algorithm.
# Naming format:
# Q-Learning_mo_nr.txt
# mo refers to the number of obstacles used when the model learned
#   (e.g., 0o is 0 obstacles, 1o is 1 obstacle, etc.)
# nr will be filled with:
#   nr: No Randomized (neither target nor agent were randomized per episode)
#   tr: Target Randomized (only the target was randomized per episode)
#   ar: Agent Randomized (only the agent was randomized per episode)
#   br: Both Randomized (both the target and agent were randomized per episode)

# Variables saved are:
#   Episodes
#   Hit/Miss for that episode
#   Sum of Rewards for that episode (SR)
#   Mean Square Error (Variance) for that episode (MSE)

# Write the file
f = open('Q-Learning_0o_tr.txt', 'w')

# Write the headers
f.write('Episode,Hit,SR,MSE' + '\n')

# Write the data
for n in range(NUM_EPISODES):
    f.write(str(episodes[n]) + ',' + str(hits[n]) + ',' + str(R_SUM) + ',' + str(MSE) + '\n')
    
# Close the file when finished
f.close()