# Action Grammars: A Grammar-Induction Based Method for Learning Temporally-Extended Actions
## Authors: Robert Lange and Aldo Faisal | January 2019

In [None]:
!pip install -r requirements.txt --quiet

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import os
import time
import numpy as np

import gym
import gym_hanoi

In [3]:
from agents import Agent_Q, SMDP_Agent_Q, Macro, SMDPQTable
from utils.q_learning import  q_learning
from utils.learning_params import *
from utils.general import *
from utils.plotting import *

In [4]:
# Create directory - Learning performance
results_dir = os.getcwd() + "/results/"
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    print("Created New Results Directory")

# Create directory - Log directory
log_dir = os.getcwd() + "/logs/"
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
    print("Created New Log Directory")
    
# Create directory - Figure directory
fig_dir = os.getcwd() + "/figures/"
if not os.path.exists(fig_dir):
    os.makedirs(fig_dir)
    print("Created New Fig Directory")

# Towers of Hanoi - Setup

In [15]:
num_episodes = 5
max_steps = 500

log_freq = 10  # greedily evaluate policy after amount of episodes
log_episodes = 5
learning_times = 5

verbose = True

N = 4

alpha, gamma, lambd, epsilon = learning_parameters(l_type="Q-Learning")
lambd = 0

### Simple TD($\lambda$) Learning

In [16]:
env = gym.make("Hanoi-v0")
env.set_env_parameters(num_disks=N, env_noise=0)

Hanoi Environment Parameters have been set to:
	 Number of Disks: 4
	 Transition Failure Probability: 0


In [22]:
agent = Agent_Q(env)
hist, er_buffer = q_learning(env, agent, num_episodes, max_steps,
                             gamma, alpha, lambd, epsilon,
                             log_freq, log_episodes, True)

[[0 0 0 0 1]]
0.0 (0, 0, 0, 0) (2, 0, 0, 0) [  0.   0. -inf -inf -inf -inf] [  0.   0. -inf -inf -inf -inf]
[]
(0, 0, 0, 0) 1
[[2 0 0 0 4]]
0.0 (2, 0, 0, 0) (0, 0, 0, 0) [  0. -inf -inf -inf   0.   0.] [  0. -inf -inf -inf   0.   0.]
[]
(2, 0, 0, 0) 4
[[0 0 0 0 1]]
0.0 (0, 0, 0, 0) (2, 0, 0, 0) [  0.   0. -inf -inf -inf -inf] [  0.   0. -inf -inf -inf -inf]
[]
(0, 0, 0, 0) 1
[[2 0 0 0 4]]
0.0 (2, 0, 0, 0) (0, 0, 0, 0) [  0. -inf -inf -inf   0.   0.] [  0. -inf -inf -inf   0.   0.]
[]
(2, 0, 0, 0) 4
[[0 0 0 0 1]]
0.0 (0, 0, 0, 0) (2, 0, 0, 0) [  0.   0. -inf -inf -inf -inf] [  0.   0. -inf -inf -inf -inf]
[]
(0, 0, 0, 0) 1
[[2 0 0 0 4]]
0.0 (2, 0, 0, 0) (0, 0, 0, 0) [  0. -inf -inf -inf   0.   0.] [  0. -inf -inf -inf   0.   0.]
[]
(2, 0, 0, 0) 4
[[0 0 0 0 1]]
0.0 (0, 0, 0, 0) (2, 0, 0, 0) [  0.   0. -inf -inf -inf -inf] [  0.   0. -inf -inf -inf -inf]
[]
(0, 0, 0, 0) 1
[[2 0 0 0 5]]
0.0 (2, 0, 0, 0) (1, 0, 0, 0) [  0. -inf -inf -inf   0.   0.] [  0. -inf -inf -inf   0.   0.]
[]
(2, 0, 

IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
plot_learning(hist[:, 0], hist[:, 1], hist[:,2], title=r"Steps - Base TD($\lambda$) Learner")

In [None]:
plot_learning(hist[:, 0], hist[:, 3], hist[:, 4], title=r"Reward - Base TD($\lambda$) Learner")

In [None]:
get_rollout_policy(env, agent, max_steps)

### SMDP-Q-Learning

In [None]:
seq_macros = {4: ["abd"],
              5: ["bafbcd", "baf", "ec", "bc"],
              6: ["abdaef", "abdced", "abdaef", "aedce",
                  "abdce", "abd", "ae", "ce"], 
              7: ["bafbcdbafecfbafbcdbcfecd", "bafbcdbafecf",
                  "bafecdbcfecbafbcdbcfec", "bafbcdbafec",
                  "bcfecbafbcec"]}

In [None]:
N = 5
env = gym.make("Hanoi-v0")
env.set_env_parameters(num_disks=N, env_noise=0)

macros = []
for i in range(len(seq_macros[N])):
    macros.append(Macro(env, seq_macros[N][i]))

In [None]:
Q = SMDPQTable(env.get_movability_map(), macros)

In [None]:
agent = SMDP_Agent_Q(env, Q, macros)

In [None]:
log_template = "Ep: {:>2} | Avg/Std Steps: {:.2f}/{:.2f} | Avg/Std Ret: {:.2f}/{:.2f} | Success R: {:.2f}"
log_counter = 0
hist = np.zeros((int(num_episodes/log_freq), 6))

# Init Replay Buffer
er_buffer = ReplayBuffer(num_episodes*max_steps)

for ep_id in range(num_episodes):

    cur_state = env.reset()

    stp = 0
    tot_td = 0
    rewards = []

    eligibility = np.zeros(agent.q_func.table.shape)

    old_greedy_choice = None
    old_action = None
    old_state = None

    for i in range(max_steps):
        action = agent.epsilon_greedy_action(cur_state)
        next_state, reward, done, _ = env.step(action)
        greedy_choice = agent.greedy_action(next_state)

        # Update value function
        eligibility, tde = q_learning_update(gamma, alpha, lambd, agent.q_func,
                                             eligibility, cur_state, action,
                                             next_state, reward, done, stp,
                                             old_greedy_choice, old_action, old_state)

        # Extend replay buffer
        er_buffer.push(ep_id, old_state, action, reward, next_state, done)

        # Update variables
        old_state = cur_state
        old_action = action
        old_greedy_choice = greedy_choice
        cur_state = next_state

        # Update counters
        stp += 1
        tot_td += tde
        rewards.append(reward)

        # Go to next episode if successfully ended
        if done:
            break

    if ep_id % log_freq == 0:
        avg_steps, sd_steps, avg_ret, sd_ret, success_rate = greedy_eval(env, agent, gamma,
                                                                         max_steps, log_episodes)
        hist[log_counter,:] = np.array([ep_id, avg_steps, sd_steps,
                                        avg_ret, sd_ret, success_rate])
        log_counter += 1

        if verbose:
            print(log_template.format(ep_id + 1, avg_steps, sd_steps,
                                      avg_ret, sd_steps, success_rate))

In [None]:
er_buffer.buffer