# Action Grammars: A Grammar-Induction Based Method for Learning Temporally-Extended Actions
## Authors: Robert Lange and Aldo Faisal | January 2019

In [None]:
!pip install -r requirements.txt --quiet

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import os
import time
import numpy as np

import gym
import gym_hanoi

In [3]:
from agents import *
from utils.learning import QTable, q_learning_update
from utils.learning_params import *
from utils.general import *
from utils.plotting import *

The text.latex.unicode rcparam was deprecated in Matplotlib 2.2 and will be removed in 3.1.
  "2.2", name=key, obj_type="rcparam", addendum=addendum)


In [4]:
# Create directory - Learning performance
results_dir = os.getcwd() + "/results/"
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    print("Created New Results Directory")

# Create directory - Log directory
log_dir = os.getcwd() + "/logs/"
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
    print("Created New Log Directory")
    
# Create directory - Figure directory
fig_dir = os.getcwd() + "/figures/"
if not os.path.exists(fig_dir):
    os.makedirs(fig_dir)
    print("Created New Fig Directory")

# Towers of Hanoi

In [24]:
num_episodes = 2000
max_steps = 500

log_freq = 20  # greedily evaluate policy after amount of episodes
log_episodes = 100
learning_times = 5

verbose = True
log_template = "Ep: {:>2} | Avg/Std Steps: {:.2f}/{:.2f} | Avg/Std Ret: {:.2f}/{:.2f} | Success R: {:.2f}"

N = 4

alpha, gamma, lambd, epsilon = learning_parameters()
hist = np.zeros((int(num_episodes/log_freq), 6))
lambd = 0.1 

In [25]:
env = gym.make("Hanoi-v0")
env.set_env_parameters(num_disks=N, env_noise=0)

Q = QTable(env.get_movability_map())
old_state = env.reset()
agent = Agent_Q(env, Q)
er_buffer = ReplayBuffer(num_episodes*max_steps)

Hanoi Environment Parameters have been set to:
	 Number of Disks: 4
	 Transition Failure Probability: 0


In [None]:
log_counter = 0

for ep_id in range(num_episodes):
    
    cur_state = env.reset()
    
    stp = 0
    tot_td = 0
    rewards = []
    
    eligibility = np.zeros(agent.q_func.table.shape)
    
    old_greedy_choice = None
    old_action = None
    old_state = None

    for i in range(max_steps):
        action = agent.epsilon_greedy_action(cur_state)
        next_state, reward, done, _ = env.step(action)
        greedy_choice = agent.greedy_action(next_state)
                
        # Update value function
        eligibility, tde = q_learning_update(gamma, alpha, lambd, agent.q_func,
                                             eligibility, cur_state, action,
                                             next_state, reward, done, stp,
                                             old_greedy_choice, old_action, old_state)
        
        # Extend replay buffer
        er_buffer.push(ep_id, old_state, action, reward, next_state, done)
        
        # Update variables
        old_state = cur_state
        old_action = action
        old_greedy_choice = greedy_choice
        cur_state = next_state
        
        # Update counters
        stp += 1
        tot_td += tde
        rewards.append(reward)
        
        # Go to next episode if successfully ended
        if done:
            break
    
    if ep_id % log_freq == 0:
        avg_steps, sd_steps, avg_ret, sd_ret, success_rate = greedy_eval(env, agent, gamma,
                                                                         max_steps, log_episodes)
        hist[log_counter,:] = np.array([ep_id, avg_steps, sd_steps,
                                        avg_ret, sd_ret, success_rate])
        log_counter += 1
        
        if verbose:
            print(log_template.format(ep_id + 1, avg_steps, sd_steps,
                                      avg_ret, sd_steps, success_rate))

Ep:  1 | Avg/Std Steps: 15.00/0.00 | Avg/Std Ret: 48.77/0.00 | Success R: 1.00
Ep: 21 | Avg/Std Steps: 15.00/0.00 | Avg/Std Ret: 48.77/0.00 | Success R: 1.00
Ep: 41 | Avg/Std Steps: 15.00/0.00 | Avg/Std Ret: 48.77/0.00 | Success R: 1.00
Ep: 61 | Avg/Std Steps: 15.00/0.00 | Avg/Std Ret: 48.77/0.00 | Success R: 1.00
Ep: 81 | Avg/Std Steps: 15.00/0.00 | Avg/Std Ret: 48.77/0.00 | Success R: 1.00
Ep: 101 | Avg/Std Steps: 15.00/0.00 | Avg/Std Ret: 48.77/0.00 | Success R: 1.00
Ep: 121 | Avg/Std Steps: 15.00/0.00 | Avg/Std Ret: 48.77/0.00 | Success R: 1.00
Ep: 141 | Avg/Std Steps: 15.00/0.00 | Avg/Std Ret: 48.77/0.00 | Success R: 1.00
Ep: 161 | Avg/Std Steps: 15.00/0.00 | Avg/Std Ret: 48.77/0.00 | Success R: 1.00
Ep: 181 | Avg/Std Steps: 16.00/0.00 | Avg/Std Ret: 46.33/0.00 | Success R: 1.00
Ep: 201 | Avg/Std Steps: 15.00/0.00 | Avg/Std Ret: 48.77/0.00 | Success R: 1.00
Ep: 221 | Avg/Std Steps: 15.00/0.00 | Avg/Std Ret: 48.77/0.00 | Success R: 1.00
Ep: 241 | Avg/Std Steps: 15.00/0.00 | Avg/Std