# Action Grammars: A Grammar-Induction Based Method for Learning Temporally-Extended Actions
## Authors: Robert Lange and Aldo Faisal | January 2019

In [None]:
!pip install -r requirements.txt --quiet

In [28]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
import os
import time
import numpy as np

import gym
import gym_hanoi

In [30]:
from agents import *
from utils.learning import QTable, q_learning_update
from utils.learning_params import *
from utils.general import *

In [31]:
# Create directory - Learning performance
results_dir = os.getcwd() + "/results/"
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    print("Created New Results Directory")

# Create directory - Log directory
log_dir = os.getcwd() + "/logs/"
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
    print("Created New Log Directory")
    
# Create directory - Figure directory
fig_dir = os.getcwd() + "/figures/"
if not os.path.exists(fig_dir):
    os.makedirs(fig_dir)
    print("Created New Fig Directory")

# Towers of Hanoi

In [41]:
num_episodes = 2000
max_steps = 100

log_freq = 100  # greedily evaluate policy after amount of episodes
log_episodes = 100
learning_times = 5

verbose = True
log_template = "Ep: {:>2} | Avg/Std Steps: {:.2f}/{:.2f} | Avg/Std Ret: {:.2f}/{:.2f} | Success R: {:.2f}"

N = 3

alpha, gamma, lambd, epsilon = learning_parameters()
hist = np.zeros((int(num_episodes/log_freq), 6))

In [42]:
env = gym.make("Hanoi-v0")
env.set_env_parameters(num_disks=N, env_noise=0)

Q = QTable(env.get_movability_map())
old_state = env.reset()
agent = Agent_Q(env, Q)
er_buffer = ReplayBuffer(num_episodes*max_steps)

Hanoi Environment Parameters have been set to:
	 Number of Disks: 3
	 Transition Failure Probability: 0


In [43]:
log_counter = 0

for ep_id in range(num_episodes):
    
    old_state = env.reset()
    
    stp = 0
    tot_td = 0
    rewards = []
        
    for i in range(max_steps):
        action = agent.epsilon_greedy_action(old_state)
        new_state, reward, done, _ = env.step(action)
                
        # Update value function
        tde = q_learning_update(gamma, alpha, agent.q_func,
                                old_state, action, new_state, reward)
        
        # Extend replay buffer
        er_buffer.push(ep_id, old_state, action, reward, new_state, done)
        old_state = new_state
        
        # Update counters
        stp += 1
        tot_td += tde
        rewards.append(reward)
        
        # Go to next episode if successfully ended
        if done:
            break
    
    if ep_id % log_freq == 0:
        avg_steps, sd_steps, avg_ret, sd_steps, success_rate = greedy_eval(env, agent, gamma, max_steps, log_episodes)
        hist[log_counter,:] = np.array([ep_id, avg_steps, sd_steps,
                                        avg_ret, sd_steps, success_rate])
        log_counter += 1
        
        if verbose:
            print(log_template.format(ep_id + 1, avg_steps, sd_steps,
                                      avg_ret, sd_steps, success_rate))

Ep:  1 | Avg/Std Steps: 48.54/18.04 | Avg/Std Ret: 17.51/18.04 | Success R: 0.50
Ep: 101 | Avg/Std Steps: 7.00/0.00 | Avg/Std Ret: 73.51/0.00 | Success R: 1.00
Ep: 201 | Avg/Std Steps: 7.00/0.00 | Avg/Std Ret: 73.51/0.00 | Success R: 1.00
Ep: 301 | Avg/Std Steps: 7.00/0.00 | Avg/Std Ret: 73.51/0.00 | Success R: 1.00
Ep: 401 | Avg/Std Steps: 7.00/0.00 | Avg/Std Ret: 73.51/0.00 | Success R: 1.00
Ep: 501 | Avg/Std Steps: 7.00/0.00 | Avg/Std Ret: 73.51/0.00 | Success R: 1.00
Ep: 601 | Avg/Std Steps: 7.00/0.00 | Avg/Std Ret: 73.51/0.00 | Success R: 1.00
Ep: 701 | Avg/Std Steps: 7.00/0.00 | Avg/Std Ret: 73.51/0.00 | Success R: 1.00
Ep: 801 | Avg/Std Steps: 7.00/0.00 | Avg/Std Ret: 73.51/0.00 | Success R: 1.00
Ep: 901 | Avg/Std Steps: 7.00/0.00 | Avg/Std Ret: 73.51/0.00 | Success R: 1.00
Ep: 1001 | Avg/Std Steps: 7.00/0.00 | Avg/Std Ret: 73.51/0.00 | Success R: 1.00
Ep: 1101 | Avg/Std Steps: 7.00/0.00 | Avg/Std Ret: 73.51/0.00 | Success R: 1.00
Ep: 1201 | Avg/Std Steps: 7.00/0.00 | Avg/Std Re

In [None]:
Q.table

In [None]:
allowed = env.get_movability_map()


In [None]:
idx = np.where(allowed[(0,0,1)] != -np.inf)[0]

In [None]:
np.random.choice(idx)

In [None]:
idx