# Action Grammars: A Grammar-Induction Based Method for Learning Temporal Abstractions
## Authors: Robert Lange and Aldo Faisal | April 2019

In [None]:
# !pip install -r requirements.txt --quiet

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import os
import time
import numpy as np

import gym
import gym_hanoi

In [5]:
from agents.q_agent import Agent_Q
from agents.smdp_q_agent import SMDP_Agent_Q, Macro, SMDPQTable
# from agents.a2c_agent import ActorCritic, train_a2c_agent

from learning.q_learning import  q_learning
from learning.smdp_q_learning import smdp_q_learning, smdp_q_online_learning

from learning.learning_params import *
from learning.run_learning import *

from utils.general import *

In [6]:
# Create directory - Learning performance
results_dir = os.getcwd() + "/results/"
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    print("Created New Results Directory")

# Create directory - Log directory
log_dir = os.getcwd() + "/logs/"
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
    print("Created New Log Directory")
    
# Create directory - Figure directory
fig_dir = os.getcwd() + "/figures/"
if not os.path.exists(fig_dir):
    os.makedirs(fig_dir)
    print("Created New Fig Directory")

# Towers of Hanoi - Setup

In [7]:
print(learning_parameters("Q-Learning"))
print(learning_parameters("Imitation-SMDP-Q-Learning"))

{'alpha': 0.8, 'gamma': 0.95, 'lambd': 0.1, 'epsilon': 0.1}
{'alpha': 0.8, 'gamma': 0.95, 'lambd': 0.0, 'epsilon': 0.1}


In [8]:
learning_setup = {4: {"num_episodes": 300,
                      "max_steps": 500},
                  5: {"num_episodes": 1250,
                      "max_steps": 1000},
                  6: {"num_episodes": 5000,
                      "max_steps": 2000},
                  7: {"num_episodes": 10000,
                      "max_steps": 4000},
                  8: {"num_episodes": 20000,
                      "max_steps": 8000}}

### Simple TD($\lambda$) Learning

In [9]:
# Setup for N=4 Disk Towers of Hanoi Environment
log_episodes = 10
log_freq = 50

N = 6
num_episodes = learning_setup[N]["num_episodes"]
max_steps = learning_setup[N]["max_steps"]

env = gym.make("Hanoi-v0")
env.set_env_parameters(N, env_noise=0, verbose=False)

  result = entry_point.load(False)


In [10]:
agent = Agent_Q(env)
params = learning_parameters(l_type="Q-Learning")
hist, er_buffer = q_learning(env, agent, num_episodes, max_steps,
                             **params, log_freq=log_freq,
                             log_episodes=log_episodes, verbose=True)

KeyboardInterrupt: 

In [9]:
# Run Learning 5 times for 4 Disk Environment
num_times = 5
num_disks = 4
num_episodes = learning_setup[num_disks]["num_episodes"]
max_steps = learning_setup[num_disks]["max_steps"]
env, agent, stats_4_q = run_learning("Q-Learning", num_times, num_disks,
                                     num_episodes, max_steps,
                                     log_episodes, log_freq,
                                     save_fname="results/4_disks_q.txt")

4 Disks - Q-Learning: Run 1/5 Done - Time: 1.41
4 Disks - Q-Learning: Run 2/5 Done - Time: 1.15
4 Disks - Q-Learning: Run 3/5 Done - Time: 1.13
4 Disks - Q-Learning: Run 4/5 Done - Time: 2.41
4 Disks - Q-Learning: Run 5/5 Done - Time: 0.88
Outfiled the results to results/4_disks_q.txt.


In [10]:
# Run Learning 5 times for 5 Disk Environment
num_disks = 5
num_episodes = learning_setup[num_disks]["num_episodes"]
max_steps = learning_setup[num_disks]["max_steps"]
env, agent, stats_5_q = run_learning("Q-Learning", num_times, num_disks,
                                     num_episodes, max_steps,
                                     log_episodes, log_freq,
                                     save_fname="results/5_disks_q.txt")

5 Disks - Q-Learning: Run 1/5 Done - Time: 16.61
5 Disks - Q-Learning: Run 2/5 Done - Time: 20.75
5 Disks - Q-Learning: Run 3/5 Done - Time: 17.17
5 Disks - Q-Learning: Run 4/5 Done - Time: 16.8
5 Disks - Q-Learning: Run 5/5 Done - Time: 26.66
Outfiled the results to results/5_disks_q.txt.


In [None]:
# Run Learning 5 times for 6 Disk Environment
num_disks = 6
num_episodes = learning_setup[num_disks]["num_episodes"]
max_steps = learning_setup[num_disks]["max_steps"]
env, agent, stats_6_q = run_learning("Q-Learning", num_times, num_disks,
                                     num_episodes, max_steps,
                                     log_episodes, log_freq,
                                     save_fname="results/6_disks_q.txt")

In [11]:
# Return a greedy rollout Experience Replay Episode
get_rollout_policy(env, agent, max_steps, grammar=False)

deque([(0, (0, 0, 0, 0, 0), 1, (2, 0, 0, 0, 0)),
       (1, (2, 0, 0, 0, 0), 0, (2, 1, 0, 0, 0)),
       (2, (2, 1, 0, 0, 0), 5, (1, 1, 0, 0, 0)),
       (3, (1, 1, 0, 0, 0), 1, (1, 1, 2, 0, 0)),
       (4, (1, 1, 2, 0, 0), 2, (0, 1, 2, 0, 0)),
       (5, (0, 1, 2, 0, 0), 3, (0, 2, 2, 0, 0)),
       (6, (0, 2, 2, 0, 0), 1, (2, 2, 2, 0, 0)),
       (7, (2, 2, 2, 0, 0), 0, (2, 2, 2, 1, 0)),
       (8, (2, 2, 2, 1, 0), 5, (1, 2, 2, 1, 0)),
       (9, (1, 2, 2, 1, 0), 4, (1, 0, 2, 1, 0)),
       (10, (1, 0, 2, 1, 0), 2, (0, 0, 2, 1, 0)),
       (11, (0, 0, 2, 1, 0), 5, (0, 0, 1, 1, 0)),
       (12, (0, 0, 1, 1, 0), 1, (2, 0, 1, 1, 0)),
       (13, (2, 0, 1, 1, 0), 0, (2, 1, 1, 1, 0)),
       (14, (2, 1, 1, 1, 0), 5, (1, 1, 1, 1, 0)),
       (15, (1, 1, 1, 1, 0), 1, (1, 1, 1, 1, 2)),
       (16, (1, 1, 1, 1, 2), 2, (0, 1, 1, 1, 2)),
       (17, (0, 1, 1, 1, 2), 3, (0, 2, 1, 1, 2)),
       (18, (0, 2, 1, 1, 2), 1, (2, 2, 1, 1, 2)),
       (19, (2, 2, 1, 1, 2), 2, (2, 2, 0, 1, 2)),
       (20

In [12]:
get_rollout_policy(env, agent, max_steps, grammar=True)

'bafbcdbafecfbafbcdbcefbcedabdafecdabd'

### Check Context-Free Grammar Inference

In [13]:
optimal_policies = {4: "abdaefabdcedabd",
                    5: "bafbcdbafecfbafbcdbcfecdbafbcdb",
                    6: "abdaefabdcedabdaefaedcefabdaefabdcedabdce"\
                        "faedcedabdaefabdcedabd",
                    7: "bafbcdbafecfbafbcdbcfecdbafbcdbafecfbafec"\
                        "dbcfecfbafbcdbafecfbafbcdbcfecdbafbcdbcfe"\
                        "cfbafecdbcfecdbafbcdbafecfbafbcdbcfecdbafbcdb"}

In [14]:
from grammars.cfg_grammar import *

In [15]:
get_macros("all", optimal_policies[4], 6, "sequitur", 2)

['1 a e f 1 c e d 1 \\n ', 'a b d ']


['abd']

In [16]:
# get_macros("all", optimal_policies[4], 6, "lexis", 2)

### Imitation SMDP-Q-Learning

In [17]:
log_episodes = 10
log_freq = 20

N = 4
num_episodes = learning_setup[N]["num_episodes"]
max_steps = learning_setup[N]["max_steps"]

env = gym.make("Hanoi-v0")
env.set_env_parameters(num_disks=N, env_noise=0, verbose=False)

In [18]:
macros = get_optimal_macros(env, N, "Sequitur")
agent = SMDP_Agent_Q(env, macros)
params = learning_parameters(l_type="Imitation-SMDP-Q-Learning")
hist, er_buffer = smdp_q_learning(env, agent, num_episodes, max_steps,
                                  **params,
                                  log_freq=log_freq,
                                  log_episodes=log_episodes, verbose=True)

Ep:  1 | Avg/Std Steps: 182.86/96.46 | Avg/Std Ret: 0.28/96.46 | Success R: 0.70
Ep: 21 | Avg/Std Steps: 146.33/103.63 | Avg/Std Ret: 4.00/103.63 | Success R: 0.30
Ep: 41 | Avg/Std Steps: 19.00/0.00 | Avg/Std Ret: 39.72/0.00 | Success R: 1.00
Ep: 61 | Avg/Std Steps: 19.00/0.00 | Avg/Std Ret: 39.72/0.00 | Success R: 1.00
Ep: 81 | Avg/Std Steps: 18.00/0.00 | Avg/Std Ret: 41.81/0.00 | Success R: 1.00
Ep: 101 | Avg/Std Steps: 18.00/0.00 | Avg/Std Ret: 41.81/0.00 | Success R: 1.00
Ep: 121 | Avg/Std Steps: 18.00/0.00 | Avg/Std Ret: 41.81/0.00 | Success R: 1.00
Ep: 141 | Avg/Std Steps: 17.00/0.00 | Avg/Std Ret: 44.01/0.00 | Success R: 1.00
Ep: 161 | Avg/Std Steps: 17.00/0.00 | Avg/Std Ret: 44.01/0.00 | Success R: 1.00
Ep: 181 | Avg/Std Steps: 17.00/0.00 | Avg/Std Ret: 44.01/0.00 | Success R: 1.00
Ep: 201 | Avg/Std Steps: 17.00/0.00 | Avg/Std Ret: 44.01/0.00 | Success R: 1.00
Ep: 221 | Avg/Std Steps: 17.00/0.00 | Avg/Std Ret: 44.01/0.00 | Success R: 1.00
Ep: 241 | Avg/Std Steps: 17.00/0.00 | A

In [19]:
# Run Learning 5 times for 4 Disk Environment
num_times = 5
num_disks = 4
num_episodes = learning_setup[num_disks]["num_episodes"]
max_steps = learning_setup[num_disks]["max_steps"]
env, agent, stats_4_smdp_imi = run_learning("Imitation-SMDP-Q-Learning", num_times, num_disks,
                                            num_episodes, max_steps,
                                            log_episodes, log_freq,
                                            save_fname="results/4_disks_smdp_imi.txt")

4 Disks - Imitation-SMDP-Q-Learning: Run 1/5 Done - Time: 1.3
4 Disks - Imitation-SMDP-Q-Learning: Run 2/5 Done - Time: 1.07
4 Disks - Imitation-SMDP-Q-Learning: Run 3/5 Done - Time: 1.12
4 Disks - Imitation-SMDP-Q-Learning: Run 4/5 Done - Time: 1.39
4 Disks - Imitation-SMDP-Q-Learning: Run 5/5 Done - Time: 1.32
Outfiled the results to results/4_disks_smdp_imi.txt.


In [21]:
# Run Learning 5 times for 5 Disk Environment
num_disks = 5
num_episodes = learning_setup[num_disks]["num_episodes"]
max_steps = learning_setup[num_disks]["max_steps"]
env, agent, stats_5_smdp_imi = run_learning("Imitation-SMDP-Q-Learning", num_times, num_disks,
                                            num_episodes, max_steps,
                                            log_episodes, log_freq,
                                            save_fname="results/5_disks_smdp_imi.txt")

5 Disks - Imitation-SMDP-Q-Learning: Run 1/5 Done - Time: 23.63
5 Disks - Imitation-SMDP-Q-Learning: Run 2/5 Done - Time: 21.87
5 Disks - Imitation-SMDP-Q-Learning: Run 3/5 Done - Time: 23.22
5 Disks - Imitation-SMDP-Q-Learning: Run 4/5 Done - Time: 21.47
5 Disks - Imitation-SMDP-Q-Learning: Run 5/5 Done - Time: 23.14
Outfiled the results to results/5_disks_smdp_imi.txt.


In [94]:
# Run Learning 5 times for 6 Disk Environment
num_disks = 6
num_episodes = learning_setup[num_disks]["num_episodes"]
max_steps = learning_setup[num_disks]["max_steps"]
env, agent, stats_6_smdp_imi = run_learning("Imitation-SMDP-Q-Learning", num_times, num_disks,
                                            num_episodes, max_steps,
                                            log_episodes, log_freq,
                                            save_fname="results/6_disks_smdp_imi.txt")

KeyboardInterrupt: 

## Transfer Learning Analysis

In [22]:
log_episodes = 10
log_freq = 20

N = 5
num_episodes = learning_setup[N]["num_episodes"]
max_steps = learning_setup[N]["max_steps"]

env = gym.make("Hanoi-v0")
env.set_env_parameters(num_disks=N, env_noise=0, verbose=False)

In [23]:
macros = get_optimal_macros(env, N-1, "Sequitur")
agent = SMDP_Agent_Q(env, macros)
params = learning_parameters(l_type="Transfer-SMDP-Q-Learning")
hist, er_buffer = smdp_q_learning(env, agent, num_episodes, max_steps,
                                  **params,
                                  log_freq=log_freq,
                                  log_episodes=log_episodes, verbose=True)

Ep:  1 | Avg/Std Steps: 1017.00/0.00 | Avg/Std Ret: 0.00/0.00 | Success R: 0.10
Ep: 21 | Avg/Std Steps: 309.50/18.50 | Avg/Std Ret: 0.00/18.50 | Success R: 0.20
Ep: 41 | Avg/Std Steps: 683.67/370.92 | Avg/Std Ret: 0.01/370.92 | Success R: 0.30
Ep: 61 | Avg/Std Steps: 1000.00/0.00 | Avg/Std Ret: 0.00/0.00 | Success R: 0.00
Ep: 81 | Avg/Std Steps: 344.75/138.37 | Avg/Std Ret: 0.00/138.37 | Success R: 0.40
Ep: 101 | Avg/Std Steps: 127.00/58.70 | Avg/Std Ret: 1.31/58.70 | Success R: 0.70
Ep: 121 | Avg/Std Steps: 60.50/19.41 | Avg/Std Ret: 6.48/19.41 | Success R: 1.00
Ep: 141 | Avg/Std Steps: 49.00/0.00 | Avg/Std Ret: 8.53/0.00 | Success R: 1.00
Ep: 161 | Avg/Std Steps: 47.00/0.00 | Avg/Std Ret: 9.45/0.00 | Success R: 1.00
Ep: 181 | Avg/Std Steps: 47.00/0.00 | Avg/Std Ret: 9.45/0.00 | Success R: 1.00
Ep: 201 | Avg/Std Steps: 47.00/0.00 | Avg/Std Ret: 9.45/0.00 | Success R: 1.00
Ep: 221 | Avg/Std Steps: 47.00/0.00 | Avg/Std Ret: 9.45/0.00 | Success R: 1.00
Ep: 241 | Avg/Std Steps: 47.00/0.00

In [24]:
# Run Learning 5 times for 5 Disk Environment with 4 disk grammar
num_times = 5
num_disks = 5
num_episodes = learning_setup[num_disks]["num_episodes"]
max_steps = learning_setup[num_disks]["max_steps"]
env, agent, stats_smdp_trans_5 = run_learning("Transfer-SMDP-Q-Learning", num_times, num_disks,
                                              num_episodes, max_steps,
                                              log_episodes, log_freq,
                                              transfer_distance=1,
                                              save_fname="results/5_disks_smdp_transfer_4_disks.txt")

5 Disks - Transfer-SMDP-Q-Learning: Run 1/5 Done - Time: 24.96
5 Disks - Transfer-SMDP-Q-Learning: Run 2/5 Done - Time: 24.05
5 Disks - Transfer-SMDP-Q-Learning: Run 3/5 Done - Time: 25.51
5 Disks - Transfer-SMDP-Q-Learning: Run 4/5 Done - Time: 25.25
5 Disks - Transfer-SMDP-Q-Learning: Run 5/5 Done - Time: 27.24
Outfiled the results to results/5_disks_smdp_transfer_4_disks.txt.


In [None]:
num_times = 5
num_disks = 6
num_episodes = learning_setup[num_disks]["num_episodes"]
max_steps = learning_setup[num_disks]["max_steps"]

# Run Learning 5 times for 6 Disk Environment with 5 disk grammar
env, agent, stats_smdp_trans_5 = run_learning("Transfer-SMDP-Q-Learning", num_times, num_disks,
                                              num_episodes, max_steps,
                                              log_episodes, log_freq,
                                              transfer_distance=1,
                                              save_fname="results/6_disks_smdp_transfer_5_disks.txt")

# Run Learning 5 times for 6 Disk Environment with 4 disk grammar
env, agent, stats_smdp_trans_4 = run_learning("Transfer-SMDP-Q-Learning", num_times, num_disks,
                                              num_episodes, max_steps,
                                              log_episodes, log_freq,
                                              transfer_distance=2,
                                              save_fname="results/6_disks_smdp_transfer_4_disks.txt")

## Online-Grammar-Macro-SMDP Learning

In [101]:
# Setup for N=4 Disk Towers of Hanoi Environment
log_episodes = 10
log_freq = 20

init_q_eps = 20
inter_update_eps=100
num_grammar_updates=5

N = 4
max_steps = learning_setup[N]["max_steps"]

env = gym.make("Hanoi-v0")
env.set_env_parameters(N, env_noise=0, verbose=False)

params = learning_parameters(l_type="Q-Learning")
hist = smdp_q_online_learning(env, init_q_eps, inter_update_eps,
                              num_grammar_updates, max_steps, **params,
                              log_freq=log_freq, log_episodes=log_episodes,
                              verbose=True)

Ep:  1 | Avg/Std Steps: 177.33/144.50 | Avg/Std Ret: 1.88/144.50 | Success R: 0.30
Ep:  1 | Avg/Std Steps: 500.00/0.00 | Avg/Std Ret: 0.00/0.00 | Success R: 0.00
Ep: 21 | Avg/Std Steps: 500.00/0.00 | Avg/Std Ret: 0.00/0.00 | Success R: 0.00
Ep: 41 | Avg/Std Steps: 19.00/0.00 | Avg/Std Ret: 39.72/0.00 | Success R: 1.00
Ep: 61 | Avg/Std Steps: 18.00/0.00 | Avg/Std Ret: 41.81/0.00 | Success R: 1.00
Ep: 81 | Avg/Std Steps: 18.00/0.00 | Avg/Std Ret: 41.81/0.00 | Success R: 1.00
Ep:  1 | Avg/Std Steps: 231.14/110.41 | Avg/Std Ret: 0.16/110.41 | Success R: 0.70
Ep: 21 | Avg/Std Steps: 500.00/0.00 | Avg/Std Ret: 0.00/0.00 | Success R: 0.00
Ep: 41 | Avg/Std Steps: 20.00/0.00 | Avg/Std Ret: 37.74/0.00 | Success R: 1.00
Ep: 61 | Avg/Std Steps: 20.00/0.00 | Avg/Std Ret: 37.74/0.00 | Success R: 1.00
Ep: 81 | Avg/Std Steps: 19.00/0.00 | Avg/Std Ret: 39.72/0.00 | Success R: 1.00
Ep:  1 | Avg/Std Steps: 241.00/151.92 | Avg/Std Ret: 1.37/151.92 | Success R: 0.70
Ep: 21 | Avg/Std Steps: 500.00/0.00 | Av

In [102]:
# Run Learning 5 times for 5 Disk Environment
num_disks = 4
stats_smdp_online = run_learning("Online-SMDP-Q-Learning", num_times, num_disks,
                                 num_episodes, max_steps,
                                 log_episodes, log_freq,
                                 save_fname="results/4_disks_smdp_online_no_transfer.txt")

4 Disks - Online-SMDP-Q-Learning: Run 1/5 Done - Time: 72.96
4 Disks - Online-SMDP-Q-Learning: Run 2/5 Done - Time: 58.4
4 Disks - Online-SMDP-Q-Learning: Run 3/5 Done - Time: 22.89
4 Disks - Online-SMDP-Q-Learning: Run 4/5 Done - Time: 60.61
4 Disks - Online-SMDP-Q-Learning: Run 5/5 Done - Time: 66.61
Outfiled the results to results/4_disks_smdp_online_no_transfer.txt.
