# Action Grammars: A Grammar-Induction Based Method for Learning Temporal Abstractions
## Authors: Robert Lange and Aldo Faisal | April 2019

In [None]:
# !pip install -r requirements.txt --quiet

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import os
import time
import numpy as np

import gym
import gym_hanoi

In [3]:
from agents.q_agent import Agent_Q
from agents.smdp_q_agent import SMDP_Agent_Q, Macro, SMDPQTable
# from agents.a2c_agent import ActorCritic, train_a2c_agent

from learning.q_learning import  q_learning
from learning.smdp_q_learning import smdp_q_learning, smdp_q_online_learning

from learning.learning_params import *
from learning.run_learning import *

from utils.general import *

In [4]:
# Create directory - Learning performance
results_dir = os.getcwd() + "/results/"
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    print("Created New Results Directory")

# Create directory - Log directory
log_dir = os.getcwd() + "/logs/"
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
    print("Created New Log Directory")
    
# Create directory - Figure directory
fig_dir = os.getcwd() + "/figures/"
if not os.path.exists(fig_dir):
    os.makedirs(fig_dir)
    print("Created New Fig Directory")

# Towers of Hanoi - Setup

In [5]:
print(learning_parameters("Q-Learning"))
print(learning_parameters("Imitation-SMDP-Q-Learning"))

{'alpha': 0.8, 'gamma': 0.95, 'lambd': 0.1, 'epsilon': 0.1}
{'alpha': 0.8, 'gamma': 0.95, 'lambd': 0.0, 'epsilon': 0.1}


In [6]:
learning_setup = {4: {"num_episodes": 1000,
                      "max_steps": 500},
                  5: {"num_episodes": 1250,
                      "max_steps": 1000},
                  6: {"num_episodes": 5000,
                      "max_steps": 2000},
                  7: {"num_episodes": 10000,
                      "max_steps": 4000},
                  8: {"num_episodes": 20000,
                      "max_steps": 8000}}

### Simple TD($\lambda$) Learning

In [7]:
# Setup for N=4 Disk Towers of Hanoi Environment
log_episodes = 1
log_freq = 20

N = 4
num_episodes = learning_setup[N]["num_episodes"]
max_steps = learning_setup[N]["max_steps"]

env = gym.make("Hanoi-v0")
env.set_env_parameters(N, env_noise=0, verbose=False)

  result = entry_point.load(False)


In [8]:
agent = Agent_Q(env)
params = learning_parameters(l_type="Q-Learning")
hist, er_buffer = q_learning(env, agent, num_episodes, max_steps,
                             **params, log_freq=log_freq,
                             log_episodes=log_episodes, verbose=True)

Ep:  1 | Avg/Std Steps: 500.00/0.00 | Avg/Std Ret: 0.00/0.00 | Success R: 0.00
Ep: 21 | Avg/Std Steps: 500.00/0.00 | Avg/Std Ret: 0.00/0.00 | Success R: 0.00
Ep: 41 | Avg/Std Steps: 500.00/0.00 | Avg/Std Ret: 0.00/0.00 | Success R: 0.00
Ep: 61 | Avg/Std Steps: 500.00/0.00 | Avg/Std Ret: 0.00/0.00 | Success R: 0.00
Ep: 81 | Avg/Std Steps: 500.00/0.00 | Avg/Std Ret: 0.00/0.00 | Success R: 0.00
Ep: 101 | Avg/Std Steps: 500.00/0.00 | Avg/Std Ret: 0.00/0.00 | Success R: 0.00
Ep: 121 | Avg/Std Steps: 500.00/0.00 | Avg/Std Ret: 0.00/0.00 | Success R: 0.00
Ep: 141 | Avg/Std Steps: 500.00/0.00 | Avg/Std Ret: 0.00/0.00 | Success R: 0.00
Ep: 161 | Avg/Std Steps: 500.00/0.00 | Avg/Std Ret: 0.00/0.00 | Success R: 0.00
Ep: 181 | Avg/Std Steps: 22.00/0.00 | Avg/Std Ret: 34.06/0.00 | Success R: 1.00
Ep: 201 | Avg/Std Steps: 500.00/0.00 | Avg/Std Ret: 0.00/0.00 | Success R: 0.00
Ep: 221 | Avg/Std Steps: 500.00/0.00 | Avg/Std Ret: 0.00/0.00 | Success R: 0.00
Ep: 241 | Avg/Std Steps: 21.00/0.00 | Avg/Std

In [9]:
# Run Learning 5 times for 4 Disk Environment
num_times = 5
num_disks = 4
num_episodes = learning_setup[num_disks]["num_episodes"]
max_steps = learning_setup[num_disks]["max_steps"]
stats_4_q = run_learning("Q-Learning", num_times, num_disks,
                         num_episodes, max_steps,
                         log_episodes, log_freq,
                         save_fname="results/4_disks_q.txt")

4 Disks - Q-Learning: Run 1/5 Done - Time: 1.14
4 Disks - Q-Learning: Run 2/5 Done - Time: 1.8
4 Disks - Q-Learning: Run 3/5 Done - Time: 3.47
4 Disks - Q-Learning: Run 4/5 Done - Time: 2.56
4 Disks - Q-Learning: Run 5/5 Done - Time: 1.15
Outfiled the results to results/4_disks_q.txt.


In [None]:
# Run Learning 5 times for 5 Disk Environment
num_times = 5
num_disks = 5
num_episodes = learning_setup[num_disks]["num_episodes"]
max_steps = learning_setup[num_disks]["max_steps"]
stats_5_q = run_learning("Q-Learning", num_times, num_disks,
                         num_episodes, max_steps,
                         log_episodes, log_freq,
                         save_fname="results/5_disks_q.txt")

In [None]:
# Run Learning 5 times for 6 Disk Environment
num_times = 5
num_disks = 6
num_episodes = learning_setup[num_disks]["num_episodes"]
max_steps = learning_setup[num_disks]["max_steps"]
stats_6_q = run_learning("Q-Learning", num_times, num_disks,
                         num_episodes, max_steps,
                         log_episodes, log_freq,
                         save_fname="results/6_disks_q.txt")

In [None]:
# Return a greedy rollout Experience Replay Episode
get_rollout_policy(env, agent, max_steps, grammar=False)

In [None]:
get_rollout_policy(env, agent, max_steps, grammar=True)

### Check Context-Free Grammar Inference

In [None]:
optimal_policies = {4: "abdaefabdcedabd",
                    5: "bafbcdbafecfbafbcdbcfecdbafbcdb",
                    6: "abdaefabdcedabdaefaedcefabdaefabdcedabdce"\
                        "faedcedabdaefabdcedabd",
                    7: "bafbcdbafecfbafbcdbcfecdbafbcdbafecfbafec"\
                        "dbcfecfbafbcdbafecfbafbcdbcfecdbafbcdbcfe"\
                        "cfbafecdbcfecdbafbcdbafecfbafbcdbcfecdbafbcdb"}

In [None]:
from grammars.cfg_grammar import *

In [None]:
get_macros("all", optimal_policies[4], 6, "sequitur", 2)

In [None]:
get_macros("all", optimal_policies[4], 6, "lexis", 2)

### Imitation SMDP-Q-Learning

In [37]:
log_episodes = 10
log_freq = 20

N = 4
num_episodes = learning_setup[N]["num_episodes"]
max_steps = learning_setup[N]["max_steps"]

env = gym.make("Hanoi-v0")
env.set_env_parameters(num_disks=N, env_noise=0, verbose=False)

macros = get_optimal_macros(env, N, "Sequitur")
agent = SMDP_Agent_Q(env, macros)

In [38]:
params = learning_parameters(l_type="Imitation-SMDP-Q-Learning")
hist, er_buffer = smdp_q_learning(env, agent, num_episodes, max_steps,
                                  **params,
                                  log_freq=log_freq,
                                  log_episodes=log_episodes, verbose=True)

Ep:  1 | Avg/Std Steps: 219.56/119.91 | Avg/Std Ret: 0.83/119.91 | Success R: 0.90
Ep: 21 | Avg/Std Steps: 169.14/139.64 | Avg/Std Ret: 5.68/139.64 | Success R: 0.70
Ep: 41 | Avg/Std Steps: 15.00/0.00 | Avg/Std Ret: 48.77/0.00 | Success R: 1.00
Ep: 61 | Avg/Std Steps: 15.00/0.00 | Avg/Std Ret: 48.77/0.00 | Success R: 1.00
Ep: 81 | Avg/Std Steps: 15.00/0.00 | Avg/Std Ret: 48.77/0.00 | Success R: 1.00
Ep: 101 | Avg/Std Steps: 15.00/0.00 | Avg/Std Ret: 48.77/0.00 | Success R: 1.00
Ep: 121 | Avg/Std Steps: 15.00/0.00 | Avg/Std Ret: 48.77/0.00 | Success R: 1.00
Ep: 141 | Avg/Std Steps: 15.00/0.00 | Avg/Std Ret: 48.77/0.00 | Success R: 1.00
Ep: 161 | Avg/Std Steps: 15.00/0.00 | Avg/Std Ret: 48.77/0.00 | Success R: 1.00
Ep: 181 | Avg/Std Steps: 15.00/0.00 | Avg/Std Ret: 48.77/0.00 | Success R: 1.00
Ep: 201 | Avg/Std Steps: 15.00/0.00 | Avg/Std Ret: 48.77/0.00 | Success R: 1.00
Ep: 221 | Avg/Std Steps: 15.00/0.00 | Avg/Std Ret: 48.77/0.00 | Success R: 1.00
Ep: 241 | Avg/Std Steps: 15.00/0.00 |

In [39]:
# Run Learning 5 times for 4 Disk Environment
num_times = 5
num_disks = 4
num_episodes = learning_setup[num_disks]["num_episodes"]
max_steps = learning_setup[num_disks]["max_steps"]
stats_4_smdp_imi = run_learning("Imitation-SMDP-Q-Learning", num_times, num_disks,
                                num_episodes, max_steps,
                                log_episodes, log_freq,
                                save_fname="results/4_disks_smdp_imi.txt")

4 Disks - Imitation-SMDP-Q-Learning: Run 1/5 Done - Time: 1.2
4 Disks - Imitation-SMDP-Q-Learning: Run 2/5 Done - Time: 1.07
4 Disks - Imitation-SMDP-Q-Learning: Run 3/5 Done - Time: 1.08
4 Disks - Imitation-SMDP-Q-Learning: Run 4/5 Done - Time: 1.09
4 Disks - Imitation-SMDP-Q-Learning: Run 5/5 Done - Time: 1.37
Outfiled the results to results/4_disks_smdp_imi.txt.


In [40]:
# Run Learning 5 times for 5 Disk Environment
num_disks = 5
num_episodes = learning_setup[num_disks]["num_episodes"]
max_steps = learning_setup[num_disks]["max_steps"]
env, agent, stats_5_smdp_imi = run_learning("Imitation-SMDP-Q-Learning", num_times, num_disks,
                                            num_episodes, max_steps,
                                            log_episodes, log_freq,
                                            save_fname="results/5_disks_smdp_imi.txt")

5 Disks - Imitation-SMDP-Q-Learning: Run 1/5 Done - Time: 4.72
5 Disks - Imitation-SMDP-Q-Learning: Run 2/5 Done - Time: 3.3
5 Disks - Imitation-SMDP-Q-Learning: Run 3/5 Done - Time: 3.5
5 Disks - Imitation-SMDP-Q-Learning: Run 4/5 Done - Time: 3.47
5 Disks - Imitation-SMDP-Q-Learning: Run 5/5 Done - Time: 3.19
Outfiled the results to results/5_disks_smdp_imi.txt.


ValueError: too many values to unpack (expected 3)

In [None]:
# Run Learning 5 times for 6 Disk Environment
num_disks = 6
num_episodes = learning_setup[num_disks]["num_episodes"]
max_steps = learning_setup[num_disks]["max_steps"]
stats_6_smdp_imi = run_learning("Imitation-SMDP-Q-Learning", num_times, num_disks,
                                num_episodes, max_steps,
                                log_episodes, log_freq,
                                save_fname="results/6_disks_smdp_imi.txt")

## Transfer Learning Analysis

In [None]:
log_episodes = 10
log_freq = 20

N = 5
num_episodes = learning_setup[N]["num_episodes"]
max_steps = learning_setup[N]["max_steps"]

env = gym.make("Hanoi-v0")
env.set_env_parameters(num_disks=N, env_noise=0, verbose=False)

In [None]:
macros = get_optimal_macros(env, N-1, "Sequitur")
agent = SMDP_Agent_Q(env, macros)
params = learning_parameters(l_type="Transfer-SMDP-Q-Learning")
hist, er_buffer = smdp_q_learning(env, agent, num_episodes, max_steps,
                                  **params,
                                  log_freq=log_freq,
                                  log_episodes=log_episodes, verbose=True)

In [None]:
# Run Learning 5 times for 5 Disk Environment with 4 disk grammar
num_times = 5
num_disks = 5
num_episodes = learning_setup[num_disks]["num_episodes"]
max_steps = learning_setup[num_disks]["max_steps"]
env, agent, stats_smdp_trans_5 = run_learning("Transfer-SMDP-Q-Learning", num_times, num_disks,
                                              num_episodes, max_steps,
                                              log_episodes, log_freq,
                                              transfer_distance=1,
                                              save_fname="results/5_disks_smdp_transfer_4_disks.txt")

In [None]:
num_times = 5
num_disks = 6
num_episodes = learning_setup[num_disks]["num_episodes"]
max_steps = learning_setup[num_disks]["max_steps"]

# Run Learning 5 times for 6 Disk Environment with 5 disk grammar
env, agent, stats_smdp_trans_5 = run_learning("Transfer-SMDP-Q-Learning", num_times, num_disks,
                                              num_episodes, max_steps,
                                              log_episodes, log_freq,
                                              transfer_distance=1,
                                              save_fname="results/6_disks_smdp_transfer_5_disks.txt")

# Run Learning 5 times for 6 Disk Environment with 4 disk grammar
env, agent, stats_smdp_trans_4 = run_learning("Transfer-SMDP-Q-Learning", num_times, num_disks,
                                              num_episodes, max_steps,
                                              log_episodes, log_freq,
                                              transfer_distance=2,
                                              save_fname="results/6_disks_smdp_transfer_4_disks.txt")

## Online-Grammar-Macro-SMDP Learning

In [None]:
# Setup for N=4 Disk Towers of Hanoi Environment
log_episodes = 10
log_freq = 20

init_q_eps = 20
inter_update_eps=100
num_grammar_updates=5

N = 4
max_steps = learning_setup[N]["max_steps"]

env = gym.make("Hanoi-v0")
env.set_env_parameters(N, env_noise=0, verbose=False)

params = learning_parameters(l_type="Q-Learning")
hist = smdp_q_online_learning(env, init_q_eps, inter_update_eps,
                              num_grammar_updates, max_steps, **params,
                              log_freq=log_freq, log_episodes=log_episodes,
                              verbose=True)

In [None]:
# Run Learning 5 times for 5 Disk Environment
num_disks = 4
stats_smdp_online = run_learning("Online-SMDP-Q-Learning", num_times, num_disks,
                                 num_episodes, max_steps,
                                 log_episodes, log_freq,
                                 save_fname="results/4_disks_smdp_online_no_transfer.txt")