In [16]:
import herringbone as hb

In [None]:

# create an MDP
map_names = ["-", "example", "easy", "danger_holes", "double_fish", "wall_of_death", "example2", "mega"]
selected_map_id = 3

state_path = "herringbone/env_core/config/state_config.json"
map_path = f"herringbone/env_core/maps/{map_names[selected_map_id]}.csv"
action_path = "herringbone/env_core/config/action_config.json"

demo_mdp = hb.MDP(state_path, map_path, action_path, seed=42)


In [18]:

random_policy = hb.EpsilonGreedyPolicy(mdp=demo_mdp, epsilon=1)
episode = hb.Episode(mdp=demo_mdp, policy=random_policy, max_depth=1000)
episode.peek()
episode.run("sar")
print(episode.trajectory)

╔════════╦════════╦════════╦════════╦════════╗
║ [34m      [0m ║ [34m      [0m ║ [31m hole [0m ║ [34m      [0m ║ [34m<✰))><[0m ║
╠════════╬════════╬════════╬════════╬════════╣
║ [34m      [0m ║ [34m      [0m ║ [31m hole [0m ║ [34m      [0m ║ [34m      [0m ║
╠════════╬════════╬════════╬════════╬════════╣
║ [34m      [0m ║ [34m      [0m ║ [31m hole [0m ║ [34m      [0m ║ [34m      [0m ║
╠════════╬════════╬════════╬════════╬════════╣
║ [34m      [0m ║ [34m      [0m ║ [31m hole [0m ║ [34m      [0m ║ [34m      [0m ║
╠════════╬════════╬════════╬════════╬════════╣
║ [34m      [0m ║ [34m      [0m ║ [34m      [0m ║ [34m      [0m ║ [34m      [0m ║
╚════════╩════════╩════════╩════════╩════════╝
t: 0 | S: [0, 0], R: nan, A: ↑
t: 1 | S: [0, 0], R: -1, A: ↓
t: 2 | S: [1, 0], R: -1, A: ↑
t: 3 | S: [0, 0], R: -1, A: ↑
t: 4 | S: [0, 0], R: -1, A: ↑
t: 5 | S: [0, 0], R: -1, A: ↓
t: 6 | S: [1, 0], R: -1, A: ↑
t: 7 | S: [0, 0], R: -1, A: →
t: 8 | S: [0, 1],

In [19]:
# Add the render function for previewing a Value Function (hate this!)
def preview_V(mdp, learned_V):
    
    states_2d = mdp.get_board().states
    x = len(states_2d)
    y = len(states_2d[0])
    
    v_values =  ['%.2f' % v for v in list(learned_V.values())]

    two_d_list = [v_values[i * y:(i + 1) * y] for i in range(x)]

    # Print 2D list
    for row in two_d_list:
        print(row)

In [20]:
policy = hb.EpsilonGreedyPolicy(mdp=demo_mdp, epsilon=1)  # defaults to uniform
print(policy)

╔═════════╦═════════╦═════════╦═════════╦═════════╗
║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║
╠═════════╬═════════╬═════════╬═════════╬═════════╣
║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║
╠═════════╬═════════╬═════════╬═════════╬═════════╣
║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║
╠═════════╬═════════╬═════════╬═════════╬═════════╣
║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║
╠═════════╬═════════╬═════════╬═════════╬═════════╣
║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║
╚═════════╩═════════╩═════════╩═════════╩═════════╝


In [21]:
episode = hb.Episode(mdp=demo_mdp, policy=policy, max_depth=1000)

In [22]:
episode.run("sar")

t: 0 | S: [0, 0], R: nan, A: ↓
t: 1 | S: [1, 0], R: -1, A: ←
t: 2 | S: [1, 0], R: -1, A: →
t: 3 | S: [1, 1], R: -1, A: →


In [23]:
DISCOUNT = 0.9

In [24]:
N = 1000
mc_predictor = hb.MonteCarloPredictor(demo_mdp, discount=DISCOUNT)
mc_predictor.evaluate_policy(policy, n_samples=N)

In [25]:
preview_V(demo_mdp, mc_predictor.value_functions)

['-5.62', '-4.97', '0.00', '-3.76', '0.00']
['-5.84', '-5.11', '0.00', '-2.97', '-3.20']
['-5.99', '-5.18', '0.00', '-3.23', '-3.18']
['-6.18', '-5.51', '0.00', '-5.83', '-6.25']
['-6.58', '-5.63', '-6.19', '-6.21', '-7.03']


In [29]:

N = 1000
mc_control = hb.MonteCarloController(demo_mdp, discount=DISCOUNT, epsilon=0.25, start_coords=(0,0))
mc_control.train(n_episodes=N)
trained_policy = mc_control.policy

In [30]:
print(trained_policy.get_policy())
print(trained_policy)

{[0, 0]: {↑: 0.0625, ↓: 0.8125, ←: 0.0625, →: 0.0625}, [0, 1]: {↑: 0.0625, ↓: 0.0625, ←: 0.8125, →: 0.0625}, [0, 2]: {↑: 0.25, ↓: 0.25, ←: 0.25, →: 0.25}, [0, 3]: {↑: 0.0854196364216137, ↓: 0.07342895886356467, ←: 0.060680322166050904, →: 0.7804710825487706}, [0, 4]: {↑: 0.25, ↓: 0.25, ←: 0.25, →: 0.25}, [1, 0]: {↑: 0.0625, ↓: 0.8125, ←: 0.0625, →: 0.0625}, [1, 1]: {↑: 0.0625, ↓: 0.0625, ←: 0.8125, →: 0.0625}, [1, 2]: {↑: 0.25, ↓: 0.25, ←: 0.25, →: 0.25}, [1, 3]: {↑: 0.0625, ↓: 0.0625, ←: 0.0625, →: 0.8125}, [1, 4]: {↑: 0.8125, ↓: 0.0625, ←: 0.0625, →: 0.0625}, [2, 0]: {↑: 0.0625, ↓: 0.8125, ←: 0.0625, →: 0.0625}, [2, 1]: {↑: 0.0625, ↓: 0.0625, ←: 0.8125, →: 0.0625}, [2, 2]: {↑: 0.25, ↓: 0.25, ←: 0.25, →: 0.25}, [2, 3]: {↑: 0.0625, ↓: 0.0625, ←: 0.0625, →: 0.8125}, [2, 4]: {↑: 0.8125, ↓: 0.0625, ←: 0.0625, →: 0.0625}, [3, 0]: {↑: 0.0625, ↓: 0.8125, ←: 0.0625, →: 0.0625}, [3, 1]: {↑: 0.0625, ↓: 0.8125, ←: 0.0625, →: 0.0625}, [3, 2]: {↑: 0.25, ↓: 0.25, ←: 0.25, →: 0.25}, [3, 3]: {↑: 0.06

In [None]:
episode = hb.Episode(mdp=demo_mdp, policy=trained_policy, max_depth=1000)
episode.run("sar")

t: 0 | S: [0, 0], R: nan, A: →
t: 1 | S: [0, 1], R: -1, A: →
t: 2 | S: [0, 2], R: -1, A: →
t: 3 | S: [0, 3], R: -1, A: ↓
t: 4 | S: [1, 3], R: -1, A: →
t: 5 | S: [1, 4], R: -1, A: ↓


In [None]:
N = 1000
mc_predictor = hb.MonteCarloPredictor(demo_mdp, discount=DISCOUNT)
mc_predictor.evaluate_policy(trained_policy, n_samples=N)

In [None]:
preview_V(demo_mdp, mc_predictor.value_functions)

['-4.95', '-4.31', '-3.53', '-2.64', '-1.60']
['-4.46', '-3.69', '-2.65', '-1.63', '-2.11']
['-3.63', '-2.75', '-1.72', '-2.15', '0.00']
