In [2]:
import herringbone as hb

All initialization tests passed.
imported herringbone without any errors :)


In [3]:

# create an MDP
map_names = ["slides", "example", "easy", "danger_holes", "double_fish", "wall_of_death", "example2", "mega"]
selected_map_id = 0

state_path = "herringbone/env_core/config/state_config.json"
map_path = f"herringbone/env_core/maps/{map_names[selected_map_id]}.csv"
action_path = "herringbone/env_core/config/action_config.json"

GAMMA = 1

demo_mdp = hb.MDP(state_path, map_path, action_path, seed=42, gamma=GAMMA)


In [4]:

random_policy = hb.Policy(mdp=demo_mdp)
episode = hb.Episode(mdp=demo_mdp, policy=random_policy, max_depth=1000)
episode.peek()
episode.run("sar")
print(episode.trajectory)

╔═══════╦═══════╦═══════╦═══════╗
║ [32m  -1 [0m ║ [34m  -1 [0m ║ [34m  -1 [0m ║ [34m  -1 [0m ║
╠═══════╬═══════╬═══════╬═══════╣
║ [34m  -1 [0m ║ [34m  -1 [0m ║ [34m  -1 [0m ║ [34m  -1 [0m ║
╠═══════╬═══════╬═══════╬═══════╣
║ [34m  -1 [0m ║ [34m  -1 [0m ║ [34m  -1 [0m ║ [34m  -1 [0m ║
╠═══════╬═══════╬═══════╬═══════╣
║ [34m  -1 [0m ║ [34m  -1 [0m ║ [34m  -1 [0m ║ [32m  -1 [0m ║
╚═══════╩═══════╩═══════╩═══════╝[0]
t: 0 | S: [2, 3], R: -1, A: ↑
t: 1 | S: [1, 3], R: -1, A: ←
t: 2 | S: [1, 2], R: -1, A: ↑
t: 3 | S: [0, 2], R: -1, A: ↑
t: 4 | S: [0, 2], R: -1, A: ↑
t: 5 | S: [0, 2], R: -1, A: ←
t: 6 | S: [0, 1], R: -1, A: ←
t: 7 | S: [0, 0], R: -1, A: None
Trajectory(states=[[2, 3], [1, 3], [1, 2], [0, 2], [0, 2], [0, 2], [0, 1], [0, 0]], actions=[↑, ←, ↑, ↑, ↑, ←, ←], rewards=[nan, -1, -1, -1, -1, -1, -1, -1])


In [10]:
N = 1000
mc_predictor = hb.MonteCarloPredictor(demo_mdp)
mc_predictor.evaluate_policy(random_policy, n_samples=N)

In [11]:
hb.Render.preview_V(mdp=demo_mdp, learned_V=mc_predictor.value_functions)

╔════════╦════════╦════════╦════════╗
║  0.00  ║ -14.12 ║ -19.58 ║ -22.17 ║
╠════════╬════════╬════════╬════════╣
║ -12.35 ║ -16.91 ║ -19.64 ║ -19.92 ║
╠════════╬════════╬════════╬════════╣
║ -19.65 ║ -20.10 ║ -18.41 ║ -13.74 ║
╠════════╬════════╬════════╬════════╣
║ -21.25 ║ -19.96 ║ -14.22 ║  0.00  ║
╚════════╩════════╩════════╩════════╝


In [27]:

N = 1000
mc_control = hb.MonteCarloController(demo_mdp, epsilon=0.25)
mc_control.train(n_episodes=N)
trained_policy = mc_control.policy

In [28]:
print(trained_policy.get_policy())
print(trained_policy)

{[0, 0]: {↑: 0.25, ↓: 0.25, ←: 0.25, →: 0.25}, [0, 1]: {↑: 0.0625, ↓: 0.0625, ←: 0.8125, →: 0.0625}, [0, 2]: {↑: 0.0625, ↓: 0.0625, ←: 0.8125, →: 0.0625}, [0, 3]: {↑: 0.0625, ↓: 0.0625, ←: 0.8125, →: 0.0625}, [1, 0]: {↑: 0.8125, ↓: 0.0625, ←: 0.0625, →: 0.0625}, [1, 1]: {↑: 0.8125, ↓: 0.0625, ←: 0.0625, →: 0.0625}, [1, 2]: {↑: 0.8125, ↓: 0.0625, ←: 0.0625, →: 0.0625}, [1, 3]: {↑: 0.0625, ↓: 0.8125, ←: 0.0625, →: 0.0625}, [2, 0]: {↑: 0.0625, ↓: 0.0625, ←: 0.0625, →: 0.8125}, [2, 1]: {↑: 0.8125, ↓: 0.0625, ←: 0.0625, →: 0.0625}, [2, 2]: {↑: 0.0625, ↓: 0.8125, ←: 0.0625, →: 0.0625}, [2, 3]: {↑: 0.0625, ↓: 0.8125, ←: 0.0625, →: 0.0625}, [3, 0]: {↑: 0.8125, ↓: 0.0625, ←: 0.0625, →: 0.0625}, [3, 1]: {↑: 0.0625, ↓: 0.0625, ←: 0.0625, →: 0.8125}, [3, 2]: {↑: 0.0625, ↓: 0.0625, ←: 0.0625, →: 0.8125}, [3, 3]: {↑: 0.25, ↓: 0.25, ←: 0.25, →: 0.25}}
╔═════════╦═════════╦═════════╦═════════╗
║ ↑/↓/←/→ ║    ←    ║    ←    ║    ←    ║
╠═════════╬═════════╬═════════╬═════════╣
║    ↑    ║    ↑    ║    

In [29]:
episode = hb.Episode(mdp=demo_mdp, policy=trained_policy, max_depth=1000)
episode.run("sar")

t: 0 | S: [0, 2], R: -1, A: ←
t: 1 | S: [0, 1], R: -1, A: ↓
t: 2 | S: [1, 1], R: -1, A: ↑
t: 3 | S: [0, 1], R: -1, A: ←
t: 4 | S: [0, 0], R: -1, A: None
