In [9]:
import herringbone as hb

In [10]:

# create an MDP
map_names = ["slides", "example", "easy", "danger_holes", "double_fish", "wall_of_death", "example2", "mega"]
selected_map_id = 0

state_path = "herringbone/env_core/config/state_config.json"
map_path = f"herringbone/env_core/maps/{map_names[selected_map_id]}.csv"
action_path = "herringbone/env_core/config/action_config.json"

GAMMA = 1

demo_mdp = hb.MDP(state_path, map_path, action_path, seed=42, gamma=GAMMA)


In [11]:

random_policy = hb.Policy(mdp=demo_mdp)
episode = hb.Episode(mdp=demo_mdp, policy=random_policy, max_depth=1000)
episode.peek()
episode.run("sar")
print(episode.trajectory)

╔═══════╦═══════╦═══════╦═══════╗
║ [32m  -1 [0m ║ [34m  -1 [0m ║ [34m  -1 [0m ║ [34m  -1 [0m ║
╠═══════╬═══════╬═══════╬═══════╣
║ [34m  -1 [0m ║ [34m  -1 [0m ║ [34m  -1 [0m ║ [34m  -1 [0m ║
╠═══════╬═══════╬═══════╬═══════╣
║ [34m  -1 [0m ║ [34m  -1 [0m ║ [34m  -1 [0m ║ [34m  -1 [0m ║
╠═══════╬═══════╬═══════╬═══════╣
║ [34m  -1 [0m ║ [34m  -1 [0m ║ [34m  -1 [0m ║ [32m  -1 [0m ║
╚═══════╩═══════╩═══════╩═══════╝[0]
t: 0 | S: [2, 3], R: -1, A: ↑
t: 1 | S: [1, 3], R: -1, A: ←
t: 2 | S: [1, 2], R: -1, A: ↑
t: 3 | S: [0, 2], R: -1, A: ↑
t: 4 | S: [0, 2], R: -1, A: ↑
t: 5 | S: [0, 2], R: -1, A: ←
t: 6 | S: [0, 1], R: -1, A: ←
t: 7 | S: [0, 0], R: -1, A: None
Trajectory(states=[[2, 3], [1, 3], [1, 2], [0, 2], [0, 2], [0, 2], [0, 1], [0, 0]], actions=[↑, ←, ↑, ↑, ↑, ←, ←], rewards=[nan, -1, -1, -1, -1, -1, -1, -1])


In [12]:
N = 1000
mc_predictor = hb.MonteCarloPredictor(demo_mdp)
mc_predictor.evaluate_policy(random_policy, n_samples=N)

In [13]:
hb.Render.preview_V(mdp=demo_mdp, learned_V=mc_predictor.value_functions)

╔════════╦════════╦════════╦════════╗
║  0.00  ║ -14.82 ║ -20.57 ║ -22.16 ║
╠════════╬════════╬════════╬════════╣
║ -14.09 ║ -18.29 ║ -19.50 ║ -19.95 ║
╠════════╬════════╬════════╬════════╣
║ -20.01 ║ -20.95 ║ -17.90 ║ -13.00 ║
╠════════╬════════╬════════╬════════╣
║ -22.03 ║ -19.57 ║ -13.13 ║  0.00  ║
╚════════╩════════╩════════╩════════╝


In [14]:

N = 10000
mc_control = hb.MonteCarloController(demo_mdp, epsilon=0.1)
mc_control.train(n_episodes=N)
trained_policy = mc_control.policy

In [15]:
print(trained_policy.get_policy())
print(trained_policy)

{[0, 0]: {↑: 0.25, ↓: 0.25, ←: 0.25, →: 0.25}, [0, 1]: {↑: 0.025, ↓: 0.025, ←: 0.925, →: 0.025}, [0, 2]: {↑: 0.025, ↓: 0.025, ←: 0.925, →: 0.025}, [0, 3]: {↑: 0.025, ↓: 0.025, ←: 0.925, →: 0.025}, [1, 0]: {↑: 0.925, ↓: 0.025, ←: 0.025, →: 0.025}, [1, 1]: {↑: 0.925, ↓: 0.025, ←: 0.025, →: 0.025}, [1, 2]: {↑: 0.925, ↓: 0.025, ←: 0.025, →: 0.025}, [1, 3]: {↑: 0.025, ↓: 0.925, ←: 0.025, →: 0.025}, [2, 0]: {↑: 0.925, ↓: 0.025, ←: 0.025, →: 0.025}, [2, 1]: {↑: 0.925, ↓: 0.025, ←: 0.025, →: 0.025}, [2, 2]: {↑: 0.025, ↓: 0.025, ←: 0.025, →: 0.925}, [2, 3]: {↑: 0.025, ↓: 0.925, ←: 0.025, →: 0.025}, [3, 0]: {↑: 0.025, ↓: 0.025, ←: 0.025, →: 0.925}, [3, 1]: {↑: 0.025, ↓: 0.025, ←: 0.025, →: 0.925}, [3, 2]: {↑: 0.025, ↓: 0.025, ←: 0.025, →: 0.925}, [3, 3]: {↑: 0.25, ↓: 0.25, ←: 0.25, →: 0.25}}
╔═════════╦═════════╦═════════╦═════════╗
║ ↑/↓/←/→ ║    ←    ║    ←    ║    ←    ║
╠═════════╬═════════╬═════════╬═════════╣
║    ↑    ║    ↑    ║    ↑    ║    ↓    ║
╠═════════╬═════════╬═════════╬════════

In [16]:
episode = hb.Episode(mdp=demo_mdp, policy=trained_policy, max_depth=1000)
episode.run("sar")

t: 0 | S: [1, 2], R: -1, A: ↑
t: 1 | S: [0, 2], R: -1, A: ←
t: 2 | S: [0, 1], R: -1, A: ←
t: 3 | S: [0, 0], R: -1, A: None
