In [1]:
import herringbone as hb

All initialization tests passed.
imported herringbone without any errors :)


### Creating an MDP

In [2]:
map_names = ["slides", "example", "easy", "danger_holes", "double_fish", "wall_of_death", "example2", "mega"]
selected_map_id = 1

state_path = "herringbone/env_core/config/state_config.json"
map_path = f"herringbone/env_core/maps/{map_names[selected_map_id]}.csv"
action_path = "herringbone/env_core/config/action_config.json"

gamma = 0.9

mdp = hb.MDP(state_path, map_path, action_path, seed=42, gamma=gamma)

### Previewing the board

In [3]:
render_modes = ['sar', 'rewards', 'ascii']
hb.Render.preview_frame(board=mdp.get_board(), agent_state=None, render_mode=render_modes[2])

╔═════════╦═════════╦═════════╦═════════╦═════════╗
║ [34m       [0m ║ [34m       [0m ║ [34m       [0m ║ [34m       [0m ║ [34m       [0m ║
╠═════════╬═════════╬═════════╬═════════╬═════════╣
║ [34m       [0m ║ [34m       [0m ║ [34m       [0m ║ [34m       [0m ║ [34m       [0m ║
╠═════════╬═════════╬═════════╬═════════╬═════════╣
║ [34m       [0m ║ [34m       [0m ║ [34m       [0m ║ [34m       [0m ║ [32m<x)))><[0m ║
╚═════════╩═════════╩═════════╩═════════╩═════════╝[0]


### Previewing a policy

In [4]:
# Create a uniform random policy
policy = hb.Policy(mdp=mdp)

print(policy)

╔═════════╦═════════╦═════════╦═════════╦═════════╗
║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║
╠═════════╬═════════╬═════════╬═════════╬═════════╣
║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║
╠═════════╬═════════╬═════════╬═════════╬═════════╣
║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║ ↑/↓/←/→ ║
╚═════════╩═════════╩═════════╩═════════╩═════════╝


### Policy Iteration



In [13]:
theta = 0.00001

policy_iteration = hb.PolicyIteration(mdp=mdp, theta_threshold=theta)

In [14]:
# Run PolicyIteration
pi_optimal_policy, pi_state_values = policy_iteration.run()

In [15]:
print(pi_optimal_policy)

╔═══╦═══╦═══╦═══╦═══╗
║ ↓ ║ ↓ ║ ↓ ║ ↓ ║ ↓ ║
╠═══╬═══╬═══╬═══╬═══╣
║ ↓ ║ ↓ ║ ↓ ║ ↓ ║ ↓ ║
╠═══╬═══╬═══╬═══╬═══╣
║ → ║ → ║ → ║ → ║ ↓ ║
╚═══╩═══╩═══╩═══╩═══╝


In [16]:
hb.Render.preview_V(mdp=mdp, learned_V=pi_state_values)

╔════════╦════════╦════════╦════════╦════════╗
║ 48.46  ║ 54.95  ║ 62.17  ║ 70.19  ║ 79.10  ║
╠════════╬════════╬════════╬════════╬════════╣
║ 54.95  ║ 62.17  ║ 70.19  ║ 79.10  ║ 89.00  ║
╠════════╬════════╬════════╬════════╬════════╣
║ 62.17  ║ 70.19  ║ 79.10  ║ 89.00  ║ 100.00 ║
╚════════╩════════╩════════╩════════╩════════╝


### Value Iteration

In [17]:
value_iteration = hb.ValueIteration(mdp=mdp, theta_threshold=theta)

In [18]:
vi_optimal_policy, vi_state_values = value_iteration.run()

In [19]:
print(vi_optimal_policy)

╔═══╦═══╦═══╦═══╦═══╗
║ ↓ ║ ↓ ║ ↓ ║ ↓ ║ ↓ ║
╠═══╬═══╬═══╬═══╬═══╣
║ ↓ ║ ↓ ║ ↓ ║ ↓ ║ ↓ ║
╠═══╬═══╬═══╬═══╬═══╣
║ → ║ → ║ → ║ → ║ ↓ ║
╚═══╩═══╩═══╩═══╩═══╝


In [20]:
hb.Render.preview_V(mdp=mdp, learned_V=vi_state_values)

╔════════╦════════╦════════╦════════╦════════╗
║ 48.46  ║ 54.95  ║ 62.17  ║ 70.19  ║ 79.10  ║
╠════════╬════════╬════════╬════════╬════════╣
║ 54.95  ║ 62.17  ║ 70.19  ║ 79.10  ║ 89.00  ║
╠════════╬════════╬════════╬════════╬════════╣
║ 62.17  ║ 70.19  ║ 79.10  ║ 89.00  ║ 100.00 ║
╚════════╩════════╩════════╩════════╩════════╝
