In [18]:
import algorithmes.policy_iteration as pi
import algorithmes.sarsa as sa
import algorithmes.dyna_q as dq
import environnements.lineworld as lw
import environnements.gridworld as gw
from utils import load_config


In [19]:
congig_file = "config.yaml"

# LineWorld Environnement

## Create LineWorld environnement

In [20]:
config_lineworld = load_config(congig_file, "LineWorld")
game = "lineworld"

In [21]:
S = config_lineworld["states"]
A = config_lineworld["actions"]
R = config_lineworld["rewards"]
T = config_lineworld["terminals"]

In [22]:
lineworld_mdp = lw.create_lineworld(S, A, R)

## Dynamic Programming

### Policy Iteration

In [23]:
policy, V = pi.policy_iteration(game, lineworld_mdp, S, A, R, T, gamma=0.999)
print("Optimal Policy:")
print(policy)
print("Value Function:")
print(V)

Iteration: 1
__X__
_X___
X____
Steps: [2, np.int64(1), np.int64(0)]
Total Reward: -1
Iteration: 2
__X__
___X_
____X
Steps: [2, np.int64(3), np.int64(4)]
Total Reward: 1
Optimal Policy:
[[1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]]
Value Function:
[0.       0.998001 0.999    1.       0.      ]


In [24]:
# Play the game with the optimal policy
steps, total_reward = lw.play_game(policy, lineworld_mdp, R, T)
print(f"Final Steps: {steps}")
print(f"Final Total Reward: {total_reward}")

__X__
___X_
____X
Final Steps: [2, np.int64(3), np.int64(4)]
Final Total Reward: 1


## Temporal Difference Learning

### Sarsa


In [25]:
print(S)
print(A)
print(R)
print(T)

[0, 1, 2, 3, 4]
[0, 1]
[-1, 0, 1]
[0, 4]


In [26]:
policy, Q = sa.sarsa(game, S, A, R, lineworld_mdp, T, num_episodes=10, gamma=0.999, alpha=0.1, epsilon=0.1, start_state=1)
print("Optimal Policy:")
print(policy)
print("Value Function:")
print(Q)

Optimal Policy:
[[1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]]
Value Function:
[[ 0.          0.        ]
 [-0.19        0.03801564]
 [-0.00999     0.18670837]
 [ 0.          0.56953279]
 [ 0.          0.        ]]


In [27]:
# Play the game with the optimal policy
steps, total_reward = lw.play_game(policy, lineworld_mdp, R, T)
print(f"Final Steps: {steps}")
print(f"Final Total Reward: {total_reward}")

__X__
___X_
____X
Final Steps: [2, np.int64(3), np.int64(4)]
Final Total Reward: 1


## Planning

### Dyna-Q

In [28]:
policy, Q = dq.dyna_q(S, A, R, lineworld_mdp, T, n_episodes = 100, n_planning_steps = 100, alpha = 0.1, gamma = 0.999, epsilon = 0.1, start_state = 1)
print("Optimal Policy:")
print(policy)
print("Value Function:")
print(Q)

Optimal Policy:
[[1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]]
Value Function:
[[ 0.        0.      ]
 [-1.        0.998001]
 [ 0.997003  0.999   ]
 [ 0.998001  1.      ]
 [ 0.        0.      ]]


In [29]:
# Play the game with the optimal policy
steps, total_reward = lw.play_game(policy, lineworld_mdp, R, T)
print(f"Final Steps: {steps}")
print(f"Final Total Reward: {total_reward}")

__X__
___X_
____X
Final Steps: [2, np.int64(3), np.int64(4)]
Final Total Reward: 1


# GridWorld Environnement


## Create GridWorld environnement

In [30]:
config_gridworld = load_config(congig_file, "GridWorld")
game = "gridworld"

In [31]:
S = config_gridworld["states"]
A = config_gridworld["actions"]
R = config_gridworld["rewards"]
T = config_gridworld["terminals"]

In [32]:
gridworld_mdp = gw.create_gridworld(S, A, R)

## Dynamic Programming

### Policy Iteration

In [33]:
policy, V = pi.policy_iteration(game, gridworld_mdp, S, A, R, T, gamma=0.999)
print("Optimal Policy:")
print(policy)
print("Value Function:")
print(V)

Iteration: 1
_ _ _ _ _
_ _ _ _ _
_ _ X _ _
_ _ _ _ _
_ _ _ _ _
******************************
_ _ _ _ _
_ _ X _ _
_ _ _ _ _
_ _ _ _ _
_ _ _ _ _
******************************
******************************
_ _ X _ _
_ _ _ _ _
_ _ _ _ _
_ _ _ _ _
_ _ _ _ _
******************************
Steps: [12, np.int64(7), np.int64(2)]
Total Reward: -1
Iteration: 2
_ _ _ _ _
_ _ _ _ _
_ _ X _ _
_ _ _ _ _
_ _ _ _ _
******************************
_ _ _ _ _
_ _ _ _ _
_ _ _ _ _
_ _ X _ _
_ _ _ _ _
******************************
******************************
_ _ _ _ _
_ _ _ _ _
_ _ _ _ _
_ _ _ X _
_ _ _ _ _
******************************
Steps: [12, np.int64(17), np.int64(18)]
Total Reward: 1
Iteration: 3
_ _ _ _ _
_ _ _ _ _
_ _ X _ _
_ _ _ _ _
_ _ _ _ _
******************************
_ _ _ _ _
_ _ _ _ _
_ _ _ _ _
_ _ X _ _
_ _ _ _ _
******************************
******************************
_ _ _ _ _
_ _ _ _ _
_ _ _ _ _
_ _ _ X _
_ _ _ _ _
******************************
Steps: [12, np.int64(17), np

In [34]:
steps, total_reward = gw.play_game(policy, gridworld_mdp, R, T, 6)
print(f"Final Steps: {steps}")
print(f"Final Total Reward: {total_reward}")

_ _ _ _ _
_ X _ _ _
_ _ _ _ _
_ _ _ _ _
_ _ _ _ _
******************************
_ _ _ _ _
_ _ _ _ _
_ X _ _ _
_ _ _ _ _
_ _ _ _ _
******************************
******************************
_ _ _ _ _
_ _ _ _ _
_ _ _ _ _
_ X _ _ _
_ _ _ _ _
******************************
******************************
_ _ _ _ _
_ _ _ _ _
_ _ _ _ _
_ _ X _ _
_ _ _ _ _
******************************
******************************
_ _ _ _ _
_ _ _ _ _
_ _ _ _ _
_ _ _ X _
_ _ _ _ _
******************************
Final Steps: [6, np.int64(11), np.int64(16), np.int64(17), np.int64(18)]
Final Total Reward: 1


## Temporal Difference Learning

### Sarsa

In [35]:
policy, Q = sa.sarsa(game, S, A, R, gridworld_mdp, T, num_episodes=1000, gamma=0.999, alpha=0.1, epsilon=0.1, start_state=6)
print("Optimal Policy:")
print(policy)
print("Value Function:")
print(Q)

Optimal Policy:
[[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
Value Function:
[[ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [-0.92023356  0.70268378  0.51792654 -0.96566316]
 [-0.19        0.76136231  0.          0.08270315]
 [-0.1         0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.59389261  0.61784818  0.94

In [36]:
# Play the game with the optimal policy
steps, total_reward = gw.play_game(policy, gridworld_mdp, R, T, 6)
print(f"Final Steps: {steps}")
print(f"Final Total Reward: {total_reward}")

_ _ _ _ _
_ X _ _ _
_ _ _ _ _
_ _ _ _ _
_ _ _ _ _
******************************
_ _ _ _ _
_ _ _ _ _
_ X _ _ _
_ _ _ _ _
_ _ _ _ _
******************************
******************************
_ _ _ _ _
_ _ _ _ _
_ _ X _ _
_ _ _ _ _
_ _ _ _ _
******************************
******************************
_ _ _ _ _
_ _ _ _ _
_ _ _ _ _
_ _ X _ _
_ _ _ _ _
******************************
******************************
_ _ _ _ _
_ _ _ _ _
_ _ _ _ _
_ _ _ X _
_ _ _ _ _
******************************
Final Steps: [6, np.int64(11), np.int64(12), np.int64(17), np.int64(18)]
Final Total Reward: 1


## Planning

### Dyna-Q

In [37]:
policy, Q = dq.dyna_q(S, A, R, gridworld_mdp, T, n_episodes = 100, n_planning_steps = 100, alpha = 0.1, gamma = 0.999, epsilon = 0.1, start_state = 6)
print("Optimal Policy:")
print(policy)
print("Value Function:")
print(Q)

Optimal Policy:
[[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
Value Function:
[[ 0.        0.        0.        0.      ]
 [ 0.        0.        0.        0.      ]
 [ 0.        0.        0.        0.      ]
 [ 0.        0.        0.        0.      ]
 [ 0.        0.        0.        0.      ]
 [ 0.        0.        0.        0.      ]
 [-1.        0.997003  0.997003 -1.      ]
 [-1.        0.998001  0.998001  0.996006]
 [-1.        0.999     0.        0.997003]
 [ 0.        0.        0.        0.      ]
 [ 0.        0.        0.        0.      ]
 [ 0.996006  0.996006  0.998001 -1.      ]
 [ 0.997003  0.997003  0.999     0.997003]
 [ 0.998001  1.       -1.        

In [38]:
steps, total_reward = gw.play_game(policy, gridworld_mdp, R, T, 6)
print(f"Final Steps: {steps}")
print(f"Final Total Reward: {total_reward}")

_ _ _ _ _
_ X _ _ _
_ _ _ _ _
_ _ _ _ _
_ _ _ _ _
******************************
_ _ _ _ _
_ _ _ _ _
_ X _ _ _
_ _ _ _ _
_ _ _ _ _
******************************
******************************
_ _ _ _ _
_ _ _ _ _
_ _ X _ _
_ _ _ _ _
_ _ _ _ _
******************************
******************************
_ _ _ _ _
_ _ _ _ _
_ _ _ X _
_ _ _ _ _
_ _ _ _ _
******************************
******************************
_ _ _ _ _
_ _ _ _ _
_ _ _ _ _
_ _ _ X _
_ _ _ _ _
******************************
Final Steps: [6, np.int64(11), np.int64(12), np.int64(13), np.int64(18)]
Final Total Reward: 1
