In [1]:
import algorithmes.policy_iteration as pi
import algorithmes.sarsa as sa
import environnements.lineworld as lw
import environnements.gridworld as gw
from utils import load_config

In [2]:
congig_file = "config.yaml"

# LineWorld Environnement

## Create LineWorld environnement

In [3]:
config_lineworld = load_config(congig_file, "LineWorld")
game = "lineworld"

In [4]:
S = config_lineworld["states"]
A = config_lineworld["actions"]
R = config_lineworld["rewards"]
T = config_lineworld["terminals"]

In [5]:
lineworld_mdp = lw.create_lineworld(S, A, R)

## Dynamic Programming

### Policy Iteration

In [6]:
policy, V = pi.policy_iteration(game, lineworld_mdp, S, A, R, T, gamma=0.999)
print("Optimal Policy:")
print(policy)
print("Value Function:")
print(V)

Iteration: 1
__X__
_X___
X____
Steps: [2, np.int64(1), np.int64(0)]
Total Reward: -1
Iteration: 2
__X__
___X_
____X
Steps: [2, np.int64(3), np.int64(4)]
Total Reward: 1
Optimal Policy:
[[1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]]
Value Function:
[0.       0.998001 0.999    1.       0.      ]


In [7]:
# Play the game with the optimal policy
steps, total_reward = lw.play_game(policy, lineworld_mdp, R, T)
print(f"Final Steps: {steps}")
print(f"Final Total Reward: {total_reward}")

__X__
___X_
____X
Final Steps: [2, np.int64(3), np.int64(4)]
Final Total Reward: 1


## Temporal Difference Learning

### Sarsa


In [8]:
print(S)
print(A)
print(R)
print(T)

[0, 1, 2, 3, 4]
[0, 1]
[-1, 0, 1]
[0, 4]


In [9]:
policy, Q = sa.sarsa(game, S, A, R, lineworld_mdp, T, num_episodes=10, gamma=0.999, alpha=0.1, epsilon=0.1, start_state=1)
print("Optimal Policy:")
print(policy)
print("Value Function:")
print(Q)

38
Optimal Policy:
[[1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]]
Value Function:
[[ 0.          0.        ]
 [-0.3439      0.01581832]
 [-0.00999     0.11415074]
 [ 0.          0.468559  ]
 [ 0.          0.        ]]


In [10]:
# Play the game with the optimal policy
steps, total_reward = lw.play_game(policy, lineworld_mdp, R, T)
print(f"Final Steps: {steps}")
print(f"Final Total Reward: {total_reward}")

__X__
___X_
____X
Final Steps: [2, np.int64(3), np.int64(4)]
Final Total Reward: 1


# GridWorld Environnement


## Create GridWorld environnement

In [11]:
config_gridworld = load_config(congig_file, "GridWorld")
game = "gridworld"

In [12]:
S = config_gridworld["states"]
A = config_gridworld["actions"]
R = config_gridworld["rewards"]
T = config_gridworld["terminals"]

In [13]:
gridworld_mdp = gw.create_gridworld(S, A, R)

## Dynamic Programming

### Policy Iteration

In [15]:
policy, V = pi.policy_iteration(game, gridworld_mdp, S, A, R, T, gamma=0.999)
print("Optimal Policy:")
print(policy)
print("Value Function:")
print(V)

Iteration: 1
_ _ _ _ _
_ _ _ _ _
_ _ X _ _
_ _ _ _ _
_ _ _ _ _
******************************
_ _ _ _ _
_ _ X _ _
_ _ _ _ _
_ _ _ _ _
_ _ _ _ _
******************************
******************************
_ _ X _ _
_ _ _ _ _
_ _ _ _ _
_ _ _ _ _
_ _ _ _ _
******************************
Steps: [12, np.int64(7), np.int64(2)]
Total Reward: -1


KeyboardInterrupt: 

In [None]:
steps, total_reward = gw.play_game(policy, gridworld_mdp, R, T, 6)
print(f"Final Steps: {steps}")
print(f"Final Total Reward: {total_reward}")

## Temporal Difference Learning

### Sarsa

In [17]:
policy, Q = sa.sarsa(game, S, A, R, gridworld_mdp, T, num_episodes=1000, gamma=0.999, alpha=0.1, epsilon=0.1, start_state=6)
print("Optimal Policy:")
print(policy)
print("Value Function:")
print(Q)

4468
Optimal Policy:
[[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
Value Function:
[[ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [-0.9282102   0.49655813  0.70857835 -0.97218716]
 [-0.92023356  0.49510014  0.7548882   0.63541024]
 [-0.95760884  0.98801749 -0.9282102   0.72415366]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.68547361  0.         

In [18]:
# Play the game with the optimal policy
steps, total_reward = gw.play_game(policy, gridworld_mdp, R, T, 6)
print(f"Final Steps: {steps}")
print(f"Final Total Reward: {total_reward}")

_ _ _ _ _
_ X _ _ _
_ _ _ _ _
_ _ _ _ _
_ _ _ _ _
******************************
_ _ _ _ _
_ _ X _ _
_ _ _ _ _
_ _ _ _ _
_ _ _ _ _
******************************
******************************
_ _ _ _ _
_ _ _ X _
_ _ _ _ _
_ _ _ _ _
_ _ _ _ _
******************************
******************************
_ _ _ _ _
_ _ _ _ _
_ _ _ X _
_ _ _ _ _
_ _ _ _ _
******************************
******************************
_ _ _ _ _
_ _ _ _ _
_ _ _ _ _
_ _ _ X _
_ _ _ _ _
******************************
Final Steps: [6, np.int64(7), np.int64(8), np.int64(13), np.int64(18)]
Final Total Reward: 1
