In [1]:
import algorithmes.sarsa as sa
import algorithmes.dyna_q as dq
import algorithmes.q_learning as ql

## Dynamic Programming

### Policy Iteration

Iteration: 1
__X__
_X___
X____
Steps: [2, np.int64(1), np.int64(0)]
Total Reward: -1
Iteration: 2
__X__
___X_
____X
Steps: [2, np.int64(3), np.int64(4)]
Total Reward: 1
Optimal Policy:
[[1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]]
Value Function:
[0.       0.998001 0.999    1.       0.      ]


__X__
___X_
____X
Final Steps: [2, np.int64(3), np.int64(4)]
Final Total Reward: 1


## Temporal Difference Learning

### Q-Learning

In [2]:
param_combinations = [
    {"alpha": alpha, "epsilon": epsilon, "gamma": 0.999, "nb_iter": nb_iter}
    for alpha in [0.01, 0.05, 0.1, 0.2]
    for epsilon in [0.01, 0.1, 0.2, 0.5]
    for nb_iter in [100, 1000, 10000, 100000]
]

games = ["LineWorld", "GridWorld", "SecretEnv0", "SecretEnv1"]

In [3]:
for game in games:
    for params in param_combinations:
        results_path = f"./result/{game}_q_learning.pkl"
        ql.play_game(game, params, results_path)

100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 1000/1000 [00:00<?, ?it/s]
100%|██████████| 10000/10000 [00:00<00:00, 79677.06it/s]
100%|██████████| 100000/100000 [00:01<00:00, 73799.30it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 1000/1000 [00:00<00:00, 63994.14it/s]
100%|██████████| 10000/10000 [00:00<00:00, 70721.90it/s]
100%|██████████| 100000/100000 [00:01<00:00, 86220.31it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 1000/1000 [00:00<00:00, 31994.39it/s]
100%|██████████| 10000/10000 [00:00<00:00, 52360.27it/s]
100%|██████████| 100000/100000 [00:01<00:00, 76911.46it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 1000/1000 [00:00<00:00, 31600.99it/s]
100%|██████████| 10000/10000 [00:00<00:00, 62861.16it/s]
100%|██████████| 100000/100000 [00:01<00:00, 62247.21it/s]
100%|██████████| 100/100 [00:00<00:00, 3618.81it/s]
100%|██████████| 1000/1000 [00:00<00:00, 64000.98it/s]
100%|██████████| 10000/10000 [00:00<00:00, 63680.70i

### Sarsa


In [2]:
param_combinations = [
    {"alpha": alpha, "epsilon": epsilon, "gamma": 0.999, "nb_iter": nb_iter}
    for alpha in [0.01, 0.05, 0.1, 0.2]
    for epsilon in [0.01, 0.1, 0.2, 0.5]
    for nb_iter in [100, 1000, 10000, 100000]
]

games = ["LineWorld", "GridWorld", "SecretEnv0", "SecretEnv1"]

In [3]:
for game in games:
    for params in param_combinations:
        results_path = f"./result/{game}_q_learning.pkl"
        sa.play_game(game, params, results_path)

100%|██████████| 100/100 [00:00<00:00, 9517.80it/s]
100%|██████████| 1000/1000 [00:00<00:00, 63970.72it/s]
100%|██████████| 10000/10000 [00:00<00:00, 53061.76it/s]
100%|██████████| 100000/100000 [00:00<00:00, 115265.37it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 1000/1000 [00:00<?, ?it/s]
100%|██████████| 10000/10000 [00:00<00:00, 106223.90it/s]
100%|██████████| 100000/100000 [00:00<00:00, 112993.44it/s]
100%|██████████| 100/100 [00:00<00:00, 6400.59it/s]
100%|██████████| 1000/1000 [00:00<00:00, 42320.11it/s]
100%|██████████| 10000/10000 [00:00<00:00, 106228.21it/s]
 76%|███████▋  | 76357/100000 [00:00<00:00, 110834.19it/s]


KeyboardInterrupt: 

## Planning

### Dyna-Q

In [28]:
param_combinations = [
    {"alpha": alpha, "epsilon": epsilon, "gamma": 0.999, "nb_iter": nb_iter, "n_planning": n_planning}
    for alpha in [0.01, 0.05, 0.1, 0.2]
    for epsilon in [0.01, 0.1, 0.2, 0.5]
    for nb_iter in [100, 1000, 10000, 100000]
    for n_planning in [1, 10, 100, 500]
    
]

games = ["LineWorld", "GridWorld", "SecretEnv0", "SecretEnv1"]

Optimal Policy:
[[1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]]
Value Function:
[[ 0.        0.      ]
 [-1.        0.998001]
 [ 0.997003  0.999   ]
 [ 0.998001  1.      ]
 [ 0.        0.      ]]


In [29]:
for game in games:
    for params in param_combinations:
        results_path = f"./result/{game}_dyna_q.pkl"
        dq.play_game(game, params, results_path)

__X__
___X_
____X
Final Steps: [2, np.int64(3), np.int64(4)]
Final Total Reward: 1
