In [None]:
import algorithmes.policy_iteration as pi
import algorithmes.sarsa as sa
import algorithmes.dyna_q as dq
import algorithmes.dyna_q_plus as dqp
import environnements.lineworld as lw
import environnements.gridworld as gw
from utils import load_config
import secret_envs_wrapper as envs

In [None]:
congig_file = "config.yaml"

# LineWorld Environnement

## Create LineWorld environnement

In [None]:
config_lineworld = load_config(congig_file, "LineWorld")
game = "lineworld"

In [None]:
S = config_lineworld["states"]
A = config_lineworld["actions"]
R = config_lineworld["rewards"]
T = config_lineworld["terminals"]

In [None]:
lineworld_mdp = lw.create_lineworld(S, A, R)

## Dynamic Programming

### Policy Iteration

In [None]:
policy, V = pi.policy_iteration(game, lineworld_mdp, S, A, R, T, gamma=0.999)
print("Optimal Policy:")
print(policy)
print("Value Function:")
print(V)

In [None]:
# Play the game with the optimal policy
steps, total_reward = lw.play_game(policy, lineworld_mdp, R, T)
print(f"Final Steps: {steps}")
print(f"Final Total Reward: {total_reward}")

## Temporal Difference Learning

### Sarsa


In [None]:
print(S)
print(A)
print(R)
print(T)

In [None]:
policy, Q = sa.sarsa(game, S, A, R, lineworld_mdp, T, num_episodes=10, gamma=0.999, alpha=0.1, epsilon=0.1, start_state=1)
print("Optimal Policy:")
print(policy)
print("Value Function:")
print(Q)

In [None]:
# Play the game with the optimal policy
steps, total_reward = lw.play_game(policy, lineworld_mdp, R, T)
print(f"Final Steps: {steps}")
print(f"Final Total Reward: {total_reward}")

## Planning

### Dyna-Q

In [None]:
policy, Q = dq.dyna_q(S, A, R, lineworld_mdp, T, n_episodes = 100, n_planning_steps = 100, alpha = 0.1, gamma = 0.999, epsilon = 0.1, start_state = 1)
print("Optimal Policy:")
print(policy)
print("Value Function:")
print(Q)

In [None]:
# Play the game with the optimal policy
steps, total_reward = lw.play_game(policy, lineworld_mdp, R, T)
print(f"Final Steps: {steps}")
print(f"Final Total Reward: {total_reward}")

### Dyna-Q+

In [None]:
policy, Q = dqp.dyna_q_plus(S, A, R, lineworld_mdp, T, n_episodes = 500, n_planning_steps = 300, alpha = 0.1, gamma = 0.999, k = 0.03, epsilon = 0.1, start_state = 1)
print("Optimal Policy:")
print(policy)
print("Value Function:")
print(Q)

In [None]:
# Play the game with the optimal policy
steps, total_reward = lw.play_game(policy, lineworld_mdp, R, T)
print(f"Final Steps: {steps}")
print(f"Final Total Reward: {total_reward}")

# GridWorld Environnement


## Create GridWorld environnement

In [None]:
config_gridworld = load_config(congig_file, "GridWorld")
game = "gridworld"

In [None]:
S = config_gridworld["states"]
A = config_gridworld["actions"]
R = config_gridworld["rewards"]
T = config_gridworld["terminals"]

In [None]:
gridworld_mdp = gw.create_gridworld(S, A, R)

## Dynamic Programming

### Policy Iteration

In [None]:
policy, V = pi.policy_iteration(game, gridworld_mdp, S, A, R, T, gamma=0.999)
print("Optimal Policy:")
print(policy)
print("Value Function:")
print(V)

In [None]:
steps, total_reward = gw.play_game(policy, gridworld_mdp, R, T, 6)
print(f"Final Steps: {steps}")
print(f"Final Total Reward: {total_reward}")

## Temporal Difference Learning

### Sarsa

In [None]:
policy, Q = sa.sarsa(game, S, A, R, gridworld_mdp, T, num_episodes=1000, gamma=0.999, alpha=0.1, epsilon=0.1, start_state=6)
print("Optimal Policy:")
print(policy)
print("Value Function:")
print(Q)

In [None]:
# Play the game with the optimal policy
steps, total_reward = gw.play_game(policy, gridworld_mdp, R, T, 6)
print(f"Final Steps: {steps}")
print(f"Final Total Reward: {total_reward}")

## Planning

### Dyna-Q

In [None]:
policy, Q = dq.dyna_q(S, A, R, gridworld_mdp, T, n_episodes = 100, n_planning_steps = 100, alpha = 0.1, gamma = 0.999, epsilon = 0.1, start_state = 6)
print("Optimal Policy:")
print(policy)
print("Value Function:")
print(Q)

In [None]:
steps, total_reward = gw.play_game(policy, gridworld_mdp, R, T, 6)
print(f"Final Steps: {steps}")
print(f"Final Total Reward: {total_reward}")

### Dyna-Q+

In [None]:
policy, Q = dqp.dyna_q_plus(S, A, R, gridworld_mdp, T, n_episodes = 100, n_planning_steps = 100, alpha = 0.1, gamma = 0.999, k = 0.5, epsilon = 0.1, start_state = 6)
print("Optimal Policy:")
print(policy)
print("Value Function:")
print(Q)

In [None]:
# Play the game with the optimal policy
steps, total_reward = gw.play_game(policy, gridworld_mdp, R, T, 6)
print(f"Final Steps: {steps}")
print(f"Final Total Reward: {total_reward}")

# Secrets Environnements

## Secret Environnement 1 