# **Temporal difference learning on the Portal GridWorld**


In [1]:
import plotly
plotly.offline.init_notebook_mode()

from package.dyna_q_agent import Dyna_Q_Agent
from package.dyna_q_plus_agent import Dyna_Q_plus_Agent
from package.env import Env
from package.q_learning_agent import Q_learning_Agent
from package.plots import animated_heatmap, plot_average_reward, plot_steps_per_episode

In [9]:
env = Env()
env.grid

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0,0,0,0,0,0,0,0,T,0,0,P
1,0,0,0,0,0,0,0,0,T,0,0,0
2,0,LP,0,0,0,0,0,0,T,G,0,0
3,W,W,W,0,0,0,0,0,T,T,T,T
4,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,W,0,0,0,0,0,0,0,0
6,0,A,0,W,0,0,0,0,0,0,P,0
7,0,0,0,W,0,0,0,0,0,0,0,0


# ***Results***

## ***Q-Learning***

In [10]:
q_learning = Q_learning_Agent(epsilon=0.1, gamma=0.9, step_size=0.25)
q_learning.fit(n_episode=401, log_progress=[400], plot=True)
animated_heatmap(state_value_dict=q_learning.value_estimates, agent_name=q_learning.name)

# ***Dyna-Q***

In [11]:
dyna_q = Dyna_Q_Agent(epsilon=0.1, gamma=0.9, step_size=0.25)
dyna_q.fit(n_episode=401, log_progress=[400], plot=True)
animated_heatmap(state_value_dict=dyna_q.value_estimates, agent_name=dyna_q.name)

KeyboardInterrupt: 

# ***Dyna-Q+***

In [None]:
dyna_q_plus = Dyna_Q_plus_Agent(planning_steps=100, epsilon=0.1, gamma=0.9, step_size=0.125)
dyna_q_plus.fit(n_episode=401, log_progress=[400], plot=True)
animated_heatmap(state_value_dict=dyna_q_plus.value_estimates, agent_name=dyna_q_plus.name)

# ***Comparison***

In [4]:
import numpy as np
import pandas as pd
import os 
from tqdm.auto import tqdm


agents_parameters = {
    Q_learning_Agent: {
        "epsilon": 0.1,
        "gamma": 0.9,
        "step_size": 0.25,
    },
    Dyna_Q_Agent: {
        "planning_steps": 100,
        "epsilon": 0.1,
        "gamma": 0.9,
        "step_size": 0.25,
    },
    Dyna_Q_plus_Agent: {
        "planning_steps": 100,
        "epsilon": 0.1,
        "gamma": 0.9,
        "step_size": 0.25,
    },
}

num_runs = 100
num_episodes = 250
random_seeds = np.arange(num_runs) + 100  # avoid seed 17, only used for testing

for agent_class, agent_parameters in agents_parameters.items():
    print(agent_class().name)
    agent_results = []

    for run in tqdm(range(num_runs), position=0, leave=True):
        # instantiate a new agent for each run
        agent = agent_class(**agent_parameters)
        # Set a different random seed for each run
        agent.random_generator = np.random.RandomState(seed=random_seeds[run])
        agent.fit(
            n_episode=num_episodes, log_progress=[num_episodes - 1], plot=False
        )

        # Append the episode results to the agent's results list
        agent_results.append(agent.episodes)

    # Concatenate the episode results from all runs
    agent_results_concatenated = pd.concat(
        agent_results, keys=range(num_runs), names=["Run", "episode"]
    )

    # Create results directory if it doesn't exist
    os.makedirs("results", exist_ok=True)

    # Write the concatenated results to a CSV file for all runs
    results_file = f"results/{agent.name}_results.csv"
    agent_results_concatenated.to_csv(
        results_file,
        header=["steps", "reward", "is_optimal"],
        index_label=["Run", "episode"],
    )

# Compare the agents performances
plot_average_reward(Q_learning_Agent(), Dyna_Q_Agent(), Dyna_Q_plus_Agent())
plot_steps_per_episode(Q_learning_Agent(), Dyna_Q_Agent(), Dyna_Q_plus_Agent())

Q-learning


100%|██████████| 100/100 [04:18<00:00,  2.59s/it]


Dyna-Q


100%|██████████| 100/100 [47:05<00:00, 28.26s/it]


Dyna-Q_plus


100%|██████████| 100/100 [48:08<00:00, 28.88s/it]
